Merge branch 'master' into imatrix
This commit is contained in:
commit
adbff66394
|
|
@ -0,0 +1,130 @@
|
|||
# ==============================================================================
|
||||
# ARGUMENTS
|
||||
# ==============================================================================
|
||||
|
||||
# Define the CANN base image for easier version updates later
|
||||
ARG CANN_BASE_IMAGE=quay.io/ascend/cann:8.1.rc1-910b-openeuler22.03-py3.10
|
||||
|
||||
# ==============================================================================
|
||||
# BUILD STAGE
|
||||
# Compile all binary files and libraries
|
||||
# ==============================================================================
|
||||
FROM ${CANN_BASE_IMAGE} AS build
|
||||
|
||||
# Define the Ascend chip model for compilation. Default is Ascend910B3
|
||||
ARG ASCEND_SOC_TYPE=Ascend910B3
|
||||
|
||||
# -- Install build dependencies --
|
||||
RUN yum install -y gcc g++ cmake make git libcurl-devel python3 python3-pip && \
|
||||
yum clean all && \
|
||||
rm -rf /var/cache/yum
|
||||
|
||||
# -- Set the working directory --
|
||||
WORKDIR /app
|
||||
|
||||
# -- Copy project files --
|
||||
COPY . .
|
||||
|
||||
# -- Set CANN environment variables (required for compilation) --
|
||||
# Using ENV instead of `source` allows environment variables to persist across the entire image layer
|
||||
ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
|
||||
ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${LD_LIBRARY_PATH}
|
||||
ENV PATH=${ASCEND_TOOLKIT_HOME}/bin:${PATH}
|
||||
ENV ASCEND_OPP_PATH=${ASCEND_TOOLKIT_HOME}/opp
|
||||
ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/runtime/lib64/stub:$LD_LIBRARY_PATH
|
||||
# ... You can add other environment variables from the original file as needed ...
|
||||
# For brevity, only core variables are listed here. You can paste the original ENV list here.
|
||||
|
||||
# -- Build llama.cpp --
|
||||
# Use the passed ASCEND_SOC_TYPE argument and add general build options
|
||||
RUN source /usr/local/Ascend/ascend-toolkit/set_env.sh --force \
|
||||
&& \
|
||||
cmake -B build \
|
||||
-DGGML_CANN=ON \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DSOC_TYPE=${ASCEND_SOC_TYPE} \
|
||||
. && \
|
||||
cmake --build build --config Release -j$(nproc)
|
||||
|
||||
# -- Organize build artifacts for copying in later stages --
|
||||
# Create a lib directory to store all .so files
|
||||
RUN mkdir -p /app/lib && \
|
||||
find build -name "*.so" -exec cp {} /app/lib \;
|
||||
|
||||
# Create a full directory to store all executables and Python scripts
|
||||
RUN mkdir -p /app/full && \
|
||||
cp build/bin/* /app/full/ && \
|
||||
cp *.py /app/full/ && \
|
||||
cp -r gguf-py /app/full/ && \
|
||||
cp -r requirements /app/full/ && \
|
||||
cp requirements.txt /app/full/
|
||||
# If you have a tools.sh script, make sure it is copied here
|
||||
# cp .devops/tools.sh /app/full/tools.sh
|
||||
|
||||
# ==============================================================================
|
||||
# BASE STAGE
|
||||
# Create a minimal base image with CANN runtime and common libraries
|
||||
# ==============================================================================
|
||||
FROM ${CANN_BASE_IMAGE} AS base
|
||||
|
||||
# -- Install runtime dependencies --
|
||||
RUN yum install -y libgomp curl && \
|
||||
yum clean all && \
|
||||
rm -rf /var/cache/yum
|
||||
|
||||
# -- Set CANN environment variables (required for runtime) --
|
||||
ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
|
||||
ENV LD_LIBRARY_PATH=/app:${ASCEND_TOOLKIT_HOME}/lib64:${LD_LIBRARY_PATH}
|
||||
ENV PATH=${ASCEND_TOOLKIT_HOME}/bin:${PATH}
|
||||
ENV ASCEND_OPP_PATH=${ASCEND_TOOLKIT_HOME}/opp
|
||||
# ... You can add other environment variables from the original file as needed ...
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# Copy compiled .so files from the build stage
|
||||
COPY --from=build /app/lib/ /app
|
||||
|
||||
# ==============================================================================
|
||||
# FINAL STAGES (TARGETS)
|
||||
# ==============================================================================
|
||||
|
||||
### Target: full
|
||||
# Complete image with all tools, Python bindings, and dependencies
|
||||
# ==============================================================================
|
||||
FROM base AS full
|
||||
|
||||
COPY --from=build /app/full /app
|
||||
|
||||
# Install Python dependencies
|
||||
RUN yum install -y git python3 python3-pip && \
|
||||
pip3 install --no-cache-dir --upgrade pip setuptools wheel && \
|
||||
pip3 install --no-cache-dir -r requirements.txt && \
|
||||
yum clean all && \
|
||||
rm -rf /var/cache/yum
|
||||
|
||||
# You need to provide a tools.sh script as the entrypoint
|
||||
ENTRYPOINT ["/app/tools.sh"]
|
||||
# If there is no tools.sh, you can set the default to start the server
|
||||
# ENTRYPOINT ["/app/llama-server"]
|
||||
|
||||
### Target: light
|
||||
# Lightweight image containing only llama-cli
|
||||
# ==============================================================================
|
||||
FROM base AS light
|
||||
|
||||
COPY --from=build /app/full/llama-cli /app
|
||||
|
||||
ENTRYPOINT [ "/app/llama-cli" ]
|
||||
|
||||
### Target: server
|
||||
# Dedicated server image containing only llama-server
|
||||
# ==============================================================================
|
||||
FROM base AS server
|
||||
|
||||
ENV LLAMA_ARG_HOST=0.0.0.0
|
||||
|
||||
COPY --from=build /app/full/llama-server /app
|
||||
|
||||
HEALTHCHECK --interval=5m CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
||||
|
||||
ENTRYPOINT [ "/app/llama-server" ]
|
||||
|
|
@ -1,8 +1,8 @@
|
|||
ARG UBUNTU_VERSION=24.04
|
||||
|
||||
# This needs to generally match the container host's environment.
|
||||
ARG ROCM_VERSION=6.3
|
||||
ARG AMDGPU_VERSION=6.3
|
||||
ARG ROCM_VERSION=6.4
|
||||
ARG AMDGPU_VERSION=6.4
|
||||
|
||||
# Target the CUDA build image
|
||||
ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
|
||||
|
|
|
|||
|
|
@ -0,0 +1,45 @@
|
|||
name: Check Pre-Tokenizer Hashes
|
||||
|
||||
on:
|
||||
push:
|
||||
paths:
|
||||
- 'convert_hf_to_gguf.py'
|
||||
- 'convert_hf_to_gguf_update.py'
|
||||
pull_request:
|
||||
paths:
|
||||
- 'convert_hf_to_gguf.py'
|
||||
- 'convert_hf_to_gguf_update.py'
|
||||
|
||||
jobs:
|
||||
pre-tokenizer-hashes:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: '3.11'
|
||||
|
||||
- name: Install Python dependencies
|
||||
run: |
|
||||
python3 -m venv .venv
|
||||
.venv/bin/pip install -r requirements/requirements-convert_hf_to_gguf_update.txt
|
||||
|
||||
- name: Update pre-tokenizer hashes
|
||||
run: |
|
||||
cp convert_hf_to_gguf.py /tmp
|
||||
.venv/bin/python convert_hf_to_gguf_update.py --check-missing
|
||||
|
||||
- name: Check if committed pre-tokenizer hashes matches generated version
|
||||
run: |
|
||||
if ! diff -q convert_hf_to_gguf.py /tmp/convert_hf_to_gguf.py; then
|
||||
echo "Model pre-tokenizer hashes (in convert_hf_to_gguf.py) do not match generated hashes (from convert_hf_to_gguf_update.py)."
|
||||
echo "To fix: run ./convert_hf_to_gguf_update.py and commit the updated convert_hf_to_gguf.py along with your changes"
|
||||
echo "Differences found:"
|
||||
diff convert_hf_to_gguf.py /tmp/convert_hf_to_gguf.py || true
|
||||
exit 1
|
||||
fi
|
||||
echo "Model pre-tokenizer hashes are up to date."
|
||||
|
|
@ -82,6 +82,7 @@ models/*
|
|||
models-mnt
|
||||
!models/.editorconfig
|
||||
!models/ggml-vocab-*.gguf*
|
||||
!models/templates
|
||||
|
||||
# Zig
|
||||
zig-out/
|
||||
|
|
|
|||
|
|
@ -977,6 +977,10 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
|||
for (auto & seq_breaker : params.sampling.dry_sequence_breakers) {
|
||||
string_process_escapes(seq_breaker);
|
||||
}
|
||||
for (auto & pair : params.speculative.replacements) {
|
||||
string_process_escapes(pair.first);
|
||||
string_process_escapes(pair.second);
|
||||
}
|
||||
}
|
||||
|
||||
if (!params.kv_overrides.empty()) {
|
||||
|
|
@ -2091,6 +2095,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||
params.no_kv_offload = true;
|
||||
}
|
||||
).set_env("LLAMA_ARG_NO_KV_OFFLOAD"));
|
||||
add_opt(common_arg(
|
||||
{"-nr", "--no-repack"},
|
||||
"disable weight repacking",
|
||||
[](common_params & params) {
|
||||
params.no_extra_bufts = true;
|
||||
}
|
||||
).set_env("LLAMA_ARG_NO_REPACK"));
|
||||
add_opt(common_arg(
|
||||
{"-ctk", "--cache-type-k"}, "TYPE",
|
||||
string_format(
|
||||
|
|
@ -2369,6 +2380,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||
}
|
||||
}
|
||||
));
|
||||
add_opt(common_arg(
|
||||
{"--cpu-moe"},
|
||||
"use CPU for Mixture of Experts (MoE) weights",
|
||||
[](common_params & params) {
|
||||
params.tensor_buft_overrides.push_back({"\\.ffn_up_exps\\.weight$", ggml_backend_cpu_buffer_type()});
|
||||
params.tensor_buft_overrides.push_back({"\\.ffn_down_exps\\.weight$", ggml_backend_cpu_buffer_type()});
|
||||
params.tensor_buft_overrides.push_back({"\\.ffn_gate_exps\\.weight$", ggml_backend_cpu_buffer_type()});
|
||||
}
|
||||
).set_env("LLAMA_ARG_CPU_MOE"));
|
||||
add_opt(common_arg(
|
||||
{"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
|
||||
"number of layers to store in VRAM",
|
||||
|
|
@ -2627,6 +2647,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||
params.n_out_freq = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
|
||||
add_opt(common_arg(
|
||||
{"--output-format"}, "{gguf,dat}",
|
||||
string_format("output format for imatrix file (default: %s)", params.imat_dat ? "dat" : "gguf"),
|
||||
[](common_params & params, const std::string & value) {
|
||||
/**/ if (value == "gguf") { params.imat_dat = false; }
|
||||
else if (value == "dat") { params.imat_dat = true; }
|
||||
else { throw std::invalid_argument("invalid output format"); }
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
|
||||
add_opt(common_arg(
|
||||
{"--save-frequency"}, "N",
|
||||
string_format("save an imatrix copy every N iterations (default: %d)", params.n_save_freq),
|
||||
|
|
@ -3249,6 +3278,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||
params.speculative.model.path = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL_DRAFT"));
|
||||
add_opt(common_arg(
|
||||
{"--spec-replace"}, "TARGET", "DRAFT",
|
||||
"translate the string in TARGET into DRAFT if the draft model and main model are not compatible",
|
||||
[](common_params & params, const std::string & tgt, const std::string & dft) {
|
||||
params.speculative.replacements.push_back({ tgt, dft });
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
|
||||
add_opt(common_arg(
|
||||
{"-ctkd", "--cache-type-k-draft"}, "TYPE",
|
||||
string_format(
|
||||
|
|
@ -3438,28 +3474,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
||||
|
||||
// diffusion parameters
|
||||
add_opt(common_arg(
|
||||
{ "--diffusion-steps" }, "N",
|
||||
string_format("number of diffusion steps (default: %d)", params.diffusion.steps),
|
||||
[](common_params & params, int value) { params.diffusion.steps = value; }
|
||||
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
|
||||
add_opt(common_arg(
|
||||
{ "--diffusion-eps" }, "F",
|
||||
string_format("epsilon for timesteps (default: %.6f)", (double) params.diffusion.eps),
|
||||
[](common_params & params, const std::string & value) { params.diffusion.eps = std::stof(value); }
|
||||
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
|
||||
add_opt(common_arg(
|
||||
{ "--diffusion-algorithm" }, "N",
|
||||
string_format("diffusion algorithm: 0=ORIGIN, 1=MASKGIT_PLUS, 2=TOPK_MARGIN, 3=ENTROPY (default: %d)",
|
||||
params.diffusion.algorithm),
|
||||
[](common_params & params, int value) { params.diffusion.algorithm = value; }
|
||||
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
|
||||
add_opt(common_arg(
|
||||
{ "--diffusion-alg-temp" }, "F",
|
||||
string_format("algorithm temperature (default: %.3f)", (double) params.diffusion.alg_temp),
|
||||
[](common_params & params, const std::string & value) { params.diffusion.alg_temp = std::stof(value); }
|
||||
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
|
||||
add_opt(common_arg(
|
||||
{ "--diffusion-visual" },
|
||||
string_format("enable visual diffusion mode (show progressive generation) (default: %s)",
|
||||
|
|
@ -3467,5 +3486,39 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||
[](common_params & params) { params.diffusion.visual_mode = true; }
|
||||
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
|
||||
|
||||
add_opt(common_arg(
|
||||
{ "--diffusion-eps" }, "F",
|
||||
string_format("epsilon for timesteps (default: %.6f)", (double) params.diffusion.eps),
|
||||
[](common_params & params, const std::string & value) { params.diffusion.eps = std::stof(value); }
|
||||
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
|
||||
add_opt(common_arg(
|
||||
{ "--diffusion-algorithm" }, "N",
|
||||
string_format("diffusion algorithm: 0=ORIGIN, 1=ENTROPY_BASED, 2=MARGIN_BASED, 3=RANDOM, 4=LOW_CONFIDENCE (default: %d)",
|
||||
params.diffusion.algorithm),
|
||||
[](common_params & params, int value) { params.diffusion.algorithm = value; }
|
||||
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
|
||||
add_opt(common_arg(
|
||||
{ "--diffusion-alg-temp" }, "F",
|
||||
string_format("dream algorithm temperature (default: %.3f)", (double) params.diffusion.alg_temp),
|
||||
[](common_params & params, const std::string & value) { params.diffusion.alg_temp = std::stof(value); }
|
||||
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
|
||||
|
||||
add_opt(common_arg(
|
||||
{ "--diffusion-block-length" }, "N",
|
||||
string_format("llada block length for generation (default: %d)", params.diffusion.block_length),
|
||||
[](common_params & params, int value) { params.diffusion.block_length = value; }
|
||||
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
|
||||
add_opt(common_arg(
|
||||
{ "--diffusion-cfg-scale" }, "F",
|
||||
string_format("llada classifier-free guidance scale (default: %.3f)", (double) params.diffusion.cfg_scale),
|
||||
[](common_params & params, const std::string & value) { params.diffusion.cfg_scale = std::stof(value); }
|
||||
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
|
||||
add_opt(common_arg(
|
||||
{ "--diffusion-add-gumbel-noise" }, "F",
|
||||
string_format("add gumbel noise to the logits if temp > 0.0 (default: %s)", params.diffusion.add_gumbel_noise ? "true" : "false"),
|
||||
[](common_params & params, const std::string & value) { params.diffusion.add_gumbel_noise = std::stof(value); }
|
||||
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
|
||||
|
||||
|
||||
return ctx_arg;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1646,7 +1646,7 @@ static void common_chat_parse_hermes_2_pro(common_chat_msg_parser & builder) {
|
|||
"|<function name=\"([^\"]+)\">" // match 5 (function name again)
|
||||
);
|
||||
|
||||
if (auto res = builder.try_find_regex(open_regex)) {
|
||||
while (auto res = builder.try_find_regex(open_regex)) {
|
||||
const auto & block_start = res->groups[1];
|
||||
std::string block_end = block_start.empty() ? "" : "```";
|
||||
|
||||
|
|
@ -1668,7 +1668,6 @@ static void common_chat_parse_hermes_2_pro(common_chat_msg_parser & builder) {
|
|||
builder.consume_literal(block_end);
|
||||
builder.consume_spaces();
|
||||
}
|
||||
builder.add_content(builder.consume_rest());
|
||||
} else {
|
||||
throw common_chat_msg_partial_exception("failed to parse tool call");
|
||||
}
|
||||
|
|
@ -1693,11 +1692,10 @@ static void common_chat_parse_hermes_2_pro(common_chat_msg_parser & builder) {
|
|||
builder.consume_spaces();
|
||||
}
|
||||
}
|
||||
builder.add_content(builder.consume_rest());
|
||||
}
|
||||
} else {
|
||||
builder.add_content(builder.consume_rest());
|
||||
}
|
||||
|
||||
builder.add_content(builder.consume_rest());
|
||||
}
|
||||
|
||||
static common_chat_params common_chat_params_init_without_tools(const common_chat_template & tmpl, const struct templates_params & inputs) {
|
||||
|
|
@ -1944,6 +1942,8 @@ common_chat_msg common_chat_parse(const std::string & input, bool is_partial, co
|
|||
}
|
||||
}
|
||||
auto msg = builder.result();
|
||||
LOG_DBG("Parsed message: %s\n", common_chat_msgs_to_json_oaicompat<json>({msg}).at(0).dump().c_str());
|
||||
if (!is_partial) {
|
||||
LOG_DBG("Parsed message: %s\n", common_chat_msgs_to_json_oaicompat<json>({msg}).at(0).dump().c_str());
|
||||
}
|
||||
return msg;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1122,6 +1122,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
|
|||
mparams.use_mmap = params.use_mmap;
|
||||
mparams.use_mlock = params.use_mlock;
|
||||
mparams.check_tensors = params.check_tensors;
|
||||
mparams.use_extra_bufts = !params.no_extra_bufts;
|
||||
|
||||
if (params.kv_overrides.empty()) {
|
||||
mparams.kv_overrides = NULL;
|
||||
|
|
|
|||
|
|
@ -201,6 +201,7 @@ struct common_params_speculative {
|
|||
int32_t n_gpu_layers = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
|
||||
float p_split = 0.1f; // speculative decoding split probability
|
||||
float p_min = 0.75f; // minimum speculative decoding probability (greedy)
|
||||
std::vector<std::pair<std::string, std::string>> replacements; // main to speculative model replacements
|
||||
|
||||
ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
|
||||
ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
|
||||
|
|
@ -220,11 +221,17 @@ struct common_params_vocoder {
|
|||
};
|
||||
|
||||
struct common_params_diffusion {
|
||||
int32_t steps = 64; // number of diffusion steps
|
||||
float eps = 1e-3f; // epsilon for timesteps
|
||||
int32_t algorithm = 0; // diffusion algorithm (0=ORIGIN, 1=MASKGIT_PLUS, 2=TOPK_MARGIN, 3=ENTROPY)
|
||||
float alg_temp = 0.0f; // algorithm temperature
|
||||
bool visual_mode = false; // show progressive diffusion on screen
|
||||
int32_t steps = 128;
|
||||
bool visual_mode = false;
|
||||
|
||||
float eps = 0; // epsilon for timesteps
|
||||
int32_t block_length = 0; // block length for generation
|
||||
|
||||
int32_t algorithm = 4; // default algorithm: low-confidence
|
||||
float alg_temp = 0.0f; // algorithm temperature
|
||||
|
||||
float cfg_scale = 0; // classifier-free guidance scale
|
||||
bool add_gumbel_noise = false; // add gumbel noise to the logits if temp > 0.0
|
||||
};
|
||||
|
||||
enum common_reasoning_format {
|
||||
|
|
@ -352,6 +359,7 @@ struct common_params {
|
|||
bool warmup = true; // warmup run
|
||||
bool check_tensors = false; // validate tensor data
|
||||
bool no_op_offload = false; // globally disable offload host tensor operations to device
|
||||
bool no_extra_bufts = false; // disable extra buffer types (used for weight repacking)
|
||||
|
||||
bool single_turn = false; // single turn chat conversation
|
||||
|
||||
|
|
@ -431,6 +439,7 @@ struct common_params {
|
|||
int32_t n_out_freq = 10; // output the imatrix every n_out_freq iterations
|
||||
int32_t n_save_freq = 0; // save the imatrix every n_save_freq iterations
|
||||
int32_t i_chunk = 0; // start processing from this chunk
|
||||
bool imat_dat = false; // whether the legacy imatrix.dat format should be output
|
||||
|
||||
bool process_output = false; // collect data for the output tensor
|
||||
bool compute_ppl = true; // whether to compute perplexity
|
||||
|
|
|
|||
|
|
@ -1,30 +1,39 @@
|
|||
#include "speculative.h"
|
||||
|
||||
#include "ggml.h"
|
||||
#include "llama.h"
|
||||
#include "log.h"
|
||||
#include "common.h"
|
||||
#include "sampling.h"
|
||||
|
||||
#include <cstring>
|
||||
#include <algorithm>
|
||||
#include <map>
|
||||
|
||||
#define SPEC_VOCAB_MAX_SIZE_DIFFERENCE 128
|
||||
#define SPEC_VOCAB_CHECK_START_TOKEN_ID 5
|
||||
|
||||
struct common_speculative {
|
||||
struct llama_context * ctx;
|
||||
struct llama_context * ctx_tgt; // only used for retokenizing from ctx_dft
|
||||
struct llama_context * ctx_dft;
|
||||
struct common_sampler * smpl;
|
||||
|
||||
llama_batch batch;
|
||||
llama_tokens prompt;
|
||||
llama_tokens prompt_dft;
|
||||
bool vocab_dft_compatible = true; // whether retokenization is needed
|
||||
std::map<std::string, std::string> tgt_dft_replacements = {};
|
||||
};
|
||||
|
||||
struct common_speculative * common_speculative_init(
|
||||
struct llama_context * ctx_tgt,
|
||||
struct llama_context * ctx_dft) {
|
||||
auto * result = new common_speculative {
|
||||
/* .ctx = */ ctx_dft,
|
||||
/* .smpl = */ nullptr,
|
||||
/* .batch = */ llama_batch_init(llama_n_batch(ctx_dft), 0, 1),
|
||||
/* .prompt = */ {},
|
||||
/* .ctx_tgt = */ ctx_tgt,
|
||||
/* .ctx_dft = */ ctx_dft,
|
||||
/* .smpl = */ nullptr,
|
||||
/* .batch = */ llama_batch_init(llama_n_batch(ctx_dft), 0, 1),
|
||||
/* .prompt_dft = */ {},
|
||||
/* .vocab_dft_compatible = */ false,
|
||||
};
|
||||
|
||||
// TODO: optimize or pass from outside?
|
||||
|
|
@ -59,6 +68,9 @@ struct common_speculative * common_speculative_init(
|
|||
}
|
||||
#endif
|
||||
|
||||
result->vocab_dft_compatible = common_speculative_are_compatible(ctx_tgt, ctx_dft);
|
||||
LOG_DBG("vocab_dft_compatible = %d\n", result->vocab_dft_compatible);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
|
|
@ -75,8 +87,8 @@ void common_speculative_free(struct common_speculative * spec) {
|
|||
}
|
||||
|
||||
bool common_speculative_are_compatible(
|
||||
const struct llama_context * ctx_tgt,
|
||||
const struct llama_context * ctx_dft) {
|
||||
const struct llama_context * ctx_tgt,
|
||||
const struct llama_context * ctx_dft) {
|
||||
const struct llama_model * model_tgt = llama_get_model(ctx_tgt);
|
||||
const struct llama_model * model_dft = llama_get_model(ctx_dft);
|
||||
|
||||
|
|
@ -90,31 +102,32 @@ bool common_speculative_are_compatible(
|
|||
LOG_DBG("%s: vocab_type dft: %d\n", __func__, vocab_type_dft);
|
||||
|
||||
if (vocab_type_tgt != vocab_type_dft) {
|
||||
LOG_ERR("%s: draft model vocab type must match target model to use speculation but "
|
||||
"vocab_type_dft = %d while vocab_type_tgt = %d\n", __func__, vocab_type_dft, vocab_type_tgt);
|
||||
LOG_DBG("%s: draft model vocab type must match target model to use speculation but ", __func__);
|
||||
LOG_DBG("vocab_type_dft = %d while vocab_type_tgt = %d\n", vocab_type_dft, vocab_type_tgt);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (llama_vocab_get_add_bos(vocab_tgt) != llama_vocab_get_add_bos(vocab_dft) ||
|
||||
if (
|
||||
llama_vocab_get_add_bos(vocab_tgt) != llama_vocab_get_add_bos(vocab_dft) ||
|
||||
llama_vocab_get_add_eos(vocab_tgt) != llama_vocab_get_add_eos(vocab_dft) ||
|
||||
llama_vocab_bos(vocab_tgt) != llama_vocab_bos(vocab_dft) ||
|
||||
llama_vocab_eos(vocab_tgt) != llama_vocab_eos(vocab_dft)) {
|
||||
LOG_ERR("%s: draft vocab special tokens must match target vocab to use speculation\n", __func__);
|
||||
LOG_ERR("%s: tgt: bos = %d (%d), eos = %d (%d)\n", __func__, llama_vocab_bos(vocab_tgt), llama_vocab_get_add_bos(vocab_tgt), llama_vocab_eos(vocab_tgt), llama_vocab_get_add_eos(vocab_tgt));
|
||||
LOG_ERR("%s: dft: bos = %d (%d), eos = %d (%d)\n", __func__, llama_vocab_bos(vocab_dft), llama_vocab_get_add_bos(vocab_dft), llama_vocab_eos(vocab_dft), llama_vocab_get_add_eos(vocab_dft));
|
||||
llama_vocab_eos(vocab_tgt) != llama_vocab_eos(vocab_dft)
|
||||
) {
|
||||
LOG_DBG("%s: draft model special tokens must match target model to use speculation\n", __func__);
|
||||
return false;
|
||||
}
|
||||
|
||||
{
|
||||
const int n_vocab_tgt = llama_vocab_n_tokens(vocab_tgt);
|
||||
const int n_vocab_dft = llama_vocab_n_tokens(vocab_dft);
|
||||
|
||||
const int vocab_diff = std::abs(n_vocab_tgt - n_vocab_dft);
|
||||
const int vocab_diff = n_vocab_tgt > n_vocab_dft
|
||||
? n_vocab_tgt - n_vocab_dft
|
||||
: n_vocab_dft - n_vocab_tgt;
|
||||
|
||||
if (vocab_diff > SPEC_VOCAB_MAX_SIZE_DIFFERENCE) {
|
||||
LOG_ERR("%s: draft model vocab must closely match target model to use speculation but "
|
||||
"target vocab size %d does not match draft vocab size %d - difference %d, max allowed %d\n",
|
||||
__func__, n_vocab_tgt, llama_vocab_n_tokens(vocab_dft), vocab_diff, SPEC_VOCAB_MAX_SIZE_DIFFERENCE);
|
||||
LOG_DBG("%s: draft model vocab must closely match target model to use speculation but ", __func__);
|
||||
LOG_DBG("target vocab size %d does not match draft vocab size %d - difference %d, max allowed %d\n",
|
||||
n_vocab_tgt, llama_vocab_n_tokens(vocab_dft), vocab_diff, SPEC_VOCAB_MAX_SIZE_DIFFERENCE);
|
||||
return false;
|
||||
}
|
||||
|
||||
|
|
@ -122,8 +135,8 @@ bool common_speculative_are_compatible(
|
|||
const char * token_text_tgt = llama_vocab_get_text(vocab_tgt, i);
|
||||
const char * token_text_dft = llama_vocab_get_text(vocab_dft, i);
|
||||
if (std::strcmp(token_text_tgt, token_text_dft) != 0) {
|
||||
LOG_ERR("%s: draft vocab vocab must match target vocab to use speculation but "
|
||||
"token %d content differs - target '%s', draft '%s'\n", __func__, i,
|
||||
LOG_DBG("%s: draft model vocab must match target model to use speculation but ", __func__);
|
||||
LOG_DBG("token %d content differs - target '%s', draft '%s'\n", i,
|
||||
common_token_to_piece(ctx_tgt, i).c_str(),
|
||||
common_token_to_piece(ctx_dft, i).c_str());
|
||||
return false;
|
||||
|
|
@ -134,32 +147,93 @@ bool common_speculative_are_compatible(
|
|||
return true;
|
||||
}
|
||||
|
||||
void common_speculative_add_replacement_tgt_dft(
|
||||
struct common_speculative * spec,
|
||||
const char *source, const char *dest) {
|
||||
spec->tgt_dft_replacements[source] = dest;
|
||||
}
|
||||
|
||||
static std::string replace_to_dft(
|
||||
struct common_speculative * spec,
|
||||
const std::string& input) {
|
||||
std::string result = input;
|
||||
for (const auto & pair : spec->tgt_dft_replacements) {
|
||||
size_t pos = result.find(pair.first);
|
||||
while (pos != std::string::npos) {
|
||||
result.replace(pos, pair.first.length(), pair.second);
|
||||
pos = result.find(pair.first, pos + pair.second.length());
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
static std::string replace_to_tgt(
|
||||
struct common_speculative * spec,
|
||||
const std::string& input) {
|
||||
std::string result = input;
|
||||
for (const auto& pair : spec->tgt_dft_replacements) {
|
||||
size_t pos = result.find(pair.second);
|
||||
while (pos != std::string::npos) {
|
||||
result.replace(pos, pair.second.length(), pair.first);
|
||||
pos = result.find(pair.second, pos + pair.first.length());
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
llama_tokens common_speculative_gen_draft(
|
||||
struct common_speculative * spec,
|
||||
struct common_speculative_params params,
|
||||
const llama_tokens & prompt_tgt,
|
||||
const llama_tokens & prompt_tgt_main_model, // specified in target model vocab
|
||||
llama_token id_last) {
|
||||
auto & batch = spec->batch;
|
||||
auto & ctx = spec->ctx;
|
||||
auto & ctx_tgt = spec->ctx_tgt;
|
||||
auto & ctx_dft = spec->ctx_dft;
|
||||
auto & smpl = spec->smpl;
|
||||
auto & prompt = spec->prompt;
|
||||
auto & prompt_dft = spec->prompt_dft;
|
||||
|
||||
auto * mem = llama_get_memory(ctx);
|
||||
auto * mem_dft = llama_get_memory(ctx_dft);
|
||||
|
||||
int reuse_i = 0;
|
||||
int reuse_n = 0;
|
||||
|
||||
const int n_ctx = llama_n_ctx(ctx) - params.n_draft;
|
||||
const int n_ctx = llama_n_ctx(ctx_dft) - params.n_draft;
|
||||
|
||||
llama_tokens prompt_tgt_draft_model;
|
||||
if (!spec->vocab_dft_compatible) {
|
||||
std::string text;
|
||||
text = common_detokenize(ctx_tgt, prompt_tgt_main_model, true);
|
||||
text = replace_to_dft(spec, text);
|
||||
LOG_DBG("%s: main->draft detokenized string: '%s'\n", __func__, text.c_str());
|
||||
prompt_tgt_draft_model = common_tokenize(ctx_dft, text, false, true);
|
||||
|
||||
// convert id_last to draft vocab. llama_detokenize is called directly to avoid an allocation
|
||||
const auto * model_tgt = llama_get_model(ctx_tgt);
|
||||
const auto * vocab_tgt = llama_model_get_vocab(model_tgt);
|
||||
|
||||
int32_t n_chars = llama_detokenize(vocab_tgt, &id_last, 1, nullptr, 0, false, false);
|
||||
GGML_ASSERT(n_chars < 0 && "failed to detokenize id_last");
|
||||
text.resize(-n_chars);
|
||||
llama_detokenize(vocab_tgt, &id_last, 1, text.data(), text.size(), false, false);
|
||||
text = replace_to_dft(spec, text);
|
||||
|
||||
LOG_DBG("main->draft detokenized id_last(%d): '%s'\n", id_last, text.c_str());
|
||||
id_last = common_tokenize(ctx_dft, text, false, true)[0];
|
||||
}
|
||||
// prompt_tgt's tokens will always be compatible with ctx_dft
|
||||
const llama_tokens &prompt_tgt =
|
||||
spec->vocab_dft_compatible ? prompt_tgt_main_model : prompt_tgt_draft_model;
|
||||
|
||||
const int i_start = std::max<int>(0, (int) prompt_tgt.size() - n_ctx);
|
||||
|
||||
// reuse as much as possible from the old draft context
|
||||
// ideally, the draft context should be as big as the target context and we will always reuse the entire prompt
|
||||
for (int i = 0; i < (int) prompt.size(); ++i) {
|
||||
for (int i = 0; i < (int) prompt_dft.size(); ++i) {
|
||||
int cur = 0;
|
||||
while (i_start + cur < (int) prompt_tgt.size() &&
|
||||
i + cur < (int) prompt.size() &&
|
||||
prompt_tgt[i_start + cur] == prompt[i + cur]) {
|
||||
i + cur < (int) prompt_dft.size() &&
|
||||
prompt_tgt[i_start + cur] == prompt_dft[i + cur]) {
|
||||
cur++;
|
||||
}
|
||||
|
||||
|
|
@ -169,21 +243,20 @@ llama_tokens common_speculative_gen_draft(
|
|||
}
|
||||
}
|
||||
|
||||
LOG_DBG("%s: reuse_i = %d, reuse_n = %d, prompt = %d\n", __func__, reuse_i, reuse_n, (int) prompt.size());
|
||||
LOG_DBG("%s: reuse_i = %d, reuse_n = %d, prompt = %d\n", __func__, reuse_i, reuse_n, (int) prompt_dft.size());
|
||||
|
||||
llama_tokens result;
|
||||
result.reserve(params.n_draft);
|
||||
|
||||
if (reuse_n == 0) {
|
||||
llama_memory_clear(mem, false);
|
||||
|
||||
prompt.clear();
|
||||
llama_memory_clear(mem_dft, false);
|
||||
prompt_dft.clear();
|
||||
} else {
|
||||
// this happens when a previous draft has been discarded (for example, due to being too small), but the
|
||||
// target model agreed with it. in this case, we simply pass back the previous results to save compute
|
||||
if (reuse_i + reuse_n < (int) prompt.size() && prompt[reuse_i + reuse_n] == id_last) {
|
||||
for (int i = reuse_i + reuse_n + 1; i < (int) prompt.size(); ++i) {
|
||||
result.push_back(prompt[i]);
|
||||
if (reuse_i + reuse_n < (int) prompt_dft.size() && prompt_dft[reuse_i + reuse_n] == id_last) {
|
||||
for (int i = reuse_i + reuse_n + 1; i < (int) prompt_dft.size(); ++i) {
|
||||
result.push_back(prompt_dft[i]);
|
||||
|
||||
if (params.n_draft <= (int) result.size()) {
|
||||
break;
|
||||
|
|
@ -194,16 +267,15 @@ llama_tokens common_speculative_gen_draft(
|
|||
}
|
||||
|
||||
if (reuse_i > 0) {
|
||||
llama_memory_seq_rm (mem, 0, 0, reuse_i);
|
||||
llama_memory_seq_add(mem, 0, reuse_i, -1, -reuse_i);
|
||||
llama_memory_seq_rm (mem_dft, 0, 0, reuse_i);
|
||||
llama_memory_seq_add(mem_dft, 0, reuse_i, -1, -reuse_i);
|
||||
|
||||
prompt.erase(prompt.begin(), prompt.begin() + reuse_i);
|
||||
prompt_dft.erase(prompt_dft.begin(), prompt_dft.begin() + reuse_i);
|
||||
}
|
||||
|
||||
if (reuse_n < (int) prompt.size()) {
|
||||
llama_memory_seq_rm (mem, 0, reuse_n, -1);
|
||||
|
||||
prompt.erase(prompt.begin() + reuse_n, prompt.end());
|
||||
if (reuse_n < (int) prompt_dft.size()) {
|
||||
llama_memory_seq_rm (mem_dft, 0, reuse_n, -1);
|
||||
prompt_dft.erase(prompt_dft.begin() + reuse_n, prompt_dft.end());
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -214,28 +286,28 @@ llama_tokens common_speculative_gen_draft(
|
|||
//LOG_DBG("i = %d, i_start = %d, reuse_n = %d, i - i_start = %d, id = %6d\n", i, i_start, reuse_n, i - i_start, prompt_tgt[i]);
|
||||
common_batch_add(batch, prompt_tgt[i], i - i_start, { 0 }, false);
|
||||
|
||||
prompt.push_back(prompt_tgt[i]);
|
||||
prompt_dft.push_back(prompt_tgt[i]);
|
||||
}
|
||||
|
||||
// we should rarely end-up here during normal decoding
|
||||
if (batch.n_tokens > 0) {
|
||||
//LOG_DBG("%s: draft prompt batch: %s\n", __func__, string_from(ctx, batch).c_str());
|
||||
|
||||
llama_decode(ctx, batch);
|
||||
llama_decode(ctx_dft, batch);
|
||||
}
|
||||
|
||||
const llama_pos n_past = prompt.size();
|
||||
const llama_pos n_past = prompt_dft.size();
|
||||
|
||||
LOG_DBG("%s: n_past = %d\n", __func__, n_past);
|
||||
|
||||
common_batch_clear(batch);
|
||||
common_batch_add (batch, id_last, n_past, { 0 }, true);
|
||||
|
||||
prompt.push_back(id_last);
|
||||
prompt_dft.push_back(id_last);
|
||||
|
||||
//LOG_DBG("%s: draft prompt: %s\n", __func__, string_from(ctx, prompt).c_str());
|
||||
LOG_DBG("%s: draft prompt: %s\n", __func__, string_from(ctx_dft, prompt_dft).c_str());
|
||||
|
||||
llama_decode(ctx, batch);
|
||||
llama_decode(ctx_dft, batch);
|
||||
|
||||
common_sampler_reset(smpl);
|
||||
|
||||
|
|
@ -243,13 +315,13 @@ llama_tokens common_speculative_gen_draft(
|
|||
for (int i = 0; i < params.n_draft; ++i) {
|
||||
common_batch_clear(batch);
|
||||
|
||||
common_sampler_sample(smpl, ctx, 0, true);
|
||||
common_sampler_sample(smpl, ctx_dft, 0, true);
|
||||
|
||||
const auto * cur_p = common_sampler_get_candidates(smpl);
|
||||
|
||||
for (int k = 0; k < std::min(3, (int) cur_p->size); ++k) {
|
||||
LOG_DBG(" - draft candidate %3d, pos %3d: %6d (%8.3f) '%s'\n",
|
||||
k, i, cur_p->data[k].id, cur_p->data[k].p, common_token_to_piece(ctx, cur_p->data[k].id).c_str());
|
||||
k, i, cur_p->data[k].id, cur_p->data[k].p, common_token_to_piece(ctx_dft, cur_p->data[k].id).c_str());
|
||||
}
|
||||
|
||||
// add drafted token for each sequence
|
||||
|
|
@ -271,10 +343,19 @@ llama_tokens common_speculative_gen_draft(
|
|||
common_batch_add(batch, id, n_past + i + 1, { 0 }, true);
|
||||
|
||||
// evaluate the drafted tokens on the draft model
|
||||
llama_decode(ctx, batch);
|
||||
llama_decode(ctx_dft, batch);
|
||||
|
||||
prompt.push_back(id);
|
||||
prompt_dft.push_back(id);
|
||||
}
|
||||
|
||||
if (!spec->vocab_dft_compatible) {
|
||||
std::string detokenized = common_detokenize(ctx_dft, result, true);
|
||||
detokenized = replace_to_tgt(spec, detokenized);
|
||||
LOG_DBG("draft->main detokenized string: '%s'\n", detokenized.c_str());
|
||||
result = common_tokenize(ctx_tgt, detokenized, false, true);
|
||||
if (result.size() > (size_t)params.n_draft) {
|
||||
result.resize(params.n_draft);
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -12,7 +12,10 @@ struct common_speculative_params {
|
|||
float p_min = 0.75f; // min probability required to accept a token in the draft
|
||||
};
|
||||
|
||||
struct common_speculative * common_speculative_init(struct llama_context * ctx_dft);
|
||||
struct common_speculative * common_speculative_init(
|
||||
struct llama_context * ctx_tgt,
|
||||
struct llama_context * ctx_dft
|
||||
);
|
||||
|
||||
void common_speculative_free(struct common_speculative * spec);
|
||||
|
||||
|
|
@ -20,6 +23,10 @@ bool common_speculative_are_compatible(
|
|||
const struct llama_context * ctx_tgt,
|
||||
const struct llama_context * ctx_dft);
|
||||
|
||||
void common_speculative_add_replacement_tgt_dft(
|
||||
struct common_speculative * spec,
|
||||
const char *source, const char *dest);
|
||||
|
||||
// sample up to n_draft tokens and add them to the batch using the draft model
|
||||
llama_tokens common_speculative_gen_draft(
|
||||
struct common_speculative * spec,
|
||||
|
|
|
|||
|
|
@ -684,6 +684,9 @@ class TextModel(ModelBase):
|
|||
if chkhsh == "7e57df22b1fe23a7b1e1c7f3dc4e3f96d43a4eb0836d0c6bdc3436d7b2f1c664":
|
||||
# ref: https://huggingface.co/tencent/Hunyuan-A13B-Instruct
|
||||
res = "hunyuan"
|
||||
if chkhsh == "bba3b3366b646dbdded5dbc42d59598b849371afc42f7beafa914afaa5b70aa6":
|
||||
# ref: https://huggingface.co/tencent/Hunyuan-4B-Instruct
|
||||
res = "hunyuan-dense"
|
||||
if chkhsh == "a6b57017d60e6edb4d88ecc2845188e0eb333a70357e45dcc9b53964a73bbae6":
|
||||
# ref: https://huggingface.co/tiiuae/Falcon-H1-0.5B-Base
|
||||
res = "falcon-h1"
|
||||
|
|
@ -699,6 +702,9 @@ class TextModel(ModelBase):
|
|||
if chkhsh == "81212dc7cdb7e0c1074ca62c5aeab0d43c9f52b8a737be7b12a777c953027890":
|
||||
# ref: https://huggingface.co/moonshotai/Kimi-K2-Base
|
||||
res = "kimi-k2"
|
||||
if chkhsh == "d4540891389ea895b53b399da6ac824becc30f2fba0e9ddbb98f92e55ca0e97c":
|
||||
# ref: https://huggingface.co/Qwen/Qwen3-Embedding-0.6B
|
||||
res = "qwen2"
|
||||
if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5":
|
||||
# ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B
|
||||
res = "llama-bpe"
|
||||
|
|
@ -846,6 +852,9 @@ class TextModel(ModelBase):
|
|||
if chkhsh == "2085e1638f6c377a0aa4ead21b27bb4cb941bf800df86ed391011769c1758dfb":
|
||||
# ref: https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B
|
||||
res = "exaone4"
|
||||
if chkhsh == "a1e163ecab2e718a4c829d1148b6e86824ec36163bb71941c3dca9cd5ac25756":
|
||||
# ref: https://huggingface.co/JetBrains/Mellum-4b-base
|
||||
res = "mellum"
|
||||
|
||||
if res is None:
|
||||
logger.warning("\n")
|
||||
|
|
@ -1900,6 +1909,7 @@ class StableLMModel(TextModel):
|
|||
"MixtralForCausalLM",
|
||||
"VLlama3ForCausalLM",
|
||||
"LlavaForConditionalGeneration",
|
||||
"VoxtralForConditionalGeneration",
|
||||
"LlamaModel")
|
||||
class LlamaModel(TextModel):
|
||||
model_arch = gguf.MODEL_ARCH.LLAMA
|
||||
|
|
@ -1912,6 +1922,11 @@ class LlamaModel(TextModel):
|
|||
self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 32)
|
||||
|
||||
def set_vocab(self):
|
||||
path_tekken_json = self.dir_model / "tekken.json"
|
||||
path_tokenizer_json = self.dir_model / "tokenizer.json"
|
||||
if path_tekken_json.is_file() and not path_tokenizer_json.is_file():
|
||||
return self.set_vocab_tekken()
|
||||
|
||||
try:
|
||||
self._set_vocab_sentencepiece()
|
||||
except FileNotFoundError:
|
||||
|
|
@ -1944,6 +1959,52 @@ class LlamaModel(TextModel):
|
|||
if self.hparams.get("vocab_size", 32000) == 49152:
|
||||
self.gguf_writer.add_add_bos_token(False)
|
||||
|
||||
def set_vocab_tekken(self):
|
||||
vocab = gguf.vocab.MistralVocab(self.dir_model)
|
||||
self.gguf_writer.add_tokenizer_model(vocab.gguf_tokenizer_model)
|
||||
|
||||
tokens = []
|
||||
scores = []
|
||||
toktypes = []
|
||||
|
||||
for text, score, toktype in vocab.all_tokens():
|
||||
tokens.append(text)
|
||||
scores.append(score)
|
||||
toktypes.append(toktype)
|
||||
|
||||
assert len(tokens) == vocab.vocab_size, (
|
||||
f"token count ({len(tokens)}) != vocab size ({vocab.vocab_size})"
|
||||
)
|
||||
|
||||
if vocab.tokenizer_type == gguf.vocab.MistralTokenizerType.tekken:
|
||||
self.gguf_writer.add_tokenizer_pre("tekken")
|
||||
self.gguf_writer.add_token_merges(
|
||||
vocab.extract_vocab_merges_from_model()
|
||||
)
|
||||
|
||||
logger.info(
|
||||
f"Setting bos, eos, unk and pad token IDs to {vocab.bos_id}, {vocab.eos_id}, {vocab.unk_id}, {vocab.pad_id}."
|
||||
)
|
||||
|
||||
self.gguf_writer.add_bos_token_id(vocab.bos_id)
|
||||
self.gguf_writer.add_eos_token_id(vocab.eos_id)
|
||||
self.gguf_writer.add_unk_token_id(vocab.unk_id)
|
||||
self.gguf_writer.add_pad_token_id(vocab.pad_id)
|
||||
|
||||
self.gguf_writer.add_token_list(tokens)
|
||||
self.gguf_writer.add_token_scores(scores)
|
||||
self.gguf_writer.add_token_types(toktypes)
|
||||
self.gguf_writer.add_vocab_size(vocab.vocab_size)
|
||||
|
||||
self.gguf_writer.add_add_bos_token(True)
|
||||
self.gguf_writer.add_add_eos_token(False)
|
||||
|
||||
script_dir = Path(__file__).parent
|
||||
template_path = script_dir / "models/templates/unsloth-mistral-Devstral-Small-2507.jinja"
|
||||
with open(template_path, "r", encoding="utf-8") as f:
|
||||
template = f.read()
|
||||
self.gguf_writer.add_chat_template(template)
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
hparams = self.hparams
|
||||
|
|
@ -1971,12 +2032,13 @@ class LlamaModel(TextModel):
|
|||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
n_head = self.hparams["num_attention_heads"]
|
||||
n_kv_head = self.hparams.get("num_key_value_heads")
|
||||
is_vision_tensor = "vision_tower" in name \
|
||||
is_multimodal_tensor = "vision_tower" in name \
|
||||
or "vision_model" in name \
|
||||
or "audio_tower" in name \
|
||||
or "model.connector" in name \
|
||||
or "multi_modal_projector" in name
|
||||
|
||||
if is_vision_tensor:
|
||||
if is_multimodal_tensor:
|
||||
return [] # skip vision tensors
|
||||
elif self.hf_arch == "LlamaModel":
|
||||
name = "model." + name
|
||||
|
|
@ -2851,6 +2913,107 @@ class DreamModel(TextModel):
|
|||
yield from super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
|
||||
@ModelBase.register("LLaDAModelLM")
|
||||
class LLaDAModel(TextModel):
|
||||
model_arch = gguf.MODEL_ARCH.LLADA
|
||||
undo_permute = True
|
||||
|
||||
def get_vocab_base(self) -> tuple[list[str], list[int], str]:
|
||||
tokens: list[str] = []
|
||||
toktypes: list[int] = []
|
||||
|
||||
from transformers import AutoTokenizer
|
||||
tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
|
||||
|
||||
vocab_dict = tokenizer.get_vocab()
|
||||
vocab_size = self.hparams.get("vocab_size", len(vocab_dict))
|
||||
assert max(vocab_dict.values()) < vocab_size
|
||||
|
||||
tokpre = self.get_vocab_base_pre(tokenizer)
|
||||
|
||||
reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in vocab_dict.items()}
|
||||
added_vocab = tokenizer.get_added_vocab()
|
||||
|
||||
for i in range(vocab_size):
|
||||
if i not in reverse_vocab:
|
||||
tokens.append(f"[PAD{i}]")
|
||||
toktypes.append(gguf.TokenType.UNUSED)
|
||||
elif reverse_vocab[i] in added_vocab:
|
||||
tokens.append(reverse_vocab[i])
|
||||
# Check if it's a special token - treat special tokens as CONTROL tokens
|
||||
if hasattr(tokenizer, 'added_tokens_decoder') and i in tokenizer.added_tokens_decoder:
|
||||
if tokenizer.added_tokens_decoder[i].special:
|
||||
toktypes.append(gguf.TokenType.CONTROL)
|
||||
else:
|
||||
toktypes.append(gguf.TokenType.USER_DEFINED)
|
||||
else:
|
||||
# Fallback: treat all added vocab as control tokens for special tokens like <|im_start|>
|
||||
toktypes.append(gguf.TokenType.CONTROL)
|
||||
else:
|
||||
tokens.append(reverse_vocab[i])
|
||||
toktypes.append(gguf.TokenType.NORMAL)
|
||||
|
||||
return tokens, toktypes, tokpre
|
||||
|
||||
def set_vocab(self):
|
||||
self._set_vocab_gpt2()
|
||||
|
||||
# LLaDA specific parameters
|
||||
self.gguf_writer.add_add_bos_token(True)
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
self._try_set_pooling_type()
|
||||
|
||||
# Add parameters similar to LlamaModel
|
||||
hparams = self.hparams
|
||||
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
|
||||
|
||||
if (rope_dim := hparams.get("head_dim")) is None:
|
||||
n_heads = hparams.get("num_attention_heads", hparams.get("n_heads"))
|
||||
rope_dim = hparams.get("hidden_size", hparams.get("d_model")) // n_heads
|
||||
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
||||
|
||||
# Set context length for LLaDA
|
||||
context_length = self.hparams.get("max_sequence_length", 4096)
|
||||
self.gguf_writer.add_context_length(context_length)
|
||||
|
||||
# Set embedding length (dimension size)
|
||||
embedding_length = self.hparams.get("d_model", 4096)
|
||||
self.gguf_writer.add_embedding_length(embedding_length)
|
||||
|
||||
# Set feed forward length (MLP hidden size)
|
||||
feed_forward_length = self.hparams.get("mlp_hidden_size", 12288)
|
||||
self.gguf_writer.add_feed_forward_length(feed_forward_length)
|
||||
|
||||
# LLaDA models use non-causal attention for diffusion, similar to Dream
|
||||
self.gguf_writer.add_causal_attention(False)
|
||||
|
||||
# LLaDA models don't shift their logits
|
||||
self.gguf_writer.add_diffusion_shift_logits(False)
|
||||
|
||||
@staticmethod
|
||||
def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
|
||||
if n_head_kv is not None and n_head != n_head_kv:
|
||||
n_head = n_head_kv
|
||||
return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
|
||||
.swapaxes(1, 2)
|
||||
.reshape(weights.shape))
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
n_head = self.hparams.get("num_attention_heads", self.hparams.get("n_heads"))
|
||||
n_kv_head = self.hparams.get("num_key_value_heads", self.hparams.get("n_kv_heads"))
|
||||
|
||||
if self.undo_permute:
|
||||
if name.endswith(("q_proj.weight", "q_proj.bias")):
|
||||
data_torch = LLaDAModel.permute(data_torch, n_head, n_head)
|
||||
if name.endswith(("k_proj.weight", "k_proj.bias")):
|
||||
data_torch = LLaDAModel.permute(data_torch, n_head, n_kv_head)
|
||||
|
||||
# LLaDA model tensors should be mapped directly since it's the base model
|
||||
yield from super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
|
||||
@ModelBase.register("Ernie4_5_ForCausalLM")
|
||||
class Ernie4_5Model(TextModel):
|
||||
model_arch = gguf.MODEL_ARCH.ERNIE4_5
|
||||
|
|
@ -3791,7 +3954,7 @@ class Plamo2Model(TextModel):
|
|||
self.gguf_writer.add_block_count(block_count)
|
||||
self.gguf_writer.add_head_count(hparams.get("num_attention_heads", 32))
|
||||
self.gguf_writer.add_layer_norm_rms_eps(hparams.get("rms_norm_eps", 1e-06))
|
||||
self.gguf_writer.add_rope_freq_base(hparams.get("rope_theta", 1000000.0))
|
||||
self.gguf_writer.add_rope_freq_base(hparams.get("rope_theta", 10000))
|
||||
|
||||
# Mamba parameters
|
||||
self.gguf_writer.add_ssm_state_size(hparams.get("mamba_d_state", 64))
|
||||
|
|
@ -3802,7 +3965,7 @@ class Plamo2Model(TextModel):
|
|||
self.gguf_writer.add_ssm_group_count(0)
|
||||
|
||||
# MLP feed forward parameters (for attention layers)
|
||||
self.gguf_writer.add_feed_forward_length(hparams.get("intermediate_size", 16384))
|
||||
self.gguf_writer.add_feed_forward_length(hparams.get("intermediate_size", 13312))
|
||||
self.gguf_writer.add_file_type(self.ftype)
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
|
|
@ -5899,6 +6062,7 @@ class DeepseekModel(TextModel):
|
|||
|
||||
@ModelBase.register("DeepseekV2ForCausalLM")
|
||||
@ModelBase.register("DeepseekV3ForCausalLM")
|
||||
@ModelBase.register("KimiVLForConditionalGeneration")
|
||||
class DeepseekV2Model(TextModel):
|
||||
model_arch = gguf.MODEL_ARCH.DEEPSEEK2
|
||||
|
||||
|
|
@ -6001,6 +6165,13 @@ class DeepseekV2Model(TextModel):
|
|||
_experts: list[dict[str, Tensor]] | None = None
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
# skip vision tensors and remove "language_model." for Kimi-VL
|
||||
if "vision_tower" in name or "multi_modal_projector" in name:
|
||||
return []
|
||||
|
||||
if name.startswith("language_model."):
|
||||
name = name.replace("language_model.", "")
|
||||
|
||||
# rename e_score_correction_bias tensors
|
||||
if name.endswith("e_score_correction_bias"):
|
||||
name = name.replace("e_score_correction_bias", "e_score_correction.bias")
|
||||
|
|
@ -7231,9 +7402,10 @@ class WhisperEncoderModel(MmprojModel):
|
|||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.hparams["hidden_size"] = self.hparams["d_model"]
|
||||
self.hparams["intermediate_size"] = self.hparams["encoder_ffn_dim"]
|
||||
self.hparams["num_attention_heads"] = self.hparams["encoder_attention_heads"]
|
||||
if "hidden_size" not in self.hparams and "intermediate_size" not in self.hparams:
|
||||
self.hparams["hidden_size"] = self.hparams["d_model"]
|
||||
self.hparams["intermediate_size"] = self.hparams["encoder_ffn_dim"]
|
||||
self.hparams["num_attention_heads"] = self.hparams["encoder_attention_heads"]
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
|
|
@ -7272,9 +7444,21 @@ class UltravoxWhisperEncoderModel(WhisperEncoderModel):
|
|||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.ULTRAVOX)
|
||||
self.gguf_writer.add_audio_stack_factor(self.global_config["stack_factor"])
|
||||
|
||||
|
||||
@ModelBase.register("VoxtralForConditionalGeneration")
|
||||
class VoxtralWhisperEncoderModel(WhisperEncoderModel):
|
||||
has_vision_encoder = False # no vision encoder
|
||||
has_audio_encoder = True
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.VOXTRAL)
|
||||
self.gguf_writer.add_audio_stack_factor(4) # == intermediate_size // hidden_size
|
||||
|
||||
|
||||
@ModelBase.register("FalconH1ForCausalLM")
|
||||
class FalconH1Model(Mamba2Model):
|
||||
model_arch = gguf.MODEL_ARCH.FALCON_H1
|
||||
|
|
@ -7386,11 +7570,6 @@ class FalconH1Model(Mamba2Model):
|
|||
class HunYuanMoEModel(TextModel):
|
||||
model_arch = gguf.MODEL_ARCH.HUNYUAN_MOE
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
# For handling tied embeddings
|
||||
self._tok_embd = None
|
||||
|
||||
def set_vocab(self):
|
||||
from transformers import AutoTokenizer
|
||||
tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
|
||||
|
|
@ -7484,9 +7663,6 @@ class HunYuanMoEModel(TextModel):
|
|||
_experts: list[dict[str, Tensor]] | None = None
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
if name == "model.embed_tokens.weight":
|
||||
self._tok_embd = data_torch.clone()
|
||||
|
||||
if name == "lm_head.weight":
|
||||
if self.hparams.get("tie_word_embeddings", False):
|
||||
logger.info("Skipping tied output layer 'lm_head.weight'")
|
||||
|
|
@ -7531,6 +7707,98 @@ class HunYuanMoEModel(TextModel):
|
|||
raise ValueError(f"Unprocessed experts: {experts}")
|
||||
|
||||
|
||||
@ModelBase.register("HunYuanDenseV1ForCausalLM")
|
||||
class HunYuanModel(TextModel):
|
||||
model_arch = gguf.MODEL_ARCH.HUNYUAN_DENSE
|
||||
|
||||
def set_vocab(self):
|
||||
if (self.dir_model / "tokenizer.json").is_file():
|
||||
self._set_vocab_gpt2()
|
||||
else:
|
||||
from transformers import AutoTokenizer
|
||||
tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
|
||||
|
||||
# 1. Get the pre-tokenizer identifier hash
|
||||
tokpre = self.get_vocab_base_pre(tokenizer)
|
||||
|
||||
# 2. Reverse-engineer the merges list from mergeable_ranks
|
||||
merges = []
|
||||
vocab = {}
|
||||
mergeable_ranks = tokenizer.mergeable_ranks
|
||||
for token, rank in mergeable_ranks.items():
|
||||
vocab[QwenModel.token_bytes_to_string(token)] = rank
|
||||
if len(token) == 1:
|
||||
continue
|
||||
merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank)
|
||||
if len(merged) == 2:
|
||||
merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged)))
|
||||
|
||||
# 3. Generate the tokens and toktypes lists
|
||||
vocab_size = self.hparams["vocab_size"]
|
||||
assert tokenizer.vocab_size == vocab_size
|
||||
special_tokens = tokenizer.special_tokens
|
||||
reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **special_tokens}.items()}
|
||||
tokens: list[str] = []
|
||||
toktypes: list[int] = []
|
||||
for i in range(vocab_size):
|
||||
if i not in reverse_vocab:
|
||||
tokens.append(f"[PAD{i}]")
|
||||
toktypes.append(gguf.TokenType.UNUSED)
|
||||
else:
|
||||
token = reverse_vocab[i]
|
||||
tokens.append(token)
|
||||
if i in special_tokens.values():
|
||||
toktypes.append(gguf.TokenType.CONTROL)
|
||||
else:
|
||||
toktypes.append(gguf.TokenType.NORMAL)
|
||||
|
||||
# 4. Write all vocab-related fields to the GGUF writer
|
||||
self.gguf_writer.add_tokenizer_model("gpt2")
|
||||
self.gguf_writer.add_tokenizer_pre(tokpre)
|
||||
self.gguf_writer.add_token_list(tokens)
|
||||
self.gguf_writer.add_token_types(toktypes)
|
||||
self.gguf_writer.add_token_merges(merges)
|
||||
|
||||
# 5. Add special tokens and chat templates
|
||||
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False)
|
||||
special_vocab.add_to_gguf(self.gguf_writer)
|
||||
# FIX for BOS token: Overwrite incorrect id read from config.json
|
||||
if self.hparams['hidden_size'] == 4096:
|
||||
self.gguf_writer.add_bos_token_id(127958) # only for 7b dense, fix <|bos|> token
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
hparams = self.hparams
|
||||
|
||||
# Rope
|
||||
rope_scaling = hparams.get("rope_scaling", {})
|
||||
if rope_scaling.get("type") == "dynamic":
|
||||
# HunYuan uses NTK Aware Alpha based scaling. Original implementation: https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/
|
||||
# 1000 corresponds to a usable context length of 256k (https://github.com/Tencent-Hunyuan/Hunyuan-A13B/blob/main/report/Hunyuan_A13B_Technical_Report.pdf)
|
||||
alpha = rope_scaling.get("alpha", 50)
|
||||
base = hparams.get("rope_theta", 10000.0)
|
||||
dim = hparams["head_dim"]
|
||||
scaled_base = base * (alpha ** (dim / (dim - 2)))
|
||||
self.gguf_writer.add_rope_freq_base(scaled_base)
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
|
||||
self.gguf_writer.add_rope_scaling_factor(1)
|
||||
# There is no consistent way to calculate ctx from alpha, and the config is incorrectly set to 32k
|
||||
self.gguf_writer.add_rope_scaling_orig_ctx_len(256 * 1024) # 256k context length
|
||||
self.gguf_writer.add_context_length(256 * 1024) # 256k context length
|
||||
|
||||
# if any of our assumptions about the values are wrong, something has changed and this may need to be updated
|
||||
assert base == 10000.0 and self.hparams["max_position_embeddings"] in [32 * 1024, 256 * 1024] , \
|
||||
"HunYuan dynamic RoPE scaling assumptions changed, please update the logic or context length manually"
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
if name == "lm_head.weight":
|
||||
if self.hparams.get("tie_word_embeddings", False):
|
||||
logger.info("Skipping tied output layer 'lm_head.weight'")
|
||||
return []
|
||||
|
||||
return [(self.map_tensor_name(name), data_torch)]
|
||||
|
||||
|
||||
@ModelBase.register("SmolLM3ForCausalLM")
|
||||
class SmolLM3Model(LlamaModel):
|
||||
model_arch = gguf.MODEL_ARCH.SMOLLM3
|
||||
|
|
@ -7589,6 +7857,88 @@ class LFM2Model(TextModel):
|
|||
return [(self.map_tensor_name(name), data_torch)]
|
||||
|
||||
|
||||
@ModelBase.register("SmallThinkerForCausalLM")
|
||||
class SmallThinkerModel(TextModel):
|
||||
model_arch = gguf.MODEL_ARCH.SMALLTHINKER
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
if (n_experts := self.hparams.get("num_experts", self.hparams.get("moe_num_primary_experts"))) is not None:
|
||||
self.gguf_writer.add_expert_count(n_experts)
|
||||
if (n_experts_used := self.hparams.get("num_experts_per_tok", self.hparams.get("moe_num_active_primary_experts"))) is not None:
|
||||
self.gguf_writer.add_expert_used_count(n_experts_used)
|
||||
if (moe_intermediate_size := self.hparams.get("moe_ffn_hidden_size")) is not None:
|
||||
self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size)
|
||||
self.gguf_writer.add_feed_forward_length(moe_intermediate_size)
|
||||
logger.info(f"gguf: expert feed forward length = {moe_intermediate_size}")
|
||||
if (self.hparams.get('moe_primary_router_apply_softmax')):
|
||||
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SOFTMAX)
|
||||
else:
|
||||
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
|
||||
# YaRN is not enabled by default
|
||||
# To enable it, please refer to this guide: https://huggingface.co/Qwen/Qwen3-30B-A3B#processing-long-texts
|
||||
rope_scaling = self.hparams.get("rope_scaling") or {}
|
||||
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
||||
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
||||
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
|
||||
|
||||
sliding_window_layout = self.hparams.get("sliding_window_layout")
|
||||
if sliding_window_layout:
|
||||
for i in sliding_window_layout:
|
||||
if i != 0:
|
||||
sliding_window = self.hparams.get("sliding_window_size")
|
||||
if sliding_window:
|
||||
self.gguf_writer.add_sliding_window(sliding_window)
|
||||
break
|
||||
|
||||
_experts: list[dict[str, Tensor]] | None = None
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
# process the experts separately
|
||||
if name.find("experts") != -1:
|
||||
n_experts = self.hparams.get("num_experts", self.hparams.get("moe_num_primary_experts"))
|
||||
assert bid is not None
|
||||
|
||||
if self._experts is None:
|
||||
self._experts = [{} for _ in range(self.block_count)]
|
||||
|
||||
self._experts[bid][name] = data_torch
|
||||
|
||||
if len(self._experts[bid]) >= n_experts * 3:
|
||||
tensors: list[tuple[str, Tensor]] = []
|
||||
|
||||
# merge the experts into a single 3d tensor
|
||||
for w_name in ["down", "gate", "up"]:
|
||||
datas: list[Tensor] = []
|
||||
|
||||
for xid in range(n_experts):
|
||||
ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{w_name}.weight"
|
||||
datas.append(self._experts[bid][ename])
|
||||
del self._experts[bid][ename]
|
||||
|
||||
data_torch = torch.stack(datas, dim=0)
|
||||
|
||||
merged_name = f"model.layers.{bid}.block_sparse_moe.experts.{w_name}.weight"
|
||||
|
||||
new_name = self.map_tensor_name(merged_name)
|
||||
|
||||
tensors.append((new_name, data_torch))
|
||||
return tensors
|
||||
else:
|
||||
return []
|
||||
|
||||
return [(self.map_tensor_name(name), data_torch)]
|
||||
|
||||
def prepare_tensors(self):
|
||||
super().prepare_tensors()
|
||||
|
||||
if self._experts is not None:
|
||||
# flatten `list[dict[str, Tensor]]` into `list[str]`
|
||||
experts = [k for d in self._experts for k in d.keys()]
|
||||
if len(experts) > 0:
|
||||
raise ValueError(f"Unprocessed experts: {experts}")
|
||||
|
||||
###### CONVERSION LOGIC ######
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -59,6 +59,10 @@ parser.add_argument(
|
|||
"--full", action="store_true",
|
||||
help="download full list of models - make sure you have access to all of them",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--check-missing", action="store_true",
|
||||
help="only check for missing pre-tokenizer hashes",
|
||||
)
|
||||
parser.add_argument(
|
||||
"hf_token",
|
||||
help="optional HF token",
|
||||
|
|
@ -70,6 +74,10 @@ hf_token = args.hf_token if args.hf_token is not None else hf_token
|
|||
if hf_token is None:
|
||||
logger.warning("HF token not found. You can provide it as an argument or set it in ~/.cache/huggingface/token")
|
||||
|
||||
if args.check_missing and args.full:
|
||||
logger.warning("Downloading full list of models requested, ignoring --check-missing!")
|
||||
args.check_missing = False
|
||||
|
||||
# TODO: this string has to exercise as much pre-tokenizer functionality as possible
|
||||
# will be updated with time - contributions welcome
|
||||
CHK_TXT = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
|
||||
|
|
@ -130,6 +138,7 @@ models = [
|
|||
{"name": "midm-2.0", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/K-intelligence/Midm-2.0-Base-Instruct", },
|
||||
{"name": "lfm2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LiquidAI/LFM2-Tokenizer"},
|
||||
{"name": "exaone4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B", },
|
||||
{"name": "mellum", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/JetBrains/Mellum-4b-base", },
|
||||
]
|
||||
|
||||
# some models are known to be broken upstream, so we will skip them as exceptions
|
||||
|
|
@ -140,12 +149,14 @@ pre_computed_hashes = [
|
|||
{"name": "glm4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-hf", "chkhsh": "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2"},
|
||||
{"name": "minerva-7b", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0", "chkhsh": "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35"},
|
||||
{"name": "hunyuan", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tencent/Hunyuan-A13B-Instruct", "chkhsh": "7e57df22b1fe23a7b1e1c7f3dc4e3f96d43a4eb0836d0c6bdc3436d7b2f1c664"},
|
||||
{"name": "hunyuan-dense", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tencent/Hunyuan-4B-Instruct", "chkhsh": "bba3b3366b646dbdded5dbc42d59598b849371afc42f7beafa914afaa5b70aa6"},
|
||||
# falcon-h1 series uses 4 different tokenizers across model sizes (0.5b - 34b), hence we need to define 4 different hashes
|
||||
{"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-0.5B-Base", "chkhsh": "a6b57017d60e6edb4d88ecc2845188e0eb333a70357e45dcc9b53964a73bbae6"},
|
||||
{"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-1B-Base", "chkhsh": "60476e1243776c4fb1b993dbd7a5f15ac22f83c80afdf425fa5ae01c8d44ef86"},
|
||||
{"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-7B-Base", "chkhsh": "3eda48b4c4dc7de733d1a8b3e3b4a85243dbbf704da2ee9d42c6beced8897896"},
|
||||
{"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-34B-Base", "chkhsh": "48f8e02c0359c0bbdd82f26909171fac1c18a457bb47573ed1fe3bbb2c1cfd4b"},
|
||||
{"name": "kimi-k2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/moonshotai/Kimi-K2-Base", "chkhsh": "81212dc7cdb7e0c1074ca62c5aeab0d43c9f52b8a737be7b12a777c953027890"},
|
||||
{"name": "qwen2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen3-Embedding-0.6B", "chkhsh": "d4540891389ea895b53b399da6ac824becc30f2fba0e9ddbb98f92e55ca0e97c"},
|
||||
]
|
||||
|
||||
|
||||
|
|
@ -220,12 +231,13 @@ if not args.full:
|
|||
all_models = models.copy()
|
||||
models = [model for model in all_models if model["name"] not in existing_models]
|
||||
|
||||
logging.info(f"Downloading {len(models)} models...")
|
||||
for model in models:
|
||||
try:
|
||||
download_model(model)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to download model {model['name']}. Error: {e}")
|
||||
if not args.check_missing:
|
||||
logging.info(f"Downloading {len(models)} models...")
|
||||
for model in models:
|
||||
try:
|
||||
download_model(model)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to download model {model['name']}. Error: {e}")
|
||||
|
||||
|
||||
# generate the source code for the convert_hf_to_gguf.py:get_vocab_base_pre() function:
|
||||
|
|
|
|||
|
|
@ -310,5 +310,7 @@ Specifies the memory pool management strategy:
|
|||
|
||||
Controls automatic cleanup of the memory pool. This option is only effective when using the prio or leg memory pool strategies.
|
||||
|
||||
## TODO
|
||||
- Support more models and data types.
|
||||
### GGML_CANN_WEIGHT_NZ
|
||||
|
||||
Converting the matmul weight format from ND to NZ can significantly improve performance on the 310I DUO NPU.
|
||||
|
||||
|
|
|
|||
|
|
@ -97,6 +97,9 @@ NOTE: some models may require large context window, for example: `-c 8192`
|
|||
# Qwen2-Audio and SeaLLM-Audio
|
||||
# note: no pre-quantized GGUF this model, as they have very poor result
|
||||
# ref: https://github.com/ggml-org/llama.cpp/pull/13760
|
||||
|
||||
# Mistral's Voxtral
|
||||
(tool_name) -hf ggml-org/Voxtral-Mini-3B-2507-GGUF
|
||||
```
|
||||
|
||||
**Mixed modalities**:
|
||||
|
|
|
|||
|
|
@ -29,8 +29,8 @@ cmake --build build --config Release
|
|||
Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-o-2_6-gguf) by us)
|
||||
|
||||
```bash
|
||||
python ./tools/mtmd/minicpmv-surgery.py -m ../MiniCPM-o-2_6
|
||||
python ./tools/mtmd/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-o-2_6 --minicpmv-projector ../MiniCPM-o-2_6/minicpmv.projector --output-dir ../MiniCPM-o-2_6/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 4
|
||||
python ./tools/mtmd/legacy-models/minicpmv-surgery.py -m ../MiniCPM-o-2_6
|
||||
python ./tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-o-2_6 --minicpmv-projector ../MiniCPM-o-2_6/minicpmv.projector --output-dir ../MiniCPM-o-2_6/ --minicpmv_version 4
|
||||
python ./convert_hf_to_gguf.py ../MiniCPM-o-2_6/model
|
||||
|
||||
# quantize int4 version
|
||||
|
|
|
|||
|
|
@ -0,0 +1,47 @@
|
|||
## MiniCPM-o 4
|
||||
|
||||
### Prepare models and code
|
||||
|
||||
Download [MiniCPM-o-4](https://huggingface.co/openbmb/MiniCPM-o-4) PyTorch model from huggingface to "MiniCPM-o-4" folder.
|
||||
|
||||
|
||||
### Build llama.cpp
|
||||
Readme modification time: 20250206
|
||||
|
||||
If there are differences in usage, please refer to the official build [documentation](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md)
|
||||
|
||||
Clone llama.cpp:
|
||||
```bash
|
||||
git clone https://github.com/ggerganov/llama.cpp
|
||||
cd llama.cpp
|
||||
```
|
||||
|
||||
Build llama.cpp using `CMake`:
|
||||
```bash
|
||||
cmake -B build
|
||||
cmake --build build --config Release
|
||||
```
|
||||
|
||||
|
||||
### Usage of MiniCPM-o 4
|
||||
|
||||
Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-o-4-gguf) by us)
|
||||
|
||||
```bash
|
||||
python ./tools/mtmd/legacy-models/minicpmv-surgery.py -m ../MiniCPM-o-4
|
||||
python ./tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-o-4 --minicpmv-projector ../MiniCPM-o-4/minicpmv.projector --output-dir ../MiniCPM-o-4/ --minicpmv_version 6
|
||||
python ./convert_hf_to_gguf.py ../MiniCPM-o-4/model
|
||||
|
||||
# quantize int4 version
|
||||
./build/bin/llama-quantize ../MiniCPM-o-4/model/ggml-model-f16.gguf ../MiniCPM-o-4/model/ggml-model-Q4_K_M.gguf Q4_K_M
|
||||
```
|
||||
|
||||
|
||||
Inference on Linux or Mac
|
||||
```bash
|
||||
# run in single-turn mode
|
||||
./build/bin/llama-mtmd-cli -m ../MiniCPM-o-4/model/ggml-model-f16.gguf --mmproj ../MiniCPM-o-4/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
|
||||
|
||||
# run in conversation mode
|
||||
./build/bin/llama-mtmd-cli -m ../MiniCPM-o-4/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-o-4/mmproj-model-f16.gguf
|
||||
```
|
||||
|
|
@ -28,8 +28,8 @@ cmake --build build --config Release
|
|||
Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5-gguf) by us)
|
||||
|
||||
```bash
|
||||
python ./tools/mtmd/minicpmv-surgery.py -m ../MiniCPM-Llama3-V-2_5
|
||||
python ./tools/mtmd/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-Llama3-V-2_5 --minicpmv-projector ../MiniCPM-Llama3-V-2_5/minicpmv.projector --output-dir ../MiniCPM-Llama3-V-2_5/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 2
|
||||
python ./tools/mtmd/legacy-models/minicpmv-surgery.py -m ../MiniCPM-Llama3-V-2_5
|
||||
python ./tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-Llama3-V-2_5 --minicpmv-projector ../MiniCPM-Llama3-V-2_5/minicpmv.projector --output-dir ../MiniCPM-Llama3-V-2_5/ --minicpmv_version 2
|
||||
python ./convert_hf_to_gguf.py ../MiniCPM-Llama3-V-2_5/model
|
||||
|
||||
# quantize int4 version
|
||||
|
|
|
|||
|
|
@ -28,8 +28,8 @@ cmake --build build --config Release
|
|||
Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-V-2_6-gguf) by us)
|
||||
|
||||
```bash
|
||||
python ./tools/mtmd/minicpmv-surgery.py -m ../MiniCPM-V-2_6
|
||||
python ./tools/mtmd/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-V-2_6 --minicpmv-projector ../MiniCPM-V-2_6/minicpmv.projector --output-dir ../MiniCPM-V-2_6/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 3
|
||||
python ./tools/mtmd/legacy-models/minicpmv-surgery.py -m ../MiniCPM-V-2_6
|
||||
python ./tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-V-2_6 --minicpmv-projector ../MiniCPM-V-2_6/minicpmv.projector --output-dir ../MiniCPM-V-2_6/ --minicpmv_version 3
|
||||
python ./convert_hf_to_gguf.py ../MiniCPM-V-2_6/model
|
||||
|
||||
# quantize int4 version
|
||||
|
|
|
|||
|
|
@ -0,0 +1,47 @@
|
|||
## MiniCPM-V 4
|
||||
|
||||
### Prepare models and code
|
||||
|
||||
Download [MiniCPM-V-4](https://huggingface.co/openbmb/MiniCPM-V-4) PyTorch model from huggingface to "MiniCPM-V-4" folder.
|
||||
|
||||
|
||||
### Build llama.cpp
|
||||
Readme modification time: 20250206
|
||||
|
||||
If there are differences in usage, please refer to the official build [documentation](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md)
|
||||
|
||||
Clone llama.cpp:
|
||||
```bash
|
||||
git clone https://github.com/ggerganov/llama.cpp
|
||||
cd llama.cpp
|
||||
```
|
||||
|
||||
Build llama.cpp using `CMake`:
|
||||
```bash
|
||||
cmake -B build
|
||||
cmake --build build --config Release
|
||||
```
|
||||
|
||||
|
||||
### Usage of MiniCPM-V 4
|
||||
|
||||
Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-V-4-gguf) by us)
|
||||
|
||||
```bash
|
||||
python ./tools/mtmd/legacy-models/minicpmv-surgery.py -m ../MiniCPM-V-4
|
||||
python ./tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-V-4 --minicpmv-projector ../MiniCPM-V-4/minicpmv.projector --output-dir ../MiniCPM-V-4/ --minicpmv_version 5
|
||||
python ./convert_hf_to_gguf.py ../MiniCPM-V-4/model
|
||||
|
||||
# quantize int4 version
|
||||
./build/bin/llama-quantize ../MiniCPM-V-4/model/ggml-model-f16.gguf ../MiniCPM-V-4/model/ggml-model-Q4_K_M.gguf Q4_K_M
|
||||
```
|
||||
|
||||
|
||||
Inference on Linux or Mac
|
||||
```bash
|
||||
# run in single-turn mode
|
||||
./build/bin/llama-mtmd-cli -m ../MiniCPM-V-4/model/ggml-model-f16.gguf --mmproj ../MiniCPM-V-4/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
|
||||
|
||||
# run in conversation mode
|
||||
./build/bin/llama-mtmd-cli -m ../MiniCPM-V-4/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-V-4/mmproj-model-f16.gguf
|
||||
```
|
||||
179
docs/ops.md
179
docs/ops.md
|
|
@ -2,94 +2,101 @@
|
|||
|
||||
List of GGML operations and backend support status.
|
||||
|
||||
## How to add a backend to this table:
|
||||
|
||||
1. Run `test-backend-ops support --output csv` with your backend name and redirect output to a csv file in `docs/ops/` (e.g., `docs/ops/CUDA.csv`)
|
||||
2. Regenerate `/docs/ops.md` via `./scripts/create_ops_docs.py`
|
||||
|
||||
Legend:
|
||||
- ✅ Fully supported by this backend
|
||||
- 🟡 Partially supported by this backend
|
||||
- ❌ Not supported by this backend
|
||||
|
||||
| Operation | BLAS | CPU | CUDA | Metal |
|
||||
|-----------|------|------|------|------|
|
||||
| ABS | ❌ | ✅ | 🟡 | ❌ |
|
||||
| ACC | ❌ | ✅ | ✅ | ✅ |
|
||||
| ADD | ❌ | ✅ | ✅ | 🟡 |
|
||||
| ADD1 | ❌ | ✅ | ✅ | ❌ |
|
||||
| ARANGE | ❌ | ✅ | ✅ | ✅ |
|
||||
| ARGMAX | ❌ | ✅ | ✅ | ✅ |
|
||||
| ARGSORT | ❌ | ✅ | ✅ | ✅ |
|
||||
| CLAMP | ❌ | ✅ | ✅ | 🟡 |
|
||||
| CONCAT | ❌ | ✅ | 🟡 | ✅ |
|
||||
| CONT | ❌ | ✅ | 🟡 | ✅ |
|
||||
| CONV_2D_DW | ❌ | ✅ | ✅ | ❌ |
|
||||
| CONV_TRANSPOSE_1D | ❌ | ✅ | ✅ | ✅ |
|
||||
| CONV_TRANSPOSE_2D | ❌ | ✅ | ✅ | ❌ |
|
||||
| COS | ❌ | ✅ | ✅ | 🟡 |
|
||||
| COUNT_EQUAL | ❌ | ✅ | ✅ | ❌ |
|
||||
| CPY | ❌ | 🟡 | 🟡 | 🟡 |
|
||||
| CROSS_ENTROPY_LOSS | ❌ | ✅ | ✅ | ❌ |
|
||||
| CROSS_ENTROPY_LOSS_BACK | ❌ | ✅ | ✅ | ❌ |
|
||||
| DIAG_MASK_INF | ❌ | ✅ | ✅ | 🟡 |
|
||||
| DIV | ❌ | ✅ | ✅ | 🟡 |
|
||||
| DUP | ❌ | ✅ | 🟡 | 🟡 |
|
||||
| ELU | ❌ | ✅ | ❌ | 🟡 |
|
||||
| EXP | ❌ | ✅ | 🟡 | ❌ |
|
||||
| FLASH_ATTN_EXT | ❌ | ✅ | 🟡 | 🟡 |
|
||||
| GATED_LINEAR_ATTN | ❌ | ✅ | ✅ | ❌ |
|
||||
| GEGLU | ❌ | ✅ | ✅ | 🟡 |
|
||||
| GEGLU_ERF | ❌ | ✅ | ✅ | 🟡 |
|
||||
| GEGLU_QUICK | ❌ | ✅ | ✅ | 🟡 |
|
||||
| GELU | ❌ | ✅ | 🟡 | 🟡 |
|
||||
| GELU_ERF | ❌ | ✅ | 🟡 | 🟡 |
|
||||
| GELU_QUICK | ❌ | ✅ | 🟡 | 🟡 |
|
||||
| GET_ROWS | ❌ | ✅ | 🟡 | ✅ |
|
||||
| GET_ROWS_BACK | ❌ | 🟡 | 🟡 | ❌ |
|
||||
| GROUP_NORM | ❌ | ✅ | ✅ | ✅ |
|
||||
| HARDSIGMOID | ❌ | ✅ | 🟡 | ❌ |
|
||||
| HARDSWISH | ❌ | ✅ | 🟡 | ❌ |
|
||||
| IM2COL | ❌ | ✅ | ✅ | 🟡 |
|
||||
| L2_NORM | ❌ | ✅ | ✅ | ✅ |
|
||||
| LEAKY_RELU | ❌ | ✅ | ✅ | ✅ |
|
||||
| LOG | ❌ | ✅ | ✅ | ❌ |
|
||||
| MEAN | ❌ | ✅ | ✅ | ✅ |
|
||||
| MUL | ❌ | ✅ | ✅ | 🟡 |
|
||||
| MUL_MAT | 🟡 | 🟡 | 🟡 | 🟡 |
|
||||
| MUL_MAT_ID | ❌ | ✅ | ✅ | ✅ |
|
||||
| NEG | ❌ | ✅ | 🟡 | 🟡 |
|
||||
| NORM | ❌ | ✅ | ✅ | 🟡 |
|
||||
| OPT_STEP_ADAMW | ❌ | ✅ | ✅ | ❌ |
|
||||
| OUT_PROD | 🟡 | 🟡 | 🟡 | ❌ |
|
||||
| PAD | ❌ | ✅ | ✅ | ✅ |
|
||||
| PAD_REFLECT_1D | ❌ | ✅ | ❌ | ✅ |
|
||||
| POOL_2D | ❌ | ✅ | ✅ | ✅ |
|
||||
| REGLU | ❌ | ✅ | ✅ | 🟡 |
|
||||
| RELU | ❌ | ✅ | 🟡 | 🟡 |
|
||||
| REPEAT | ❌ | ✅ | 🟡 | ✅ |
|
||||
| REPEAT_BACK | ❌ | ✅ | ✅ | ❌ |
|
||||
| RMS_NORM | ❌ | ✅ | ✅ | 🟡 |
|
||||
| RMS_NORM_BACK | ❌ | ✅ | ✅ | ❌ |
|
||||
| RMS_NORM_MUL | ❌ | ✅ | ✅ | ✅ |
|
||||
| ROPE | ❌ | ✅ | ✅ | ✅ |
|
||||
| ROPE_BACK | ❌ | ✅ | ✅ | ❌ |
|
||||
| RWKV_WKV6 | ❌ | ✅ | ✅ | ✅ |
|
||||
| RWKV_WKV7 | ❌ | ✅ | ✅ | ✅ |
|
||||
| SCALE | ❌ | ✅ | ✅ | ✅ |
|
||||
| SET | ❌ | ✅ | ❌ | ✅ |
|
||||
| SET_ROWS | ❌ | 🟡 | ❌ | 🟡 |
|
||||
| SGN | ❌ | ✅ | 🟡 | ❌ |
|
||||
| SIGMOID | ❌ | ✅ | 🟡 | 🟡 |
|
||||
| SILU | ❌ | ✅ | 🟡 | 🟡 |
|
||||
| SILU_BACK | ❌ | ✅ | ✅ | ❌ |
|
||||
| SIN | ❌ | ✅ | ✅ | 🟡 |
|
||||
| SOFT_MAX | ❌ | ✅ | ✅ | ✅ |
|
||||
| SOFT_MAX_BACK | ❌ | 🟡 | 🟡 | ❌ |
|
||||
| SQR | ❌ | ✅ | ✅ | 🟡 |
|
||||
| SQRT | ❌ | ✅ | ✅ | 🟡 |
|
||||
| SSM_CONV | ❌ | ✅ | ✅ | ✅ |
|
||||
| SSM_SCAN | ❌ | ✅ | ✅ | ✅ |
|
||||
| STEP | ❌ | ✅ | 🟡 | ❌ |
|
||||
| SUB | ❌ | ✅ | ✅ | 🟡 |
|
||||
| SUM | ❌ | ✅ | ✅ | ❌ |
|
||||
| SUM_ROWS | ❌ | ✅ | ✅ | ✅ |
|
||||
| SWIGLU | ❌ | ✅ | ✅ | 🟡 |
|
||||
| TANH | ❌ | ✅ | 🟡 | 🟡 |
|
||||
| TIMESTEP_EMBEDDING | ❌ | ✅ | ✅ | ✅ |
|
||||
| UPSCALE | ❌ | ✅ | ✅ | 🟡 |
|
||||
| Operation | BLAS | CANN | CPU | CUDA | Metal | OpenCL | SYCL | Vulkan |
|
||||
|-----------|------|------|------|------|------|------|------|------|
|
||||
| ABS | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ |
|
||||
| ACC | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ |
|
||||
| ADD | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ |
|
||||
| ADD1 | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ |
|
||||
| ARANGE | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| ARGMAX | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ |
|
||||
| ARGSORT | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| CLAMP | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 |
|
||||
| CONCAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | 🟡 | ✅ |
|
||||
| CONT | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | 🟡 |
|
||||
| CONV_2D | ❌ | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ✅ |
|
||||
| CONV_2D_DW | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ |
|
||||
| CONV_TRANSPOSE_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ |
|
||||
| CONV_TRANSPOSE_2D | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ |
|
||||
| COS | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 |
|
||||
| COUNT_EQUAL | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ |
|
||||
| CPY | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
|
||||
| CROSS_ENTROPY_LOSS | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ |
|
||||
| CROSS_ENTROPY_LOSS_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ |
|
||||
| DIAG_MASK_INF | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ |
|
||||
| DIV | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ |
|
||||
| DUP | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 |
|
||||
| ELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ |
|
||||
| EXP | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ |
|
||||
| FLASH_ATTN_EXT | ❌ | 🟡 | ✅ | 🟡 | 🟡 | ❌ | ❌ | 🟡 |
|
||||
| GATED_LINEAR_ATTN | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ |
|
||||
| GEGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 |
|
||||
| GEGLU_ERF | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 |
|
||||
| GEGLU_QUICK | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 |
|
||||
| GELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
|
||||
| GELU_ERF | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
|
||||
| GELU_QUICK | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
|
||||
| GET_ROWS | ❌ | 🟡 | ✅ | 🟡 | ✅ | 🟡 | 🟡 | 🟡 |
|
||||
| GET_ROWS_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ | ❌ |
|
||||
| GROUP_NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| HARDSIGMOID | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ |
|
||||
| HARDSWISH | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ |
|
||||
| IM2COL | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ |
|
||||
| L2_NORM | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ |
|
||||
| LEAKY_RELU | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ |
|
||||
| LOG | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ |
|
||||
| MEAN | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| MUL | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ |
|
||||
| MUL_MAT | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
|
||||
| MUL_MAT_ID | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ |
|
||||
| NEG | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ |
|
||||
| NORM | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 |
|
||||
| OPT_STEP_ADAMW | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ |
|
||||
| OUT_PROD | 🟡 | ❌ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ |
|
||||
| PAD | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| PAD_REFLECT_1D | ❌ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
||||
| POOL_2D | ❌ | 🟡 | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ |
|
||||
| REGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 |
|
||||
| RELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
|
||||
| REPEAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | 🟡 |
|
||||
| REPEAT_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ |
|
||||
| RMS_NORM | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ |
|
||||
| RMS_NORM_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ |
|
||||
| RMS_NORM_MUL_ADD | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| ROLL | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ✅ |
|
||||
| ROPE | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| ROPE_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ |
|
||||
| RWKV_WKV6 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ |
|
||||
| RWKV_WKV7 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ |
|
||||
| SCALE | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| SET | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
||||
| SET_ROWS | ❌ | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
|
||||
| SGN | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ |
|
||||
| SIGMOID | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
|
||||
| SILU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
|
||||
| SILU_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ |
|
||||
| SIN | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 |
|
||||
| SOFT_MAX | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | 🟡 | ✅ |
|
||||
| SOFT_MAX_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ | ✅ |
|
||||
| SQR | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 |
|
||||
| SQRT | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | ❌ |
|
||||
| SSM_CONV | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| SSM_SCAN | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| STEP | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ |
|
||||
| SUB | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ |
|
||||
| SUM | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ |
|
||||
| SUM_ROWS | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| SWIGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 |
|
||||
| TANH | ❌ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | 🟡 |
|
||||
| TIMESTEP_EMBEDDING | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| UPSCALE | ❌ | 🟡 | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ |
|
||||
|
|
|
|||
14667
docs/ops/BLAS.csv
14667
docs/ops/BLAS.csv
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
13883
docs/ops/CPU.csv
13883
docs/ops/CPU.csv
File diff suppressed because it is too large
Load Diff
13883
docs/ops/CUDA.csv
13883
docs/ops/CUDA.csv
File diff suppressed because it is too large
Load Diff
14667
docs/ops/Metal.csv
14667
docs/ops/Metal.csv
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
|
@ -0,0 +1,13 @@
|
|||
# Diffusion Text Generation
|
||||
|
||||
This directory contains implementations for Diffusion LLMs (DLLMs)
|
||||
|
||||
More Info:
|
||||
- https://github.com/ggml-org/llama.cpp/pull/14644
|
||||
- https://github.com/ggml-org/llama.cpp/pull/14771
|
||||
|
||||
|
||||
Example of using Dream architechture: `llama-diffusion-cli -m dream7b.gguf -p "write code to train MNIST in pytorch" -ub 512 --diffusion-eps 0.001 --diffusion-algorithm 3 --diffusion-steps 256 --diffusion-visual`
|
||||
|
||||
Example of using LLaDA architechture: `llama-diffusion-cli -m llada-8b.gguf -p "write code to train MNIST in pytorch" -ub 512 --diffusion-block-length 32 --diffusion-steps 256 --diffusion-visual`
|
||||
|
||||
|
|
@ -5,344 +5,128 @@
|
|||
#include "log.h"
|
||||
|
||||
#include <limits.h>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
#include <cstring>
|
||||
#include <limits>
|
||||
#include <random>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
typedef bool (*diffusion_step_callback_t)(int32_t step,
|
||||
int32_t total_steps,
|
||||
const llama_token * tokens,
|
||||
int32_t n_tokens,
|
||||
void * user_data);
|
||||
enum diffusion_algorithm { ORIGIN = 0, ENTROPY_BASED = 1, MARGIN_BASED = 2, RANDOM = 3, CONFIDENCE_BASED = 4 };
|
||||
|
||||
enum diffusion_alg {
|
||||
DIFFUSION_ALG_ORIGIN = 0,
|
||||
DIFFUSION_ALG_MASKGIT_PLUS = 1,
|
||||
DIFFUSION_ALG_TOPK_MARGIN = 2,
|
||||
DIFFUSION_ALG_ENTROPY = 3,
|
||||
// Unified transfer scheduling methods
|
||||
enum transfer_schedule {
|
||||
TIMESTEP_BASED = 0, // Dream-style: (1.0 - s/t) * remaining
|
||||
BLOCK_BASED = 1, // LLaDA-style: process in blocks with get_num_transfer_tokens
|
||||
};
|
||||
|
||||
typedef bool (*diffusion_step_callback_t)(int32_t step,
|
||||
int32_t total_steps,
|
||||
const llama_token * tokens,
|
||||
int32_t n_tokens,
|
||||
void * user_data);
|
||||
|
||||
struct diffusion_params {
|
||||
int32_t steps;
|
||||
float eps;
|
||||
float temperature;
|
||||
float top_p;
|
||||
int32_t top_k;
|
||||
llama_token mask_token_id;
|
||||
enum diffusion_alg algorithm;
|
||||
float alg_temp;
|
||||
diffusion_step_callback_t step_callback;
|
||||
void * step_callback_user_data;
|
||||
int32_t seed;
|
||||
int32_t steps = 0;
|
||||
float temperature = 0;
|
||||
llama_token mask_token_id = LLAMA_TOKEN_NULL;
|
||||
diffusion_step_callback_t step_callback = nullptr;
|
||||
void * step_callback_user_data = nullptr;
|
||||
int32_t seed = 0;
|
||||
bool visual_mode = false;
|
||||
bool shift_logits = false; // Shift logits by -1 after decode
|
||||
|
||||
float top_p = 0.;
|
||||
int32_t top_k = 0.;
|
||||
|
||||
diffusion_algorithm algorithm = CONFIDENCE_BASED;
|
||||
transfer_schedule schedule = TIMESTEP_BASED;
|
||||
|
||||
float cfg_scale = 0.; // Config scale for classifier-free guidance
|
||||
float eps = 0.; // Timestep scheduling
|
||||
int32_t block_length = 0; // Block size (for block scheduling)
|
||||
float alg_temp = 0; // algorithm temperature (0.0 = deterministic)
|
||||
bool add_gumbel_noise = false; // Add gumbel noise to the logits if temp > 0.0
|
||||
|
||||
int32_t max_length = 0; // Maximum sequence length
|
||||
};
|
||||
|
||||
|
||||
static diffusion_params diffusion_default_params() {
|
||||
diffusion_params params = {};
|
||||
params.steps = 64;
|
||||
params.eps = 1e-3f;
|
||||
params.temperature = 0.2f;
|
||||
params.top_p = 0.95f;
|
||||
params.top_k = 0;
|
||||
params.mask_token_id = LLAMA_TOKEN_NULL;
|
||||
params.algorithm = DIFFUSION_ALG_ORIGIN;
|
||||
params.alg_temp = 0.0f;
|
||||
params.step_callback = nullptr;
|
||||
params.step_callback_user_data = nullptr;
|
||||
params.seed = 0;
|
||||
return params;
|
||||
}
|
||||
|
||||
static void diffusion_generate(llama_context * ctx,
|
||||
const llama_token * input_tokens,
|
||||
llama_token * output_tokens,
|
||||
int32_t n_input,
|
||||
int32_t max_length,
|
||||
struct diffusion_params params,
|
||||
int32_t & n_generated) {
|
||||
|
||||
n_generated = 0;
|
||||
if (!ctx || !input_tokens || !output_tokens || n_input <= 0 || max_length <= n_input) {
|
||||
return;
|
||||
}
|
||||
|
||||
const llama_model * model = llama_get_model(ctx);
|
||||
|
||||
// Initialize with input and pad with mask tokens
|
||||
std::copy(input_tokens, input_tokens + n_input, output_tokens);
|
||||
std::fill(output_tokens + n_input, output_tokens + max_length, params.mask_token_id);
|
||||
|
||||
std::mt19937 rng(params.seed);
|
||||
|
||||
std::vector<float> timesteps(params.steps + 1);
|
||||
for (int32_t i = 0; i <= params.steps; i++) {
|
||||
timesteps[i] = 1.0f - (float) i / params.steps * (1.0f - params.eps);
|
||||
}
|
||||
|
||||
llama_set_causal_attn(ctx, false);
|
||||
|
||||
int32_t n_vocab = llama_vocab_n_tokens(llama_model_get_vocab(model));
|
||||
|
||||
std::vector<llama_token_data> candidates(n_vocab);
|
||||
|
||||
std::vector<llama_token_data> conf_candidates;
|
||||
conf_candidates.reserve(max_length);
|
||||
|
||||
std::vector<int32_t> mask_positions;
|
||||
mask_positions.reserve(max_length);
|
||||
|
||||
struct llama_sampler * sampler = llama_sampler_chain_init(llama_sampler_chain_default_params());
|
||||
if (params.top_k > 0) {
|
||||
llama_sampler_chain_add(sampler, llama_sampler_init_top_k(params.top_k));
|
||||
}
|
||||
if (params.top_p < 1.0f) {
|
||||
llama_sampler_chain_add(sampler, llama_sampler_init_top_p(params.top_p, 1));
|
||||
}
|
||||
if (params.temperature > 0.0f) {
|
||||
llama_sampler_chain_add(sampler, llama_sampler_init_temp(params.temperature));
|
||||
}
|
||||
llama_sampler_chain_add(sampler, llama_sampler_init_dist(params.seed));
|
||||
|
||||
struct llama_sampler * dist_sampler = llama_sampler_init_dist(params.seed);
|
||||
|
||||
llama_batch batch = llama_batch_init(max_length, 0, 1);
|
||||
batch.n_tokens = max_length;
|
||||
|
||||
int64_t total_sampling_time = 0;
|
||||
int64_t total_time = 0;
|
||||
|
||||
int64_t time_start = ggml_time_us();
|
||||
for (int32_t step = 0; step < params.steps; step++) {
|
||||
if (params.step_callback) {
|
||||
if (!params.step_callback(step, params.steps, output_tokens, max_length, params.step_callback_user_data)) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
for (int32_t i = 0; i < max_length; i++) {
|
||||
batch.token[i] = output_tokens[i];
|
||||
batch.pos[i] = i;
|
||||
batch.n_seq_id[i] = 1;
|
||||
batch.seq_id[i][0] = 0;
|
||||
batch.logits[i] = 1;
|
||||
}
|
||||
|
||||
int ret = llama_decode(ctx, batch);
|
||||
if (ret != 0) {
|
||||
LOG_ERR("%s: failed to decode at step %d, ret = %d\n", __func__, step, ret);
|
||||
break;
|
||||
}
|
||||
|
||||
float * raw_logits = llama_get_logits(ctx);
|
||||
if (!raw_logits) {
|
||||
LOG_ERR("%s: failed to get logits at step %d\n", __func__, step);
|
||||
break;
|
||||
}
|
||||
|
||||
auto get_logits_for_pos = [&](int32_t pos) -> const float * {
|
||||
return pos == 0 ? raw_logits : raw_logits + (pos - 1) * n_vocab;
|
||||
};
|
||||
|
||||
int64_t time_start_sampling = ggml_time_us();
|
||||
|
||||
mask_positions.clear();
|
||||
for (int32_t i = 0; i < max_length; i++) {
|
||||
if (output_tokens[i] == params.mask_token_id) {
|
||||
mask_positions.push_back(i);
|
||||
}
|
||||
}
|
||||
|
||||
if (mask_positions.empty()) {
|
||||
break;
|
||||
}
|
||||
|
||||
float t = timesteps[step];
|
||||
float s = timesteps[step + 1];
|
||||
|
||||
if (params.algorithm == DIFFUSION_ALG_ORIGIN) {
|
||||
float p_transfer = (step < params.steps - 1) ? (1.0f - s / t) : 1.0f;
|
||||
|
||||
for (int32_t pos : mask_positions) {
|
||||
if (std::uniform_real_distribution<float>(0.0f, 1.0f)(rng) < p_transfer) {
|
||||
const float * pos_logits = get_logits_for_pos(pos);
|
||||
for (int32_t token_id = 0; token_id < n_vocab; token_id++) {
|
||||
candidates[token_id].id = token_id;
|
||||
candidates[token_id].logit = pos_logits[token_id];
|
||||
candidates[token_id].p = 0.0f;
|
||||
}
|
||||
|
||||
llama_token_data_array cur_p = {
|
||||
/* .data = */ candidates.data(),
|
||||
/* .size = */ (size_t) n_vocab, // Reset size to full vocab
|
||||
/* .selected = */ -1,
|
||||
/* .sorted = */ false,
|
||||
};
|
||||
|
||||
llama_sampler_apply(sampler, &cur_p);
|
||||
output_tokens[pos] = cur_p.data[cur_p.selected].id;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
std::vector<std::pair<float, int32_t>> confidences;
|
||||
std::vector<llama_token> sampled_tokens(mask_positions.size());
|
||||
|
||||
for (size_t i = 0; i < mask_positions.size(); i++) {
|
||||
int32_t pos = mask_positions[i];
|
||||
const float * pos_logits = get_logits_for_pos(pos);
|
||||
|
||||
for (int32_t token_id = 0; token_id < n_vocab; token_id++) {
|
||||
candidates[token_id].logit = pos_logits[token_id];
|
||||
candidates[token_id].p = 0.0f;
|
||||
candidates[token_id].id = token_id;
|
||||
}
|
||||
|
||||
llama_token_data_array cur_p = {
|
||||
/* .data = */ candidates.data(),
|
||||
/* .size = */ candidates.size(),
|
||||
/* .selected = */ -1,
|
||||
/* .sorted = */ false,
|
||||
};
|
||||
|
||||
llama_sampler_apply(sampler, &cur_p);
|
||||
|
||||
llama_token sampled_token = cur_p.data[cur_p.selected].id;
|
||||
|
||||
float confidence = 0.0f;
|
||||
if (params.algorithm == DIFFUSION_ALG_ENTROPY) {
|
||||
const float epsilon = 1e-10f;
|
||||
for (size_t j = 0; j < cur_p.size; j++) {
|
||||
float prob = cur_p.data[j].p;
|
||||
confidence += prob * logf(prob + epsilon);
|
||||
}
|
||||
} else if (params.algorithm == DIFFUSION_ALG_TOPK_MARGIN) {
|
||||
confidence = cur_p.data[0].p - cur_p.data[1].p;
|
||||
} else {
|
||||
confidence = cur_p.data[cur_p.selected].p;
|
||||
}
|
||||
|
||||
sampled_tokens[i] = sampled_token;
|
||||
confidences.emplace_back(confidence, i);
|
||||
}
|
||||
|
||||
int32_t num_transfer =
|
||||
(step < params.steps - 1) ? (int32_t) (mask_positions.size() * (1.0f - s / t)) : mask_positions.size();
|
||||
|
||||
if (num_transfer > 0) {
|
||||
if (params.alg_temp == 0.0f) {
|
||||
std::partial_sort(confidences.begin(), confidences.begin() + num_transfer, confidences.end(),
|
||||
[](const std::pair<float, int32_t> & a, const std::pair<float, int32_t> & b) {
|
||||
if (a.first != b.first) {
|
||||
return a.first > b.first;
|
||||
}
|
||||
return a.second < b.second;
|
||||
});
|
||||
} else {
|
||||
conf_candidates.clear();
|
||||
|
||||
for (int32_t pos = 0; pos < max_length; pos++) {
|
||||
float conf_logit = -std::numeric_limits<float>::infinity();
|
||||
|
||||
auto it = std::find(mask_positions.begin(), mask_positions.end(), pos);
|
||||
if (it != mask_positions.end()) {
|
||||
size_t mask_idx = std::distance(mask_positions.begin(), it);
|
||||
conf_logit = confidences[mask_idx].first / params.alg_temp; // Apply temperature scaling
|
||||
}
|
||||
|
||||
conf_candidates.emplace_back(llama_token_data{ pos, conf_logit, 0.0f });
|
||||
}
|
||||
|
||||
llama_token_data_array conf_array = {
|
||||
/* .data = */ conf_candidates.data(),
|
||||
/* .size = */ conf_candidates.size(),
|
||||
/* .selected = */ -1,
|
||||
/* .sorted = */ false,
|
||||
};
|
||||
|
||||
for (int32_t i = 0; i < num_transfer; i++) {
|
||||
// Apply distribution sampler to get selected index
|
||||
llama_sampler_apply(dist_sampler, &conf_array);
|
||||
int selected_idx = conf_array.selected;
|
||||
confidences[i].second = conf_candidates[selected_idx].id;
|
||||
|
||||
conf_candidates[selected_idx].p = 0.0f;
|
||||
conf_array.selected = -1;
|
||||
}
|
||||
}
|
||||
|
||||
if (params.alg_temp == 0.0f) {
|
||||
// Deterministic - use confidence order
|
||||
for (int32_t i = 0; i < num_transfer; i++) {
|
||||
int32_t mask_idx = confidences[i].second;
|
||||
int32_t pos = mask_positions[mask_idx];
|
||||
llama_token token = sampled_tokens[mask_idx];
|
||||
output_tokens[pos] = token;
|
||||
}
|
||||
} else {
|
||||
for (int32_t i = 0; i < num_transfer; i++) {
|
||||
int32_t pos = confidences[i].second;
|
||||
auto it = std::find(mask_positions.begin(), mask_positions.end(), pos);
|
||||
if (it != mask_positions.end()) {
|
||||
int32_t mask_idx = std::distance(mask_positions.begin(), it);
|
||||
output_tokens[pos] = sampled_tokens[mask_idx];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
int64_t time_end_sampling = ggml_time_us();
|
||||
total_sampling_time += time_end_sampling - time_start_sampling;
|
||||
}
|
||||
int64_t time_end = ggml_time_us();
|
||||
total_time += time_end - time_start;
|
||||
|
||||
LOG_INF("\ntotal time: %0.2fms, time per step: %0.2fms, sampling time per step: %0.2fms\n",
|
||||
total_time / 1000.0, total_time / 1000.0 / params.steps, total_sampling_time / 1000.0 / params.steps);
|
||||
|
||||
|
||||
llama_batch_free(batch);
|
||||
llama_sampler_free(sampler);
|
||||
llama_sampler_free(dist_sampler);
|
||||
|
||||
n_generated = max_length;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
static std::string format_input_text(const std::string & prompt, bool use_chat_template, llama_model * model) {
|
||||
if (!use_chat_template) {
|
||||
return prompt;
|
||||
}
|
||||
|
||||
auto chat_templates = common_chat_templates_init(model, "");
|
||||
|
||||
common_chat_templates_inputs inputs;
|
||||
common_chat_msg user_msg;
|
||||
user_msg.role = "user";
|
||||
user_msg.content = prompt;
|
||||
inputs.add_generation_prompt = true;
|
||||
inputs.messages.push_back(user_msg);
|
||||
|
||||
auto result = common_chat_templates_apply(chat_templates.get(), inputs);
|
||||
|
||||
return result.prompt;
|
||||
}
|
||||
|
||||
struct callback_data {
|
||||
const common_params_diffusion * diff_params;
|
||||
const llama_vocab * vocab;
|
||||
int32_t n_input;
|
||||
diffusion_params * diff_params;
|
||||
const llama_vocab * vocab;
|
||||
int32_t n_input;
|
||||
};
|
||||
|
||||
static bool diffusion_step_callback(int32_t step,
|
||||
int32_t total_steps,
|
||||
static float calculate_confidence(const llama_token_data_array & cur_p,
|
||||
diffusion_algorithm algorithm,
|
||||
std::mt19937 & rng) {
|
||||
switch (algorithm) {
|
||||
case CONFIDENCE_BASED:
|
||||
return cur_p.data[cur_p.selected].p; // Selected token probability
|
||||
|
||||
case ENTROPY_BASED:
|
||||
{
|
||||
float entropy = 0.0f;
|
||||
const float epsilon = 1e-10f;
|
||||
for (size_t i = 0; i < cur_p.size; i++) {
|
||||
float prob = cur_p.data[i].p;
|
||||
entropy += prob * logf(prob + epsilon);
|
||||
}
|
||||
return -entropy; // Higher entropy = lower confidence
|
||||
}
|
||||
|
||||
case MARGIN_BASED:
|
||||
return (cur_p.size > 1) ? cur_p.data[0].p - cur_p.data[1].p : cur_p.data[0].p;
|
||||
|
||||
case RANDOM:
|
||||
{
|
||||
std::uniform_real_distribution<float> uniform(0.0f, 1.0f);
|
||||
return uniform(rng); // Random confidence
|
||||
}
|
||||
|
||||
case ORIGIN:
|
||||
return cur_p.data[cur_p.selected].p;
|
||||
|
||||
default:
|
||||
return 0.0f;
|
||||
}
|
||||
}
|
||||
|
||||
// Unified transfer count calculation function
|
||||
static int32_t calculate_transfer_count(int32_t step,
|
||||
int32_t total_steps,
|
||||
int32_t remaining_masked,
|
||||
transfer_schedule schedule,
|
||||
float eps,
|
||||
const std::vector<int32_t> & num_transfer_tokens = {}) {
|
||||
switch (schedule) {
|
||||
case TIMESTEP_BASED:
|
||||
{
|
||||
float t = 1.0f - (float) step / total_steps * (1.0f - eps);
|
||||
float s = 1.0f - (float) (step + 1) / total_steps * (1.0f - eps);
|
||||
float p_transfer = (step < total_steps - 1) ? (1.0f - s / t) : 1.0f;
|
||||
return (int32_t) (remaining_masked * p_transfer);
|
||||
}
|
||||
|
||||
case BLOCK_BASED:
|
||||
if (!num_transfer_tokens.empty() && step < (int32_t) num_transfer_tokens.size()) {
|
||||
return num_transfer_tokens[step];
|
||||
}
|
||||
return remaining_masked / (total_steps - step); // Fallback
|
||||
|
||||
default:
|
||||
return remaining_masked / (total_steps - step);
|
||||
}
|
||||
}
|
||||
|
||||
static bool diffusion_step_callback(int32_t step,
|
||||
int32_t total_steps,
|
||||
const llama_token * tokens,
|
||||
int32_t n_tokens,
|
||||
void * user_data) {
|
||||
(void)user_data;
|
||||
int32_t n_tokens,
|
||||
void * user_data) {
|
||||
(void) user_data;
|
||||
|
||||
callback_data * data = static_cast<callback_data *>(user_data);
|
||||
|
||||
|
|
@ -350,11 +134,11 @@ static bool diffusion_step_callback(int32_t step,
|
|||
int progress_percent = (step * 100) / total_steps;
|
||||
int progress_bars = (step * 50) / total_steps;
|
||||
LOG_INF("\rdiffusion step: %d/%d [%s%s] %d%%",
|
||||
step,
|
||||
total_steps,
|
||||
std::string(progress_bars, '=').c_str(),
|
||||
std::string(50 - progress_bars, ' ').c_str(),
|
||||
progress_percent);
|
||||
step,
|
||||
total_steps,
|
||||
std::string(progress_bars, '=').c_str(),
|
||||
std::string(50 - progress_bars, ' ').c_str(),
|
||||
progress_percent);
|
||||
};
|
||||
|
||||
if (data->diff_params->visual_mode) {
|
||||
|
|
@ -391,6 +175,360 @@ static bool diffusion_step_callback(int32_t step,
|
|||
return true;
|
||||
}
|
||||
|
||||
static void add_gumbel_noise(float * logits, int32_t n_vocab, float temperature, std::mt19937 & rng) {
|
||||
if (temperature == 0.0f) {
|
||||
return;
|
||||
}
|
||||
|
||||
std::uniform_real_distribution<double> uniform(0.0, 1.0);
|
||||
for (int32_t i = 0; i < n_vocab; i++) {
|
||||
double noise = uniform(rng);
|
||||
// Prevent log(0)
|
||||
noise = std::max(noise, 1e-20);
|
||||
double gumbel_noise = std::pow(-std::log(noise), temperature);
|
||||
logits[i] = std::exp(logits[i]) / gumbel_noise;
|
||||
}
|
||||
}
|
||||
|
||||
static std::vector<int32_t> get_num_transfer_tokens(int32_t mask_count, int32_t steps) {
|
||||
std::vector<int32_t> num_transfer_tokens(steps);
|
||||
|
||||
int32_t base = mask_count / steps;
|
||||
int32_t remainder = mask_count % steps;
|
||||
|
||||
for (int32_t i = 0; i < steps; i++) {
|
||||
num_transfer_tokens[i] = base + (i < remainder ? 1 : 0);
|
||||
}
|
||||
|
||||
return num_transfer_tokens;
|
||||
}
|
||||
|
||||
static void diffusion_generate(llama_context * ctx,
|
||||
const llama_token * input_tokens,
|
||||
llama_token * output_tokens,
|
||||
int32_t n_input,
|
||||
const diffusion_params & params,
|
||||
int32_t & n_generated) {
|
||||
n_generated = 0;
|
||||
if (!ctx || !input_tokens || !output_tokens || n_input <= 0 || params.max_length <= n_input) {
|
||||
return;
|
||||
}
|
||||
|
||||
const llama_model * model = llama_get_model(ctx);
|
||||
|
||||
// Initialize with input and pad with mask tokens
|
||||
std::copy(input_tokens, input_tokens + n_input, output_tokens);
|
||||
std::fill(output_tokens + n_input, output_tokens + params.max_length, params.mask_token_id);
|
||||
|
||||
std::mt19937 rng(params.seed);
|
||||
|
||||
llama_set_causal_attn(ctx, false);
|
||||
|
||||
int32_t n_vocab = llama_vocab_n_tokens(llama_model_get_vocab(model));
|
||||
|
||||
std::vector<llama_token_data> candidates(n_vocab);
|
||||
std::vector<llama_token_data> conf_candidates;
|
||||
conf_candidates.reserve(params.max_length);
|
||||
std::vector<int32_t> mask_positions;
|
||||
mask_positions.reserve(params.max_length);
|
||||
|
||||
// Setup sampler chain
|
||||
struct llama_sampler * sampler = llama_sampler_chain_init(llama_sampler_chain_default_params());
|
||||
if (params.top_k > 0) {
|
||||
llama_sampler_chain_add(sampler, llama_sampler_init_top_k(params.top_k));
|
||||
}
|
||||
if (params.top_p < 1.0f) {
|
||||
llama_sampler_chain_add(sampler, llama_sampler_init_top_p(params.top_p, 1));
|
||||
}
|
||||
if (params.temperature > 0.0f) {
|
||||
llama_sampler_chain_add(sampler, llama_sampler_init_temp(params.temperature));
|
||||
}
|
||||
llama_sampler_chain_add(sampler, llama_sampler_init_dist(params.seed));
|
||||
|
||||
struct llama_sampler * dist_sampler = llama_sampler_init_dist(params.seed);
|
||||
|
||||
llama_batch batch = llama_batch_init(params.max_length, 0, 1);
|
||||
batch.n_tokens = params.max_length;
|
||||
|
||||
// Pre-allocate buffers for CFG if needed
|
||||
int32_t logits_size = n_vocab * params.max_length;
|
||||
std::vector<float> cond_logits_buffer;
|
||||
std::vector<llama_token> un_x_buffer;
|
||||
if (params.cfg_scale > 0.0f) {
|
||||
cond_logits_buffer.resize(logits_size);
|
||||
un_x_buffer.resize(params.max_length);
|
||||
}
|
||||
|
||||
// For block-based processing
|
||||
std::vector<int32_t> num_transfer_tokens;
|
||||
int32_t num_blocks = 1;
|
||||
int32_t steps_per_block = params.steps;
|
||||
|
||||
if (params.schedule == BLOCK_BASED) {
|
||||
GGML_ASSERT(params.max_length % params.block_length == 0);
|
||||
num_blocks = params.max_length / params.block_length;
|
||||
GGML_ASSERT(params.steps % num_blocks == 0);
|
||||
steps_per_block = params.steps / num_blocks;
|
||||
}
|
||||
|
||||
std::vector<float> confidence(params.max_length);
|
||||
|
||||
int64_t total_sampling_time = 0;
|
||||
int64_t total_time = 0;
|
||||
int64_t time_start = ggml_time_us();
|
||||
|
||||
for (int block_num = 0; block_num < num_blocks; block_num++) {
|
||||
int32_t block_start = (params.schedule == BLOCK_BASED) ? n_input + block_num * params.block_length : 0;
|
||||
int32_t block_end = (params.schedule == BLOCK_BASED) ?
|
||||
std::min(n_input + (block_num + 1) * params.block_length, params.max_length) :
|
||||
params.max_length;
|
||||
|
||||
// Count masked tokens in current block for block-based processing
|
||||
if (params.schedule == BLOCK_BASED) {
|
||||
int32_t block_mask_count = 0;
|
||||
for (int i = block_start; i < block_end; i++) {
|
||||
if (output_tokens[i] == params.mask_token_id) {
|
||||
block_mask_count++;
|
||||
}
|
||||
}
|
||||
num_transfer_tokens = get_num_transfer_tokens(block_mask_count, steps_per_block);
|
||||
}
|
||||
|
||||
for (int32_t step = 0; step < steps_per_block; step++) {
|
||||
int32_t global_step = block_num * steps_per_block + step;
|
||||
|
||||
if (params.step_callback) {
|
||||
if (!params.step_callback(
|
||||
global_step, params.steps, output_tokens, params.max_length, params.step_callback_user_data)) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Setup batch
|
||||
for (int32_t i = 0; i < params.max_length; i++) {
|
||||
batch.token[i] = output_tokens[i];
|
||||
batch.pos[i] = i;
|
||||
batch.n_seq_id[i] = 1;
|
||||
batch.seq_id[i][0] = 0;
|
||||
batch.logits[i] = 1;
|
||||
}
|
||||
|
||||
float * logits = nullptr;
|
||||
|
||||
if (params.cfg_scale > 0.0f) {
|
||||
int ret = llama_decode(ctx, batch);
|
||||
if (ret != 0) {
|
||||
LOG_ERR("Failed to generate conditional");
|
||||
break;
|
||||
}
|
||||
float * cond_logits_ptr = llama_get_logits(ctx);
|
||||
std::memcpy(cond_logits_buffer.data(), cond_logits_ptr, logits_size * sizeof(float));
|
||||
|
||||
// Unconditional generation (mask input)
|
||||
std::copy(output_tokens, output_tokens + params.max_length, un_x_buffer.begin());
|
||||
for (int32_t i = 0; i < n_input; i++) {
|
||||
un_x_buffer[i] = params.mask_token_id;
|
||||
}
|
||||
|
||||
for (int32_t i = 0; i < params.max_length; i++) {
|
||||
batch.token[i] = un_x_buffer[i];
|
||||
}
|
||||
ret = llama_decode(ctx, batch);
|
||||
if (ret != 0) {
|
||||
LOG_ERR("Failed to generate unconditional");
|
||||
break;
|
||||
}
|
||||
float * uncond_logits = llama_get_logits(ctx);
|
||||
|
||||
// Apply CFG
|
||||
for (int32_t i = 0; i < logits_size; i++) {
|
||||
cond_logits_buffer[i] =
|
||||
uncond_logits[i] + (params.cfg_scale + 1.0f) * (cond_logits_buffer[i] - uncond_logits[i]);
|
||||
}
|
||||
logits = cond_logits_buffer.data();
|
||||
} else {
|
||||
int ret = llama_decode(ctx, batch);
|
||||
if (ret != 0) {
|
||||
LOG_ERR("%s: failed to decode at step %d, ret = %d\n", __func__, global_step, ret);
|
||||
break;
|
||||
}
|
||||
logits = llama_get_logits(ctx);
|
||||
}
|
||||
|
||||
if (!logits) {
|
||||
LOG_ERR("%s: failed to get logits at step %d\n", __func__, global_step);
|
||||
break;
|
||||
}
|
||||
|
||||
auto get_logits_for_pos = [&](int32_t pos) -> const float * {
|
||||
if (params.shift_logits) {
|
||||
return pos == 0 ? logits : logits + (pos - 1) * n_vocab;
|
||||
}
|
||||
return logits + (pos) *n_vocab;
|
||||
};
|
||||
|
||||
int64_t time_start_sampling = ggml_time_us();
|
||||
|
||||
mask_positions.clear();
|
||||
for (int32_t i = 0; i < params.max_length; i++) {
|
||||
if (output_tokens[i] == params.mask_token_id) {
|
||||
// For block-based, only consider current block
|
||||
if (params.schedule != BLOCK_BASED || (i >= block_start && i < block_end)) {
|
||||
mask_positions.push_back(i);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (mask_positions.empty()) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (params.add_gumbel_noise && params.temperature > 0.0f) {
|
||||
add_gumbel_noise(logits, n_vocab, params.temperature, rng);
|
||||
}
|
||||
|
||||
if (params.algorithm == ORIGIN) {
|
||||
int32_t transfer_count = calculate_transfer_count(
|
||||
step, steps_per_block, mask_positions.size(), params.schedule, params.eps, num_transfer_tokens);
|
||||
float p_transfer = (float) transfer_count / mask_positions.size();
|
||||
|
||||
for (int32_t pos : mask_positions) {
|
||||
if (std::uniform_real_distribution<float>(0.0f, 1.0f)(rng) < p_transfer) {
|
||||
const float * pos_logits = get_logits_for_pos(pos);
|
||||
for (int32_t token_id = 0; token_id < n_vocab; token_id++) {
|
||||
candidates[token_id].id = token_id;
|
||||
candidates[token_id].logit = pos_logits[token_id];
|
||||
candidates[token_id].p = 0.0f;
|
||||
}
|
||||
|
||||
llama_token_data_array cur_p = {
|
||||
candidates.data(),
|
||||
(size_t) n_vocab,
|
||||
-1,
|
||||
false,
|
||||
};
|
||||
|
||||
llama_sampler_apply(sampler, &cur_p);
|
||||
output_tokens[pos] = cur_p.data[cur_p.selected].id;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
std::vector<std::pair<float, int32_t>> confidences;
|
||||
std::vector<llama_token> sampled_tokens(mask_positions.size());
|
||||
|
||||
for (size_t i = 0; i < mask_positions.size(); i++) {
|
||||
int32_t pos = mask_positions[i];
|
||||
const float * pos_logits = get_logits_for_pos(pos);
|
||||
|
||||
for (int32_t token_id = 0; token_id < n_vocab; token_id++) {
|
||||
candidates[token_id].logit = pos_logits[token_id];
|
||||
candidates[token_id].p = 0.0f;
|
||||
candidates[token_id].id = token_id;
|
||||
}
|
||||
|
||||
llama_token_data_array cur_p = {
|
||||
candidates.data(),
|
||||
candidates.size(),
|
||||
-1,
|
||||
false,
|
||||
};
|
||||
|
||||
llama_sampler_apply(sampler, &cur_p);
|
||||
llama_token sampled_token = cur_p.data[cur_p.selected].id;
|
||||
|
||||
float conf = calculate_confidence(cur_p, params.algorithm, rng);
|
||||
|
||||
sampled_tokens[i] = sampled_token;
|
||||
confidences.emplace_back(conf, i);
|
||||
}
|
||||
|
||||
int32_t transfer_count = calculate_transfer_count(
|
||||
step, steps_per_block, mask_positions.size(), params.schedule, params.eps, num_transfer_tokens);
|
||||
|
||||
if (transfer_count > 0) {
|
||||
if (params.alg_temp == 0.0f) {
|
||||
std::partial_sort(confidences.begin(),
|
||||
confidences.begin() + std::min(transfer_count, (int32_t) confidences.size()),
|
||||
confidences.end(),
|
||||
[](const std::pair<float, int32_t> & a, const std::pair<float, int32_t> & b) {
|
||||
if (a.first != b.first) {
|
||||
return a.first > b.first;
|
||||
}
|
||||
return a.second < b.second;
|
||||
});
|
||||
|
||||
for (int32_t i = 0; i < std::min(transfer_count, (int32_t) confidences.size()); i++) {
|
||||
int32_t mask_idx = confidences[i].second;
|
||||
int32_t pos = mask_positions[mask_idx];
|
||||
output_tokens[pos] = sampled_tokens[mask_idx];
|
||||
}
|
||||
} else {
|
||||
conf_candidates.clear();
|
||||
for (size_t i = 0; i < confidences.size(); i++) {
|
||||
float conf_logit = confidences[i].first / params.alg_temp;
|
||||
conf_candidates.emplace_back(llama_token_data{ (int32_t) i, conf_logit, 0.0f });
|
||||
}
|
||||
|
||||
llama_token_data_array conf_array = {
|
||||
conf_candidates.data(),
|
||||
conf_candidates.size(),
|
||||
-1,
|
||||
false,
|
||||
};
|
||||
|
||||
for (int32_t i = 0; i < std::min(transfer_count, (int32_t) confidences.size()); i++) {
|
||||
llama_sampler_apply(dist_sampler, &conf_array);
|
||||
int32_t selected_idx = conf_array.selected;
|
||||
int32_t mask_idx = selected_idx;
|
||||
int32_t pos = mask_positions[mask_idx];
|
||||
output_tokens[pos] = sampled_tokens[mask_idx];
|
||||
|
||||
conf_candidates[selected_idx].p = 0.0f;
|
||||
conf_array.selected = -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int64_t time_end_sampling = ggml_time_us();
|
||||
total_sampling_time += time_end_sampling - time_start_sampling;
|
||||
}
|
||||
}
|
||||
|
||||
int64_t time_end = ggml_time_us();
|
||||
total_time += time_end - time_start;
|
||||
|
||||
LOG_INF("\ntotal time: %0.2fms, time per step: %0.2fms, sampling time per step: %0.2fms\n",
|
||||
total_time / 1000.0,
|
||||
total_time / 1000.0 / params.steps,
|
||||
total_sampling_time / 1000.0 / params.steps);
|
||||
|
||||
llama_batch_free(batch);
|
||||
llama_sampler_free(sampler);
|
||||
llama_sampler_free(dist_sampler);
|
||||
|
||||
n_generated = params.max_length;
|
||||
}
|
||||
|
||||
static std::string format_input_text(const std::string & prompt, bool use_chat_template, llama_model * model) {
|
||||
if (!use_chat_template) {
|
||||
return prompt;
|
||||
}
|
||||
|
||||
auto chat_templates = common_chat_templates_init(model, "");
|
||||
|
||||
common_chat_templates_inputs inputs;
|
||||
common_chat_msg user_msg;
|
||||
user_msg.role = "user";
|
||||
user_msg.content = prompt;
|
||||
inputs.add_generation_prompt = true;
|
||||
inputs.messages.push_back(user_msg);
|
||||
|
||||
auto result = common_chat_templates_apply(chat_templates.get(), inputs);
|
||||
|
||||
return result.prompt;
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
ggml_time_init();
|
||||
|
||||
|
|
@ -400,11 +538,6 @@ int main(int argc, char ** argv) {
|
|||
return 1;
|
||||
}
|
||||
|
||||
const char * alg_names[] = { "ORIGIN", "MASKGIT_PLUS", "TOPK_MARGIN", "ENTROPY" };
|
||||
const char * alg_name = (params.diffusion.algorithm >= 0 && params.diffusion.algorithm <= 3) ?
|
||||
alg_names[params.diffusion.algorithm] :
|
||||
"UNKNOWN";
|
||||
|
||||
common_init();
|
||||
llama_backend_init();
|
||||
|
||||
|
|
@ -421,6 +554,12 @@ int main(int argc, char ** argv) {
|
|||
return 1;
|
||||
}
|
||||
|
||||
if (!llama_model_is_diffusion(model)) {
|
||||
LOG_ERR("error: unsupported model for diffusion");
|
||||
llama_model_free(model);
|
||||
return 1;
|
||||
}
|
||||
|
||||
llama_context_params ctx_params = llama_context_default_params();
|
||||
ctx_params.n_ctx = params.n_ctx;
|
||||
ctx_params.n_batch = params.n_batch;
|
||||
|
|
@ -442,10 +581,12 @@ int main(int argc, char ** argv) {
|
|||
const llama_vocab * vocab = llama_model_get_vocab(model);
|
||||
std::string formatted_prompt = format_input_text(params.prompt, params.enable_chat_template, model);
|
||||
|
||||
std::vector<llama_token> input_tokens = common_tokenize(vocab, formatted_prompt,
|
||||
std::vector<llama_token> input_tokens = common_tokenize(vocab,
|
||||
formatted_prompt,
|
||||
/*add special tokens*/ true,
|
||||
/*parse special*/ true);
|
||||
int n_input = input_tokens.size();
|
||||
|
||||
int n_input = input_tokens.size();
|
||||
|
||||
if (n_input >= params.n_ctx) {
|
||||
LOG_ERR("error: input too long (%d tokens), max context is %d\n", n_input, params.n_ctx);
|
||||
|
|
@ -454,44 +595,79 @@ int main(int argc, char ** argv) {
|
|||
return 1;
|
||||
}
|
||||
|
||||
struct diffusion_params ldiff_params = diffusion_default_params();
|
||||
ldiff_params.steps = params.diffusion.steps;
|
||||
ldiff_params.eps = params.diffusion.eps;
|
||||
ldiff_params.temperature = params.sampling.temp;
|
||||
ldiff_params.top_p = params.sampling.top_p;
|
||||
ldiff_params.top_k = params.sampling.top_k;
|
||||
ldiff_params.algorithm = static_cast<enum diffusion_alg>(params.diffusion.algorithm);
|
||||
ldiff_params.alg_temp = params.diffusion.alg_temp;
|
||||
ldiff_params.seed = params.sampling.seed;
|
||||
|
||||
llama_token mask_token_id = llama_vocab_mask(vocab);
|
||||
GGML_ASSERT(mask_token_id != LLAMA_TOKEN_NULL);
|
||||
|
||||
LOG_INF("diffusion_params: - %-25s llama_token = %d\n", "mask_token_id", mask_token_id);
|
||||
LOG_INF("diffusion_params: - %-25s u32 = %d\n", "steps", params.diffusion.steps);
|
||||
LOG_INF("diffusion_params: - %-25s f32 = %.6f\n", "eps", params.diffusion.eps);
|
||||
LOG_INF("diffusion_params: - %-25s u32 = %d (%s)\n", "algorithm", params.diffusion.algorithm,
|
||||
alg_name);
|
||||
LOG_INF("diffusion_params: - %-25s f32 = %.3f\n", "alg_temp", params.diffusion.alg_temp);
|
||||
|
||||
ldiff_params.mask_token_id = mask_token_id;
|
||||
|
||||
callback_data cb_data = { ¶ms.diffusion, vocab, n_input };
|
||||
|
||||
ldiff_params.step_callback = diffusion_step_callback;
|
||||
ldiff_params.step_callback_user_data = &cb_data;
|
||||
|
||||
int32_t n_generated = 0;
|
||||
bool visual_mode = params.diffusion.visual_mode;
|
||||
|
||||
int32_t n_generated = 0;
|
||||
std::vector<llama_token> output_tokens(params.n_ubatch);
|
||||
diffusion_generate(ctx, input_tokens.data(), output_tokens.data(), n_input, params.n_ubatch,
|
||||
ldiff_params, n_generated);
|
||||
|
||||
struct diffusion_params diff_params;
|
||||
|
||||
char shift_logits_str[8];
|
||||
if (llama_model_meta_val_str(model, "diffusion.shift_logits", shift_logits_str, sizeof(shift_logits_str)) >= 0) {
|
||||
diff_params.shift_logits = (strcmp(shift_logits_str, "true") == 0);
|
||||
} else {
|
||||
diff_params.shift_logits = true;
|
||||
}
|
||||
|
||||
//Use either eps or block length, but not both
|
||||
GGML_ASSERT((params.diffusion.eps == 0) ^ (params.diffusion.block_length == 0));
|
||||
|
||||
if (params.diffusion.eps) {
|
||||
diff_params.schedule = TIMESTEP_BASED;
|
||||
diff_params.eps = params.diffusion.eps;
|
||||
} else if (params.diffusion.block_length) {
|
||||
diff_params.schedule = BLOCK_BASED;
|
||||
diff_params.block_length = params.diffusion.block_length;
|
||||
}
|
||||
|
||||
diff_params.mask_token_id = mask_token_id;
|
||||
diff_params.seed = params.sampling.seed;
|
||||
diff_params.temperature = params.sampling.temp;
|
||||
diff_params.steps = params.diffusion.steps;
|
||||
diff_params.algorithm = static_cast<diffusion_algorithm>(params.diffusion.algorithm);
|
||||
diff_params.max_length = params.n_ubatch;
|
||||
diff_params.top_p = params.sampling.top_p;
|
||||
diff_params.top_k = params.sampling.top_k;
|
||||
diff_params.visual_mode = params.diffusion.visual_mode;
|
||||
diff_params.add_gumbel_noise = params.diffusion.add_gumbel_noise;
|
||||
|
||||
diff_params.step_callback = diffusion_step_callback;
|
||||
callback_data cb_data = { &diff_params, vocab, n_input };
|
||||
diff_params.step_callback_user_data = &cb_data;
|
||||
|
||||
const char * alg_names[] = { "ORIGIN", "ENTROPY_BASED", "MARGIN_BASED", "RANDOM", "CONFIDENCE_BASED" };
|
||||
const char * sched_names[] = { "TIMESTEP_BASED", "BLOCK_BASED" };
|
||||
const char * alg_name =
|
||||
(diff_params.algorithm >= 0 && diff_params.algorithm <= 4) ? alg_names[diff_params.algorithm] : "UNKNOWN";
|
||||
const char * sched_name =
|
||||
(diff_params.schedule >= 0 && diff_params.schedule <= 1) ? sched_names[diff_params.schedule] : "UNKNOWN";
|
||||
|
||||
LOG_INF("diffusion_params: - %-25s llama_token = %d\n", "mask_token_id", mask_token_id);
|
||||
LOG_INF("diffusion_params: - %-25s u32 = %d\n", "steps", diff_params.steps);
|
||||
LOG_INF("diffusion_params: - %-25s u32 = %d\n", "max_length", diff_params.max_length);
|
||||
LOG_INF("diffusion_params: - %-25s enum = %d (%s)\n", "algorithm", diff_params.algorithm, alg_name);
|
||||
LOG_INF("diffusion_params: - %-25s enum = %d (%s)\n", "schedule", diff_params.schedule, sched_name);
|
||||
LOG_INF("diffusion_params: - %-25s f32 = %.3f\n", "temperature", diff_params.temperature);
|
||||
if (diff_params.schedule == TIMESTEP_BASED) {
|
||||
LOG_INF("diffusion_params: - %-25s f32 = %.6f\n", "eps", diff_params.eps);
|
||||
LOG_INF("diffusion_params: - %-25s f32 = %.3f\n", "alg_temp", diff_params.alg_temp);
|
||||
}
|
||||
if (diff_params.schedule == BLOCK_BASED) {
|
||||
LOG_INF("diffusion_params: - %-25s u32 = %d\n", "block_length", diff_params.block_length);
|
||||
LOG_INF("diffusion_params: - %-25s f32 = %.3f\n", "cfg_scale", diff_params.cfg_scale);
|
||||
}
|
||||
|
||||
diffusion_generate(ctx, input_tokens.data(), output_tokens.data(), n_input, diff_params, n_generated);
|
||||
|
||||
if (n_generated > 0) {
|
||||
if (params.diffusion.visual_mode) {
|
||||
if (visual_mode) {
|
||||
//clear screen and move cursor to top-left
|
||||
LOG_INF("\033[2J\033[H");
|
||||
}
|
||||
|
||||
output_tokens.erase(output_tokens.begin(), output_tokens.begin() + n_input);
|
||||
std::string output_data = common_detokenize(vocab, output_tokens, false);
|
||||
LOG_INF("\n%s\n", output_data.c_str());
|
||||
|
|
|
|||
|
|
@ -81,6 +81,14 @@ int main(int argc, char ** argv) {
|
|||
|
||||
params.embedding = true;
|
||||
|
||||
// if the number of prompts that would be encoded is known in advance, it's more efficient to specify the
|
||||
// --parallel argument accordingly. for convenience, if not specified, we fallback to unified KV cache
|
||||
// in order to support any number of prompts
|
||||
if (params.n_parallel == 1) {
|
||||
LOG_INF("%s: n_parallel == 1 -> unified KV cache is enabled\n", __func__);
|
||||
params.kv_unified = true;
|
||||
}
|
||||
|
||||
// utilize the full context
|
||||
if (params.n_batch < params.n_ctx) {
|
||||
LOG_WRN("%s: setting batch size to %d\n", __func__, params.n_ctx);
|
||||
|
|
|
|||
|
|
@ -15,6 +15,12 @@ int main(int argc, char ** argv) {
|
|||
return 1;
|
||||
}
|
||||
|
||||
if (params.n_parallel == 1) {
|
||||
// the example uses 2 sequences, so when n_parallel == 1, we need to enable unified kv cache
|
||||
printf("%s: n_parallel == 1, enabling unified kv cache\n", __func__);
|
||||
params.kv_unified = true;
|
||||
}
|
||||
|
||||
common_init();
|
||||
|
||||
if (params.n_predict < 0) {
|
||||
|
|
|
|||
|
|
@ -65,7 +65,7 @@ int main(int argc, char ** argv) {
|
|||
ctx_dft = llama_init_dft.context.get();
|
||||
|
||||
if (!common_speculative_are_compatible(ctx_tgt, ctx_dft)) {
|
||||
return 1;
|
||||
LOG_INF("the draft model '%s' is not compatible with the target model '%s'. tokens will be translated between the draft and target models.\n", params.speculative.model.path.c_str(), params.model.path.c_str());
|
||||
}
|
||||
|
||||
// Tokenize the prompt
|
||||
|
|
@ -130,7 +130,10 @@ int main(int argc, char ** argv) {
|
|||
params_spec.n_reuse = llama_n_ctx(ctx_dft) - n_draft;
|
||||
params_spec.p_min = p_min;
|
||||
|
||||
struct common_speculative * spec = common_speculative_init(ctx_dft);
|
||||
struct common_speculative * spec = common_speculative_init(ctx_tgt, ctx_dft);
|
||||
for (auto &pair : params.speculative.replacements) {
|
||||
common_speculative_add_replacement_tgt_dft(spec, pair.first.c_str(), pair.second.c_str());
|
||||
}
|
||||
|
||||
llama_batch batch_tgt = llama_batch_init(llama_n_batch(ctx_tgt), 0, 1);
|
||||
|
||||
|
|
|
|||
|
|
@ -174,6 +174,7 @@ option(GGML_HIP_GRAPHS "ggml: use HIP graph, experimental,
|
|||
option(GGML_HIP_NO_VMM "ggml: do not try to use HIP VMM" ON)
|
||||
option(GGML_HIP_ROCWMMA_FATTN "ggml: enable rocWMMA for FlashAttention" OFF)
|
||||
option(GGML_HIP_FORCE_ROCWMMA_FATTN_GFX12 "ggml: enable rocWMMA FlashAttention on GFX12" OFF)
|
||||
option(GGML_HIP_MMQ_MFMA "ggml: enable MFMA MMA for CDNA in MMQ" ON)
|
||||
option(GGML_MUSA_GRAPHS "ggml: use MUSA graph, experimental, unstable" OFF)
|
||||
option(GGML_MUSA_MUDNN_COPY "ggml: enable muDNN for accelerated copy" OFF)
|
||||
option(GGML_VULKAN "ggml: use Vulkan" OFF)
|
||||
|
|
|
|||
|
|
@ -34,8 +34,8 @@ if (NOT GGML_SHARED_LIB)
|
|||
|
||||
if (GGML_BLAS)
|
||||
find_dependency(BLAS)
|
||||
list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES ${BLAS_LIBRARIES})
|
||||
list(APPEND GGML_CPU_INTERFACE_LINK_OPTIONS ${BLAS_LINKER_FLAGS})
|
||||
list(APPEND GGML_BLAS_INTERFACE_LINK_LIBRARIES ${BLAS_LIBRARIES})
|
||||
list(APPEND GGML_BLAS_INTERFACE_LINK_OPTIONS ${BLAS_LINKER_FLAGS})
|
||||
endif()
|
||||
|
||||
if (GGML_CUDA)
|
||||
|
|
@ -102,89 +102,88 @@ set_and_check(GGML_LIB_DIR "@PACKAGE_GGML_LIB_INSTALL_DIR@")
|
|||
#set_and_check(GGML_BIN_DIR "@PACKAGE_GGML_BIN_INSTALL_DIR@")
|
||||
|
||||
if(NOT TARGET ggml::ggml)
|
||||
find_package(Threads REQUIRED)
|
||||
|
||||
find_package(Threads REQUIRED)
|
||||
|
||||
find_library(GGML_LIBRARY ggml
|
||||
REQUIRED
|
||||
HINTS ${GGML_LIB_DIR}
|
||||
NO_CMAKE_FIND_ROOT_PATH)
|
||||
|
||||
add_library(ggml::ggml UNKNOWN IMPORTED)
|
||||
set_target_properties(ggml::ggml
|
||||
PROPERTIES
|
||||
IMPORTED_LOCATION "${GGML_LIBRARY}")
|
||||
|
||||
find_library(GGML_BASE_LIBRARY ggml-base
|
||||
REQUIRED
|
||||
HINTS ${GGML_LIB_DIR}
|
||||
NO_CMAKE_FIND_ROOT_PATH)
|
||||
|
||||
add_library(ggml::ggml-base UNKNOWN IMPORTED)
|
||||
set_target_properties(ggml::ggml-base
|
||||
PROPERTIES
|
||||
IMPORTED_LOCATION "${GGML_BASE_LIBRARY}")
|
||||
|
||||
set(_ggml_all_targets "")
|
||||
foreach(_ggml_backend ${GGML_AVAILABLE_BACKENDS})
|
||||
string(REPLACE "-" "_" _ggml_backend_pfx "${_ggml_backend}")
|
||||
string(TOUPPER "${_ggml_backend_pfx}" _ggml_backend_pfx)
|
||||
|
||||
find_library(${_ggml_backend_pfx}_LIBRARY ${_ggml_backend}
|
||||
find_library(GGML_LIBRARY ggml
|
||||
REQUIRED
|
||||
HINTS ${GGML_LIB_DIR}
|
||||
NO_CMAKE_FIND_ROOT_PATH)
|
||||
|
||||
message(STATUS "Found ${${_ggml_backend_pfx}_LIBRARY}")
|
||||
|
||||
add_library(ggml::${_ggml_backend} UNKNOWN IMPORTED)
|
||||
set_target_properties(ggml::${_ggml_backend}
|
||||
add_library(ggml::ggml UNKNOWN IMPORTED)
|
||||
set_target_properties(ggml::ggml
|
||||
PROPERTIES
|
||||
INTERFACE_INCLUDE_DIRECTORIES "${GGML_INCLUDE_DIR}"
|
||||
IMPORTED_LINK_INTERFACE_LANGUAGES "CXX"
|
||||
IMPORTED_LOCATION "${${_ggml_backend_pfx}_LIBRARY}"
|
||||
INTERFACE_COMPILE_FEATURES c_std_90
|
||||
POSITION_INDEPENDENT_CODE ON)
|
||||
IMPORTED_LOCATION "${GGML_LIBRARY}")
|
||||
|
||||
string(REGEX MATCH "^ggml-cpu" is_cpu_variant "${_ggml_backend}")
|
||||
if(is_cpu_variant)
|
||||
list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES "ggml::ggml-base")
|
||||
set_target_properties(ggml::${_ggml_backend}
|
||||
PROPERTIES
|
||||
INTERFACE_LINK_LIBRARIES "${GGML_CPU_INTERFACE_LINK_LIBRARIES}")
|
||||
find_library(GGML_BASE_LIBRARY ggml-base
|
||||
REQUIRED
|
||||
HINTS ${GGML_LIB_DIR}
|
||||
NO_CMAKE_FIND_ROOT_PATH)
|
||||
|
||||
if(GGML_CPU_INTERFACE_LINK_OPTIONS)
|
||||
set_target_properties(ggml::${_ggml_backend}
|
||||
PROPERTIES
|
||||
INTERFACE_LINK_OPTIONS "${GGML_CPU_INTERFACE_LINK_OPTIONS}")
|
||||
endif()
|
||||
add_library(ggml::ggml-base UNKNOWN IMPORTED)
|
||||
set_target_properties(ggml::ggml-base
|
||||
PROPERTIES
|
||||
IMPORTED_LOCATION "${GGML_BASE_LIBRARY}")
|
||||
|
||||
else()
|
||||
list(APPEND ${_ggml_backend_pfx}_INTERFACE_LINK_LIBRARIES "ggml::ggml-base")
|
||||
set(_ggml_all_targets "")
|
||||
foreach(_ggml_backend ${GGML_AVAILABLE_BACKENDS})
|
||||
string(REPLACE "-" "_" _ggml_backend_pfx "${_ggml_backend}")
|
||||
string(TOUPPER "${_ggml_backend_pfx}" _ggml_backend_pfx)
|
||||
|
||||
find_library(${_ggml_backend_pfx}_LIBRARY ${_ggml_backend}
|
||||
REQUIRED
|
||||
HINTS ${GGML_LIB_DIR}
|
||||
NO_CMAKE_FIND_ROOT_PATH)
|
||||
|
||||
message(STATUS "Found ${${_ggml_backend_pfx}_LIBRARY}")
|
||||
|
||||
add_library(ggml::${_ggml_backend} UNKNOWN IMPORTED)
|
||||
set_target_properties(ggml::${_ggml_backend}
|
||||
PROPERTIES
|
||||
INTERFACE_LINK_LIBRARIES "${${_ggml_backend_pfx}_INTERFACE_LINK_LIBRARIES}")
|
||||
INTERFACE_INCLUDE_DIRECTORIES "${GGML_INCLUDE_DIR}"
|
||||
IMPORTED_LINK_INTERFACE_LANGUAGES "CXX"
|
||||
IMPORTED_LOCATION "${${_ggml_backend_pfx}_LIBRARY}"
|
||||
INTERFACE_COMPILE_FEATURES c_std_90
|
||||
POSITION_INDEPENDENT_CODE ON)
|
||||
|
||||
if(${_ggml_backend_pfx}_INTERFACE_LINK_OPTIONS)
|
||||
string(REGEX MATCH "^ggml-cpu" is_cpu_variant "${_ggml_backend}")
|
||||
if(is_cpu_variant)
|
||||
list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES "ggml::ggml-base")
|
||||
set_target_properties(ggml::${_ggml_backend}
|
||||
PROPERTIES
|
||||
INTERFACE_LINK_LIBRARIES "${GGML_CPU_INTERFACE_LINK_LIBRARIES}")
|
||||
|
||||
if(GGML_CPU_INTERFACE_LINK_OPTIONS)
|
||||
set_target_properties(ggml::${_ggml_backend}
|
||||
PROPERTIES
|
||||
INTERFACE_LINK_OPTIONS "${GGML_CPU_INTERFACE_LINK_OPTIONS}")
|
||||
endif()
|
||||
|
||||
else()
|
||||
list(APPEND ${_ggml_backend_pfx}_INTERFACE_LINK_LIBRARIES "ggml::ggml-base")
|
||||
set_target_properties(ggml::${_ggml_backend}
|
||||
PROPERTIES
|
||||
INTERFACE_LINK_OPTIONS "${${_ggml_backend_pfx}_INTERFACE_LINK_OPTIONS}")
|
||||
INTERFACE_LINK_LIBRARIES "${${_ggml_backend_pfx}_INTERFACE_LINK_LIBRARIES}")
|
||||
|
||||
if(${_ggml_backend_pfx}_INTERFACE_LINK_OPTIONS)
|
||||
set_target_properties(ggml::${_ggml_backend}
|
||||
PROPERTIES
|
||||
INTERFACE_LINK_OPTIONS "${${_ggml_backend_pfx}_INTERFACE_LINK_OPTIONS}")
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
|
||||
list(APPEND _ggml_all_targets ggml::${_ggml_backend})
|
||||
endforeach()
|
||||
list(APPEND _ggml_all_targets ggml::${_ggml_backend})
|
||||
endforeach()
|
||||
|
||||
list(APPEND GGML_INTERFACE_LINK_LIBRARIES ggml::ggml-base "${_ggml_all_targets}")
|
||||
set_target_properties(ggml::ggml
|
||||
PROPERTIES
|
||||
INTERFACE_LINK_LIBRARIES "${GGML_INTERFACE_LINK_LIBRARIES}")
|
||||
list(APPEND GGML_INTERFACE_LINK_LIBRARIES ggml::ggml-base "${_ggml_all_targets}")
|
||||
set_target_properties(ggml::ggml
|
||||
PROPERTIES
|
||||
INTERFACE_LINK_LIBRARIES "${GGML_INTERFACE_LINK_LIBRARIES}")
|
||||
|
||||
add_library(ggml::all INTERFACE IMPORTED)
|
||||
set_target_properties(ggml::all
|
||||
PROPERTIES
|
||||
INTERFACE_LINK_LIBRARIES "${_ggml_all_targets}")
|
||||
add_library(ggml::all INTERFACE IMPORTED)
|
||||
set_target_properties(ggml::all
|
||||
PROPERTIES
|
||||
INTERFACE_LINK_LIBRARIES "${_ggml_all_targets}")
|
||||
|
||||
endif() # TARGET ggml::ggml
|
||||
endif()
|
||||
|
||||
check_required_components(ggml)
|
||||
|
|
|
|||
|
|
@ -68,6 +68,8 @@
|
|||
#include <aclnnop/aclnn_grouped_matmul_v3.h>
|
||||
#include <aclnnop/aclnn_fused_infer_attention_score_v2.h>
|
||||
#include <aclnnop/aclnn_zero.h>
|
||||
#include <aclnnop/aclnn_index_copy.h>
|
||||
#include <aclnnop/aclnn_index_select.h>
|
||||
#include <float.h>
|
||||
|
||||
#include <cmath>
|
||||
|
|
@ -1614,50 +1616,97 @@ void ggml_cann_softmax(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
|||
}
|
||||
|
||||
/**
|
||||
* @brief Performs embedding operation on a 4D tensor using the CANN backend.
|
||||
* @brief Performs index select operation on a 4D tensor using the CANN backend.
|
||||
*
|
||||
* This function extracts slices from the source tensor (`src_buffer`),
|
||||
* index tensor (`index`), and destination tensor (`dst`), and performs an
|
||||
* embedding operation on them. The embedding operation is applied by iterating
|
||||
* over the last two dimensions of the source tensor, creating the necessary
|
||||
* tensors for the source, index, and output, and executing the embedding operation.
|
||||
* This function applies the `IndexSelect` operation along a specific dimension
|
||||
* of the source tensor (`src_buffer`) using the indices from the index tensor (`index`).
|
||||
* It iterates over the last two dimensions of the source tensor, creates the corresponding
|
||||
* CANN tensors for the source, index, and output slices, and executes the `IndexSelect`
|
||||
* operation for each slice.
|
||||
*
|
||||
* @param ctx The context for CANN backend operations.
|
||||
* @param src_buffer The source buffer holding the data for the source tensor.
|
||||
* @param src_buffer The source buffer containing the 4D input tensor data.
|
||||
* @param src_ne The dimensions of the source tensor.
|
||||
* @param src_nb The strides (byte offsets) of the source tensor.
|
||||
* @param index The index tensor used in the embedding operation.
|
||||
* @param dst The destination tensor where the result will be stored.
|
||||
* @param dst_buffer The destination buffer where the output tensor data will be written.
|
||||
* @param dst_ne The dimensions of the destination tensor.
|
||||
* @param dst_nb The strides (byte offsets) of the destination tensor.
|
||||
* @param index The index tensor specifying the indices to select from the source tensor.
|
||||
* @param type The data type of the source and destination tensors.
|
||||
*/
|
||||
static void aclnn_embedding_4d(ggml_backend_cann_context& ctx, void* src_buffer,
|
||||
int64_t* src_ne, size_t* src_nb, ggml_tensor* index,
|
||||
ggml_tensor* dst) {
|
||||
static void aclnn_index_select_4d(ggml_backend_cann_context& ctx,
|
||||
void* src_buffer,int64_t* src_ne, size_t* src_nb,
|
||||
void* dst_buffer, int64_t* dst_ne, size_t* dst_nb,
|
||||
ggml_tensor* index, ggml_type type) {
|
||||
for (int64_t i = 0; i < src_ne[3]; i++) {
|
||||
for (int64_t j = 0; j < src_ne[2]; j++) {
|
||||
// src
|
||||
int64_t acl_src_ne[2] = {src_ne[0], src_ne[1]};
|
||||
size_t acl_src_nb[2] = {src_nb[0], src_nb[1]};
|
||||
aclTensor* acl_src_tensor = ggml_cann_create_tensor(
|
||||
(char*)src_buffer + i * src_nb[3] + j * src_nb[2],
|
||||
ggml_cann_type_mapping(dst->type), ggml_element_size(dst),
|
||||
acl_src_ne, acl_src_nb, 2);
|
||||
ggml_cann_type_mapping(type), ggml_type_size(type),
|
||||
src_ne, src_nb, 2);
|
||||
|
||||
// index
|
||||
int64_t acl_index_ne[1] = {index->ne[0]};
|
||||
size_t acl_index_nb[1] = {index->nb[0]};
|
||||
aclTensor* acl_index = ggml_cann_create_tensor(
|
||||
(char*)index->data + i * index->nb[2] + j * index->nb[1],
|
||||
(char*)index->data + (i % index->ne[2]) * index->nb[2] + (j % index->ne[1]) * index->nb[1],
|
||||
ggml_cann_type_mapping(index->type), ggml_element_size(index),
|
||||
acl_index_ne, acl_index_nb, 1);
|
||||
index->ne, index->nb, 1);
|
||||
|
||||
// out
|
||||
int64_t acl_out_ne[2] = {dst->ne[0], dst->ne[1]};
|
||||
size_t acl_out_nb[2] = {dst->nb[0], dst->nb[1]};
|
||||
aclTensor* acl_out = ggml_cann_create_tensor(
|
||||
(char*)dst->data + i * dst->nb[3] + j * dst->nb[2],
|
||||
ggml_cann_type_mapping(dst->type), ggml_element_size(dst),
|
||||
acl_out_ne, acl_out_nb, 2);
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, Embedding, acl_src_tensor, acl_index, acl_out);
|
||||
(char*)dst_buffer + i * dst_nb[3] + j * dst_nb[2],
|
||||
ggml_cann_type_mapping(type), ggml_type_size(type),
|
||||
dst_ne, dst_nb, 2);
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, IndexSelect, acl_src_tensor, 0, acl_index, acl_out);
|
||||
ggml_cann_release_resources(ctx, acl_src_tensor, acl_index, acl_out);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Performs inplace index copy operation on a 4D tensor using the CANN backend.
|
||||
*
|
||||
* This function applies the `IndexCopy` operation along a specific dimension of the
|
||||
* destination tensor (`dst_buffer`) by copying elements from the source tensor (`src_buffer`)
|
||||
* to positions specified by the index tensor (`index`).
|
||||
* It iterates over the last two dimensions of the tensors, creates the corresponding
|
||||
* CANN tensors for source, index, and destination slices, and performs the index copy
|
||||
* operation for each slice.
|
||||
*
|
||||
* @param ctx The context for CANN backend operations.
|
||||
* @param src_buffer The source buffer containing the 4D input tensor data to be copied.
|
||||
* @param src_ne The dimensions of the source tensor.
|
||||
* @param src_nb The strides (byte offsets) of the source tensor.
|
||||
* @param dst_buffer The destination buffer where values will be copied to.
|
||||
* @param dst_ne The dimensions of the destination tensor.
|
||||
* @param dst_nb The strides (byte offsets) of the destination tensor.
|
||||
* @param index The index tensor specifying target positions in the destination tensor.
|
||||
* @param type The data type of the source and destination tensors.
|
||||
*/
|
||||
static void aclnn_index_copy_4d(ggml_backend_cann_context& ctx,
|
||||
void* src_buffer,int64_t* src_ne, size_t* src_nb,
|
||||
void* dst_buffer, int64_t* dst_ne, size_t* dst_nb,
|
||||
ggml_tensor* index, ggml_type type) {
|
||||
for (int64_t i = 0; i < src_ne[3]; i++) {
|
||||
for (int64_t j = 0; j < src_ne[2]; j++) {
|
||||
// src
|
||||
aclTensor* acl_src_tensor = ggml_cann_create_tensor(
|
||||
(char*)src_buffer + i * src_nb[3] + j * src_nb[2],
|
||||
ggml_cann_type_mapping(type), ggml_type_size(type),
|
||||
src_ne, src_nb, 2);
|
||||
|
||||
// index
|
||||
aclTensor* acl_index = ggml_cann_create_tensor(
|
||||
(char*)index->data + (i % index->ne[2]) * index->nb[2] + (j % index->ne[1]) * index->nb[1],
|
||||
ggml_cann_type_mapping(index->type), ggml_element_size(index),
|
||||
index->ne, index->nb, 1);
|
||||
|
||||
// out
|
||||
aclTensor* acl_out = ggml_cann_create_tensor(
|
||||
(char*)dst_buffer + i * dst_nb[3] + j * dst_nb[2],
|
||||
ggml_cann_type_mapping(type), ggml_type_size(type),
|
||||
dst_ne, dst_nb, 2);
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceIndexCopy, acl_out, 0, acl_index, acl_src_tensor);
|
||||
ggml_cann_release_resources(ctx, acl_src_tensor, acl_index, acl_out);
|
||||
}
|
||||
}
|
||||
|
|
@ -1669,8 +1718,9 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
|||
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_F32: {
|
||||
aclnn_embedding_4d(ctx, src0->data, src0->ne, src0->nb, src1,
|
||||
dst);
|
||||
aclnn_index_select_4d(ctx, src0->data, src0->ne, src0->nb,
|
||||
dst->data, dst->ne, dst->nb,
|
||||
src1, dst->type);
|
||||
break;
|
||||
}
|
||||
case GGML_TYPE_F16: {
|
||||
|
|
@ -1687,8 +1737,9 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
|||
src_trans_buffer, ACL_FLOAT, ggml_type_size(dst->type),
|
||||
src0->ne, src_trans_nb, GGML_MAX_DIMS);
|
||||
aclnn_cast(ctx, acl_src0, src_trans_tensor, ggml_cann_type_mapping(dst->type));
|
||||
aclnn_embedding_4d(ctx, src_trans_buffer, src0->ne,
|
||||
src_trans_nb, src1, dst);
|
||||
aclnn_index_select_4d(ctx, src_trans_buffer, src0->ne, src_trans_nb,
|
||||
dst->data, dst->ne, dst->nb,
|
||||
src1, dst->type);
|
||||
ggml_cann_release_resources(ctx, acl_src0, src_trans_tensor);
|
||||
break;
|
||||
}
|
||||
|
|
@ -1748,8 +1799,10 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
|||
dequant_nb[i] = dequant_nb[i - 1] * src0->ne[i - 1];
|
||||
}
|
||||
|
||||
aclnn_embedding_4d(ctx, dequant_buffer_allocator.get(),
|
||||
dequant_ne, dequant_nb, src1, dst);
|
||||
aclnn_index_select_4d(ctx, dequant_buffer_allocator.get(),
|
||||
dequant_ne, dequant_nb,
|
||||
dst->data, dst->ne, dst->nb,
|
||||
src1, dst->type);
|
||||
|
||||
ggml_cann_release_resources(ctx, dequant_tensor);
|
||||
break;
|
||||
|
|
@ -1760,6 +1813,43 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
|||
}
|
||||
}
|
||||
|
||||
void ggml_cann_set_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
ggml_tensor* src0 = dst->src[0]; // src
|
||||
ggml_tensor* src1 = dst->src[1]; // index
|
||||
|
||||
switch (dst->type) {
|
||||
case GGML_TYPE_F32: {
|
||||
aclnn_index_copy_4d(ctx, src0->data, src0->ne, src0->nb,
|
||||
dst->data, dst->ne, dst->nb,
|
||||
src1, dst->type);
|
||||
break;
|
||||
}
|
||||
case GGML_TYPE_F16: {
|
||||
aclTensor* acl_src0 = ggml_cann_create_tensor(src0);
|
||||
ggml_cann_pool_alloc src_buffer_allocator(
|
||||
ctx.pool(), ggml_nelements(src0) * sizeof(uint16_t));
|
||||
void* src_trans_buffer = src_buffer_allocator.get();
|
||||
size_t src_trans_nb[GGML_MAX_DIMS];
|
||||
src_trans_nb[0] = sizeof(uint16_t);
|
||||
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
||||
src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1];
|
||||
}
|
||||
aclTensor* src_trans_tensor = ggml_cann_create_tensor(
|
||||
src_trans_buffer, ACL_FLOAT16, ggml_type_size(dst->type),
|
||||
src0->ne, src_trans_nb, GGML_MAX_DIMS);
|
||||
aclnn_cast(ctx, acl_src0, src_trans_tensor, ggml_cann_type_mapping(dst->type));
|
||||
aclnn_index_copy_4d(ctx, src_trans_buffer, src0->ne, src_trans_nb,
|
||||
dst->data, dst->ne, dst->nb,
|
||||
src1, dst->type);
|
||||
ggml_cann_release_resources(ctx, acl_src0, src_trans_tensor);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
GGML_ABORT("Unsupported tensor type for GGML_OP_SET_ROWS");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Repeats elements of a tensor along a specified dimension.
|
||||
*
|
||||
|
|
@ -1823,11 +1913,9 @@ static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx,
|
|||
bcast_weight_nb[4], bcast_weight_nb[5]};
|
||||
aclTensor* acl_weight_tensor;
|
||||
|
||||
bool weightToNZ = false;
|
||||
#ifdef ASCEND_310P
|
||||
weightToNZ = (getenv("GGML_CANN_WEIGHT_NZ") != nullptr);
|
||||
#endif
|
||||
if (weightToNZ && is_matmul_weight(weight)) {
|
||||
// Only check env once.
|
||||
static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or(""));
|
||||
if (weight_to_nz && is_matmul_weight(weight)) {
|
||||
int64_t acl_stride[2] = {1, transpose_ne[1]};
|
||||
|
||||
// Reverse ne.
|
||||
|
|
|
|||
|
|
@ -424,15 +424,25 @@ void ggml_cann_softmax(ggml_backend_cann_context& ctx, ggml_tensor* dst);
|
|||
*
|
||||
* @details This function retrieves rows from a source tensor src0 according to
|
||||
* the indices provided in another tensor src1 and stores the result in
|
||||
* a destination tensor (\p dst). It supports different data types
|
||||
* including F32, F16, Q4_0, and Q8_0.
|
||||
* a destination tensor (\p dst).
|
||||
*
|
||||
* @param ctx The backend CANN context for executing operations.
|
||||
* @param dst The destination tensor where the extracted rows will be stored.
|
||||
* dst->op is `GGML_OP_GET_ROWS`.
|
||||
*/
|
||||
void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst);
|
||||
|
||||
/**
|
||||
* @brief Writes specific rows into a tensor at positions specified by indices.
|
||||
*
|
||||
* @details This function copies rows from a source tensor into a destination
|
||||
* tensor (\p dst) at the positions indicated by the indices in another
|
||||
* tensor.
|
||||
*
|
||||
* @param ctx The backend CANN context for executing operations.
|
||||
* @param dst The destination tensor where the specified rows will be updated.
|
||||
*/
|
||||
void ggml_cann_set_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst);
|
||||
|
||||
/**
|
||||
* @brief Executes matrix multiplication for the given tensor.
|
||||
*
|
||||
|
|
|
|||
|
|
@ -1116,61 +1116,59 @@ static enum ggml_status ggml_backend_cann_buffer_init_tensor(
|
|||
return GGML_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
static int CreateAclTensorWeight(const void *hostData, const std::vector<int64_t> &shape, void **deviceAddr,
|
||||
aclDataType dataType, aclTensor **tensor)
|
||||
{
|
||||
uint64_t size = 1;
|
||||
for (auto i : shape) {
|
||||
size *= i;
|
||||
// ND to NZ Workspace Cache Management. Thread-safety: Not guaranteed
|
||||
namespace {
|
||||
void* g_nz_workspace = nullptr;
|
||||
size_t g_nz_workspace_allocated = 0;
|
||||
|
||||
void release_nz_workspace() {
|
||||
if (g_nz_workspace) {
|
||||
aclrtFree(g_nz_workspace);
|
||||
g_nz_workspace = nullptr;
|
||||
g_nz_workspace_allocated = 0;
|
||||
}
|
||||
}
|
||||
|
||||
const aclIntArray *mat2Size = aclCreateIntArray(shape.data(), shape.size());
|
||||
ACL_CHECK(aclnnCalculateMatmulWeightSizeV2(mat2Size, dataType, &size));
|
||||
|
||||
size *= sizeof(int16_t);
|
||||
|
||||
ACL_CHECK(aclrtMalloc(deviceAddr, size, ACL_MEM_MALLOC_HUGE_FIRST));
|
||||
aclrtMemcpy(*deviceAddr, size, hostData, size, ACL_MEMCPY_HOST_TO_DEVICE);
|
||||
|
||||
std::vector<int64_t> strides(shape.size(), 1);
|
||||
for (int64_t i = shape.size() - 2; i >= 0; i--) {
|
||||
strides[i] = shape[i + 1] * strides[i + 1];
|
||||
void relloc_nz_workspace(size_t new_size) {
|
||||
if (new_size > g_nz_workspace_allocated) {
|
||||
if (g_nz_workspace) {
|
||||
aclrtFree(g_nz_workspace);
|
||||
g_nz_workspace = nullptr;
|
||||
}
|
||||
ACL_CHECK(aclrtMalloc(&g_nz_workspace, new_size, ACL_MEM_MALLOC_HUGE_FIRST));
|
||||
g_nz_workspace_allocated = new_size;
|
||||
}
|
||||
}
|
||||
|
||||
*tensor = aclCreateTensor(shape.data(), shape.size(), dataType, strides.data(), 0, aclFormat::ACL_FORMAT_ND,
|
||||
shape.data(), shape.size(), *deviceAddr);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Convert tensor weights to NZ format using Ascend CANN API.
|
||||
*
|
||||
* This function creates a transposed tensor descriptor and performs the
|
||||
* TransMatmulWeight operation. Converting tensor formats can significantly
|
||||
* improve performance on certain hardware.
|
||||
*
|
||||
* @param tensor Pointer to the input ggml_tensor containing the weights.
|
||||
* @param data Pointer to the raw data buffer for the tensor weights.
|
||||
* @param offset Byte offset within the tensor data buffer where weights start.
|
||||
*
|
||||
* @note The workspace buffer used in this function is managed globally and reused
|
||||
* across calls. This reduces overhead from repeated memory allocation and deallocation.
|
||||
*/
|
||||
static void weight_format_to_nz(ggml_tensor *tensor, const void *data, size_t offset) {
|
||||
aclrtStream stream;
|
||||
ACL_CHECK(aclrtCreateStream(&stream));
|
||||
|
||||
std::vector<int64_t> weightTransposedShape = {tensor->ne[1], tensor->ne[0]};
|
||||
void *weightTransposedDeviceAddr = nullptr;
|
||||
aclTensor *weightTransposed = nullptr;
|
||||
CreateAclTensorWeight(data, weightTransposedShape, &weightTransposedDeviceAddr,
|
||||
ggml_cann_type_mapping(tensor->type), &weightTransposed);
|
||||
|
||||
aclTensor* weightTransposed = ggml_cann_create_tensor(tensor, tensor->ne,
|
||||
tensor->nb, 2, ACL_FORMAT_ND, offset);
|
||||
uint64_t workspaceSize = 0;
|
||||
aclOpExecutor *executor;
|
||||
void *workspaceAddr = nullptr;
|
||||
|
||||
// TransMatmulWeight
|
||||
ACL_CHECK(aclnnTransMatmulWeightGetWorkspaceSize(weightTransposed, &workspaceSize, &executor));
|
||||
std::unique_ptr<void, aclError (*)(void *)> workspaceAddrPtrTrans(nullptr, aclrtFree);
|
||||
if (workspaceSize > 0) {
|
||||
ACL_CHECK(aclrtMalloc(&workspaceAddr, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST));
|
||||
workspaceAddrPtrTrans.reset(workspaceAddr);
|
||||
}
|
||||
ACL_CHECK(aclnnTransMatmulWeight(workspaceAddr, workspaceSize, executor, stream));
|
||||
ACL_CHECK(aclnnTransMatmulWeightGetWorkspaceSize(weightTransposed,
|
||||
&workspaceSize, &executor));
|
||||
// Avoid frequent malloc/free of the workspace.
|
||||
relloc_nz_workspace(workspaceSize);
|
||||
|
||||
size_t size = ggml_nelements(tensor) * ggml_element_size(tensor);
|
||||
|
||||
aclrtMemcpy((char *)tensor->data + offset, size,
|
||||
weightTransposedDeviceAddr, size, ACL_MEMCPY_HOST_TO_DEVICE);
|
||||
ACL_CHECK(aclnnTransMatmulWeight(g_nz_workspace, workspaceSize, executor, nullptr));
|
||||
ACL_CHECK(aclDestroyTensor(weightTransposed));
|
||||
aclrtFree(weightTransposedDeviceAddr);
|
||||
}
|
||||
|
||||
// TODO: need handle tensor which has paddings.
|
||||
|
|
@ -1197,14 +1195,14 @@ static void ggml_backend_cann_buffer_set_tensor(
|
|||
// For acl, synchronous functions use this default stream.
|
||||
// Why aclrtSynchronizeDevice?
|
||||
|
||||
bool weightToNZ = false;
|
||||
#ifdef ASCEND_310P
|
||||
weightToNZ = (getenv("GGML_CANN_WEIGHT_NZ") != nullptr);
|
||||
#endif
|
||||
// Only check env once.
|
||||
static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or(""));
|
||||
if (!need_transform(tensor->type)) {
|
||||
ACL_CHECK(aclrtMemcpy((char *)tensor->data + offset, size, data, size,
|
||||
ACL_MEMCPY_HOST_TO_DEVICE));
|
||||
if (weightToNZ && is_matmul_weight((const ggml_tensor*)tensor)) {
|
||||
if (weight_to_nz && is_matmul_weight((const ggml_tensor*)tensor)) {
|
||||
GGML_ASSERT(tensor->ne[2] == 1);
|
||||
GGML_ASSERT(tensor->ne[3] == 1);
|
||||
weight_format_to_nz(tensor, data, offset);
|
||||
}
|
||||
} else {
|
||||
|
|
@ -1440,20 +1438,32 @@ static size_t ggml_backend_cann_buffer_type_get_alloc_size(
|
|||
size_t size = ggml_nbytes(tensor);
|
||||
int64_t ne0 = tensor->ne[0];
|
||||
|
||||
// Only check env once.
|
||||
static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or(""));
|
||||
|
||||
// last line must bigger than 32, because every single op deal at
|
||||
// least 32 bytes.
|
||||
// TODO: quantized type?
|
||||
// int64_t line_size = ne0 * ggml_element_size(tensor);
|
||||
// int64_t line_size_align_32 = (line_size + 31) & ~31;
|
||||
// size += (line_size_align_32 - line_size);
|
||||
|
||||
// TODO: not support quantized yet.
|
||||
// TODO: consider un-continue tensor.
|
||||
if (ggml_is_quantized(tensor->type)) {
|
||||
if (ne0 % MATRIX_ROW_PADDING != 0) {
|
||||
size += ggml_row_size(
|
||||
tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
|
||||
}
|
||||
} else if (weight_to_nz && is_matmul_weight((const ggml_tensor*)tensor)) {
|
||||
// NZ format weight are not support quantized yet.
|
||||
// If ND tensor transform to NZ, size may changed.
|
||||
int64_t shape[] = {tensor->ne[1], tensor->ne[0]};
|
||||
GGML_ASSERT(tensor->ne[2] == 1);
|
||||
GGML_ASSERT(tensor->ne[3] == 1);
|
||||
const aclIntArray *acl_shape = aclCreateIntArray(shape, 2);
|
||||
size_t new_size;
|
||||
ACL_CHECK(aclnnCalculateMatmulWeightSizeV2(acl_shape,
|
||||
ggml_cann_type_mapping(tensor->type), &new_size));
|
||||
ACL_CHECK(aclDestroyIntArray(acl_shape));
|
||||
size = std::max(size, new_size);
|
||||
}
|
||||
|
||||
return size;
|
||||
|
|
@ -1659,6 +1669,9 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
|
|||
case GGML_OP_GET_ROWS:
|
||||
ggml_cann_get_rows(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_SET_ROWS:
|
||||
ggml_cann_set_rows(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_DUP:
|
||||
ggml_cann_dup(ctx, dst);
|
||||
break;
|
||||
|
|
@ -2003,6 +2016,9 @@ static bool ggml_backend_cann_cpy_tensor_async(
|
|||
(ggml_backend_cann_context*)backend_dst->context;
|
||||
|
||||
size_t copy_size = ggml_nbytes(dst);
|
||||
if (copy_size == 0) {
|
||||
return true;
|
||||
}
|
||||
if (backend_src != backend_dst) {
|
||||
ggml_backend_cann_buffer_context* buf_ctx_src =
|
||||
(ggml_backend_cann_buffer_context*)buf_src->context;
|
||||
|
|
@ -2077,6 +2093,8 @@ static enum ggml_status ggml_backend_cann_graph_compute(
|
|||
(ggml_backend_cann_context*)backend->context;
|
||||
|
||||
ggml_cann_set_device(cann_ctx->device);
|
||||
//release temp buffer create by set tensor.
|
||||
release_nz_workspace();
|
||||
|
||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||
ggml_tensor* node = cgraph->nodes[i];
|
||||
|
|
@ -2191,13 +2209,15 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
|
|||
return false;
|
||||
}
|
||||
} break;
|
||||
case GGML_OP_SET_ROWS:
|
||||
{
|
||||
// TODO: add support
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/14274
|
||||
#pragma message("TODO: implement F32, F16, BF16, Q4_0, Q4_1, Q5_0, Q5_1, Q8_0, IQ4_NL support (https://github.com/ggml-org/llama.cpp/pull/14661)")
|
||||
return false;
|
||||
} break;
|
||||
case GGML_OP_SET_ROWS: {
|
||||
switch (op->type) {
|
||||
case GGML_TYPE_F32:
|
||||
case GGML_TYPE_F16:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
} break;
|
||||
case GGML_OP_CPY: {
|
||||
ggml_tensor *src = op->src[0];
|
||||
if ((op->type != GGML_TYPE_F32 && op->type != GGML_TYPE_F16) ||
|
||||
|
|
|
|||
|
|
@ -37,17 +37,21 @@
|
|||
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
|
||||
#define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
|
||||
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
||||
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
||||
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
||||
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
||||
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
||||
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
|
||||
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
||||
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
||||
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
||||
#elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM) || defined(_M_ARM64)
|
||||
// repack.cpp
|
||||
#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
|
||||
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
||||
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
||||
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
||||
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
||||
#elif defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || defined(_M_X64)
|
||||
// repack.cpp
|
||||
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
|
||||
|
|
@ -72,11 +76,13 @@
|
|||
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
|
||||
#define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
|
||||
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
||||
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
||||
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
||||
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
||||
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
||||
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
|
||||
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
||||
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
||||
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
||||
#elif defined(__loongarch64)
|
||||
// quants.c
|
||||
|
|
@ -92,11 +98,13 @@
|
|||
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
|
||||
#define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
|
||||
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
||||
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
||||
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
||||
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
||||
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
||||
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
|
||||
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
||||
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
||||
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
||||
#elif defined(__riscv)
|
||||
// quants.c
|
||||
|
|
@ -119,10 +127,12 @@
|
|||
#define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
|
||||
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
|
||||
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
||||
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
||||
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
||||
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
||||
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
||||
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
||||
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
||||
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
||||
#elif defined(__s390x__)
|
||||
// quants.c
|
||||
|
|
@ -147,11 +157,13 @@
|
|||
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
|
||||
#define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
|
||||
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
||||
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
||||
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
||||
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
||||
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
||||
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
|
||||
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
||||
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
||||
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
||||
#elif defined(__wasm__)
|
||||
// quants.c
|
||||
|
|
@ -175,10 +187,12 @@
|
|||
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
|
||||
#define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
|
||||
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
||||
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
||||
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
||||
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
||||
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
||||
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
|
||||
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
||||
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
||||
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -1236,44 +1236,10 @@ void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|||
*s = sumf;
|
||||
|
||||
#else
|
||||
const uint8_t pow3[6] = {1, 3, 9, 27, 81, 243};
|
||||
|
||||
float sumf = 0.0f;
|
||||
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
int sum = 0;
|
||||
|
||||
for (size_t j = 0; j < sizeof(x->qs) - sizeof(x->qs) % 32; j += 32) {
|
||||
for (size_t l = 0; l < 5; ++l) {
|
||||
for (size_t m = 0; m < 32; ++m) {
|
||||
uint8_t q = x[i].qs[j + m] * pow3[l];
|
||||
uint16_t xi = ((uint16_t) q * 3) >> 8;
|
||||
sum += (xi - 1) * y[i].qs[j*5 + l*32 + m];
|
||||
}
|
||||
}
|
||||
}
|
||||
for (size_t j = sizeof(x->qs) - sizeof(x->qs) % 32; j < sizeof(x->qs); j += 16) {
|
||||
for (size_t l = 0; l < 5; ++l) {
|
||||
for (size_t m = 0; m < 16; ++m) {
|
||||
uint8_t q = x[i].qs[j + m] * pow3[l];
|
||||
uint16_t xi = ((uint16_t) q * 3) >> 8;
|
||||
sum += (xi - 1) * y[i].qs[j*5 + l*16 + m];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (size_t l = 0; l < 4; ++l) {
|
||||
for (size_t j = 0; j < sizeof(x->qh); ++j) {
|
||||
uint8_t q = x[i].qh[j] * pow3[l];
|
||||
uint16_t xi = ((uint16_t) q * 3) >> 8;
|
||||
sum += (xi - 1) * y[i].qs[sizeof(x->qs)*5 + l*sizeof(x->qh) + j];
|
||||
}
|
||||
}
|
||||
|
||||
sumf += (float) sum * (GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d);
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
ggml_vec_dot_tq1_0_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
@ -1381,25 +1347,10 @@ void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|||
*s = sumf;
|
||||
|
||||
#else
|
||||
float sumf = 0.0f;
|
||||
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
int32_t sumi = 0;
|
||||
|
||||
for (size_t j = 0; j < sizeof(x->qs); j += 32) {
|
||||
for (size_t l = 0; l < 4; ++l) {
|
||||
for (size_t k = 0; k < 32; ++k) {
|
||||
sumi += y[i].qs[j*4 + l*32 + k] * (((x[i].qs[j + k] >> (l*2)) & 3) - 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
||||
|
||||
sumf += (float) sumi * d;
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
ggml_vec_dot_tq2_0_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
@ -1729,45 +1680,10 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
*s = sum;
|
||||
|
||||
#else
|
||||
|
||||
float sumf = 0;
|
||||
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
|
||||
const uint8_t * q2 = x[i].qs;
|
||||
const int8_t * q8 = y[i].qs;
|
||||
const uint8_t * sc = x[i].scales;
|
||||
|
||||
int summs = 0;
|
||||
for (int j = 0; j < 16; ++j) {
|
||||
summs += y[i].bsums[j] * (sc[j] >> 4);
|
||||
}
|
||||
|
||||
const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
||||
const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
|
||||
|
||||
int isum = 0;
|
||||
int is = 0;
|
||||
int d;
|
||||
for (int k = 0; k < QK_K/128; ++k) {
|
||||
int shift = 0;
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
d = sc[is++] & 0xF;
|
||||
int isuml = 0;
|
||||
for (int l = 0; l < 16; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
|
||||
isum += d * isuml;
|
||||
d = sc[is++] & 0xF;
|
||||
isuml = 0;
|
||||
for (int l = 16; l < 32; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
|
||||
isum += d * isuml;
|
||||
shift += 2;
|
||||
q8 += 32;
|
||||
}
|
||||
q2 += 32;
|
||||
}
|
||||
sumf += dall * isum - dmin * summs;
|
||||
}
|
||||
*s = sumf;
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
ggml_vec_dot_q2_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
@ -2057,68 +1973,12 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
*s = sum;
|
||||
|
||||
#else
|
||||
// scalar version
|
||||
// This function is written like this so the compiler can manage to vectorize most of it
|
||||
// Using -Ofast, GCC and clang manage to produce code that is within a factor of 2 or so from the
|
||||
// manually vectorized version above. Every other version I tried would run at least 4 times slower.
|
||||
// The ideal situation would be if we could just write the code once, and the compiler would
|
||||
// automatically produce the best possible set of machine instructions, instead of us having to manually
|
||||
// write vectorized versions for AVX, ARM_NEON, etc.
|
||||
|
||||
int8_t aux8[QK_K];
|
||||
int16_t aux16[8];
|
||||
float sums [8];
|
||||
int32_t aux32[8];
|
||||
memset(sums, 0, 8*sizeof(float));
|
||||
|
||||
uint32_t auxs[4];
|
||||
const int8_t * scales = (const int8_t*)auxs;
|
||||
|
||||
float sumf = 0;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
|
||||
const uint8_t * GGML_RESTRICT hm = x[i].hmask;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
memset(aux32, 0, 8*sizeof(int32_t));
|
||||
int8_t * GGML_RESTRICT a = aux8;
|
||||
uint8_t m = 1;
|
||||
for (int j = 0; j < QK_K; j += 128) {
|
||||
for (int l = 0; l < 32; ++l) a[l] = q3[l] & 3;
|
||||
for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
|
||||
a += 32; m <<= 1;
|
||||
for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 2) & 3;
|
||||
for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
|
||||
a += 32; m <<= 1;
|
||||
for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 4) & 3;
|
||||
for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
|
||||
a += 32; m <<= 1;
|
||||
for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 6) & 3;
|
||||
for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
|
||||
a += 32; m <<= 1;
|
||||
q3 += 32;
|
||||
}
|
||||
a = aux8;
|
||||
|
||||
memcpy(auxs, x[i].scales, 12);
|
||||
uint32_t tmp = auxs[2];
|
||||
auxs[2] = ((auxs[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4);
|
||||
auxs[3] = ((auxs[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4);
|
||||
auxs[0] = (auxs[0] & kmask2) | (((tmp >> 0) & kmask1) << 4);
|
||||
auxs[1] = (auxs[1] & kmask2) | (((tmp >> 2) & kmask1) << 4);
|
||||
for (int j = 0; j < QK_K/16; ++j) {
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
}
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
||||
}
|
||||
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
||||
*s = sumf;
|
||||
|
||||
UNUSED(kmask1);
|
||||
UNUSED(kmask2);
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
ggml_vec_dot_q3_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
|
||||
}
|
||||
|
|
@ -2431,61 +2291,14 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
*s = sumf;
|
||||
|
||||
#else
|
||||
|
||||
const uint8_t * scales = (const uint8_t*)&utmp[0];
|
||||
const uint8_t * mins = (const uint8_t*)&utmp[2];
|
||||
|
||||
int8_t aux8[QK_K];
|
||||
int16_t aux16[8];
|
||||
float sums [8];
|
||||
int32_t aux32[8];
|
||||
memset(sums, 0, 8*sizeof(float));
|
||||
|
||||
float sumf = 0;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const uint8_t * GGML_RESTRICT q4 = x[i].qs;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
memset(aux32, 0, 8*sizeof(int32_t));
|
||||
int8_t * GGML_RESTRICT a = aux8;
|
||||
for (int j = 0; j < QK_K/64; ++j) {
|
||||
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
|
||||
a += 32;
|
||||
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4);
|
||||
a += 32; q4 += 32;
|
||||
}
|
||||
memcpy(utmp, x[i].scales, 12);
|
||||
utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
|
||||
const uint32_t uaux = utmp[1] & kmask1;
|
||||
utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
|
||||
utmp[2] = uaux;
|
||||
utmp[0] &= kmask1;
|
||||
|
||||
int sumi = 0;
|
||||
for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
|
||||
a = aux8;
|
||||
int is = 0;
|
||||
for (int j = 0; j < QK_K/32; ++j) {
|
||||
int32_t scale = scales[is++];
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
}
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
||||
const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
||||
sumf -= dmin * sumi;
|
||||
}
|
||||
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
||||
*s = sumf;
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
UNUSED(kmask1);
|
||||
UNUSED(kmask2);
|
||||
UNUSED(kmask3);
|
||||
UNUSED(utmp);
|
||||
ggml_vec_dot_q4_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
@ -2578,66 +2391,14 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
*s = sumf;
|
||||
|
||||
#else
|
||||
|
||||
const uint8_t * scales = (const uint8_t*)&utmp[0];
|
||||
const uint8_t * mins = (const uint8_t*)&utmp[2];
|
||||
|
||||
int8_t aux8[QK_K];
|
||||
int16_t aux16[8];
|
||||
float sums [8];
|
||||
int32_t aux32[8];
|
||||
memset(sums, 0, 8*sizeof(float));
|
||||
|
||||
float sumf = 0;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const uint8_t * GGML_RESTRICT q4 = x[i].qs;
|
||||
const uint8_t * GGML_RESTRICT hm = x[i].qh;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
memset(aux32, 0, 8*sizeof(int32_t));
|
||||
int8_t * GGML_RESTRICT a = aux8;
|
||||
uint8_t m = 1;
|
||||
for (int j = 0; j < QK_K/64; ++j) {
|
||||
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
|
||||
for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
|
||||
a += 32; m <<= 1;
|
||||
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4);
|
||||
for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
|
||||
a += 32; m <<= 1;
|
||||
q4 += 32;
|
||||
}
|
||||
memcpy(utmp, x[i].scales, 12);
|
||||
utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
|
||||
const uint32_t uaux = utmp[1] & kmask1;
|
||||
utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
|
||||
utmp[2] = uaux;
|
||||
utmp[0] &= kmask1;
|
||||
|
||||
int sumi = 0;
|
||||
for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
|
||||
a = aux8;
|
||||
int is = 0;
|
||||
for (int j = 0; j < QK_K/32; ++j) {
|
||||
int32_t scale = scales[is++];
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
}
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
||||
const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
||||
sumf -= dmin * sumi;
|
||||
}
|
||||
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
||||
*s = sumf;
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
UNUSED(kmask1);
|
||||
UNUSED(kmask2);
|
||||
UNUSED(kmask3);
|
||||
UNUSED(utmp);
|
||||
ggml_vec_dot_q5_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
@ -3093,47 +2854,10 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
}
|
||||
*s = sum;
|
||||
#else
|
||||
|
||||
int8_t aux8[QK_K];
|
||||
int16_t aux16[8];
|
||||
float sums [8];
|
||||
int32_t aux32[8];
|
||||
memset(sums, 0, 8*sizeof(float));
|
||||
|
||||
float sumf = 0;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const uint8_t * GGML_RESTRICT q4 = x[i].ql;
|
||||
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
memset(aux32, 0, 8*sizeof(int32_t));
|
||||
int8_t * GGML_RESTRICT a = aux8;
|
||||
for (int j = 0; j < QK_K; j += 128) {
|
||||
for (int l = 0; l < 32; ++l) {
|
||||
a[l + 0] = (int8_t)((q4[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
|
||||
a[l + 32] = (int8_t)((q4[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
|
||||
a[l + 64] = (int8_t)((q4[l + 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
|
||||
a[l + 96] = (int8_t)((q4[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
|
||||
}
|
||||
a += 128;
|
||||
q4 += 64;
|
||||
qh += 32;
|
||||
}
|
||||
a = aux8;
|
||||
int is = 0;
|
||||
for (int j = 0; j < QK_K/16; ++j) {
|
||||
int scale = x[i].scales[is++];
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
}
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
||||
}
|
||||
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
||||
*s = sumf;
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
ggml_vec_dot_q6_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
@ -3229,34 +2953,10 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
|
|||
*s = 0.25f * sumf;
|
||||
|
||||
#else
|
||||
|
||||
uint32_t aux32[2];
|
||||
const uint8_t * aux8 = (const uint8_t *)aux32;
|
||||
|
||||
float sumf = 0.f;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
int32_t bsum = 0;
|
||||
for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
|
||||
memcpy(aux32, q2, 2*sizeof(uint32_t));
|
||||
q2 += 4;
|
||||
const uint32_t ls = 2*(aux32[1] >> 28) + 1;
|
||||
int32_t sumi = 0;
|
||||
for (int l = 0; l < 4; ++l) {
|
||||
const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[l]);
|
||||
const uint8_t signs = ksigns_iq2xs[(aux32[1] >> 7*l) & 127];
|
||||
for (int j = 0; j < 8; ++j) {
|
||||
sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
|
||||
}
|
||||
q8 += 8;
|
||||
}
|
||||
bsum += sumi * ls;
|
||||
}
|
||||
sumf += d * bsum;
|
||||
}
|
||||
*s = 0.125f * sumf;
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
ggml_vec_dot_iq2_xxs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
@ -3327,42 +3027,10 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
|
|||
*s = 0.125f * sumf;
|
||||
|
||||
#else
|
||||
|
||||
float sumf = 0.f;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
|
||||
const uint8_t * GGML_RESTRICT sc = x[i].scales;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
int32_t bsum = 0;
|
||||
for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
|
||||
const uint16_t ls1 = 2*(sc[ib32] & 0xf) + 1;
|
||||
const uint16_t ls2 = 2*(sc[ib32] >> 4) + 1;
|
||||
int32_t sumi = 0;
|
||||
for (int l = 0; l < 2; ++l) {
|
||||
const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
|
||||
const uint8_t signs = ksigns_iq2xs[q2[l] >> 9];
|
||||
for (int j = 0; j < 8; ++j) {
|
||||
sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
|
||||
}
|
||||
q8 += 8;
|
||||
}
|
||||
bsum += sumi * ls1;
|
||||
sumi = 0;
|
||||
for (int l = 2; l < 4; ++l) {
|
||||
const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
|
||||
const uint8_t signs = ksigns_iq2xs[q2[l] >> 9];
|
||||
for (int j = 0; j < 8; ++j) {
|
||||
sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
|
||||
}
|
||||
q8 += 8;
|
||||
}
|
||||
bsum += sumi * ls2;
|
||||
q2 += 4;
|
||||
}
|
||||
sumf += d * bsum;
|
||||
}
|
||||
*s = 0.125f * sumf;
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
ggml_vec_dot_iq2_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
@ -3455,45 +3123,10 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|||
*s = 0.125f * sumf;
|
||||
|
||||
#else
|
||||
|
||||
float sumf = 0;
|
||||
for (int i = 0; i < nb; i++) {
|
||||
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const int8_t * q8 = y[i].qs;
|
||||
const uint8_t * qs = x[i].qs;
|
||||
const uint8_t * qh = x[i].qh;
|
||||
const uint8_t * signs = qs + QK_K/8;
|
||||
|
||||
int bsum = 0;
|
||||
for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
|
||||
int ls1 = 1 + 2*(x[i].scales[ib32] & 0xf);
|
||||
int ls2 = 1 + 2*(x[i].scales[ib32] >> 4);
|
||||
int sumi1 = 0, sumi2 = 0;
|
||||
for (int l = 0; l < 2; ++l) {
|
||||
const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
|
||||
for (int j = 0; j < 8; ++j) {
|
||||
sumi1 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1);
|
||||
}
|
||||
q8 += 8;
|
||||
}
|
||||
for (int l = 2; l < 4; ++l) {
|
||||
const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
|
||||
for (int j = 0; j < 8; ++j) {
|
||||
sumi2 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1);
|
||||
}
|
||||
q8 += 8;
|
||||
}
|
||||
bsum += ls1 * sumi1 + ls2 * sumi2;
|
||||
qs += 4;
|
||||
signs += 4;
|
||||
}
|
||||
|
||||
sumf += d * bsum;
|
||||
}
|
||||
|
||||
*s = 0.125f * sumf;
|
||||
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
ggml_vec_dot_iq2_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
|
||||
}
|
||||
|
|
@ -3553,36 +3186,10 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
|
|||
*s = 0.5f * sumf;
|
||||
|
||||
#else
|
||||
|
||||
uint32_t aux32;
|
||||
|
||||
float sumf = 0.f;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
|
||||
const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
int32_t bsum = 0;
|
||||
for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
|
||||
memcpy(&aux32, gas, sizeof(uint32_t)); gas += sizeof(uint32_t);
|
||||
const uint32_t ls = 2*(aux32 >> 28) + 1;
|
||||
int32_t sumi = 0;
|
||||
for (int l = 0; l < 4; ++l) {
|
||||
const uint8_t * grid1 = (const uint8_t *)(iq3xxs_grid + q3[2*l+0]);
|
||||
const uint8_t * grid2 = (const uint8_t *)(iq3xxs_grid + q3[2*l+1]);
|
||||
const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*l) & 127];
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
sumi += grid1[j] * q8[j+0] * (signs & kmask_iq2xs[j+0] ? -1 : 1);
|
||||
sumi += grid2[j] * q8[j+4] * (signs & kmask_iq2xs[j+4] ? -1 : 1);
|
||||
}
|
||||
q8 += 8;
|
||||
}
|
||||
q3 += 8;
|
||||
bsum += sumi * ls;
|
||||
}
|
||||
sumf += d * bsum;
|
||||
}
|
||||
*s = 0.25f * sumf;
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
ggml_vec_dot_iq3_xxs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
@ -3689,48 +3296,10 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|||
*s = sumf;
|
||||
|
||||
#else
|
||||
|
||||
float sumf = 0.f;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const uint8_t * GGML_RESTRICT qs = x[i].qs;
|
||||
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
||||
const uint8_t * GGML_RESTRICT signs = x[i].signs;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
int32_t bsum = 0;
|
||||
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
||||
const uint32_t ls1 = 2*(x[i].scales[ib32/2] & 0xf) + 1;
|
||||
const uint32_t ls2 = 2*(x[i].scales[ib32/2] >> 4) + 1;
|
||||
int32_t sumi = 0;
|
||||
for (int l = 0; l < 4; ++l) {
|
||||
const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+0] << (8-2*l)) & 256)));
|
||||
const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+0] << (7-2*l)) & 256)));
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
|
||||
sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
|
||||
}
|
||||
q8 += 8;
|
||||
}
|
||||
qs += 8;
|
||||
signs += 4;
|
||||
bsum += sumi * ls1;
|
||||
sumi = 0;
|
||||
for (int l = 0; l < 4; ++l) {
|
||||
const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+1] << (8-2*l)) & 256)));
|
||||
const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+1] << (7-2*l)) & 256)));
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
|
||||
sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
|
||||
}
|
||||
q8 += 8;
|
||||
}
|
||||
qs += 8;
|
||||
signs += 4;
|
||||
bsum += sumi * ls2;
|
||||
}
|
||||
sumf += d * bsum;
|
||||
}
|
||||
*s = sumf;
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
ggml_vec_dot_iq3_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
@ -3793,36 +3362,10 @@ void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|||
*s = sumf;
|
||||
|
||||
#else
|
||||
|
||||
float sumf = 0;
|
||||
for (int i = 0; i < nb; i++) {
|
||||
|
||||
const int8_t * q8 = y[i].qs;
|
||||
const uint8_t * qs = x[i].qs;
|
||||
const uint16_t * qh = x[i].qh;
|
||||
|
||||
int sumi = 0, sumi1 = 0;
|
||||
for (int ib = 0; ib < QK_K/32; ++ib) {
|
||||
const int ls = 2*((qh[ib] >> 12) & 7) + 1;
|
||||
const int delta = qh[ib] & 0x8000 ? -1 : 1;
|
||||
int lsum = 0;
|
||||
for (int l = 0; l < 4; ++l) {
|
||||
const int8_t * grid = (const int8_t *)(iq1s_grid + (qs[l] | (((qh[ib] >> 3*l) & 7) << 8)));
|
||||
for (int j = 0; j < 8; ++j) {
|
||||
lsum += q8[j] * grid[j];
|
||||
}
|
||||
q8 += 8;
|
||||
}
|
||||
sumi += ls * lsum;
|
||||
sumi1 += ls * delta * (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]);
|
||||
qs += 4;
|
||||
}
|
||||
|
||||
sumf += GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1);
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
ggml_vec_dot_iq1_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
@ -3912,52 +3455,11 @@ void ggml_vec_dot_iq1_m_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|||
*s = sumf;
|
||||
|
||||
#else
|
||||
|
||||
int sum1[2], sum2[2], delta[4];
|
||||
|
||||
float sumf = 0;
|
||||
for (int i = 0; i < nb; i++) {
|
||||
|
||||
const int8_t * q8 = y[i].qs;
|
||||
const uint8_t * qs = x[i].qs;
|
||||
const uint8_t * qh = x[i].qh;
|
||||
const uint16_t * sc = (const uint16_t *)x[i].scales;
|
||||
|
||||
scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
|
||||
|
||||
int sumi1 = 0, sumi2 = 0;
|
||||
for (int ib = 0; ib < QK_K/32; ++ib) {
|
||||
delta[0] = qh[0] & 0x08 ? -1 : 1;
|
||||
delta[1] = qh[0] & 0x80 ? -1 : 1;
|
||||
delta[2] = qh[1] & 0x08 ? -1 : 1;
|
||||
delta[3] = qh[1] & 0x80 ? -1 : 1;
|
||||
sum1[0] = sum1[1] = sum2[0] = sum2[1] = 0;
|
||||
for (int l = 0; l < 4; ++l) {
|
||||
const int8_t * grid = (const int8_t *)(iq1s_grid + (qs[l] | (((uint16_t)qh[l/2] << (8 - 4*(l%2))) & 0x700)));
|
||||
int lsum1 = 0, lsum2 = 0;
|
||||
for (int j = 0; j < 8; ++j) {
|
||||
lsum1 += q8[j] * grid[j];
|
||||
lsum2 += q8[j];
|
||||
}
|
||||
q8 += 8;
|
||||
sum1[l/2] += lsum1;
|
||||
sum2[l/2] += lsum2*delta[l];
|
||||
}
|
||||
|
||||
const int ls1 = 2*((sc[ib/2] >> (6*(ib%2)+0)) & 0x7) + 1;
|
||||
const int ls2 = 2*((sc[ib/2] >> (6*(ib%2)+3)) & 0x7) + 1;
|
||||
|
||||
sumi1 += sum1[0] * ls1 + sum1[1] * ls2;
|
||||
sumi2 += sum2[0] * ls1 + sum2[1] * ls2;
|
||||
qs += 4;
|
||||
qh += 2;
|
||||
}
|
||||
|
||||
sumf += GGML_CPU_FP16_TO_FP32(scale.f16) * y[i].d * (sumi1 + IQ1M_DELTA * sumi2);
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
UNUSED(scale);
|
||||
ggml_vec_dot_iq1_m_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
@ -4078,37 +3580,10 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
|
|||
*s = sumf;
|
||||
|
||||
#else
|
||||
float sumf = 0;
|
||||
for (int ibl = 0; ibl < nb; ++ibl) {
|
||||
const float d4d8 = GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d;
|
||||
uint16_t h = x[ibl].scales_h;
|
||||
const uint8_t * qs = x[ibl].qs;
|
||||
const int8_t * q8 = y[ibl].qs;
|
||||
for (int ib = 0; ib < QK_K/32; ib += 2) {
|
||||
const uint8_t ls1 = (x[ibl].scales_l[ib/2] & 0xf) | ((h << 4) & 0x30);
|
||||
const uint8_t ls2 = (x[ibl].scales_l[ib/2] >> 4) | ((h << 2) & 0x30);
|
||||
h >>= 4;
|
||||
const float d1 = d4d8*(ls1 - 32);
|
||||
const float d2 = d4d8*(ls2 - 32);
|
||||
int sumi1 = 0, sumi2 = 0;
|
||||
for (int j = 0; j < 16; ++j) {
|
||||
sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
|
||||
sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >> 4];
|
||||
}
|
||||
sumf += d1 * (sumi1 + sumi2);
|
||||
qs += 16;
|
||||
q8 += 32;
|
||||
sumi1 = sumi2 = 0;
|
||||
for (int j = 0; j < 16; ++j) {
|
||||
sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
|
||||
sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >> 4];
|
||||
}
|
||||
sumf += d2 * (sumi1 + sumi2);
|
||||
qs += 16;
|
||||
q8 += 32;
|
||||
}
|
||||
}
|
||||
*s = sumf;
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
ggml_vec_dot_iq4_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -86,35 +86,9 @@ void ggml_quantize_mat_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTR
|
|||
}
|
||||
}
|
||||
#else
|
||||
// scalar
|
||||
const int blck_size_interleave = 4;
|
||||
float srcv[4][QK8_0];
|
||||
float id[4];
|
||||
|
||||
for (int i = 0; i < nb; i++) {
|
||||
for (int row_iter = 0; row_iter < 4; row_iter++) {
|
||||
float amax = 0.0f; // absolute max
|
||||
|
||||
for (int j = 0; j < QK8_0; j++) {
|
||||
srcv[row_iter][j] = x[row_iter * k + i * QK8_0 + j];
|
||||
amax = MAX(amax, fabsf(srcv[row_iter][j]));
|
||||
}
|
||||
|
||||
const float d = amax / ((1 << 7) - 1);
|
||||
id[row_iter] = d ? 1.0f / d : 0.0f;
|
||||
|
||||
y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d);
|
||||
}
|
||||
|
||||
for (int j = 0; j < QK8_0 * 4; j++) {
|
||||
int src_offset = (j / (4 * blck_size_interleave)) * blck_size_interleave;
|
||||
int src_id = (j % (4 * blck_size_interleave)) / blck_size_interleave;
|
||||
src_offset += (j % blck_size_interleave);
|
||||
|
||||
float x0 = srcv[src_id][src_offset] * id[src_id];
|
||||
y[i].qs[j] = roundf(x0);
|
||||
}
|
||||
}
|
||||
UNUSED(nb);
|
||||
UNUSED(y);
|
||||
ggml_quantize_mat_q8_0_4x4_generic(x, vy, k);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
@ -205,35 +179,9 @@ void ggml_quantize_mat_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTR
|
|||
}
|
||||
|
||||
#else
|
||||
// scalar
|
||||
const int blck_size_interleave = 8;
|
||||
float srcv[4][QK8_0];
|
||||
float id[4];
|
||||
|
||||
for (int i = 0; i < nb; i++) {
|
||||
for (int row_iter = 0; row_iter < 4; row_iter++) {
|
||||
float amax = 0.0f; // absolute max
|
||||
|
||||
for (int j = 0; j < QK8_0; j++) {
|
||||
srcv[row_iter][j] = x[row_iter * k + i * QK8_0 + j];
|
||||
amax = MAX(amax, fabsf(srcv[row_iter][j]));
|
||||
}
|
||||
|
||||
const float d = amax / ((1 << 7) - 1);
|
||||
id[row_iter] = d ? 1.0f / d : 0.0f;
|
||||
|
||||
y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d);
|
||||
}
|
||||
|
||||
for (int j = 0; j < QK8_0 * 4; j++) {
|
||||
int src_offset = (j / (4 * blck_size_interleave)) * blck_size_interleave;
|
||||
int src_id = (j % (4 * blck_size_interleave)) / blck_size_interleave;
|
||||
src_offset += (j % blck_size_interleave);
|
||||
|
||||
float x0 = srcv[src_id][src_offset] * id[src_id];
|
||||
y[i].qs[j] = roundf(x0);
|
||||
}
|
||||
}
|
||||
UNUSED(nb);
|
||||
UNUSED(y);
|
||||
ggml_quantize_mat_q8_0_4x8_generic(x, vy, k);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
@ -295,29 +243,7 @@ void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|||
}
|
||||
return;
|
||||
#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
|
||||
float sumf[4];
|
||||
int sumi;
|
||||
|
||||
const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
||||
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
||||
const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
|
||||
|
||||
for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
|
||||
for (int l = 0; l < nb; l++) {
|
||||
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
||||
for (int j = 0; j < ncols_interleaved; j++) {
|
||||
sumi = 0;
|
||||
for (int i = 0; i < blocklen; ++i) {
|
||||
const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
|
||||
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
|
||||
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
|
||||
}
|
||||
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
|
||||
}
|
||||
ggml_gemv_q4_0_4x4_q8_0_generic(n, s, bs, vx, vy, nr, nc);
|
||||
}
|
||||
|
||||
void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||
|
|
@ -383,29 +309,7 @@ void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|||
}
|
||||
return;
|
||||
#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
|
||||
float sumf[4];
|
||||
int sumi;
|
||||
|
||||
const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
||||
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
||||
const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
|
||||
|
||||
for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
|
||||
for (int l = 0; l < nb; l++) {
|
||||
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
||||
for (int j = 0; j < ncols_interleaved; j++) {
|
||||
sumi = 0;
|
||||
for (int i = 0; i < blocklen; ++i) {
|
||||
const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
|
||||
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
|
||||
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
|
||||
}
|
||||
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
|
||||
}
|
||||
ggml_gemv_q4_0_4x8_q8_0_generic(n, s, bs, vx, vy, nr, nc);
|
||||
}
|
||||
|
||||
void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||
|
|
@ -497,31 +401,7 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|||
#endif // #if defined(__ARM_FEATURE_SVE)
|
||||
|
||||
#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__)
|
||||
{
|
||||
float sumf[8];
|
||||
int sumi;
|
||||
|
||||
const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
||||
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
||||
const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
|
||||
|
||||
for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
|
||||
for (int l = 0; l < nb; l++) {
|
||||
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
||||
for (int j = 0; j < ncols_interleaved; j++) {
|
||||
sumi = 0;
|
||||
for (int i = 0; i < blocklen; ++i) {
|
||||
const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
|
||||
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
|
||||
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
|
||||
}
|
||||
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
|
||||
}
|
||||
}
|
||||
ggml_gemv_q4_0_8x8_q8_0_generic(n, s, bs, vx, vy, nr, nc);
|
||||
}
|
||||
|
||||
void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||
|
|
@ -591,31 +471,7 @@ void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const
|
|||
}
|
||||
return;
|
||||
#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
|
||||
{
|
||||
float sumf[4];
|
||||
int sumi;
|
||||
|
||||
const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
||||
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
||||
const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
|
||||
|
||||
for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
|
||||
for (int l = 0; l < nb; l++) {
|
||||
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
||||
for (int j = 0; j < ncols_interleaved; j++) {
|
||||
sumi = 0;
|
||||
for (int i = 0; i < blocklen; ++i) {
|
||||
const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
|
||||
const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
|
||||
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
|
||||
}
|
||||
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
|
||||
}
|
||||
}
|
||||
ggml_gemv_iq4_nl_4x4_q8_0_generic(n, s, bs, vx, vy, nr, nc);
|
||||
}
|
||||
|
||||
void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||
|
|
@ -1096,40 +952,7 @@ void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|||
);
|
||||
return;
|
||||
#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
|
||||
{
|
||||
float sumf[4][4];
|
||||
int sumi;
|
||||
|
||||
for (int y = 0; y < nr / 4; y++) {
|
||||
const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
|
||||
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
||||
const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
|
||||
for (int m = 0; m < 4; m++) {
|
||||
for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
|
||||
}
|
||||
for (int l = 0; l < nb; l++) {
|
||||
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
||||
for (int m = 0; m < 4; m++) {
|
||||
for (int j = 0; j < ncols_interleaved; j++) {
|
||||
sumi = 0;
|
||||
for (int i = 0; i < blocklen; ++i) {
|
||||
const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
|
||||
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
|
||||
sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
|
||||
(v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
|
||||
}
|
||||
sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int m = 0; m < 4; m++) {
|
||||
for (int j = 0; j < ncols_interleaved; j++)
|
||||
s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
ggml_gemm_q4_0_4x4_q8_0_generic(n, s, bs, vx, vy, nr, nc);
|
||||
}
|
||||
|
||||
void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||
|
|
@ -1550,38 +1373,7 @@ void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|||
);
|
||||
return;
|
||||
#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
|
||||
float sumf[4][4];
|
||||
int sumi;
|
||||
|
||||
for (int y = 0; y < nr / 4; y++) {
|
||||
const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
|
||||
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
||||
const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
|
||||
for (int m = 0; m < 4; m++) {
|
||||
for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
|
||||
}
|
||||
for (int l = 0; l < nb; l++) {
|
||||
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
||||
for (int m = 0; m < 4; m++) {
|
||||
for (int j = 0; j < ncols_interleaved; j++) {
|
||||
sumi = 0;
|
||||
for (int i = 0; i < blocklen; ++i) {
|
||||
const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
|
||||
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
|
||||
sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
|
||||
(v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
|
||||
}
|
||||
sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int m = 0; m < 4; m++) {
|
||||
for (int j = 0; j < ncols_interleaved; j++)
|
||||
s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
|
||||
}
|
||||
}
|
||||
}
|
||||
ggml_gemm_q4_0_4x8_q8_0_generic(n, s, bs, vx, vy, nr, nc);
|
||||
}
|
||||
|
||||
void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||
|
|
@ -2019,38 +1811,7 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|||
#endif // #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
|
||||
|
||||
#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__)
|
||||
float sumf[4][8];
|
||||
int sumi;
|
||||
|
||||
for (int y = 0; y < nr / 4; y++) {
|
||||
const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
|
||||
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
||||
const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
|
||||
for (int m = 0; m < 4; m++) {
|
||||
for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
|
||||
}
|
||||
for (int l = 0; l < nb; l++) {
|
||||
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
||||
for (int m = 0; m < 4; m++) {
|
||||
for (int j = 0; j < ncols_interleaved; j++) {
|
||||
sumi = 0;
|
||||
for (int i = 0; i < blocklen; ++i) {
|
||||
const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
|
||||
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
|
||||
sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
|
||||
(v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
|
||||
}
|
||||
sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int m = 0; m < 4; m++) {
|
||||
for (int j = 0; j < ncols_interleaved; j++)
|
||||
s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
|
||||
}
|
||||
}
|
||||
}
|
||||
ggml_gemm_q4_0_8x8_q8_0_generic(n, s, bs, vx, vy, nr, nc);
|
||||
}
|
||||
|
||||
void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||
|
|
@ -2126,38 +1887,5 @@ void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const
|
|||
}
|
||||
return;
|
||||
#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
|
||||
{
|
||||
float sumf[4][4];
|
||||
int sumi;
|
||||
|
||||
for (int y = 0; y < nr / 4; y++) {
|
||||
const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
|
||||
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
||||
const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
|
||||
for (int m = 0; m < 4; m++) {
|
||||
for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
|
||||
}
|
||||
for (int l = 0; l < nb; l++) {
|
||||
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
||||
for (int m = 0; m < 4; m++) {
|
||||
for (int j = 0; j < ncols_interleaved; j++) {
|
||||
sumi = 0;
|
||||
for (int i = 0; i < blocklen; ++i) {
|
||||
const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
|
||||
const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
|
||||
sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
|
||||
(v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4]));
|
||||
}
|
||||
sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int m = 0; m < 4; m++) {
|
||||
for (int j = 0; j < ncols_interleaved; j++)
|
||||
s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
ggml_gemm_iq4_nl_4x4_q8_0_generic(n, s, bs, vx, vy, nr, nc);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -821,24 +821,15 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
|
||||
sumf = hsum_float_8(acc) + summs;
|
||||
|
||||
#endif
|
||||
for (; ib < nb; ++ib) {
|
||||
int sumi0 = 0;
|
||||
int sumi1 = 0;
|
||||
|
||||
for (int j = 0; j < qk/2; ++j) {
|
||||
const int v0 = (x[ib].qs[j] & 0x0F);
|
||||
const int v1 = (x[ib].qs[j] >> 4);
|
||||
|
||||
sumi0 += (v0 * y[ib].qs[j]);
|
||||
sumi1 += (v1 * y[ib].qs[j + qk/2]);
|
||||
}
|
||||
|
||||
int sumi = sumi0 + sumi1;
|
||||
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
#else
|
||||
UNUSED(nb);
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(ib);
|
||||
UNUSED(sumf);
|
||||
ggml_vec_dot_q4_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
|
|
@ -883,30 +874,15 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
|
||||
sumf = hsum_float_8(acc);
|
||||
|
||||
#endif
|
||||
for (; ib < nb; ++ib) {
|
||||
uint32_t qh;
|
||||
memcpy(&qh, x[ib].qh, sizeof(qh));
|
||||
|
||||
int sumi0 = 0;
|
||||
int sumi1 = 0;
|
||||
|
||||
for (int j = 0; j < qk/2; ++j) {
|
||||
const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
|
||||
const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12));
|
||||
|
||||
const int32_t x0 = (int8_t)(((x[ib].qs[j] & 0x0F) | xh_0) - 16);
|
||||
const int32_t x1 = (int8_t)(((x[ib].qs[j] >> 4) | xh_1) - 16);
|
||||
|
||||
sumi0 += (x0 * y[ib].qs[j]);
|
||||
sumi1 += (x1 * y[ib].qs[j + qk/2]);
|
||||
}
|
||||
|
||||
int sumi = sumi0 + sumi1;
|
||||
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi;
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
#else
|
||||
UNUSED(nb);
|
||||
UNUSED(ib);
|
||||
UNUSED(sumf);
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
|
|
@ -954,30 +930,15 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
|
||||
sumf = hsum_float_8(acc) + summs;
|
||||
|
||||
#endif
|
||||
for (; ib < nb; ++ib) {
|
||||
uint32_t qh;
|
||||
memcpy(&qh, x[ib].qh, sizeof(qh));
|
||||
|
||||
int sumi0 = 0;
|
||||
int sumi1 = 0;
|
||||
|
||||
for (int j = 0; j < qk/2; ++j) {
|
||||
const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
|
||||
const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10;
|
||||
|
||||
const int32_t x0 = (x[ib].qs[j] & 0xF) | xh_0;
|
||||
const int32_t x1 = (x[ib].qs[j] >> 4) | xh_1;
|
||||
|
||||
sumi0 += (x0 * y[ib].qs[j]);
|
||||
sumi1 += (x1 * y[ib].qs[j + qk/2]);
|
||||
}
|
||||
|
||||
int sumi = sumi0 + sumi1;
|
||||
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
#else
|
||||
UNUSED(nb);
|
||||
UNUSED(ib);
|
||||
UNUSED(sumf);
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
|
|
@ -1016,18 +977,15 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
|
||||
sumf = hsum_float_8(acc);
|
||||
|
||||
#endif
|
||||
for (; ib < nb; ++ib) {
|
||||
int sumi = 0;
|
||||
|
||||
for (int j = 0; j < qk; j++) {
|
||||
sumi += x[ib].qs[j]*y[ib].qs[j];
|
||||
}
|
||||
|
||||
sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d));
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
#else
|
||||
UNUSED(nb);
|
||||
UNUSED(ib);
|
||||
UNUSED(sumf);
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
ggml_vec_dot_q8_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
|
|
@ -1103,45 +1061,10 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
*s = hsum_float_8(acc);
|
||||
|
||||
#else
|
||||
|
||||
float sumf = 0;
|
||||
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
|
||||
const uint8_t * q2 = x[i].qs;
|
||||
const int8_t * q8 = y[i].qs;
|
||||
const uint8_t * sc = x[i].scales;
|
||||
|
||||
int summs = 0;
|
||||
for (int j = 0; j < 16; ++j) {
|
||||
summs += y[i].bsums[j] * (sc[j] >> 4);
|
||||
}
|
||||
|
||||
const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
||||
const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
|
||||
|
||||
int isum = 0;
|
||||
int is = 0;
|
||||
int d;
|
||||
for (int k = 0; k < QK_K/128; ++k) {
|
||||
int shift = 0;
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
d = sc[is++] & 0xF;
|
||||
int isuml = 0;
|
||||
for (int l = 0; l < 16; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
|
||||
isum += d * isuml;
|
||||
d = sc[is++] & 0xF;
|
||||
isuml = 0;
|
||||
for (int l = 16; l < 32; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
|
||||
isum += d * isuml;
|
||||
shift += 2;
|
||||
q8 += 32;
|
||||
}
|
||||
q2 += 32;
|
||||
}
|
||||
sumf += dall * isum - dmin * summs;
|
||||
}
|
||||
*s = sumf;
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
ggml_vec_dot_q2_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
@ -1239,70 +1162,13 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
*s = hsum_float_8(acc);
|
||||
|
||||
#else
|
||||
// scalar version
|
||||
// This function is written like this so the compiler can manage to vectorize most of it
|
||||
// Using -Ofast, GCC and clang manage to produce code that is within a factor of 2 or so from the
|
||||
// manually vectorized version above. Every other version I tried would run at least 4 times slower.
|
||||
// The ideal situation would be if we could just write the code once, and the compiler would
|
||||
// automatically produce the best possible set of machine instructions, instead of us having to manually
|
||||
// write vectorized versions for AVX, ARM_NEON, etc.
|
||||
|
||||
int8_t aux8[QK_K];
|
||||
int16_t aux16[8];
|
||||
float sums [8];
|
||||
int32_t aux32[8];
|
||||
memset(sums, 0, 8*sizeof(float));
|
||||
|
||||
uint32_t auxs[4];
|
||||
const int8_t * scales = (const int8_t*)auxs;
|
||||
|
||||
float sumf = 0;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
|
||||
const uint8_t * GGML_RESTRICT hm = x[i].hmask;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
memset(aux32, 0, 8*sizeof(int32_t));
|
||||
int8_t * GGML_RESTRICT a = aux8;
|
||||
uint8_t m = 1;
|
||||
for (int j = 0; j < QK_K; j += 128) {
|
||||
for (int l = 0; l < 32; ++l) a[l] = q3[l] & 3;
|
||||
for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
|
||||
a += 32; m <<= 1;
|
||||
for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 2) & 3;
|
||||
for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
|
||||
a += 32; m <<= 1;
|
||||
for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 4) & 3;
|
||||
for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
|
||||
a += 32; m <<= 1;
|
||||
for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 6) & 3;
|
||||
for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
|
||||
a += 32; m <<= 1;
|
||||
q3 += 32;
|
||||
}
|
||||
a = aux8;
|
||||
|
||||
memcpy(auxs, x[i].scales, 12);
|
||||
uint32_t tmp = auxs[2];
|
||||
auxs[2] = ((auxs[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4);
|
||||
auxs[3] = ((auxs[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4);
|
||||
auxs[0] = (auxs[0] & kmask2) | (((tmp >> 0) & kmask1) << 4);
|
||||
auxs[1] = (auxs[1] & kmask2) | (((tmp >> 2) & kmask1) << 4);
|
||||
for (int j = 0; j < QK_K/16; ++j) {
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
}
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
||||
}
|
||||
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
||||
*s = sumf;
|
||||
|
||||
UNUSED(kmask1);
|
||||
UNUSED(kmask2);
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
ggml_vec_dot_q3_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
|
|
@ -1391,61 +1257,14 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
*s = hsum_float_8(acc) + ((v4f32)acc_m)[0];
|
||||
|
||||
#else
|
||||
|
||||
const uint8_t * scales = (const uint8_t*)&utmp[0];
|
||||
const uint8_t * mins = (const uint8_t*)&utmp[2];
|
||||
|
||||
int8_t aux8[QK_K];
|
||||
int16_t aux16[8];
|
||||
float sums [8];
|
||||
int32_t aux32[8];
|
||||
memset(sums, 0, 8*sizeof(float));
|
||||
|
||||
float sumf = 0;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const uint8_t * GGML_RESTRICT q4 = x[i].qs;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
memset(aux32, 0, 8*sizeof(int32_t));
|
||||
int8_t * GGML_RESTRICT a = aux8;
|
||||
for (int j = 0; j < QK_K/64; ++j) {
|
||||
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
|
||||
a += 32;
|
||||
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4);
|
||||
a += 32; q4 += 32;
|
||||
}
|
||||
memcpy(utmp, x[i].scales, 12);
|
||||
utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
|
||||
const uint32_t uaux = utmp[1] & kmask1;
|
||||
utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
|
||||
utmp[2] = uaux;
|
||||
utmp[0] &= kmask1;
|
||||
|
||||
int sumi = 0;
|
||||
for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
|
||||
a = aux8;
|
||||
int is = 0;
|
||||
for (int j = 0; j < QK_K/32; ++j) {
|
||||
int32_t scale = scales[is++];
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
}
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
||||
const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
||||
sumf -= dmin * sumi;
|
||||
}
|
||||
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
||||
*s = sumf;
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
UNUSED(kmask1);
|
||||
UNUSED(kmask2);
|
||||
UNUSED(kmask3);
|
||||
UNUSED(utmp);
|
||||
ggml_vec_dot_q4_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
@ -1541,66 +1360,14 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
*s = hsum_float_8(acc) + ((v4f32)acc_m)[0];
|
||||
|
||||
#else
|
||||
|
||||
const uint8_t * scales = (const uint8_t*)&utmp[0];
|
||||
const uint8_t * mins = (const uint8_t*)&utmp[2];
|
||||
|
||||
int8_t aux8[QK_K];
|
||||
int16_t aux16[8];
|
||||
float sums [8];
|
||||
int32_t aux32[8];
|
||||
memset(sums, 0, 8*sizeof(float));
|
||||
|
||||
float sumf = 0;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const uint8_t * GGML_RESTRICT q4 = x[i].qs;
|
||||
const uint8_t * GGML_RESTRICT hm = x[i].qh;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
memset(aux32, 0, 8*sizeof(int32_t));
|
||||
int8_t * GGML_RESTRICT a = aux8;
|
||||
uint8_t m = 1;
|
||||
for (int j = 0; j < QK_K/64; ++j) {
|
||||
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
|
||||
for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
|
||||
a += 32; m <<= 1;
|
||||
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4);
|
||||
for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
|
||||
a += 32; m <<= 1;
|
||||
q4 += 32;
|
||||
}
|
||||
memcpy(utmp, x[i].scales, 12);
|
||||
utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
|
||||
const uint32_t uaux = utmp[1] & kmask1;
|
||||
utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
|
||||
utmp[2] = uaux;
|
||||
utmp[0] &= kmask1;
|
||||
|
||||
int sumi = 0;
|
||||
for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
|
||||
a = aux8;
|
||||
int is = 0;
|
||||
for (int j = 0; j < QK_K/32; ++j) {
|
||||
int32_t scale = scales[is++];
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
}
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
||||
const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
||||
sumf -= dmin * sumi;
|
||||
}
|
||||
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
||||
*s = sumf;
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
UNUSED(kmask1);
|
||||
UNUSED(kmask2);
|
||||
UNUSED(kmask3);
|
||||
UNUSED(utmp);
|
||||
ggml_vec_dot_q5_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
@ -1678,47 +1445,10 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
*s = hsum_float_8(acc);
|
||||
|
||||
#else
|
||||
|
||||
int8_t aux8[QK_K];
|
||||
int16_t aux16[8];
|
||||
float sums [8];
|
||||
int32_t aux32[8];
|
||||
memset(sums, 0, 8*sizeof(float));
|
||||
|
||||
float sumf = 0;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const uint8_t * GGML_RESTRICT q4 = x[i].ql;
|
||||
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
memset(aux32, 0, 8*sizeof(int32_t));
|
||||
int8_t * GGML_RESTRICT a = aux8;
|
||||
for (int j = 0; j < QK_K; j += 128) {
|
||||
for (int l = 0; l < 32; ++l) {
|
||||
a[l + 0] = (int8_t)((q4[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
|
||||
a[l + 32] = (int8_t)((q4[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
|
||||
a[l + 64] = (int8_t)((q4[l + 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
|
||||
a[l + 96] = (int8_t)((q4[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
|
||||
}
|
||||
a += 128;
|
||||
q4 += 64;
|
||||
qh += 32;
|
||||
}
|
||||
a = aux8;
|
||||
int is = 0;
|
||||
for (int j = 0; j < QK_K/16; ++j) {
|
||||
int scale = x[i].scales[is++];
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
}
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
||||
}
|
||||
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
||||
*s = sumf;
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
ggml_vec_dot_q6_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
@ -1815,34 +1545,10 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
|
|||
*s = 0.125f * hsum_float_8(accumf);
|
||||
|
||||
#else
|
||||
|
||||
uint32_t aux32[2];
|
||||
const uint8_t * aux8 = (const uint8_t *)aux32;
|
||||
|
||||
float sumf = 0.f;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
int32_t bsum = 0;
|
||||
for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
|
||||
memcpy(aux32, q2, 2*sizeof(uint32_t));
|
||||
q2 += 4;
|
||||
const uint32_t ls = 2*(aux32[1] >> 28) + 1;
|
||||
int32_t sumi = 0;
|
||||
for (int l = 0; l < 4; ++l) {
|
||||
const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[l]);
|
||||
const uint8_t signs = ksigns_iq2xs[(aux32[1] >> 7*l) & 127];
|
||||
for (int j = 0; j < 8; ++j) {
|
||||
sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
|
||||
}
|
||||
q8 += 8;
|
||||
}
|
||||
bsum += sumi * ls;
|
||||
}
|
||||
sumf += d * bsum;
|
||||
}
|
||||
*s = 0.125f * sumf;
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
ggml_vec_dot_iq2_xxs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
@ -1978,42 +1684,10 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
|
|||
*s = 0.125f * hsum_float_8(accumf);
|
||||
|
||||
#else
|
||||
|
||||
float sumf = 0.f;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
|
||||
const uint8_t * GGML_RESTRICT sc = x[i].scales;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
int32_t bsum = 0;
|
||||
for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
|
||||
const uint16_t ls1 = 2*(sc[ib32] & 0xf) + 1;
|
||||
const uint16_t ls2 = 2*(sc[ib32] >> 4) + 1;
|
||||
int32_t sumi = 0;
|
||||
for (int l = 0; l < 2; ++l) {
|
||||
const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
|
||||
const uint8_t signs = ksigns_iq2xs[q2[l] >> 9];
|
||||
for (int j = 0; j < 8; ++j) {
|
||||
sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
|
||||
}
|
||||
q8 += 8;
|
||||
}
|
||||
bsum += sumi * ls1;
|
||||
sumi = 0;
|
||||
for (int l = 2; l < 4; ++l) {
|
||||
const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
|
||||
const uint8_t signs = ksigns_iq2xs[q2[l] >> 9];
|
||||
for (int j = 0; j < 8; ++j) {
|
||||
sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
|
||||
}
|
||||
q8 += 8;
|
||||
}
|
||||
bsum += sumi * ls2;
|
||||
q2 += 4;
|
||||
}
|
||||
sumf += d * bsum;
|
||||
}
|
||||
*s = 0.125f * sumf;
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
ggml_vec_dot_iq2_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
@ -2105,47 +1779,11 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|||
*s = 0.125f * hsum_float_8(accumf);
|
||||
|
||||
#else
|
||||
|
||||
float sumf = 0;
|
||||
for (int i = 0; i < nb; i++) {
|
||||
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const int8_t * q8 = y[i].qs;
|
||||
const uint8_t * qs = x[i].qs;
|
||||
const uint8_t * qh = x[i].qh;
|
||||
const uint8_t * signs = qs + QK_K/8;
|
||||
|
||||
int bsum = 0;
|
||||
for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
|
||||
int ls1 = 1 + 2*(x[i].scales[ib32] & 0xf);
|
||||
int ls2 = 1 + 2*(x[i].scales[ib32] >> 4);
|
||||
int sumi1 = 0, sumi2 = 0;
|
||||
for (int l = 0; l < 2; ++l) {
|
||||
const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
|
||||
for (int j = 0; j < 8; ++j) {
|
||||
sumi1 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1);
|
||||
}
|
||||
q8 += 8;
|
||||
}
|
||||
for (int l = 2; l < 4; ++l) {
|
||||
const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
|
||||
for (int j = 0; j < 8; ++j) {
|
||||
sumi2 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1);
|
||||
}
|
||||
q8 += 8;
|
||||
}
|
||||
bsum += ls1 * sumi1 + ls2 * sumi2;
|
||||
qs += 4;
|
||||
signs += 4;
|
||||
}
|
||||
|
||||
sumf += d * bsum;
|
||||
}
|
||||
|
||||
*s = 0.125f * sumf;
|
||||
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
ggml_vec_dot_iq2_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
|
|
@ -2209,36 +1847,10 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
|
|||
*s = 0.25f * hsum_float_8(accumf);
|
||||
|
||||
#else
|
||||
|
||||
uint32_t aux32;
|
||||
|
||||
float sumf = 0.f;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
|
||||
const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
int32_t bsum = 0;
|
||||
for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
|
||||
memcpy(&aux32, gas, sizeof(uint32_t)); gas += sizeof(uint32_t);
|
||||
const uint32_t ls = 2*(aux32 >> 28) + 1;
|
||||
int32_t sumi = 0;
|
||||
for (int l = 0; l < 4; ++l) {
|
||||
const uint8_t * grid1 = (const uint8_t *)(iq3xxs_grid + q3[2*l+0]);
|
||||
const uint8_t * grid2 = (const uint8_t *)(iq3xxs_grid + q3[2*l+1]);
|
||||
const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*l) & 127];
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
sumi += grid1[j] * q8[j+0] * (signs & kmask_iq2xs[j+0] ? -1 : 1);
|
||||
sumi += grid2[j] * q8[j+4] * (signs & kmask_iq2xs[j+4] ? -1 : 1);
|
||||
}
|
||||
q8 += 8;
|
||||
}
|
||||
q3 += 8;
|
||||
bsum += sumi * ls;
|
||||
}
|
||||
sumf += d * bsum;
|
||||
}
|
||||
*s = 0.25f * sumf;
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
ggml_vec_dot_iq3_xxs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
@ -2338,48 +1950,10 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|||
*s = hsum_float_8(accumf);
|
||||
|
||||
#else
|
||||
|
||||
float sumf = 0.f;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const uint8_t * GGML_RESTRICT qs = x[i].qs;
|
||||
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
||||
const uint8_t * GGML_RESTRICT signs = x[i].signs;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
int32_t bsum = 0;
|
||||
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
||||
const uint32_t ls1 = 2*(x[i].scales[ib32/2] & 0xf) + 1;
|
||||
const uint32_t ls2 = 2*(x[i].scales[ib32/2] >> 4) + 1;
|
||||
int32_t sumi = 0;
|
||||
for (int l = 0; l < 4; ++l) {
|
||||
const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+0] << (8-2*l)) & 256)));
|
||||
const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+0] << (7-2*l)) & 256)));
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
|
||||
sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
|
||||
}
|
||||
q8 += 8;
|
||||
}
|
||||
qs += 8;
|
||||
signs += 4;
|
||||
bsum += sumi * ls1;
|
||||
sumi = 0;
|
||||
for (int l = 0; l < 4; ++l) {
|
||||
const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+1] << (8-2*l)) & 256)));
|
||||
const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+1] << (7-2*l)) & 256)));
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
|
||||
sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
|
||||
}
|
||||
q8 += 8;
|
||||
}
|
||||
qs += 8;
|
||||
signs += 4;
|
||||
bsum += sumi * ls2;
|
||||
}
|
||||
sumf += d * bsum;
|
||||
}
|
||||
*s = sumf;
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
ggml_vec_dot_iq3_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
@ -2460,36 +2034,10 @@ void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|||
*s = hsum_float_8(accum) + IQ1S_DELTA * accum1;
|
||||
|
||||
#else
|
||||
|
||||
float sumf = 0;
|
||||
for (int i = 0; i < nb; i++) {
|
||||
|
||||
const int8_t * q8 = y[i].qs;
|
||||
const uint8_t * qs = x[i].qs;
|
||||
const uint16_t * qh = x[i].qh;
|
||||
|
||||
int sumi = 0, sumi1 = 0;
|
||||
for (int ib = 0; ib < QK_K/32; ++ib) {
|
||||
const int ls = 2*((qh[ib] >> 12) & 7) + 1;
|
||||
const int delta = qh[ib] & 0x8000 ? -1 : 1;
|
||||
int lsum = 0;
|
||||
for (int l = 0; l < 4; ++l) {
|
||||
const int8_t * grid = (const int8_t *)(iq1s_grid + (qs[l] | (((qh[ib] >> 3*l) & 7) << 8)));
|
||||
for (int j = 0; j < 8; ++j) {
|
||||
lsum += q8[j] * grid[j];
|
||||
}
|
||||
q8 += 8;
|
||||
}
|
||||
sumi += ls * lsum;
|
||||
sumi1 += ls * delta * (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]);
|
||||
qs += 4;
|
||||
}
|
||||
|
||||
sumf += GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1);
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
ggml_vec_dot_iq1_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
@ -2603,37 +2151,10 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
|
|||
*s = hsum_float_8(accum);
|
||||
|
||||
#else
|
||||
float sumf = 0;
|
||||
for (int ibl = 0; ibl < nb; ++ibl) {
|
||||
const float d4d8 = GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d;
|
||||
uint16_t h = x[ibl].scales_h;
|
||||
const uint8_t * qs = x[ibl].qs;
|
||||
const int8_t * q8 = y[ibl].qs;
|
||||
for (int ib = 0; ib < QK_K/32; ib += 2) {
|
||||
const uint8_t ls1 = (x[ibl].scales_l[ib/2] & 0xf) | ((h << 4) & 0x30);
|
||||
const uint8_t ls2 = (x[ibl].scales_l[ib/2] >> 4) | ((h << 2) & 0x30);
|
||||
h >>= 4;
|
||||
const float d1 = d4d8*(ls1 - 32);
|
||||
const float d2 = d4d8*(ls2 - 32);
|
||||
int sumi1 = 0, sumi2 = 0;
|
||||
for (int j = 0; j < 16; ++j) {
|
||||
sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
|
||||
sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >> 4];
|
||||
}
|
||||
sumf += d1 * (sumi1 + sumi2);
|
||||
qs += 16;
|
||||
q8 += 32;
|
||||
sumi1 = sumi2 = 0;
|
||||
for (int j = 0; j < 16; ++j) {
|
||||
sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
|
||||
sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >> 4];
|
||||
}
|
||||
sumf += d2 * (sumi1 + sumi2);
|
||||
qs += 16;
|
||||
q8 += 32;
|
||||
}
|
||||
}
|
||||
*s = sumf;
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
ggml_vec_dot_iq4_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -201,24 +201,14 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
|
||||
sumf = vec_extract(vsumf0, 0);
|
||||
|
||||
#endif
|
||||
for (; ib < nb; ++ib) {
|
||||
int sumi0 = 0;
|
||||
int sumi1 = 0;
|
||||
|
||||
for (int j = 0; j < qk/2; ++j) {
|
||||
const int v0 = (x[ib].qs[j] & 0x0F) - 8;
|
||||
const int v1 = (x[ib].qs[j] >> 4) - 8;
|
||||
|
||||
sumi0 += (v0 * y[ib].qs[j]);
|
||||
sumi1 += (v1 * y[ib].qs[j + qk/2]);
|
||||
}
|
||||
|
||||
int sumi = sumi0 + sumi1;
|
||||
sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d);
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
#else
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(ib);
|
||||
UNUSED(sumf);
|
||||
ggml_vec_dot_q4_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
|
|
@ -278,24 +268,14 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
|
||||
sumf = vec_extract(vsumf0, 0);
|
||||
|
||||
#endif
|
||||
for (; ib < nb; ++ib) {
|
||||
int sumi0 = 0;
|
||||
int sumi1 = 0;
|
||||
|
||||
for (int j = 0; j < qk/2; ++j) {
|
||||
const int v0 = (x[ib].qs[j] & 0x0F);
|
||||
const int v1 = (x[ib].qs[j] >> 4);
|
||||
|
||||
sumi0 += (v0 * y[ib].qs[j]);
|
||||
sumi1 += (v1 * y[ib].qs[j + qk/2]);
|
||||
}
|
||||
|
||||
int sumi = sumi0 + sumi1;
|
||||
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
#else
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(ib);
|
||||
UNUSED(sumf);
|
||||
ggml_vec_dot_q4_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
|
|
@ -360,30 +340,14 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
|
||||
sumf = vec_extract(vsumf0, 0);
|
||||
|
||||
#endif
|
||||
for (; ib < nb; ++ib) {
|
||||
uint32_t qh;
|
||||
memcpy(&qh, x[ib].qh, sizeof(qh));
|
||||
|
||||
int sumi0 = 0;
|
||||
int sumi1 = 0;
|
||||
|
||||
for (int j = 0; j < qk/2; ++j) {
|
||||
const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
|
||||
const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12));
|
||||
|
||||
const int32_t x0 = (int8_t)(((x[ib].qs[j] & 0x0F) | xh_0) - 16);
|
||||
const int32_t x1 = (int8_t)(((x[ib].qs[j] >> 4) | xh_1) - 16);
|
||||
|
||||
sumi0 += (x0 * y[ib].qs[j]);
|
||||
sumi1 += (x1 * y[ib].qs[j + qk/2]);
|
||||
}
|
||||
|
||||
int sumi = sumi0 + sumi1;
|
||||
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi;
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
#else
|
||||
UNUSED(ib);
|
||||
UNUSED(sumf);
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
|
|
@ -451,30 +415,15 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
|
||||
sumf = vec_extract(vsumf0, 0);
|
||||
|
||||
#endif
|
||||
for (; ib < nb; ++ib) {
|
||||
uint32_t qh;
|
||||
memcpy(&qh, x[ib].qh, sizeof(qh));
|
||||
|
||||
int sumi0 = 0;
|
||||
int sumi1 = 0;
|
||||
|
||||
for (int j = 0; j < qk/2; ++j) {
|
||||
const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
|
||||
const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10;
|
||||
|
||||
const int32_t x0 = (x[ib].qs[j] & 0xF) | xh_0;
|
||||
const int32_t x1 = (x[ib].qs[j] >> 4) | xh_1;
|
||||
|
||||
sumi0 += (x0 * y[ib].qs[j]);
|
||||
sumi1 += (x1 * y[ib].qs[j + qk/2]);
|
||||
}
|
||||
|
||||
int sumi = sumi0 + sumi1;
|
||||
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
#else
|
||||
UNUSED(nb);
|
||||
UNUSED(ib);
|
||||
UNUSED(sumf);
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
|
|
@ -535,18 +484,15 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
|
||||
sumf = vec_extract(vsumf0, 0);
|
||||
|
||||
#endif
|
||||
for (; ib < nb; ++ib) {
|
||||
int sumi = 0;
|
||||
|
||||
for (int j = 0; j < qk; j++) {
|
||||
sumi += x[ib].qs[j]*y[ib].qs[j];
|
||||
}
|
||||
|
||||
sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d));
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
#else
|
||||
UNUSED(nb);
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(ib);
|
||||
UNUSED(sumf);
|
||||
ggml_vec_dot_q8_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
|
|
@ -695,45 +641,10 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
*s = vec_extract(vsumf0, 0);
|
||||
|
||||
#else
|
||||
|
||||
float sumf = 0;
|
||||
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
|
||||
const uint8_t * q2 = x[i].qs;
|
||||
const int8_t * q8 = y[i].qs;
|
||||
const uint8_t * sc = x[i].scales;
|
||||
|
||||
int summs = 0;
|
||||
for (int j = 0; j < 16; ++j) {
|
||||
summs += y[i].bsums[j] * (sc[j] >> 4);
|
||||
}
|
||||
|
||||
const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
||||
const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
|
||||
|
||||
int isum = 0;
|
||||
int is = 0;
|
||||
int d;
|
||||
for (int k = 0; k < QK_K/128; ++k) {
|
||||
int shift = 0;
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
d = sc[is++] & 0xF;
|
||||
int isuml = 0;
|
||||
for (int l = 0; l < 16; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
|
||||
isum += d * isuml;
|
||||
d = sc[is++] & 0xF;
|
||||
isuml = 0;
|
||||
for (int l = 16; l < 32; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
|
||||
isum += d * isuml;
|
||||
shift += 2;
|
||||
q8 += 32;
|
||||
}
|
||||
q2 += 32;
|
||||
}
|
||||
sumf += dall * isum - dmin * summs;
|
||||
}
|
||||
*s = sumf;
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
ggml_vec_dot_q2_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
@ -907,70 +818,13 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
*s = vec_extract(vsumf0, 0);
|
||||
|
||||
#else
|
||||
// scalar version
|
||||
// This function is written like this so the compiler can manage to vectorize most of it
|
||||
// Using -Ofast, GCC and clang manage to produce code that is within a factor of 2 or so from the
|
||||
// manually vectorized version above. Every other version I tried would run at least 4 times slower.
|
||||
// The ideal situation would be if we could just write the code once, and the compiler would
|
||||
// automatically produce the best possible set of machine instructions, instead of us having to manually
|
||||
// write vectorized versions for AVX, ARM_NEON, etc.
|
||||
|
||||
int8_t aux8[QK_K];
|
||||
int16_t aux16[8];
|
||||
float sums [8];
|
||||
int32_t aux32[8];
|
||||
memset(sums, 0, 8*sizeof(float));
|
||||
|
||||
uint32_t auxs[4];
|
||||
const int8_t * scales = (const int8_t*)auxs;
|
||||
|
||||
float sumf = 0;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
|
||||
const uint8_t * GGML_RESTRICT hm = x[i].hmask;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
memset(aux32, 0, 8*sizeof(int32_t));
|
||||
int8_t * GGML_RESTRICT a = aux8;
|
||||
uint8_t m = 1;
|
||||
for (int j = 0; j < QK_K; j += 128) {
|
||||
for (int l = 0; l < 32; ++l) a[l] = q3[l] & 3;
|
||||
for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
|
||||
a += 32; m <<= 1;
|
||||
for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 2) & 3;
|
||||
for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
|
||||
a += 32; m <<= 1;
|
||||
for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 4) & 3;
|
||||
for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
|
||||
a += 32; m <<= 1;
|
||||
for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 6) & 3;
|
||||
for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
|
||||
a += 32; m <<= 1;
|
||||
q3 += 32;
|
||||
}
|
||||
a = aux8;
|
||||
|
||||
memcpy(auxs, x[i].scales, 12);
|
||||
uint32_t tmp = auxs[2];
|
||||
auxs[2] = ((auxs[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4);
|
||||
auxs[3] = ((auxs[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4);
|
||||
auxs[0] = (auxs[0] & kmask2) | (((tmp >> 0) & kmask1) << 4);
|
||||
auxs[1] = (auxs[1] & kmask2) | (((tmp >> 2) & kmask1) << 4);
|
||||
for (int j = 0; j < QK_K/16; ++j) {
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
}
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
||||
}
|
||||
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
||||
*s = sumf;
|
||||
|
||||
UNUSED(kmask1);
|
||||
UNUSED(kmask2);
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
ggml_vec_dot_q3_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
|
|
@ -1130,61 +984,14 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
*s = vec_extract(vsumf0, 0);
|
||||
|
||||
#else
|
||||
|
||||
const uint8_t * scales = (const uint8_t*)&utmp[0];
|
||||
const uint8_t * mins = (const uint8_t*)&utmp[2];
|
||||
|
||||
int8_t aux8[QK_K];
|
||||
int16_t aux16[8];
|
||||
float sums [8];
|
||||
int32_t aux32[8];
|
||||
memset(sums, 0, 8*sizeof(float));
|
||||
|
||||
float sumf = 0;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const uint8_t * GGML_RESTRICT q4 = x[i].qs;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
memset(aux32, 0, 8*sizeof(int32_t));
|
||||
int8_t * GGML_RESTRICT a = aux8;
|
||||
for (int j = 0; j < QK_K/64; ++j) {
|
||||
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
|
||||
a += 32;
|
||||
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4);
|
||||
a += 32; q4 += 32;
|
||||
}
|
||||
memcpy(utmp, x[i].scales, 12);
|
||||
utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
|
||||
const uint32_t uaux = utmp[1] & kmask1;
|
||||
utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
|
||||
utmp[2] = uaux;
|
||||
utmp[0] &= kmask1;
|
||||
|
||||
int sumi = 0;
|
||||
for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
|
||||
a = aux8;
|
||||
int is = 0;
|
||||
for (int j = 0; j < QK_K/32; ++j) {
|
||||
int32_t scale = scales[is++];
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
}
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
||||
const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
||||
sumf -= dmin * sumi;
|
||||
}
|
||||
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
||||
*s = sumf;
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
UNUSED(kmask1);
|
||||
UNUSED(kmask2);
|
||||
UNUSED(kmask3);
|
||||
UNUSED(utmp);
|
||||
ggml_vec_dot_q4_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
@ -1342,66 +1149,14 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
*s = vec_extract(vsumf0, 0);
|
||||
|
||||
#else
|
||||
|
||||
const uint8_t * scales = (const uint8_t*)&utmp[0];
|
||||
const uint8_t * mins = (const uint8_t*)&utmp[2];
|
||||
|
||||
int8_t aux8[QK_K];
|
||||
int16_t aux16[8];
|
||||
float sums [8];
|
||||
int32_t aux32[8];
|
||||
memset(sums, 0, 8*sizeof(float));
|
||||
|
||||
float sumf = 0;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const uint8_t * GGML_RESTRICT q4 = x[i].qs;
|
||||
const uint8_t * GGML_RESTRICT hm = x[i].qh;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
memset(aux32, 0, 8*sizeof(int32_t));
|
||||
int8_t * GGML_RESTRICT a = aux8;
|
||||
uint8_t m = 1;
|
||||
for (int j = 0; j < QK_K/64; ++j) {
|
||||
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
|
||||
for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
|
||||
a += 32; m <<= 1;
|
||||
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4);
|
||||
for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
|
||||
a += 32; m <<= 1;
|
||||
q4 += 32;
|
||||
}
|
||||
memcpy(utmp, x[i].scales, 12);
|
||||
utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
|
||||
const uint32_t uaux = utmp[1] & kmask1;
|
||||
utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
|
||||
utmp[2] = uaux;
|
||||
utmp[0] &= kmask1;
|
||||
|
||||
int sumi = 0;
|
||||
for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
|
||||
a = aux8;
|
||||
int is = 0;
|
||||
for (int j = 0; j < QK_K/32; ++j) {
|
||||
int32_t scale = scales[is++];
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
}
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
||||
const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
||||
sumf -= dmin * sumi;
|
||||
}
|
||||
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
||||
*s = sumf;
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
UNUSED(kmask1);
|
||||
UNUSED(kmask2);
|
||||
UNUSED(kmask3);
|
||||
UNUSED(utmp);
|
||||
ggml_vec_dot_q5_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
@ -1556,47 +1311,10 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
*s = vec_extract(vsumf0, 0);
|
||||
|
||||
#else
|
||||
|
||||
int8_t aux8[QK_K];
|
||||
int16_t aux16[8];
|
||||
float sums [8];
|
||||
int32_t aux32[8];
|
||||
memset(sums, 0, 8*sizeof(float));
|
||||
|
||||
float sumf = 0;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const uint8_t * GGML_RESTRICT q4 = x[i].ql;
|
||||
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
memset(aux32, 0, 8*sizeof(int32_t));
|
||||
int8_t * GGML_RESTRICT a = aux8;
|
||||
for (int j = 0; j < QK_K; j += 128) {
|
||||
for (int l = 0; l < 32; ++l) {
|
||||
a[l + 0] = (int8_t)((q4[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
|
||||
a[l + 32] = (int8_t)((q4[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
|
||||
a[l + 64] = (int8_t)((q4[l + 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
|
||||
a[l + 96] = (int8_t)((q4[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
|
||||
}
|
||||
a += 128;
|
||||
q4 += 64;
|
||||
qh += 32;
|
||||
}
|
||||
a = aux8;
|
||||
int is = 0;
|
||||
for (int j = 0; j < QK_K/16; ++j) {
|
||||
int scale = x[i].scales[is++];
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
}
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
||||
}
|
||||
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
||||
*s = sumf;
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
ggml_vec_dot_q6_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
@ -1737,34 +1455,10 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
|
|||
*s = 0.125f * vec_extract(vsumf0, 0);
|
||||
|
||||
#else
|
||||
|
||||
uint32_t aux32[2];
|
||||
const uint8_t * aux8 = (const uint8_t *)aux32;
|
||||
|
||||
float sumf = 0.f;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
int32_t bsum = 0;
|
||||
for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
|
||||
memcpy(aux32, q2, 2*sizeof(uint32_t));
|
||||
q2 += 4;
|
||||
const uint32_t ls = 2*(aux32[1] >> 28) + 1;
|
||||
int32_t sumi = 0;
|
||||
for (int l = 0; l < 4; ++l) {
|
||||
const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[l]);
|
||||
const uint8_t signs = ksigns_iq2xs[(aux32[1] >> 7*l) & 127];
|
||||
for (int j = 0; j < 8; ++j) {
|
||||
sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
|
||||
}
|
||||
q8 += 8;
|
||||
}
|
||||
bsum += sumi * ls;
|
||||
}
|
||||
sumf += d * bsum;
|
||||
}
|
||||
*s = 0.125f * sumf;
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
ggml_vec_dot_iq2_xxs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
@ -1869,42 +1563,10 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
|
|||
*s = 0.125f * vec_extract(vsumf0, 0);
|
||||
|
||||
#else
|
||||
|
||||
float sumf = 0.f;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
|
||||
const uint8_t * GGML_RESTRICT sc = x[i].scales;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
int32_t bsum = 0;
|
||||
for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
|
||||
const uint16_t ls1 = 2*(sc[ib32] & 0xf) + 1;
|
||||
const uint16_t ls2 = 2*(sc[ib32] >> 4) + 1;
|
||||
int32_t sumi = 0;
|
||||
for (int l = 0; l < 2; ++l) {
|
||||
const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
|
||||
const uint8_t signs = ksigns_iq2xs[q2[l] >> 9];
|
||||
for (int j = 0; j < 8; ++j) {
|
||||
sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
|
||||
}
|
||||
q8 += 8;
|
||||
}
|
||||
bsum += sumi * ls1;
|
||||
sumi = 0;
|
||||
for (int l = 2; l < 4; ++l) {
|
||||
const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
|
||||
const uint8_t signs = ksigns_iq2xs[q2[l] >> 9];
|
||||
for (int j = 0; j < 8; ++j) {
|
||||
sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
|
||||
}
|
||||
q8 += 8;
|
||||
}
|
||||
bsum += sumi * ls2;
|
||||
q2 += 4;
|
||||
}
|
||||
sumf += d * bsum;
|
||||
}
|
||||
*s = 0.125f * sumf;
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
ggml_vec_dot_iq2_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
@ -2030,47 +1692,11 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|||
*s = 0.125f * vec_extract(vsumf0, 0);
|
||||
|
||||
#else
|
||||
|
||||
float sumf = 0;
|
||||
for (int i = 0; i < nb; i++) {
|
||||
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const int8_t * q8 = y[i].qs;
|
||||
const uint8_t * qs = x[i].qs;
|
||||
const uint8_t * qh = x[i].qh;
|
||||
const uint8_t * signs = qs + QK_K/8;
|
||||
|
||||
int bsum = 0;
|
||||
for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
|
||||
int ls1 = 1 + 2*(x[i].scales[ib32] & 0xf);
|
||||
int ls2 = 1 + 2*(x[i].scales[ib32] >> 4);
|
||||
int sumi1 = 0, sumi2 = 0;
|
||||
for (int l = 0; l < 2; ++l) {
|
||||
const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
|
||||
for (int j = 0; j < 8; ++j) {
|
||||
sumi1 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1);
|
||||
}
|
||||
q8 += 8;
|
||||
}
|
||||
for (int l = 2; l < 4; ++l) {
|
||||
const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
|
||||
for (int j = 0; j < 8; ++j) {
|
||||
sumi2 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1);
|
||||
}
|
||||
q8 += 8;
|
||||
}
|
||||
bsum += ls1 * sumi1 + ls2 * sumi2;
|
||||
qs += 4;
|
||||
signs += 4;
|
||||
}
|
||||
|
||||
sumf += d * bsum;
|
||||
}
|
||||
|
||||
*s = 0.125f * sumf;
|
||||
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
ggml_vec_dot_iq2_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
|
|
@ -2172,36 +1798,10 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
|
|||
*s = 0.25f * vec_extract(vsumf0, 0);
|
||||
|
||||
#else
|
||||
|
||||
uint32_t aux32;
|
||||
|
||||
float sumf = 0.f;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
|
||||
const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
int32_t bsum = 0;
|
||||
for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
|
||||
memcpy(&aux32, gas, sizeof(uint32_t)); gas += sizeof(uint32_t);
|
||||
const uint32_t ls = 2*(aux32 >> 28) + 1;
|
||||
int32_t sumi = 0;
|
||||
for (int l = 0; l < 4; ++l) {
|
||||
const uint8_t * grid1 = (const uint8_t *)(iq3xxs_grid + q3[2*l+0]);
|
||||
const uint8_t * grid2 = (const uint8_t *)(iq3xxs_grid + q3[2*l+1]);
|
||||
const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*l) & 127];
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
sumi += grid1[j] * q8[j+0] * (signs & kmask_iq2xs[j+0] ? -1 : 1);
|
||||
sumi += grid2[j] * q8[j+4] * (signs & kmask_iq2xs[j+4] ? -1 : 1);
|
||||
}
|
||||
q8 += 8;
|
||||
}
|
||||
q3 += 8;
|
||||
bsum += sumi * ls;
|
||||
}
|
||||
sumf += d * bsum;
|
||||
}
|
||||
*s = 0.25f * sumf;
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
ggml_vec_dot_iq3_xxs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
@ -2327,48 +1927,10 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|||
*s = vec_extract(vsumf0, 0);
|
||||
|
||||
#else
|
||||
|
||||
float sumf = 0.f;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const uint8_t * GGML_RESTRICT qs = x[i].qs;
|
||||
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
||||
const uint8_t * GGML_RESTRICT signs = x[i].signs;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
int32_t bsum = 0;
|
||||
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
||||
const uint32_t ls1 = 2*(x[i].scales[ib32/2] & 0xf) + 1;
|
||||
const uint32_t ls2 = 2*(x[i].scales[ib32/2] >> 4) + 1;
|
||||
int32_t sumi = 0;
|
||||
for (int l = 0; l < 4; ++l) {
|
||||
const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+0] << (8-2*l)) & 256)));
|
||||
const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+0] << (7-2*l)) & 256)));
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
|
||||
sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
|
||||
}
|
||||
q8 += 8;
|
||||
}
|
||||
qs += 8;
|
||||
signs += 4;
|
||||
bsum += sumi * ls1;
|
||||
sumi = 0;
|
||||
for (int l = 0; l < 4; ++l) {
|
||||
const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+1] << (8-2*l)) & 256)));
|
||||
const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+1] << (7-2*l)) & 256)));
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
|
||||
sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
|
||||
}
|
||||
q8 += 8;
|
||||
}
|
||||
qs += 8;
|
||||
signs += 4;
|
||||
bsum += sumi * ls2;
|
||||
}
|
||||
sumf += d * bsum;
|
||||
}
|
||||
*s = sumf;
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
ggml_vec_dot_iq3_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
@ -2481,36 +2043,10 @@ void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|||
*s = vec_extract(vsumf0, 0);
|
||||
|
||||
#else
|
||||
|
||||
float sumf = 0;
|
||||
for (int i = 0; i < nb; i++) {
|
||||
|
||||
const int8_t * q8 = y[i].qs;
|
||||
const uint8_t * qs = x[i].qs;
|
||||
const uint16_t * qh = x[i].qh;
|
||||
|
||||
int sumi = 0, sumi1 = 0;
|
||||
for (int ib = 0; ib < QK_K/32; ++ib) {
|
||||
const int ls = 2*((qh[ib] >> 12) & 7) + 1;
|
||||
const int delta = qh[ib] & 0x8000 ? -1 : 1;
|
||||
int lsum = 0;
|
||||
for (int l = 0; l < 4; ++l) {
|
||||
const int8_t * grid = (const int8_t *)(iq1s_grid + (qs[l] | (((qh[ib] >> 3*l) & 7) << 8)));
|
||||
for (int j = 0; j < 8; ++j) {
|
||||
lsum += q8[j] * grid[j];
|
||||
}
|
||||
q8 += 8;
|
||||
}
|
||||
sumi += ls * lsum;
|
||||
sumi1 += ls * delta * (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]);
|
||||
qs += 4;
|
||||
}
|
||||
|
||||
sumf += GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1);
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
ggml_vec_dot_iq1_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
@ -2581,17 +2117,15 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v
|
|||
|
||||
sumf = vec_extract(vsumf0, 0);
|
||||
|
||||
#endif
|
||||
for (; ib < nb; ++ib) {
|
||||
const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_CPU_FP16_TO_FP32(x[ib].d);
|
||||
int sumi1 = 0, sumi2 = 0;
|
||||
for (int j = 0; j < QK4_NL/2; ++j) {
|
||||
sumi1 += y[ib].qs[j+ 0] * kvalues_iq4nl[x[ib].qs[j] & 0xf];
|
||||
sumi2 += y[ib].qs[j+QK4_NL/2] * kvalues_iq4nl[x[ib].qs[j] >> 4];
|
||||
}
|
||||
sumf += d * (sumi1 + sumi2);
|
||||
}
|
||||
*s = sumf;
|
||||
#else
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
UNUSED(ib);
|
||||
UNUSED(sumf);
|
||||
ggml_vec_dot_iq4_nl_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
}
|
||||
|
||||
void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
|
|
@ -2696,37 +2230,10 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
|
|||
*s = vec_extract(vsumf0, 0);
|
||||
|
||||
#else
|
||||
float sumf = 0;
|
||||
for (int ibl = 0; ibl < nb; ++ibl) {
|
||||
const float d4d8 = GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d;
|
||||
uint16_t h = x[ibl].scales_h;
|
||||
const uint8_t * qs = x[ibl].qs;
|
||||
const int8_t * q8 = y[ibl].qs;
|
||||
for (int ib = 0; ib < QK_K/32; ib += 2) {
|
||||
const uint8_t ls1 = (x[ibl].scales_l[ib/2] & 0xf) | ((h << 4) & 0x30);
|
||||
const uint8_t ls2 = (x[ibl].scales_l[ib/2] >> 4) | ((h << 2) & 0x30);
|
||||
h >>= 4;
|
||||
const float d1 = d4d8*(ls1 - 32);
|
||||
const float d2 = d4d8*(ls2 - 32);
|
||||
int sumi1 = 0, sumi2 = 0;
|
||||
for (int j = 0; j < 16; ++j) {
|
||||
sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
|
||||
sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >> 4];
|
||||
}
|
||||
sumf += d1 * (sumi1 + sumi2);
|
||||
qs += 16;
|
||||
q8 += 32;
|
||||
sumi1 = sumi2 = 0;
|
||||
for (int j = 0; j < 16; ++j) {
|
||||
sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
|
||||
sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >> 4];
|
||||
}
|
||||
sumf += d2 * (sumi1 + sumi2);
|
||||
qs += 16;
|
||||
q8 += 32;
|
||||
}
|
||||
}
|
||||
*s = sumf;
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
ggml_vec_dot_iq4_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -116,6 +116,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
|
|||
//===================================== Dot products =================================
|
||||
|
||||
void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
#if defined(__riscv_v)
|
||||
const int qk = QK8_0;
|
||||
const int nb = n / qk;
|
||||
|
||||
|
|
@ -132,7 +133,6 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
int ib = 0;
|
||||
float sumf = 0;
|
||||
|
||||
#if defined(__riscv_v)
|
||||
size_t vl = qk / 2;
|
||||
|
||||
for (; ib < nb; ++ib) {
|
||||
|
|
@ -164,27 +164,14 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d);
|
||||
}
|
||||
|
||||
#endif
|
||||
for (; ib < nb; ++ib) {
|
||||
int sumi0 = 0;
|
||||
int sumi1 = 0;
|
||||
|
||||
for (int j = 0; j < qk/2; ++j) {
|
||||
const int v0 = (x[ib].qs[j] & 0x0F) - 8;
|
||||
const int v1 = (x[ib].qs[j] >> 4) - 8;
|
||||
|
||||
sumi0 += (v0 * y[ib].qs[j]);
|
||||
sumi1 += (v1 * y[ib].qs[j + qk/2]);
|
||||
}
|
||||
|
||||
int sumi = sumi0 + sumi1;
|
||||
sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d);
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
#else
|
||||
ggml_vec_dot_q4_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
#if defined(__riscv_v)
|
||||
const int qk = QK8_1;
|
||||
const int nb = n / qk;
|
||||
|
||||
|
|
@ -201,7 +188,6 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
int ib = 0;
|
||||
float sumf = 0;
|
||||
|
||||
#if defined(__riscv_v)
|
||||
size_t vl = qk / 2;
|
||||
|
||||
for (; ib < nb; ++ib) {
|
||||
|
|
@ -229,27 +215,14 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
|
||||
}
|
||||
|
||||
#endif
|
||||
for (; ib < nb; ++ib) {
|
||||
int sumi0 = 0;
|
||||
int sumi1 = 0;
|
||||
|
||||
for (int j = 0; j < qk/2; ++j) {
|
||||
const int v0 = (x[ib].qs[j] & 0x0F);
|
||||
const int v1 = (x[ib].qs[j] >> 4);
|
||||
|
||||
sumi0 += (v0 * y[ib].qs[j]);
|
||||
sumi1 += (v1 * y[ib].qs[j + qk/2]);
|
||||
}
|
||||
|
||||
int sumi = sumi0 + sumi1;
|
||||
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
#else
|
||||
ggml_vec_dot_q4_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
#if defined(__riscv_v)
|
||||
const int qk = QK8_0;
|
||||
const int nb = n / qk;
|
||||
|
||||
|
|
@ -267,7 +240,6 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
const block_q5_0 * GGML_RESTRICT x = vx;
|
||||
const block_q8_0 * GGML_RESTRICT y = vy;
|
||||
|
||||
#if defined(__riscv_v)
|
||||
size_t vl;
|
||||
size_t vlenb = __riscv_vlenb();
|
||||
|
||||
|
|
@ -297,33 +269,14 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi;
|
||||
}
|
||||
|
||||
#endif
|
||||
for (; ib < nb; ++ib) {
|
||||
uint32_t qh;
|
||||
memcpy(&qh, x[ib].qh, sizeof(qh));
|
||||
|
||||
int sumi0 = 0;
|
||||
int sumi1 = 0;
|
||||
|
||||
for (int j = 0; j < qk/2; ++j) {
|
||||
const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
|
||||
const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12));
|
||||
|
||||
const int32_t x0 = (int8_t)(((x[ib].qs[j] & 0x0F) | xh_0) - 16);
|
||||
const int32_t x1 = (int8_t)(((x[ib].qs[j] >> 4) | xh_1) - 16);
|
||||
|
||||
sumi0 += (x0 * y[ib].qs[j]);
|
||||
sumi1 += (x1 * y[ib].qs[j + qk/2]);
|
||||
}
|
||||
|
||||
int sumi = sumi0 + sumi1;
|
||||
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi;
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
#else
|
||||
ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
#if defined(__riscv_v)
|
||||
const int qk = QK8_1;
|
||||
const int nb = n / qk;
|
||||
|
||||
|
|
@ -341,7 +294,6 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
const block_q5_1 * GGML_RESTRICT x = vx;
|
||||
const block_q8_1 * GGML_RESTRICT y = vy;
|
||||
|
||||
#if defined(__riscv_v)
|
||||
size_t vl;
|
||||
size_t vlenb = __riscv_vlenb();
|
||||
|
||||
|
|
@ -370,30 +322,10 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
|
||||
}
|
||||
|
||||
#endif
|
||||
for (; ib < nb; ++ib) {
|
||||
uint32_t qh;
|
||||
memcpy(&qh, x[ib].qh, sizeof(qh));
|
||||
|
||||
int sumi0 = 0;
|
||||
int sumi1 = 0;
|
||||
|
||||
for (int j = 0; j < qk/2; ++j) {
|
||||
const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
|
||||
const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10;
|
||||
|
||||
const int32_t x0 = (x[ib].qs[j] & 0xF) | xh_0;
|
||||
const int32_t x1 = (x[ib].qs[j] >> 4) | xh_1;
|
||||
|
||||
sumi0 += (x0 * y[ib].qs[j]);
|
||||
sumi1 += (x1 * y[ib].qs[j + qk/2]);
|
||||
}
|
||||
|
||||
int sumi = sumi0 + sumi1;
|
||||
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
#else
|
||||
ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
|
|
@ -431,18 +363,17 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d));
|
||||
}
|
||||
|
||||
#endif
|
||||
for (; ib < nb; ++ib) {
|
||||
int sumi = 0;
|
||||
|
||||
for (int j = 0; j < qk; j++) {
|
||||
sumi += x[ib].qs[j]*y[ib].qs[j];
|
||||
}
|
||||
|
||||
sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d));
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
#else
|
||||
|
||||
UNUSED(nb);
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(ib);
|
||||
UNUSED(sumf);
|
||||
|
||||
ggml_vec_dot_q8_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
|
|
@ -738,44 +669,11 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
|
||||
#else
|
||||
|
||||
float sumf = 0;
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
|
||||
const uint8_t * q2 = x[i].qs;
|
||||
const int8_t * q8 = y[i].qs;
|
||||
const uint8_t * sc = x[i].scales;
|
||||
|
||||
int summs = 0;
|
||||
for (int j = 0; j < 16; ++j) {
|
||||
summs += y[i].bsums[j] * (sc[j] >> 4);
|
||||
}
|
||||
|
||||
const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
||||
const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
|
||||
|
||||
int isum = 0;
|
||||
int is = 0;
|
||||
int d;
|
||||
for (int k = 0; k < QK_K/128; ++k) {
|
||||
int shift = 0;
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
d = sc[is++] & 0xF;
|
||||
int isuml = 0;
|
||||
for (int l = 0; l < 16; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
|
||||
isum += d * isuml;
|
||||
d = sc[is++] & 0xF;
|
||||
isuml = 0;
|
||||
for (int l = 16; l < 32; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
|
||||
isum += d * isuml;
|
||||
shift += 2;
|
||||
q8 += 32;
|
||||
}
|
||||
q2 += 32;
|
||||
}
|
||||
sumf += dall * isum - dmin * summs;
|
||||
}
|
||||
*s = sumf;
|
||||
ggml_vec_dot_q2_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
@ -1147,68 +1045,14 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
*s = sumf;
|
||||
|
||||
#else
|
||||
// scalar version
|
||||
// This function is written like this so the compiler can manage to vectorize most of it
|
||||
// Using -Ofast, GCC and clang manage to produce code that is within a factor of 2 or so from the
|
||||
// manually vectorized version above. Every other version I tried would run at least 4 times slower.
|
||||
// The ideal situation would be if we could just write the code once, and the compiler would
|
||||
// automatically produce the best possible set of machine instructions, instead of us having to manually
|
||||
// write vectorized versions for AVX, ARM_NEON, etc.
|
||||
|
||||
int8_t aux8[QK_K];
|
||||
int16_t aux16[8];
|
||||
float sums [8];
|
||||
int32_t aux32[8];
|
||||
memset(sums, 0, 8*sizeof(float));
|
||||
|
||||
uint32_t auxs[4];
|
||||
const int8_t * scales = (const int8_t*)auxs;
|
||||
|
||||
float sumf = 0;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
|
||||
const uint8_t * GGML_RESTRICT hm = x[i].hmask;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
memset(aux32, 0, 8*sizeof(int32_t));
|
||||
int8_t * GGML_RESTRICT a = aux8;
|
||||
uint8_t m = 1;
|
||||
for (int j = 0; j < QK_K; j += 128) {
|
||||
for (int l = 0; l < 32; ++l) a[l] = q3[l] & 3;
|
||||
for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
|
||||
a += 32; m <<= 1;
|
||||
for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 2) & 3;
|
||||
for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
|
||||
a += 32; m <<= 1;
|
||||
for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 4) & 3;
|
||||
for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
|
||||
a += 32; m <<= 1;
|
||||
for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 6) & 3;
|
||||
for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
|
||||
a += 32; m <<= 1;
|
||||
q3 += 32;
|
||||
}
|
||||
a = aux8;
|
||||
|
||||
memcpy(auxs, x[i].scales, 12);
|
||||
uint32_t tmp = auxs[2];
|
||||
auxs[2] = ((auxs[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4);
|
||||
auxs[3] = ((auxs[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4);
|
||||
auxs[0] = (auxs[0] & kmask2) | (((tmp >> 0) & kmask1) << 4);
|
||||
auxs[1] = (auxs[1] & kmask2) | (((tmp >> 2) & kmask1) << 4);
|
||||
for (int j = 0; j < QK_K/16; ++j) {
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
}
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
||||
}
|
||||
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
||||
*s = sumf;
|
||||
UNUSED(kmask1);
|
||||
UNUSED(kmask2);
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
|
||||
ggml_vec_dot_q3_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
|
||||
}
|
||||
|
|
@ -1534,60 +1378,15 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
|
||||
#else
|
||||
|
||||
const uint8_t * scales = (const uint8_t*)&utmp[0];
|
||||
const uint8_t * mins = (const uint8_t*)&utmp[2];
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(kmask1);
|
||||
UNUSED(kmask2);
|
||||
UNUSED(kmask3);
|
||||
UNUSED(nb);
|
||||
UNUSED(utmp);
|
||||
|
||||
int8_t aux8[QK_K];
|
||||
int16_t aux16[8];
|
||||
float sums [8];
|
||||
int32_t aux32[8];
|
||||
memset(sums, 0, 8*sizeof(float));
|
||||
|
||||
float sumf = 0;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const uint8_t * GGML_RESTRICT q4 = x[i].qs;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
memset(aux32, 0, 8*sizeof(int32_t));
|
||||
int8_t * GGML_RESTRICT a = aux8;
|
||||
for (int j = 0; j < QK_K/64; ++j) {
|
||||
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
|
||||
a += 32;
|
||||
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4);
|
||||
a += 32; q4 += 32;
|
||||
}
|
||||
memcpy(utmp, x[i].scales, 12);
|
||||
utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
|
||||
const uint32_t uaux = utmp[1] & kmask1;
|
||||
utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
|
||||
utmp[2] = uaux;
|
||||
utmp[0] &= kmask1;
|
||||
|
||||
int sumi = 0;
|
||||
for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
|
||||
a = aux8;
|
||||
int is = 0;
|
||||
for (int j = 0; j < QK_K/32; ++j) {
|
||||
int32_t scale = scales[is++];
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
}
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
||||
const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
||||
sumf -= dmin * sumi;
|
||||
}
|
||||
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
||||
*s = sumf;
|
||||
ggml_vec_dot_q4_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
@ -1698,65 +1497,15 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
|
||||
#else
|
||||
|
||||
const uint8_t * scales = (const uint8_t*)&utmp[0];
|
||||
const uint8_t * mins = (const uint8_t*)&utmp[2];
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(kmask1);
|
||||
UNUSED(kmask2);
|
||||
UNUSED(kmask3);
|
||||
UNUSED(nb);
|
||||
UNUSED(utmp);
|
||||
|
||||
int8_t aux8[QK_K];
|
||||
int16_t aux16[8];
|
||||
float sums [8];
|
||||
int32_t aux32[8];
|
||||
memset(sums, 0, 8*sizeof(float));
|
||||
|
||||
float sumf = 0;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const uint8_t * GGML_RESTRICT q4 = x[i].qs;
|
||||
const uint8_t * GGML_RESTRICT hm = x[i].qh;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
memset(aux32, 0, 8*sizeof(int32_t));
|
||||
int8_t * GGML_RESTRICT a = aux8;
|
||||
uint8_t m = 1;
|
||||
for (int j = 0; j < QK_K/64; ++j) {
|
||||
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
|
||||
for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
|
||||
a += 32; m <<= 1;
|
||||
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4);
|
||||
for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
|
||||
a += 32; m <<= 1;
|
||||
q4 += 32;
|
||||
}
|
||||
memcpy(utmp, x[i].scales, 12);
|
||||
utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
|
||||
const uint32_t uaux = utmp[1] & kmask1;
|
||||
utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
|
||||
utmp[2] = uaux;
|
||||
utmp[0] &= kmask1;
|
||||
|
||||
int sumi = 0;
|
||||
for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
|
||||
a = aux8;
|
||||
int is = 0;
|
||||
for (int j = 0; j < QK_K/32; ++j) {
|
||||
int32_t scale = scales[is++];
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
}
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
||||
const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
||||
sumf -= dmin * sumi;
|
||||
}
|
||||
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
||||
*s = sumf;
|
||||
ggml_vec_dot_q5_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
@ -2024,46 +1773,11 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
|
||||
#else
|
||||
|
||||
int8_t aux8[QK_K];
|
||||
int16_t aux16[8];
|
||||
float sums [8];
|
||||
int32_t aux32[8];
|
||||
memset(sums, 0, 8*sizeof(float));
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
|
||||
float sumf = 0;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const uint8_t * GGML_RESTRICT q4 = x[i].ql;
|
||||
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
memset(aux32, 0, 8*sizeof(int32_t));
|
||||
int8_t * GGML_RESTRICT a = aux8;
|
||||
for (int j = 0; j < QK_K; j += 128) {
|
||||
for (int l = 0; l < 32; ++l) {
|
||||
a[l + 0] = (int8_t)((q4[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
|
||||
a[l + 32] = (int8_t)((q4[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
|
||||
a[l + 64] = (int8_t)((q4[l + 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
|
||||
a[l + 96] = (int8_t)((q4[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
|
||||
}
|
||||
a += 128;
|
||||
q4 += 64;
|
||||
qh += 32;
|
||||
}
|
||||
a = aux8;
|
||||
int is = 0;
|
||||
for (int j = 0; j < QK_K/16; ++j) {
|
||||
int scale = x[i].scales[is++];
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
}
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
||||
}
|
||||
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
||||
*s = sumf;
|
||||
ggml_vec_dot_q6_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -112,31 +112,7 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|||
}
|
||||
|
||||
#endif
|
||||
{
|
||||
float sumf[8];
|
||||
int sumi;
|
||||
|
||||
const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
||||
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
||||
const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
|
||||
|
||||
for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
|
||||
for (int l = 0; l < nb; l++) {
|
||||
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
||||
for (int j = 0; j < ncols_interleaved; j++) {
|
||||
sumi = 0;
|
||||
for (int i = 0; i < blocklen; ++i) {
|
||||
const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
|
||||
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
|
||||
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
|
||||
}
|
||||
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
|
||||
}
|
||||
}
|
||||
ggml_gemv_q4_0_8x8_q8_0_generic(n, s, bs, vx, vy, nr, nc);
|
||||
}
|
||||
|
||||
void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||
|
|
@ -361,37 +337,6 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|||
return;
|
||||
}
|
||||
|
||||
#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__)
|
||||
float sumf[4][8];
|
||||
int sumi;
|
||||
|
||||
for (int y = 0; y < nr / 4; y++) {
|
||||
const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
|
||||
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
||||
const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
|
||||
for (int m = 0; m < 4; m++) {
|
||||
for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
|
||||
}
|
||||
for (int l = 0; l < nb; l++) {
|
||||
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
||||
for (int m = 0; m < 4; m++) {
|
||||
for (int j = 0; j < ncols_interleaved; j++) {
|
||||
sumi = 0;
|
||||
for (int i = 0; i < blocklen; ++i) {
|
||||
const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
|
||||
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
|
||||
sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
|
||||
(v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
|
||||
}
|
||||
sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int m = 0; m < 4; m++) {
|
||||
for (int j = 0; j < ncols_interleaved; j++)
|
||||
s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
ggml_gemm_q4_0_8x8_q8_0_generic(n, s, bs, vx, vy, nr, nc);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -172,24 +172,15 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
|
||||
sumf = acc[0] + acc[1] + acc[2] + acc[3];
|
||||
|
||||
#endif
|
||||
for (; ib < nb; ++ib) {
|
||||
int sumi0 = 0;
|
||||
int sumi1 = 0;
|
||||
|
||||
for (int j = 0; j < qk/2; ++j) {
|
||||
const int v0 = (x[ib].qs[j] & 0x0F) - 8;
|
||||
const int v1 = (x[ib].qs[j] >> 4) - 8;
|
||||
|
||||
sumi0 += (v0 * y[ib].qs[j]);
|
||||
sumi1 += (v1 * y[ib].qs[j + qk/2]);
|
||||
}
|
||||
|
||||
int sumi = sumi0 + sumi1;
|
||||
sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d);
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
#else
|
||||
UNUSED(nb);
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(ib);
|
||||
UNUSED(sumf);
|
||||
ggml_vec_dot_q4_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
|
|
@ -239,24 +230,15 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
|
||||
sumf = acc[0] + acc[1] + acc[2] + acc[3] + summs;
|
||||
|
||||
#endif
|
||||
for (; ib < nb; ++ib) {
|
||||
int sumi0 = 0;
|
||||
int sumi1 = 0;
|
||||
|
||||
for (int j = 0; j < qk/2; ++j) {
|
||||
const int v0 = (x[ib].qs[j] & 0x0F);
|
||||
const int v1 = (x[ib].qs[j] >> 4);
|
||||
|
||||
sumi0 += (v0 * y[ib].qs[j]);
|
||||
sumi1 += (v1 * y[ib].qs[j + qk/2]);
|
||||
}
|
||||
|
||||
int sumi = sumi0 + sumi1;
|
||||
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
#else
|
||||
UNUSED(nb);
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(ib);
|
||||
UNUSED(sumf);
|
||||
ggml_vec_dot_q4_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
|
|
@ -298,18 +280,15 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
|
||||
sumf = acc[0] + acc[1] + acc[2] + acc[3];
|
||||
|
||||
#endif
|
||||
for (; ib < nb; ++ib) {
|
||||
int sumi = 0;
|
||||
|
||||
for (int j = 0; j < qk; j++) {
|
||||
sumi += x[ib].qs[j]*y[ib].qs[j];
|
||||
}
|
||||
|
||||
sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d));
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
#else
|
||||
UNUSED(nb);
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(ib);
|
||||
UNUSED(sumf);
|
||||
ggml_vec_dot_q8_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
|
|
@ -442,70 +421,13 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
*s = sum;
|
||||
|
||||
#else
|
||||
// scalar version
|
||||
// This function is written like this so the compiler can manage to vectorize most of it
|
||||
// Using -Ofast, GCC and clang manage to produce code that is within a factor of 2 or so from the
|
||||
// manually vectorized version above. Every other version I tried would run at least 4 times slower.
|
||||
// The ideal situation would be if we could just write the code once, and the compiler would
|
||||
// automatically produce the best possible set of machine instructions, instead of us having to manually
|
||||
// write vectorized versions for AVX, ARM_NEON, etc.
|
||||
|
||||
int8_t aux8[QK_K];
|
||||
int16_t aux16[8];
|
||||
float sums [8];
|
||||
int32_t aux32[8];
|
||||
memset(sums, 0, 8*sizeof(float));
|
||||
|
||||
uint32_t auxs[4];
|
||||
const int8_t * scales = (const int8_t*)auxs;
|
||||
|
||||
float sumf = 0;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
|
||||
const uint8_t * GGML_RESTRICT hm = x[i].hmask;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
memset(aux32, 0, 8*sizeof(int32_t));
|
||||
int8_t * GGML_RESTRICT a = aux8;
|
||||
uint8_t m = 1;
|
||||
for (int j = 0; j < QK_K; j += 128) {
|
||||
for (int l = 0; l < 32; ++l) a[l] = q3[l] & 3;
|
||||
for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
|
||||
a += 32; m <<= 1;
|
||||
for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 2) & 3;
|
||||
for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
|
||||
a += 32; m <<= 1;
|
||||
for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 4) & 3;
|
||||
for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
|
||||
a += 32; m <<= 1;
|
||||
for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 6) & 3;
|
||||
for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
|
||||
a += 32; m <<= 1;
|
||||
q3 += 32;
|
||||
}
|
||||
a = aux8;
|
||||
|
||||
memcpy(auxs, x[i].scales, 12);
|
||||
uint32_t tmp = auxs[2];
|
||||
auxs[2] = ((auxs[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4);
|
||||
auxs[3] = ((auxs[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4);
|
||||
auxs[0] = (auxs[0] & kmask2) | (((tmp >> 0) & kmask1) << 4);
|
||||
auxs[1] = (auxs[1] & kmask2) | (((tmp >> 2) & kmask1) << 4);
|
||||
for (int j = 0; j < QK_K/16; ++j) {
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
}
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
||||
}
|
||||
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
||||
*s = sumf;
|
||||
|
||||
UNUSED(kmask1);
|
||||
UNUSED(kmask2);
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
ggml_vec_dot_q3_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
|
|
@ -600,61 +522,14 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
*s = sumf;
|
||||
|
||||
#else
|
||||
|
||||
const uint8_t * scales = (const uint8_t*)&utmp[0];
|
||||
const uint8_t * mins = (const uint8_t*)&utmp[2];
|
||||
|
||||
int8_t aux8[QK_K];
|
||||
int16_t aux16[8];
|
||||
float sums [8];
|
||||
int32_t aux32[8];
|
||||
memset(sums, 0, 8*sizeof(float));
|
||||
|
||||
float sumf = 0;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const uint8_t * GGML_RESTRICT q4 = x[i].qs;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
memset(aux32, 0, 8*sizeof(int32_t));
|
||||
int8_t * GGML_RESTRICT a = aux8;
|
||||
for (int j = 0; j < QK_K/64; ++j) {
|
||||
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
|
||||
a += 32;
|
||||
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4);
|
||||
a += 32; q4 += 32;
|
||||
}
|
||||
memcpy(utmp, x[i].scales, 12);
|
||||
utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
|
||||
const uint32_t uaux = utmp[1] & kmask1;
|
||||
utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
|
||||
utmp[2] = uaux;
|
||||
utmp[0] &= kmask1;
|
||||
|
||||
int sumi = 0;
|
||||
for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
|
||||
a = aux8;
|
||||
int is = 0;
|
||||
for (int j = 0; j < QK_K/32; ++j) {
|
||||
int32_t scale = scales[is++];
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
}
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
||||
const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
||||
sumf -= dmin * sumi;
|
||||
}
|
||||
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
||||
*s = sumf;
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
UNUSED(kmask1);
|
||||
UNUSED(kmask2);
|
||||
UNUSED(kmask3);
|
||||
UNUSED(utmp);
|
||||
ggml_vec_dot_q4_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
@ -767,66 +642,14 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
*s = sumf;
|
||||
|
||||
#else
|
||||
|
||||
const uint8_t * scales = (const uint8_t*)&utmp[0];
|
||||
const uint8_t * mins = (const uint8_t*)&utmp[2];
|
||||
|
||||
int8_t aux8[QK_K];
|
||||
int16_t aux16[8];
|
||||
float sums [8];
|
||||
int32_t aux32[8];
|
||||
memset(sums, 0, 8*sizeof(float));
|
||||
|
||||
float sumf = 0;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const uint8_t * GGML_RESTRICT q4 = x[i].qs;
|
||||
const uint8_t * GGML_RESTRICT hm = x[i].qh;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
memset(aux32, 0, 8*sizeof(int32_t));
|
||||
int8_t * GGML_RESTRICT a = aux8;
|
||||
uint8_t m = 1;
|
||||
for (int j = 0; j < QK_K/64; ++j) {
|
||||
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
|
||||
for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
|
||||
a += 32; m <<= 1;
|
||||
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4);
|
||||
for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
|
||||
a += 32; m <<= 1;
|
||||
q4 += 32;
|
||||
}
|
||||
memcpy(utmp, x[i].scales, 12);
|
||||
utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
|
||||
const uint32_t uaux = utmp[1] & kmask1;
|
||||
utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
|
||||
utmp[2] = uaux;
|
||||
utmp[0] &= kmask1;
|
||||
|
||||
int sumi = 0;
|
||||
for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
|
||||
a = aux8;
|
||||
int is = 0;
|
||||
for (int j = 0; j < QK_K/32; ++j) {
|
||||
int32_t scale = scales[is++];
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
}
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
||||
const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
||||
sumf -= dmin * sumi;
|
||||
}
|
||||
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
||||
*s = sumf;
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
UNUSED(kmask1);
|
||||
UNUSED(kmask2);
|
||||
UNUSED(kmask3);
|
||||
UNUSED(utmp);
|
||||
ggml_vec_dot_q5_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
@ -969,47 +792,10 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
*s = sum;
|
||||
|
||||
#else
|
||||
|
||||
int8_t aux8[QK_K];
|
||||
int16_t aux16[8];
|
||||
float sums [8];
|
||||
int32_t aux32[8];
|
||||
memset(sums, 0, 8*sizeof(float));
|
||||
|
||||
float sumf = 0;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const uint8_t * GGML_RESTRICT q4 = x[i].ql;
|
||||
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
memset(aux32, 0, 8*sizeof(int32_t));
|
||||
int8_t * GGML_RESTRICT a = aux8;
|
||||
for (int j = 0; j < QK_K; j += 128) {
|
||||
for (int l = 0; l < 32; ++l) {
|
||||
a[l + 0] = (int8_t)((q4[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
|
||||
a[l + 32] = (int8_t)((q4[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
|
||||
a[l + 64] = (int8_t)((q4[l + 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
|
||||
a[l + 96] = (int8_t)((q4[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
|
||||
}
|
||||
a += 128;
|
||||
q4 += 64;
|
||||
qh += 32;
|
||||
}
|
||||
a = aux8;
|
||||
int is = 0;
|
||||
for (int j = 0; j < QK_K/16; ++j) {
|
||||
int scale = x[i].scales[is++];
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
}
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
||||
}
|
||||
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
||||
*s = sumf;
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
ggml_vec_dot_q6_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
@ -1186,17 +972,15 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v
|
|||
sumf += GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d) * (v_xy[0] + v_xy[1] + v_xy[2] + v_xy[3]);
|
||||
}
|
||||
|
||||
#endif
|
||||
for (; ib < nb; ++ib) {
|
||||
const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_CPU_FP16_TO_FP32(x[ib].d);
|
||||
int sumi1 = 0, sumi2 = 0;
|
||||
for (int j = 0; j < QK4_NL/2; ++j) {
|
||||
sumi1 += y[ib].qs[j+ 0] * kvalues_iq4nl[x[ib].qs[j] & 0xf];
|
||||
sumi2 += y[ib].qs[j+QK4_NL/2] * kvalues_iq4nl[x[ib].qs[j] >> 4];
|
||||
}
|
||||
sumf += d * (sumi1 + sumi2);
|
||||
}
|
||||
*s = sumf;
|
||||
#else
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
UNUSED(ib);
|
||||
UNUSED(sumf);
|
||||
ggml_vec_dot_iq4_nl_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
}
|
||||
|
||||
void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
|
|
@ -1264,37 +1048,10 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
|
|||
*s = sumf;
|
||||
|
||||
#else
|
||||
float sumf = 0;
|
||||
for (int ibl = 0; ibl < nb; ++ibl) {
|
||||
const float d4d8 = GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d;
|
||||
uint16_t h = x[ibl].scales_h;
|
||||
const uint8_t * qs = x[ibl].qs;
|
||||
const int8_t * q8 = y[ibl].qs;
|
||||
for (int ib = 0; ib < QK_K/32; ib += 2) {
|
||||
const uint8_t ls1 = (x[ibl].scales_l[ib/2] & 0xf) | ((h << 4) & 0x30);
|
||||
const uint8_t ls2 = (x[ibl].scales_l[ib/2] >> 4) | ((h << 2) & 0x30);
|
||||
h >>= 4;
|
||||
const float d1 = d4d8*(ls1 - 32);
|
||||
const float d2 = d4d8*(ls2 - 32);
|
||||
int sumi1 = 0, sumi2 = 0;
|
||||
for (int j = 0; j < 16; ++j) {
|
||||
sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
|
||||
sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >> 4];
|
||||
}
|
||||
sumf += d1 * (sumi1 + sumi2);
|
||||
qs += 16;
|
||||
q8 += 32;
|
||||
sumi1 = sumi2 = 0;
|
||||
for (int j = 0; j < 16; ++j) {
|
||||
sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
|
||||
sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >> 4];
|
||||
}
|
||||
sumf += d2 * (sumi1 + sumi2);
|
||||
qs += 16;
|
||||
q8 += 32;
|
||||
}
|
||||
}
|
||||
*s = sumf;
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
ggml_vec_dot_iq4_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -435,30 +435,15 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
sumf = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
|
||||
wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3);
|
||||
|
||||
#endif
|
||||
for (; ib < nb; ++ib) {
|
||||
uint32_t qh;
|
||||
memcpy(&qh, x[ib].qh, sizeof(qh));
|
||||
|
||||
int sumi0 = 0;
|
||||
int sumi1 = 0;
|
||||
|
||||
for (int j = 0; j < qk/2; ++j) {
|
||||
const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
|
||||
const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12));
|
||||
|
||||
const int32_t x0 = (int8_t)(((x[ib].qs[j] & 0x0F) | xh_0) - 16);
|
||||
const int32_t x1 = (int8_t)(((x[ib].qs[j] >> 4) | xh_1) - 16);
|
||||
|
||||
sumi0 += (x0 * y[ib].qs[j]);
|
||||
sumi1 += (x1 * y[ib].qs[j + qk/2]);
|
||||
}
|
||||
|
||||
int sumi = sumi0 + sumi1;
|
||||
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi;
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
#else
|
||||
UNUSED(nb);
|
||||
UNUSED(ib);
|
||||
UNUSED(sumf);
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
|
|
@ -545,30 +530,15 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
sumf = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
|
||||
wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3) + summs;
|
||||
|
||||
#endif
|
||||
for (; ib < nb; ++ib) {
|
||||
uint32_t qh;
|
||||
memcpy(&qh, x[ib].qh, sizeof(qh));
|
||||
|
||||
int sumi0 = 0;
|
||||
int sumi1 = 0;
|
||||
|
||||
for (int j = 0; j < qk/2; ++j) {
|
||||
const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
|
||||
const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10;
|
||||
|
||||
const int32_t x0 = (x[ib].qs[j] & 0xF) | xh_0;
|
||||
const int32_t x1 = (x[ib].qs[j] >> 4) | xh_1;
|
||||
|
||||
sumi0 += (x0 * y[ib].qs[j]);
|
||||
sumi1 += (x1 * y[ib].qs[j + qk/2]);
|
||||
}
|
||||
|
||||
int sumi = sumi0 + sumi1;
|
||||
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
#else
|
||||
UNUSED(nb);
|
||||
UNUSED(ib);
|
||||
UNUSED(sumf);
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
|
|
@ -628,18 +598,15 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
sumf = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
|
||||
wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3);
|
||||
|
||||
#endif
|
||||
for (; ib < nb; ++ib) {
|
||||
int sumi = 0;
|
||||
|
||||
for (int j = 0; j < qk; j++) {
|
||||
sumi += x[ib].qs[j]*y[ib].qs[j];
|
||||
}
|
||||
|
||||
sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d));
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
#else
|
||||
UNUSED(nb);
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(ib);
|
||||
UNUSED(sumf);
|
||||
ggml_vec_dot_q8_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
|
|
@ -755,45 +722,10 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
*s = sumf;
|
||||
|
||||
#else
|
||||
|
||||
float sumf = 0;
|
||||
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
|
||||
const uint8_t * q2 = x[i].qs;
|
||||
const int8_t * q8 = y[i].qs;
|
||||
const uint8_t * sc = x[i].scales;
|
||||
|
||||
int summs = 0;
|
||||
for (int j = 0; j < 16; ++j) {
|
||||
summs += y[i].bsums[j] * (sc[j] >> 4);
|
||||
}
|
||||
|
||||
const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
||||
const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
|
||||
|
||||
int isum = 0;
|
||||
int is = 0;
|
||||
int d;
|
||||
for (int k = 0; k < QK_K/128; ++k) {
|
||||
int shift = 0;
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
d = sc[is++] & 0xF;
|
||||
int isuml = 0;
|
||||
for (int l = 0; l < 16; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
|
||||
isum += d * isuml;
|
||||
d = sc[is++] & 0xF;
|
||||
isuml = 0;
|
||||
for (int l = 16; l < 32; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
|
||||
isum += d * isuml;
|
||||
shift += 2;
|
||||
q8 += 32;
|
||||
}
|
||||
q2 += 32;
|
||||
}
|
||||
sumf += dall * isum - dmin * summs;
|
||||
}
|
||||
*s = sumf;
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
ggml_vec_dot_q2_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
@ -902,68 +834,12 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
*s = sumf;
|
||||
|
||||
#else
|
||||
// scalar version
|
||||
// This function is written like this so the compiler can manage to vectorize most of it
|
||||
// Using -Ofast, GCC and clang manage to produce code that is within a factor of 2 or so from the
|
||||
// manually vectorized version above. Every other version I tried would run at least 4 times slower.
|
||||
// The ideal situation would be if we could just write the code once, and the compiler would
|
||||
// automatically produce the best possible set of machine instructions, instead of us having to manually
|
||||
// write vectorized versions for AVX, ARM_NEON, etc.
|
||||
|
||||
int8_t aux8[QK_K];
|
||||
int16_t aux16[8];
|
||||
float sums [8];
|
||||
int32_t aux32[8];
|
||||
memset(sums, 0, 8*sizeof(float));
|
||||
|
||||
uint32_t auxs[4];
|
||||
const int8_t * scales = (const int8_t*)auxs;
|
||||
|
||||
float sumf = 0;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
|
||||
const uint8_t * GGML_RESTRICT hm = x[i].hmask;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
memset(aux32, 0, 8*sizeof(int32_t));
|
||||
int8_t * GGML_RESTRICT a = aux8;
|
||||
uint8_t m = 1;
|
||||
for (int j = 0; j < QK_K; j += 128) {
|
||||
for (int l = 0; l < 32; ++l) a[l] = q3[l] & 3;
|
||||
for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
|
||||
a += 32; m <<= 1;
|
||||
for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 2) & 3;
|
||||
for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
|
||||
a += 32; m <<= 1;
|
||||
for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 4) & 3;
|
||||
for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
|
||||
a += 32; m <<= 1;
|
||||
for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 6) & 3;
|
||||
for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
|
||||
a += 32; m <<= 1;
|
||||
q3 += 32;
|
||||
}
|
||||
a = aux8;
|
||||
|
||||
memcpy(auxs, x[i].scales, 12);
|
||||
uint32_t tmp = auxs[2];
|
||||
auxs[2] = ((auxs[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4);
|
||||
auxs[3] = ((auxs[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4);
|
||||
auxs[0] = (auxs[0] & kmask2) | (((tmp >> 0) & kmask1) << 4);
|
||||
auxs[1] = (auxs[1] & kmask2) | (((tmp >> 2) & kmask1) << 4);
|
||||
for (int j = 0; j < QK_K/16; ++j) {
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
}
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
||||
}
|
||||
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
||||
*s = sumf;
|
||||
|
||||
UNUSED(kmask1);
|
||||
UNUSED(kmask2);
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
ggml_vec_dot_q3_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
|
||||
}
|
||||
|
|
@ -1089,61 +965,14 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
*s = sumf;
|
||||
|
||||
#else
|
||||
|
||||
const uint8_t * scales = (const uint8_t*)&utmp[0];
|
||||
const uint8_t * mins = (const uint8_t*)&utmp[2];
|
||||
|
||||
int8_t aux8[QK_K];
|
||||
int16_t aux16[8];
|
||||
float sums [8];
|
||||
int32_t aux32[8];
|
||||
memset(sums, 0, 8*sizeof(float));
|
||||
|
||||
float sumf = 0;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const uint8_t * GGML_RESTRICT q4 = x[i].qs;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
memset(aux32, 0, 8*sizeof(int32_t));
|
||||
int8_t * GGML_RESTRICT a = aux8;
|
||||
for (int j = 0; j < QK_K/64; ++j) {
|
||||
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
|
||||
a += 32;
|
||||
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4);
|
||||
a += 32; q4 += 32;
|
||||
}
|
||||
memcpy(utmp, x[i].scales, 12);
|
||||
utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
|
||||
const uint32_t uaux = utmp[1] & kmask1;
|
||||
utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
|
||||
utmp[2] = uaux;
|
||||
utmp[0] &= kmask1;
|
||||
|
||||
int sumi = 0;
|
||||
for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
|
||||
a = aux8;
|
||||
int is = 0;
|
||||
for (int j = 0; j < QK_K/32; ++j) {
|
||||
int32_t scale = scales[is++];
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
}
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
||||
const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
||||
sumf -= dmin * sumi;
|
||||
}
|
||||
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
||||
*s = sumf;
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
UNUSED(kmask1);
|
||||
UNUSED(kmask2);
|
||||
UNUSED(kmask3);
|
||||
UNUSED(utmp);
|
||||
ggml_vec_dot_q4_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
@ -1279,66 +1108,14 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
*s = sumf;
|
||||
|
||||
#else
|
||||
|
||||
const uint8_t * scales = (const uint8_t*)&utmp[0];
|
||||
const uint8_t * mins = (const uint8_t*)&utmp[2];
|
||||
|
||||
int8_t aux8[QK_K];
|
||||
int16_t aux16[8];
|
||||
float sums [8];
|
||||
int32_t aux32[8];
|
||||
memset(sums, 0, 8*sizeof(float));
|
||||
|
||||
float sumf = 0;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const uint8_t * GGML_RESTRICT q4 = x[i].qs;
|
||||
const uint8_t * GGML_RESTRICT hm = x[i].qh;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
memset(aux32, 0, 8*sizeof(int32_t));
|
||||
int8_t * GGML_RESTRICT a = aux8;
|
||||
uint8_t m = 1;
|
||||
for (int j = 0; j < QK_K/64; ++j) {
|
||||
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
|
||||
for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
|
||||
a += 32; m <<= 1;
|
||||
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4);
|
||||
for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
|
||||
a += 32; m <<= 1;
|
||||
q4 += 32;
|
||||
}
|
||||
memcpy(utmp, x[i].scales, 12);
|
||||
utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
|
||||
const uint32_t uaux = utmp[1] & kmask1;
|
||||
utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
|
||||
utmp[2] = uaux;
|
||||
utmp[0] &= kmask1;
|
||||
|
||||
int sumi = 0;
|
||||
for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
|
||||
a = aux8;
|
||||
int is = 0;
|
||||
for (int j = 0; j < QK_K/32; ++j) {
|
||||
int32_t scale = scales[is++];
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
}
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
||||
const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
||||
sumf -= dmin * sumi;
|
||||
}
|
||||
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
||||
*s = sumf;
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
UNUSED(kmask1);
|
||||
UNUSED(kmask2);
|
||||
UNUSED(kmask3);
|
||||
UNUSED(utmp);
|
||||
ggml_vec_dot_q5_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
@ -1435,47 +1212,10 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
*s = sumf;
|
||||
|
||||
#else
|
||||
|
||||
int8_t aux8[QK_K];
|
||||
int16_t aux16[8];
|
||||
float sums [8];
|
||||
int32_t aux32[8];
|
||||
memset(sums, 0, 8*sizeof(float));
|
||||
|
||||
float sumf = 0;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const uint8_t * GGML_RESTRICT q4 = x[i].ql;
|
||||
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
memset(aux32, 0, 8*sizeof(int32_t));
|
||||
int8_t * GGML_RESTRICT a = aux8;
|
||||
for (int j = 0; j < QK_K; j += 128) {
|
||||
for (int l = 0; l < 32; ++l) {
|
||||
a[l + 0] = (int8_t)((q4[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
|
||||
a[l + 32] = (int8_t)((q4[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
|
||||
a[l + 64] = (int8_t)((q4[l + 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
|
||||
a[l + 96] = (int8_t)((q4[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
|
||||
}
|
||||
a += 128;
|
||||
q4 += 64;
|
||||
qh += 32;
|
||||
}
|
||||
a = aux8;
|
||||
int is = 0;
|
||||
for (int j = 0; j < QK_K/16; ++j) {
|
||||
int scale = x[i].scales[is++];
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
}
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
||||
}
|
||||
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
||||
*s = sumf;
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
ggml_vec_dot_q6_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -702,7 +702,6 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
const block_q8_1 * GGML_RESTRICT y = vy;
|
||||
|
||||
int ib = 0;
|
||||
float sumf = 0;
|
||||
|
||||
#if defined(__AVX2__) || defined(__AVX__)
|
||||
// Initialize accumulator with zeros
|
||||
|
|
@ -737,26 +736,14 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
#endif
|
||||
}
|
||||
|
||||
sumf = hsum_float_8(acc) + summs;
|
||||
|
||||
*s = hsum_float_8(acc) + summs;
|
||||
#else
|
||||
UNUSED(nb);
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(ib);
|
||||
ggml_vec_dot_q4_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
for (; ib < nb; ++ib) {
|
||||
int sumi0 = 0;
|
||||
int sumi1 = 0;
|
||||
|
||||
for (int j = 0; j < qk/2; ++j) {
|
||||
const int v0 = (x[ib].qs[j] & 0x0F);
|
||||
const int v1 = (x[ib].qs[j] >> 4);
|
||||
|
||||
sumi0 += (v0 * y[ib].qs[j]);
|
||||
sumi1 += (v1 * y[ib].qs[j + qk/2]);
|
||||
}
|
||||
|
||||
int sumi = sumi0 + sumi1;
|
||||
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
|
|
@ -764,7 +751,6 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
const int nb = n / qk;
|
||||
|
||||
int ib = 0;
|
||||
float sumf = 0;
|
||||
|
||||
assert(n % qk == 0);
|
||||
assert(qk == QK5_0);
|
||||
|
|
@ -799,7 +785,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
acc = _mm256_fmadd_ps(d, q, acc);
|
||||
}
|
||||
|
||||
sumf = hsum_float_8(acc);
|
||||
*s = hsum_float_8(acc);
|
||||
#elif defined(__AVX__)
|
||||
// Initialize accumulator with zeros
|
||||
__m256 acc = _mm256_setzero_ps();
|
||||
|
|
@ -830,32 +816,14 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
acc = _mm256_add_ps(_mm256_mul_ps(d, q), acc);
|
||||
}
|
||||
|
||||
sumf = hsum_float_8(acc);
|
||||
|
||||
*s = hsum_float_8(acc);
|
||||
#else
|
||||
UNUSED(nb);
|
||||
UNUSED(ib);
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
for (; ib < nb; ++ib) {
|
||||
uint32_t qh;
|
||||
memcpy(&qh, x[ib].qh, sizeof(qh));
|
||||
|
||||
int sumi0 = 0;
|
||||
int sumi1 = 0;
|
||||
|
||||
for (int j = 0; j < qk/2; ++j) {
|
||||
const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
|
||||
const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12));
|
||||
|
||||
const int32_t x0 = (int8_t)(((x[ib].qs[j] & 0x0F) | xh_0) - 16);
|
||||
const int32_t x1 = (int8_t)(((x[ib].qs[j] >> 4) | xh_1) - 16);
|
||||
|
||||
sumi0 += (x0 * y[ib].qs[j]);
|
||||
sumi1 += (x1 * y[ib].qs[j + qk/2]);
|
||||
}
|
||||
|
||||
int sumi = sumi0 + sumi1;
|
||||
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi;
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
|
|
@ -863,7 +831,6 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
const int nb = n / qk;
|
||||
|
||||
int ib = 0;
|
||||
float sumf = 0;
|
||||
|
||||
assert(n % qk == 0);
|
||||
assert(qk == QK5_1);
|
||||
|
|
@ -901,7 +868,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
acc = _mm256_fmadd_ps(q, _mm256_mul_ps(dx, dy), acc);
|
||||
}
|
||||
|
||||
sumf = hsum_float_8(acc) + summs;
|
||||
*s = hsum_float_8(acc) + summs;
|
||||
#elif defined(__AVX__)
|
||||
// Initialize accumulator with zeros
|
||||
__m256 acc = _mm256_setzero_ps();
|
||||
|
|
@ -935,32 +902,14 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
acc = _mm256_add_ps(_mm256_mul_ps(q, _mm256_mul_ps(dx, dy)), acc);
|
||||
}
|
||||
|
||||
sumf = hsum_float_8(acc) + summs;
|
||||
|
||||
*s = hsum_float_8(acc) + summs;
|
||||
#else
|
||||
UNUSED(nb);
|
||||
UNUSED(ib);
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
for (; ib < nb; ++ib) {
|
||||
uint32_t qh;
|
||||
memcpy(&qh, x[ib].qh, sizeof(qh));
|
||||
|
||||
int sumi0 = 0;
|
||||
int sumi1 = 0;
|
||||
|
||||
for (int j = 0; j < qk/2; ++j) {
|
||||
const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
|
||||
const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10;
|
||||
|
||||
const int32_t x0 = (x[ib].qs[j] & 0xF) | xh_0;
|
||||
const int32_t x1 = (x[ib].qs[j] >> 4) | xh_1;
|
||||
|
||||
sumi0 += (x0 * y[ib].qs[j]);
|
||||
sumi1 += (x1 * y[ib].qs[j + qk/2]);
|
||||
}
|
||||
|
||||
int sumi = sumi0 + sumi1;
|
||||
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
|
|
@ -1017,7 +966,6 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
}
|
||||
|
||||
sumf = hsum_float_8(accum);
|
||||
|
||||
#endif
|
||||
for (; ib < nb; ++ib) {
|
||||
int sumi = 0;
|
||||
|
|
@ -1157,44 +1105,10 @@ void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|||
*s = hsum_float_8(sumf);
|
||||
|
||||
#else
|
||||
const uint8_t pow3[6] = {1, 3, 9, 27, 81, 243};
|
||||
|
||||
float sumf = 0.0f;
|
||||
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
int sum = 0;
|
||||
|
||||
for (size_t j = 0; j < sizeof(x->qs) - sizeof(x->qs) % 32; j += 32) {
|
||||
for (size_t l = 0; l < 5; ++l) {
|
||||
for (size_t m = 0; m < 32; ++m) {
|
||||
uint8_t q = x[i].qs[j + m] * pow3[l];
|
||||
uint16_t xi = ((uint16_t) q * 3) >> 8;
|
||||
sum += (xi - 1) * y[i].qs[j*5 + l*32 + m];
|
||||
}
|
||||
}
|
||||
}
|
||||
for (size_t j = sizeof(x->qs) - sizeof(x->qs) % 32; j < sizeof(x->qs); j += 16) {
|
||||
for (size_t l = 0; l < 5; ++l) {
|
||||
for (size_t m = 0; m < 16; ++m) {
|
||||
uint8_t q = x[i].qs[j + m] * pow3[l];
|
||||
uint16_t xi = ((uint16_t) q * 3) >> 8;
|
||||
sum += (xi - 1) * y[i].qs[j*5 + l*16 + m];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (size_t l = 0; l < 4; ++l) {
|
||||
for (size_t j = 0; j < sizeof(x->qh); ++j) {
|
||||
uint8_t q = x[i].qh[j] * pow3[l];
|
||||
uint16_t xi = ((uint16_t) q * 3) >> 8;
|
||||
sum += (xi - 1) * y[i].qs[sizeof(x->qs)*5 + l*sizeof(x->qh) + j];
|
||||
}
|
||||
}
|
||||
|
||||
sumf += (float) sum * (GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d);
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
ggml_vec_dot_tq1_0_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
@ -1257,25 +1171,10 @@ void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|||
*s = hsum_float_8(sumf);
|
||||
|
||||
#else
|
||||
float sumf = 0.0f;
|
||||
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
int32_t sumi = 0;
|
||||
|
||||
for (size_t j = 0; j < sizeof(x->qs); j += 32) {
|
||||
for (size_t l = 0; l < 4; ++l) {
|
||||
for (size_t k = 0; k < 32; ++k) {
|
||||
sumi += y[i].qs[j*4 + l*32 + k] * (((x[i].qs[j + k] >> (l*2)) & 3) - 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
||||
|
||||
sumf += (float) sumi * d;
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
ggml_vec_dot_tq2_0_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
@ -1464,45 +1363,10 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
*s = hsum_float_8(acc);
|
||||
|
||||
#else
|
||||
|
||||
float sumf = 0;
|
||||
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
|
||||
const uint8_t * q2 = x[i].qs;
|
||||
const int8_t * q8 = y[i].qs;
|
||||
const uint8_t * sc = x[i].scales;
|
||||
|
||||
int summs = 0;
|
||||
for (int j = 0; j < 16; ++j) {
|
||||
summs += y[i].bsums[j] * (sc[j] >> 4);
|
||||
}
|
||||
|
||||
const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
||||
const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
|
||||
|
||||
int isum = 0;
|
||||
int is = 0;
|
||||
int d;
|
||||
for (int k = 0; k < QK_K/128; ++k) {
|
||||
int shift = 0;
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
d = sc[is++] & 0xF;
|
||||
int isuml = 0;
|
||||
for (int l = 0; l < 16; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
|
||||
isum += d * isuml;
|
||||
d = sc[is++] & 0xF;
|
||||
isuml = 0;
|
||||
for (int l = 16; l < 32; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
|
||||
isum += d * isuml;
|
||||
shift += 2;
|
||||
q8 += 32;
|
||||
}
|
||||
q2 += 32;
|
||||
}
|
||||
sumf += dall * isum - dmin * summs;
|
||||
}
|
||||
*s = sumf;
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
ggml_vec_dot_q2_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
@ -1769,70 +1633,13 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
*s = hsum_float_8(acc);
|
||||
|
||||
#else
|
||||
// scalar version
|
||||
// This function is written like this so the compiler can manage to vectorize most of it
|
||||
// Using -Ofast, GCC and clang manage to produce code that is within a factor of 2 or so from the
|
||||
// manually vectorized version above. Every other version I tried would run at least 4 times slower.
|
||||
// The ideal situation would be if we could just write the code once, and the compiler would
|
||||
// automatically produce the best possible set of machine instructions, instead of us having to manually
|
||||
// write vectorized versions for AVX, ARM_NEON, etc.
|
||||
|
||||
int8_t aux8[QK_K];
|
||||
int16_t aux16[8];
|
||||
float sums [8];
|
||||
int32_t aux32[8];
|
||||
memset(sums, 0, 8*sizeof(float));
|
||||
|
||||
uint32_t auxs[4];
|
||||
const int8_t * scales = (const int8_t*)auxs;
|
||||
|
||||
float sumf = 0;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
|
||||
const uint8_t * GGML_RESTRICT hm = x[i].hmask;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
memset(aux32, 0, 8*sizeof(int32_t));
|
||||
int8_t * GGML_RESTRICT a = aux8;
|
||||
uint8_t m = 1;
|
||||
for (int j = 0; j < QK_K; j += 128) {
|
||||
for (int l = 0; l < 32; ++l) a[l] = q3[l] & 3;
|
||||
for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
|
||||
a += 32; m <<= 1;
|
||||
for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 2) & 3;
|
||||
for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
|
||||
a += 32; m <<= 1;
|
||||
for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 4) & 3;
|
||||
for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
|
||||
a += 32; m <<= 1;
|
||||
for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 6) & 3;
|
||||
for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
|
||||
a += 32; m <<= 1;
|
||||
q3 += 32;
|
||||
}
|
||||
a = aux8;
|
||||
|
||||
memcpy(auxs, x[i].scales, 12);
|
||||
uint32_t tmp = auxs[2];
|
||||
auxs[2] = ((auxs[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4);
|
||||
auxs[3] = ((auxs[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4);
|
||||
auxs[0] = (auxs[0] & kmask2) | (((tmp >> 0) & kmask1) << 4);
|
||||
auxs[1] = (auxs[1] & kmask2) | (((tmp >> 2) & kmask1) << 4);
|
||||
for (int j = 0; j < QK_K/16; ++j) {
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
}
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
||||
}
|
||||
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
||||
*s = sumf;
|
||||
|
||||
UNUSED(kmask1);
|
||||
UNUSED(kmask2);
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
ggml_vec_dot_q3_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
|
|
@ -2002,61 +1809,14 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
*s = hsum_float_8(acc) + _mm_cvtss_f32(acc_m);
|
||||
|
||||
#else
|
||||
|
||||
const uint8_t * scales = (const uint8_t*)&utmp[0];
|
||||
const uint8_t * mins = (const uint8_t*)&utmp[2];
|
||||
|
||||
int8_t aux8[QK_K];
|
||||
int16_t aux16[8];
|
||||
float sums [8];
|
||||
int32_t aux32[8];
|
||||
memset(sums, 0, 8*sizeof(float));
|
||||
|
||||
float sumf = 0;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const uint8_t * GGML_RESTRICT q4 = x[i].qs;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
memset(aux32, 0, 8*sizeof(int32_t));
|
||||
int8_t * GGML_RESTRICT a = aux8;
|
||||
for (int j = 0; j < QK_K/64; ++j) {
|
||||
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
|
||||
a += 32;
|
||||
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4);
|
||||
a += 32; q4 += 32;
|
||||
}
|
||||
memcpy(utmp, x[i].scales, 12);
|
||||
utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
|
||||
const uint32_t uaux = utmp[1] & kmask1;
|
||||
utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
|
||||
utmp[2] = uaux;
|
||||
utmp[0] &= kmask1;
|
||||
|
||||
int sumi = 0;
|
||||
for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
|
||||
a = aux8;
|
||||
int is = 0;
|
||||
for (int j = 0; j < QK_K/32; ++j) {
|
||||
int32_t scale = scales[is++];
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
}
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
||||
const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
||||
sumf -= dmin * sumi;
|
||||
}
|
||||
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
||||
*s = sumf;
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
UNUSED(kmask1);
|
||||
UNUSED(kmask2);
|
||||
UNUSED(kmask3);
|
||||
UNUSED(utmp);
|
||||
ggml_vec_dot_q4_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
@ -2259,66 +2019,14 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
*s = hsum_float_8(acc) + summs;
|
||||
|
||||
#else
|
||||
|
||||
const uint8_t * scales = (const uint8_t*)&utmp[0];
|
||||
const uint8_t * mins = (const uint8_t*)&utmp[2];
|
||||
|
||||
int8_t aux8[QK_K];
|
||||
int16_t aux16[8];
|
||||
float sums [8];
|
||||
int32_t aux32[8];
|
||||
memset(sums, 0, 8*sizeof(float));
|
||||
|
||||
float sumf = 0;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const uint8_t * GGML_RESTRICT q4 = x[i].qs;
|
||||
const uint8_t * GGML_RESTRICT hm = x[i].qh;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
memset(aux32, 0, 8*sizeof(int32_t));
|
||||
int8_t * GGML_RESTRICT a = aux8;
|
||||
uint8_t m = 1;
|
||||
for (int j = 0; j < QK_K/64; ++j) {
|
||||
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
|
||||
for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
|
||||
a += 32; m <<= 1;
|
||||
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4);
|
||||
for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
|
||||
a += 32; m <<= 1;
|
||||
q4 += 32;
|
||||
}
|
||||
memcpy(utmp, x[i].scales, 12);
|
||||
utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
|
||||
const uint32_t uaux = utmp[1] & kmask1;
|
||||
utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
|
||||
utmp[2] = uaux;
|
||||
utmp[0] &= kmask1;
|
||||
|
||||
int sumi = 0;
|
||||
for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
|
||||
a = aux8;
|
||||
int is = 0;
|
||||
for (int j = 0; j < QK_K/32; ++j) {
|
||||
int32_t scale = scales[is++];
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
}
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
||||
const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
||||
sumf -= dmin * sumi;
|
||||
}
|
||||
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
||||
*s = sumf;
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
UNUSED(kmask1);
|
||||
UNUSED(kmask2);
|
||||
UNUSED(kmask3);
|
||||
UNUSED(utmp);
|
||||
ggml_vec_dot_q5_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
@ -2520,47 +2228,10 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
*s = hsum_float_8(acc);
|
||||
|
||||
#else
|
||||
|
||||
int8_t aux8[QK_K];
|
||||
int16_t aux16[8];
|
||||
float sums [8];
|
||||
int32_t aux32[8];
|
||||
memset(sums, 0, 8*sizeof(float));
|
||||
|
||||
float sumf = 0;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const uint8_t * GGML_RESTRICT q4 = x[i].ql;
|
||||
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
memset(aux32, 0, 8*sizeof(int32_t));
|
||||
int8_t * GGML_RESTRICT a = aux8;
|
||||
for (int j = 0; j < QK_K; j += 128) {
|
||||
for (int l = 0; l < 32; ++l) {
|
||||
a[l + 0] = (int8_t)((q4[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
|
||||
a[l + 32] = (int8_t)((q4[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
|
||||
a[l + 64] = (int8_t)((q4[l + 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
|
||||
a[l + 96] = (int8_t)((q4[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
|
||||
}
|
||||
a += 128;
|
||||
q4 += 64;
|
||||
qh += 32;
|
||||
}
|
||||
a = aux8;
|
||||
int is = 0;
|
||||
for (int j = 0; j < QK_K/16; ++j) {
|
||||
int scale = x[i].scales[is++];
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
}
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
||||
}
|
||||
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
||||
*s = sumf;
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
ggml_vec_dot_q6_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
@ -2712,34 +2383,10 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
|
|||
*s = 0.125f * hsum_float_8(accumf);
|
||||
|
||||
#else
|
||||
|
||||
uint32_t aux32[2];
|
||||
const uint8_t * aux8 = (const uint8_t *)aux32;
|
||||
|
||||
float sumf = 0.f;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
int32_t bsum = 0;
|
||||
for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
|
||||
memcpy(aux32, q2, 2*sizeof(uint32_t));
|
||||
q2 += 4;
|
||||
const uint32_t ls = 2*(aux32[1] >> 28) + 1;
|
||||
int32_t sumi = 0;
|
||||
for (int l = 0; l < 4; ++l) {
|
||||
const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[l]);
|
||||
const uint8_t signs = ksigns_iq2xs[(aux32[1] >> 7*l) & 127];
|
||||
for (int j = 0; j < 8; ++j) {
|
||||
sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
|
||||
}
|
||||
q8 += 8;
|
||||
}
|
||||
bsum += sumi * ls;
|
||||
}
|
||||
sumf += d * bsum;
|
||||
}
|
||||
*s = 0.125f * sumf;
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
ggml_vec_dot_iq2_xxs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
@ -3033,42 +2680,10 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
|
|||
*s = 0.125f * hsum_float_8(accumf);
|
||||
|
||||
#else
|
||||
|
||||
float sumf = 0.f;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
|
||||
const uint8_t * GGML_RESTRICT sc = x[i].scales;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
int32_t bsum = 0;
|
||||
for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
|
||||
const uint16_t ls1 = 2*(sc[ib32] & 0xf) + 1;
|
||||
const uint16_t ls2 = 2*(sc[ib32] >> 4) + 1;
|
||||
int32_t sumi = 0;
|
||||
for (int l = 0; l < 2; ++l) {
|
||||
const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
|
||||
const uint8_t signs = ksigns_iq2xs[q2[l] >> 9];
|
||||
for (int j = 0; j < 8; ++j) {
|
||||
sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
|
||||
}
|
||||
q8 += 8;
|
||||
}
|
||||
bsum += sumi * ls1;
|
||||
sumi = 0;
|
||||
for (int l = 2; l < 4; ++l) {
|
||||
const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
|
||||
const uint8_t signs = ksigns_iq2xs[q2[l] >> 9];
|
||||
for (int j = 0; j < 8; ++j) {
|
||||
sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
|
||||
}
|
||||
q8 += 8;
|
||||
}
|
||||
bsum += sumi * ls2;
|
||||
q2 += 4;
|
||||
}
|
||||
sumf += d * bsum;
|
||||
}
|
||||
*s = 0.125f * sumf;
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
ggml_vec_dot_iq2_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
@ -3250,47 +2865,11 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|||
*s = 0.125f * hsum_float_8(accumf);
|
||||
|
||||
#else
|
||||
|
||||
float sumf = 0;
|
||||
for (int i = 0; i < nb; i++) {
|
||||
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const int8_t * q8 = y[i].qs;
|
||||
const uint8_t * qs = x[i].qs;
|
||||
const uint8_t * qh = x[i].qh;
|
||||
const uint8_t * signs = qs + QK_K/8;
|
||||
|
||||
int bsum = 0;
|
||||
for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
|
||||
int ls1 = 1 + 2*(x[i].scales[ib32] & 0xf);
|
||||
int ls2 = 1 + 2*(x[i].scales[ib32] >> 4);
|
||||
int sumi1 = 0, sumi2 = 0;
|
||||
for (int l = 0; l < 2; ++l) {
|
||||
const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
|
||||
for (int j = 0; j < 8; ++j) {
|
||||
sumi1 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1);
|
||||
}
|
||||
q8 += 8;
|
||||
}
|
||||
for (int l = 2; l < 4; ++l) {
|
||||
const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
|
||||
for (int j = 0; j < 8; ++j) {
|
||||
sumi2 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1);
|
||||
}
|
||||
q8 += 8;
|
||||
}
|
||||
bsum += ls1 * sumi1 + ls2 * sumi2;
|
||||
qs += 4;
|
||||
signs += 4;
|
||||
}
|
||||
|
||||
sumf += d * bsum;
|
||||
}
|
||||
|
||||
*s = 0.125f * sumf;
|
||||
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
ggml_vec_dot_iq2_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
|
|
@ -3410,36 +2989,10 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
|
|||
*s = 0.25f * hsum_float_8(accumf);
|
||||
|
||||
#else
|
||||
|
||||
uint32_t aux32;
|
||||
|
||||
float sumf = 0.f;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
|
||||
const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
int32_t bsum = 0;
|
||||
for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
|
||||
memcpy(&aux32, gas, sizeof(uint32_t)); gas += sizeof(uint32_t);
|
||||
const uint32_t ls = 2*(aux32 >> 28) + 1;
|
||||
int32_t sumi = 0;
|
||||
for (int l = 0; l < 4; ++l) {
|
||||
const uint8_t * grid1 = (const uint8_t *)(iq3xxs_grid + q3[2*l+0]);
|
||||
const uint8_t * grid2 = (const uint8_t *)(iq3xxs_grid + q3[2*l+1]);
|
||||
const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*l) & 127];
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
sumi += grid1[j] * q8[j+0] * (signs & kmask_iq2xs[j+0] ? -1 : 1);
|
||||
sumi += grid2[j] * q8[j+4] * (signs & kmask_iq2xs[j+4] ? -1 : 1);
|
||||
}
|
||||
q8 += 8;
|
||||
}
|
||||
q3 += 8;
|
||||
bsum += sumi * ls;
|
||||
}
|
||||
sumf += d * bsum;
|
||||
}
|
||||
*s = 0.25f * sumf;
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
ggml_vec_dot_iq3_xxs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
@ -3646,48 +3199,10 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|||
*s = hsum_float_8(accumf);
|
||||
|
||||
#else
|
||||
|
||||
float sumf = 0.f;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const uint8_t * GGML_RESTRICT qs = x[i].qs;
|
||||
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
||||
const uint8_t * GGML_RESTRICT signs = x[i].signs;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
int32_t bsum = 0;
|
||||
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
||||
const uint32_t ls1 = 2*(x[i].scales[ib32/2] & 0xf) + 1;
|
||||
const uint32_t ls2 = 2*(x[i].scales[ib32/2] >> 4) + 1;
|
||||
int32_t sumi = 0;
|
||||
for (int l = 0; l < 4; ++l) {
|
||||
const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+0] << (8-2*l)) & 256)));
|
||||
const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+0] << (7-2*l)) & 256)));
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
|
||||
sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
|
||||
}
|
||||
q8 += 8;
|
||||
}
|
||||
qs += 8;
|
||||
signs += 4;
|
||||
bsum += sumi * ls1;
|
||||
sumi = 0;
|
||||
for (int l = 0; l < 4; ++l) {
|
||||
const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+1] << (8-2*l)) & 256)));
|
||||
const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+1] << (7-2*l)) & 256)));
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
|
||||
sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
|
||||
}
|
||||
q8 += 8;
|
||||
}
|
||||
qs += 8;
|
||||
signs += 4;
|
||||
bsum += sumi * ls2;
|
||||
}
|
||||
sumf += d * bsum;
|
||||
}
|
||||
*s = sumf;
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
ggml_vec_dot_iq3_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
@ -3811,36 +3326,10 @@ void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|||
*s = hsum_float_8(accum) + IQ1S_DELTA * accum1;
|
||||
|
||||
#else
|
||||
|
||||
float sumf = 0;
|
||||
for (int i = 0; i < nb; i++) {
|
||||
|
||||
const int8_t * q8 = y[i].qs;
|
||||
const uint8_t * qs = x[i].qs;
|
||||
const uint16_t * qh = x[i].qh;
|
||||
|
||||
int sumi = 0, sumi1 = 0;
|
||||
for (int ib = 0; ib < QK_K/32; ++ib) {
|
||||
const int ls = 2*((qh[ib] >> 12) & 7) + 1;
|
||||
const int delta = qh[ib] & 0x8000 ? -1 : 1;
|
||||
int lsum = 0;
|
||||
for (int l = 0; l < 4; ++l) {
|
||||
const int8_t * grid = (const int8_t *)(iq1s_grid + (qs[l] | (((qh[ib] >> 3*l) & 7) << 8)));
|
||||
for (int j = 0; j < 8; ++j) {
|
||||
lsum += q8[j] * grid[j];
|
||||
}
|
||||
q8 += 8;
|
||||
}
|
||||
sumi += ls * lsum;
|
||||
sumi1 += ls * delta * (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]);
|
||||
qs += 4;
|
||||
}
|
||||
|
||||
sumf += GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1);
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
ggml_vec_dot_iq1_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
@ -4043,52 +3532,11 @@ void ggml_vec_dot_iq1_m_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|||
*s = hsum_float_8(accum1) + IQ1M_DELTA * hsum_float_8(accum2);
|
||||
|
||||
#else
|
||||
|
||||
int sum1[2], sum2[2], delta[4];
|
||||
|
||||
float sumf = 0;
|
||||
for (int i = 0; i < nb; i++) {
|
||||
|
||||
const int8_t * q8 = y[i].qs;
|
||||
const uint8_t * qs = x[i].qs;
|
||||
const uint8_t * qh = x[i].qh;
|
||||
const uint16_t * sc = (const uint16_t *)x[i].scales;
|
||||
|
||||
scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
|
||||
|
||||
int sumi1 = 0, sumi2 = 0;
|
||||
for (int ib = 0; ib < QK_K/32; ++ib) {
|
||||
delta[0] = qh[0] & 0x08 ? -1 : 1;
|
||||
delta[1] = qh[0] & 0x80 ? -1 : 1;
|
||||
delta[2] = qh[1] & 0x08 ? -1 : 1;
|
||||
delta[3] = qh[1] & 0x80 ? -1 : 1;
|
||||
sum1[0] = sum1[1] = sum2[0] = sum2[1] = 0;
|
||||
for (int l = 0; l < 4; ++l) {
|
||||
const int8_t * grid = (const int8_t *)(iq1s_grid + (qs[l] | (((uint16_t)qh[l/2] << (8 - 4*(l%2))) & 0x700)));
|
||||
int lsum1 = 0, lsum2 = 0;
|
||||
for (int j = 0; j < 8; ++j) {
|
||||
lsum1 += q8[j] * grid[j];
|
||||
lsum2 += q8[j];
|
||||
}
|
||||
q8 += 8;
|
||||
sum1[l/2] += lsum1;
|
||||
sum2[l/2] += lsum2*delta[l];
|
||||
}
|
||||
|
||||
const int ls1 = 2*((sc[ib/2] >> (6*(ib%2)+0)) & 0x7) + 1;
|
||||
const int ls2 = 2*((sc[ib/2] >> (6*(ib%2)+3)) & 0x7) + 1;
|
||||
|
||||
sumi1 += sum1[0] * ls1 + sum1[1] * ls2;
|
||||
sumi2 += sum2[0] * ls1 + sum2[1] * ls2;
|
||||
qs += 4;
|
||||
qh += 2;
|
||||
}
|
||||
|
||||
sumf += GGML_CPU_FP16_TO_FP32(scale.f16) * y[i].d * (sumi1 + IQ1M_DELTA * sumi2);
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
UNUSED(scale);
|
||||
ggml_vec_dot_iq1_m_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
@ -4275,37 +3723,10 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
|
|||
*s = hsum_float_8(accum);
|
||||
|
||||
#else
|
||||
float sumf = 0;
|
||||
for (int ibl = 0; ibl < nb; ++ibl) {
|
||||
const float d4d8 = GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d;
|
||||
uint16_t h = x[ibl].scales_h;
|
||||
const uint8_t * qs = x[ibl].qs;
|
||||
const int8_t * q8 = y[ibl].qs;
|
||||
for (int ib = 0; ib < QK_K/32; ib += 2) {
|
||||
const uint8_t ls1 = (x[ibl].scales_l[ib/2] & 0xf) | ((h << 4) & 0x30);
|
||||
const uint8_t ls2 = (x[ibl].scales_l[ib/2] >> 4) | ((h << 2) & 0x30);
|
||||
h >>= 4;
|
||||
const float d1 = d4d8*(ls1 - 32);
|
||||
const float d2 = d4d8*(ls2 - 32);
|
||||
int sumi1 = 0, sumi2 = 0;
|
||||
for (int j = 0; j < 16; ++j) {
|
||||
sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
|
||||
sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >> 4];
|
||||
}
|
||||
sumf += d1 * (sumi1 + sumi2);
|
||||
qs += 16;
|
||||
q8 += 32;
|
||||
sumi1 = sumi2 = 0;
|
||||
for (int j = 0; j < 16; ++j) {
|
||||
sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
|
||||
sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >> 4];
|
||||
}
|
||||
sumf += d2 * (sumi1 + sumi2);
|
||||
qs += 16;
|
||||
q8 += 32;
|
||||
}
|
||||
}
|
||||
*s = sumf;
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
ggml_vec_dot_iq4_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load Diff
|
|
@ -412,6 +412,82 @@ void ggml_gemv_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|||
}
|
||||
}
|
||||
|
||||
void ggml_gemv_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||
const int qk = QK_K;
|
||||
const int nb = n / qk;
|
||||
const int ncols_interleaved = 8;
|
||||
const int blocklen = 8;
|
||||
|
||||
assert (n % qk == 0);
|
||||
assert (nc % ncols_interleaved == 0);
|
||||
|
||||
UNUSED(s);
|
||||
UNUSED(bs);
|
||||
UNUSED(vx);
|
||||
UNUSED(vy);
|
||||
UNUSED(nr);
|
||||
UNUSED(nc);
|
||||
UNUSED(nb);
|
||||
UNUSED(ncols_interleaved);
|
||||
UNUSED(blocklen);
|
||||
|
||||
float sumf[8];
|
||||
float sum_minf[8];
|
||||
int sumi1,sumi2,sumi3,sumi4;
|
||||
int sumi;
|
||||
|
||||
const block_q8_K * a_ptr = (const block_q8_K *)vy;
|
||||
for(int x = 0; x < nc / ncols_interleaved; x++) {
|
||||
const block_q2_Kx8 * b_ptr = (const block_q2_Kx8 *) vx + (x * nb);
|
||||
for (int j = 0; j < ncols_interleaved; j++) {
|
||||
sumf[j] = 0.0;
|
||||
sum_minf[j] = 0.0;
|
||||
}
|
||||
for (int l = 0; l < nb; l++) {
|
||||
for (int k = 0; k < (qk / (4 * blocklen)); k++) {
|
||||
const uint8_t *scales_0 = b_ptr[l].scales + (k / 4) * 64 ;
|
||||
const uint8_t *scales_1 = b_ptr[l].scales + (k / 4) * 64 + 16;
|
||||
const uint8_t *scales_2 = b_ptr[l].scales + (k / 4) * 64 + 32;
|
||||
const uint8_t *scales_3 = b_ptr[l].scales + (k / 4) * 64 + 48;
|
||||
for (int j = 0; j < ncols_interleaved; j++) {
|
||||
sumi1 = 0;
|
||||
sumi2 = 0;
|
||||
sumi3 = 0;
|
||||
sumi4 = 0;
|
||||
sumi = 0;
|
||||
int offset = ((k / 2) % 2) + j * 2;
|
||||
for (int i = 0; i < blocklen; ++i){
|
||||
const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 3);
|
||||
const int v1 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 2 ) & 3);
|
||||
const int v2 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4 ) & 3);
|
||||
const int v3 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 6 ) & 3);
|
||||
sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i]);
|
||||
sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i + 32]);
|
||||
sumi3 = (v2 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i + 64]);
|
||||
sumi4 = (v3 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i + 96]);
|
||||
|
||||
sumi1 = sumi1 * (scales_0[offset] & 0xF);
|
||||
sumi2 = sumi2 * (scales_1[offset] & 0xF);
|
||||
sumi3 = sumi3 * (scales_2[offset] & 0xF);
|
||||
sumi4 = sumi4 * (scales_3[offset] & 0xF);
|
||||
sumi += sumi1 + sumi2 + sumi3 + sumi4;
|
||||
}
|
||||
sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
|
||||
}
|
||||
}
|
||||
for(int sb = 0; sb < 8; sb++) {
|
||||
const uint8_t *mins = b_ptr[l].scales + sb * 16;
|
||||
for(int j = 0; j < ncols_interleaved; j++){
|
||||
sum_minf[j] += ((mins[j * 2] >> 4) * a_ptr[l].bsums[sb * 2] + (mins[(j * 2)+ 1] >> 4) * a_ptr[l].bsums[sb * 2 + 1]) * GGML_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d;
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int j = 0; j < ncols_interleaved; j++) {
|
||||
s[x * ncols_interleaved + j] = sumf[j] - sum_minf[j];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||
const int qk = QK8_0;
|
||||
const int nb = n / qk;
|
||||
|
|
@ -711,6 +787,97 @@ void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|||
}
|
||||
}
|
||||
|
||||
void ggml_gemm_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||
const int qk = QK_K;
|
||||
const int nb = n / qk;
|
||||
const int ncols_interleaved = 8;
|
||||
const int blocklen = 8;
|
||||
|
||||
assert (n % qk == 0);
|
||||
assert (nr % 4 == 0);
|
||||
assert (nc % ncols_interleaved == 0);
|
||||
|
||||
UNUSED(s);
|
||||
UNUSED(bs);
|
||||
UNUSED(vx);
|
||||
UNUSED(vy);
|
||||
UNUSED(nr);
|
||||
UNUSED(nc);
|
||||
UNUSED(nb);
|
||||
UNUSED(ncols_interleaved);
|
||||
UNUSED(blocklen);
|
||||
|
||||
float sumf[4][8];
|
||||
float sum_minf[4][8];
|
||||
int sumi1, sumi2, sumi3, sumi4;
|
||||
int sumi;
|
||||
|
||||
for (int y = 0; y < nr / 4; y++) {
|
||||
const block_q8_Kx4 * a_ptr = (const block_q8_Kx4 *) vy + (y * nb);
|
||||
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
||||
const block_q2_Kx8 * b_ptr = (const block_q2_Kx8 *) vx + (x * nb);
|
||||
for (int m = 0; m < 4; m++) {
|
||||
for (int j = 0; j < ncols_interleaved; j++) {
|
||||
sumf[m][j] = 0.0;
|
||||
sum_minf[m][j] = 0.0;
|
||||
}
|
||||
}
|
||||
for (int l = 0; l < nb; l++) {
|
||||
for (int k = 0; k < (qk / (4 * blocklen)); k++) {
|
||||
|
||||
const uint8_t *scales_0 = b_ptr[l].scales + (k / 4) * 64 ;
|
||||
const uint8_t *scales_1 = b_ptr[l].scales + (k / 4) * 64 + 16;
|
||||
const uint8_t *scales_2 = b_ptr[l].scales + (k / 4) * 64 + 32;
|
||||
const uint8_t *scales_3 = b_ptr[l].scales + (k / 4) * 64 + 48;
|
||||
for (int m = 0; m < 4; m++) {
|
||||
for (int j = 0; j < ncols_interleaved; j++) {
|
||||
sumi1 = 0;
|
||||
sumi2 = 0;
|
||||
sumi3 = 0;
|
||||
sumi4 = 0;
|
||||
sumi = 0;
|
||||
int offset = ((k / 2) % 2) + j * 2;
|
||||
for (int i = 0; i < blocklen; ++i){
|
||||
const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 3);
|
||||
const int v1 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 2 ) & 3);
|
||||
const int v2 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4 ) & 3);
|
||||
const int v3 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 6 ) & 3);
|
||||
sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i]);
|
||||
sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i + 128]);
|
||||
sumi3 = (v2 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i + 256]);
|
||||
sumi4 = (v3 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i + 384]);
|
||||
sumi1 = sumi1 * (scales_0[offset] & 0xF);
|
||||
sumi2 = sumi2 * (scales_1[offset] & 0xF);
|
||||
sumi3 = sumi3 * (scales_2[offset] & 0xF);
|
||||
sumi4 = sumi4 * (scales_3[offset] & 0xF);
|
||||
sumi += sumi1 + sumi2 + sumi3 + sumi4;
|
||||
}
|
||||
sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m];
|
||||
}
|
||||
}
|
||||
}
|
||||
for(int sb = 0; sb < 8; sb++) {
|
||||
const uint8_t *mins = b_ptr[l].scales + sb * 16;
|
||||
for(int m = 0; m < 4; m++) {
|
||||
const int16_t *bsums = a_ptr[l].bsums + (sb * 8) + (m * 4) - ((sb % 2) * 6);
|
||||
for(int j = 0; j < ncols_interleaved; j++) {
|
||||
int mins_prod = ((mins[j * 2] >> 4) * bsums[0] + (mins[(j * 2)+ 1] >> 4) * bsums[1]);
|
||||
sum_minf[m][j] += (mins_prod) * GGML_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (int m = 0; m < 4; m++) {
|
||||
for (int j = 0; j < ncols_interleaved; j++) {
|
||||
s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j] - sum_minf[m][j];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||
const int qk = QK8_0;
|
||||
const int nb = n / qk;
|
||||
|
|
@ -914,6 +1081,50 @@ static block_q4_Kx8 make_block_q4_Kx8(block_q4_K * in, unsigned int blck_size_in
|
|||
return out;
|
||||
}
|
||||
|
||||
static block_q2_Kx8 make_block_q2_Kx8(block_q2_K * in, unsigned int blck_size_interleave) {
|
||||
block_q2_Kx8 out;
|
||||
|
||||
// Delta(scale) and dmin values of the eight Q2_K structures are copied onto the output interleaved structure
|
||||
for (int i = 0; i < 8; i++) {
|
||||
out.d[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.d;
|
||||
}
|
||||
|
||||
for (int i = 0; i < 8; i++) {
|
||||
out.dmin[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.dmin;
|
||||
}
|
||||
|
||||
const int end = QK_K * 2 / blck_size_interleave;
|
||||
|
||||
// Interleave Q2_K quants by taking 8 bytes at a time
|
||||
for (int i = 0; i < end; ++i) {
|
||||
int src_id = i % 8;
|
||||
int src_offset = (i / 8) * blck_size_interleave;
|
||||
int dst_offset = i * blck_size_interleave;
|
||||
|
||||
uint64_t elems;
|
||||
memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
|
||||
memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
|
||||
}
|
||||
|
||||
// The below logic is designed so as to unpack and rearrange scales and mins values in Q2_K
|
||||
// Currently the Q2_K structure has 16 scales and 16 mins packed in 16 bytes ( 4 bits for each value)
|
||||
// The output Q2_Kx8 structure has 128 bytes for storing scales and mins
|
||||
// Every 16 byte is packed such that it contains scales and mins for corresponding sub blocks from Q2_K structure
|
||||
// For eg - First 16 bytes contains 16 scales and 16 mins - each of first and second sub blocks from different Q2_K structures
|
||||
|
||||
for(int i = 0; i < 128; i++){
|
||||
|
||||
// Index for selecting which q2k super block
|
||||
int src1 = (i % 16) / 2;
|
||||
// Index for selecting scale
|
||||
int src2 = ((i / 16) * 2) + (i % 2);
|
||||
|
||||
out.scales[i] = in[src1].scales[src2];
|
||||
}
|
||||
return out;
|
||||
|
||||
}
|
||||
|
||||
static int repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
|
||||
GGML_ASSERT(t->type == GGML_TYPE_Q4_0);
|
||||
GGML_ASSERT(interleave_block == 4 || interleave_block == 8);
|
||||
|
|
@ -975,6 +1186,37 @@ static int repack_q4_K_to_q4_K_8_bl(struct ggml_tensor * t, int interleave_block
|
|||
GGML_UNUSED(data_size);
|
||||
}
|
||||
|
||||
static int repack_q2_K_to_q2_K_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
|
||||
GGML_ASSERT(t->type == GGML_TYPE_Q2_K);
|
||||
GGML_ASSERT(interleave_block == 8);
|
||||
constexpr int nrows_interleaved = 8;
|
||||
|
||||
block_q2_Kx8 * dst = (block_q2_Kx8*)t->data;
|
||||
const block_q2_K * src = (const block_q2_K*) data;
|
||||
block_q2_K dst_tmp[8];
|
||||
int nrow = ggml_nrows(t);
|
||||
int nblocks = t->ne[0] / QK_K;
|
||||
|
||||
GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q2_K));
|
||||
|
||||
if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
for (int b = 0; b < nrow; b += nrows_interleaved) {
|
||||
for (int64_t x = 0; x < nblocks; x++) {
|
||||
for (int i = 0; i < nrows_interleaved; i++ ) {
|
||||
dst_tmp[i] = src[x + i * nblocks];
|
||||
}
|
||||
*dst++ = make_block_q2_Kx8(dst_tmp, interleave_block);
|
||||
}
|
||||
src += nrows_interleaved * nblocks;
|
||||
}
|
||||
return 0;
|
||||
|
||||
GGML_UNUSED(data_size);
|
||||
}
|
||||
|
||||
static int repack_q4_0_to_q4_0_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
|
||||
GGML_ASSERT(t->type == GGML_TYPE_Q4_0);
|
||||
GGML_ASSERT(interleave_block == 8);
|
||||
|
|
@ -1095,6 +1337,10 @@ template <> int repack<block_q4_K, 8, 8>(struct ggml_tensor * t, const void * da
|
|||
return repack_q4_K_to_q4_K_8_bl(t, 8, data, data_size);
|
||||
}
|
||||
|
||||
template <> int repack<block_q2_K, 8, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
|
||||
return repack_q2_K_to_q2_K_8_bl(t, 8, data, data_size);
|
||||
}
|
||||
|
||||
template <> int repack<block_iq4_nl, 4, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
|
||||
return repack_iq4_nl_to_iq4_nl_4_bl(t, 4, data, data_size);
|
||||
}
|
||||
|
|
@ -1124,6 +1370,10 @@ template <> void gemv<block_q4_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t
|
|||
ggml_gemv_q4_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
|
||||
}
|
||||
|
||||
template <> void gemv<block_q2_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
||||
ggml_gemv_q2_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
|
||||
}
|
||||
|
||||
template <> void gemv<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
||||
ggml_gemv_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
|
||||
}
|
||||
|
|
@ -1148,6 +1398,10 @@ template <> void gemm<block_q4_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t
|
|||
ggml_gemm_q4_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
|
||||
}
|
||||
|
||||
template <> void gemm<block_q2_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
||||
ggml_gemm_q2_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
|
||||
}
|
||||
|
||||
template <> void gemm<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
||||
ggml_gemm_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
|
||||
}
|
||||
|
|
@ -1421,6 +1675,9 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
|
|||
static const ggml::cpu::repack::tensor_traits<block_q4_0, 8, 8, GGML_TYPE_Q8_0> q4_0_8x8_q8_0;
|
||||
static const ggml::cpu::repack::tensor_traits<block_q4_K, 8, 8, GGML_TYPE_Q8_K> q4_K_8x8_q8_K;
|
||||
|
||||
// instance for Q2
|
||||
static const ggml::cpu::repack::tensor_traits<block_q2_K, 8, 8, GGML_TYPE_Q8_K> q2_K_8x8_q8_K;
|
||||
|
||||
// instance for IQ4
|
||||
static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0> iq4_nl_4x4_q8_0;
|
||||
|
||||
|
|
@ -1446,6 +1703,12 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
|
|||
return &q4_K_8x8_q8_K;
|
||||
}
|
||||
}
|
||||
} else if (cur->type == GGML_TYPE_Q2_K) {
|
||||
if (ggml_cpu_has_avx512()) {
|
||||
if (cur->ne[1] % 8 == 0) {
|
||||
return &q2_K_8x8_q8_K;
|
||||
}
|
||||
}
|
||||
} else if (cur->type == GGML_TYPE_IQ4_NL) {
|
||||
if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
|
||||
if (cur->ne[1] % 4 == 0) {
|
||||
|
|
|
|||
|
|
@ -44,7 +44,14 @@ struct block_q4_Kx8 {
|
|||
};
|
||||
|
||||
static_assert(sizeof(block_q4_Kx8) == sizeof(ggml_half) * 16 + K_SCALE_SIZE * 8 + QK_K * 4, "wrong q4_K block size/padding");
|
||||
struct block_q2_Kx8 {
|
||||
ggml_half d[8]; // super-block scale for quantized scales
|
||||
ggml_half dmin[8]; // super-block scale for quantized mins
|
||||
uint8_t scales[128]; // scales and mins, quantized with 4 bits
|
||||
uint8_t qs[512]; // 2--bit quants
|
||||
};
|
||||
|
||||
static_assert(sizeof(block_q2_Kx8) == sizeof(ggml_half) * 16 + QK_K/2 + QK_K * 2, "wrong q2_K block size/padding");
|
||||
struct block_q8_Kx4 {
|
||||
float d[4]; // delta
|
||||
int8_t qs[QK_K * 4]; // quants
|
||||
|
|
@ -71,11 +78,13 @@ void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|||
void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemv_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemv_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
|
||||
// Native implementations
|
||||
|
|
@ -86,11 +95,13 @@ void ggml_gemv_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|||
void ggml_gemv_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemv_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemv_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemv_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
|
||||
#if defined(__cplusplus)
|
||||
|
|
|
|||
|
|
@ -56,7 +56,7 @@
|
|||
#define GGML_CUDA_CC_GCN4 (GGML_CUDA_CC_OFFSET_AMD + 0x803) // Tonga, Fiji, Polaris, minimum for fast fp16
|
||||
#define GGML_CUDA_CC_VEGA (GGML_CUDA_CC_OFFSET_AMD + 0x900) // Vega56/64, minimum for fp16 dual issue
|
||||
#define GGML_CUDA_CC_VEGA20 (GGML_CUDA_CC_OFFSET_AMD + 0x906) // MI50/Radeon VII, minimum for dp4a
|
||||
#define GGML_CUDA_CC_CDNA (GGML_CUDA_CC_OFFSET_AMD + 0x908) // MI100, minimum for MFMA, acc registers
|
||||
#define GGML_CUDA_CC_CDNA1 (GGML_CUDA_CC_OFFSET_AMD + 0x908) // MI100, minimum for MFMA, acc registers
|
||||
#define GGML_CUDA_CC_CDNA2 (GGML_CUDA_CC_OFFSET_AMD + 0x910) // MI210, minimum acc register renameing
|
||||
#define GGML_CUDA_CC_CDNA3 (GGML_CUDA_CC_OFFSET_AMD + 0x942) // MI300
|
||||
|
||||
|
|
@ -72,8 +72,9 @@
|
|||
#define GGML_CUDA_CC_IS_RDNA2(cc) (cc >= GGML_CUDA_CC_RDNA2 && cc < GGML_CUDA_CC_RDNA3)
|
||||
#define GGML_CUDA_CC_IS_RDNA3(cc) (cc >= GGML_CUDA_CC_RDNA3 && cc < GGML_CUDA_CC_RDNA4)
|
||||
#define GGML_CUDA_CC_IS_RDNA4(cc) (cc >= GGML_CUDA_CC_RDNA4)
|
||||
#define GGML_CUDA_CC_IS_GCN(cc) (cc > GGML_CUDA_CC_OFFSET_AMD && cc < GGML_CUDA_CC_CDNA)
|
||||
#define GGML_CUDA_CC_IS_CDNA(cc) (cc >= GGML_CUDA_CC_CDNA && cc < GGML_CUDA_CC_RDNA1)
|
||||
#define GGML_CUDA_CC_IS_GCN(cc) (cc > GGML_CUDA_CC_OFFSET_AMD && cc < GGML_CUDA_CC_CDNA1)
|
||||
#define GGML_CUDA_CC_IS_CDNA(cc) (cc >= GGML_CUDA_CC_CDNA1 && cc < GGML_CUDA_CC_RDNA1)
|
||||
#define GGML_CUDA_CC_IS_CDNA3(cc) (cc >= GGML_CUDA_CC_CDNA3 && cc < GGML_CUDA_CC_RDNA1)
|
||||
|
||||
// Moore Threads
|
||||
#define GGML_CUDA_CC_QY1 (GGML_CUDA_CC_OFFSET_MTHREADS + 0x210) // MTT S80, MTT S3000
|
||||
|
|
@ -175,7 +176,7 @@ static const char * cu_get_error_str(CUresult err) {
|
|||
#define CU_CHECK(err) CUDA_CHECK_GEN(err, CUDA_SUCCESS, cu_get_error_str)
|
||||
#endif
|
||||
|
||||
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA)
|
||||
#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
|
||||
# define CUDA_SET_SHARED_MEMORY_LIMIT(kernel, nbytes) \
|
||||
do { \
|
||||
static bool shared_memory_limit_raised[GGML_CUDA_MAX_DEVICES] = { false }; \
|
||||
|
|
@ -190,7 +191,7 @@ static const char * cu_get_error_str(CUresult err) {
|
|||
do { \
|
||||
GGML_UNUSED(nbytes); \
|
||||
} while (0)
|
||||
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA)
|
||||
#endif // !(defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
|
||||
|
||||
#if CUDART_VERSION >= 11010 || defined(GGML_USE_MUSA)
|
||||
#define GGML_CUDA_ASSUME(x) __builtin_assume(x)
|
||||
|
|
@ -210,9 +211,9 @@ typedef float2 dfloat2;
|
|||
#define GGML_USE_VMM
|
||||
#endif // (!defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)) || (defined(GGML_USE_HIP) && !defined(GGML_HIP_NO_VMM))
|
||||
|
||||
#if (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
|
||||
#if defined(GGML_USE_HIP) || __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
|
||||
#define FP16_AVAILABLE
|
||||
#endif // (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
|
||||
#endif // defined(GGML_USE_HIP) || __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
|
||||
|
||||
#if defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610
|
||||
#define FAST_FP16_AVAILABLE
|
||||
|
|
@ -226,13 +227,17 @@ typedef float2 dfloat2;
|
|||
#define FP16_MMA_AVAILABLE
|
||||
#endif // defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3) || (defined(GGML_HIP_ROCWMMA_FATTN_GFX12) && defined(RDNA4)))
|
||||
|
||||
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
|
||||
#define NEW_MMA_AVAILABLE
|
||||
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
|
||||
#if defined(GGML_USE_HIP) && defined(CDNA) && !defined(GGML_HIP_NO_MMQ_MFMA)
|
||||
#define AMD_MFMA_AVAILABLE
|
||||
#endif // defined(GGML_USE_HIP) && defined(CDNA) && !defined(GGML_HIP_NO_MMQ_MFMA)
|
||||
|
||||
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
|
||||
#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
|
||||
#define NEW_MMA_AVAILABLE
|
||||
#endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
|
||||
|
||||
#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
|
||||
#define CP_ASYNC_AVAILABLE
|
||||
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
|
||||
#endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
|
||||
|
||||
#if !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ < 220)
|
||||
#define FLASH_ATTN_AVAILABLE
|
||||
|
|
@ -254,7 +259,7 @@ static bool fast_fp16_hardware_available(const int cc) {
|
|||
|
||||
// Any FP16 tensor core instructions are available for ggml code.
|
||||
static bool fp16_mma_available(const int cc) {
|
||||
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(GGML_HIP_ROCWMMA_FATTN)
|
||||
#if defined(GGML_USE_HIP) && !defined(GGML_HIP_ROCWMMA_FATTN)
|
||||
return false;
|
||||
#else
|
||||
if ((GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA) ||
|
||||
|
|
@ -270,7 +275,7 @@ static bool fp16_mma_available(const int cc) {
|
|||
} else {
|
||||
return false;
|
||||
}
|
||||
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(GGML_HIP_ROCWMMA_FATTN)
|
||||
#endif // defined(GGML_USE_HIP) && !defined(GGML_HIP_ROCWMMA_FATTN)
|
||||
}
|
||||
|
||||
// To be used for feature selection of external libraries, e.g. cuBLAS.
|
||||
|
|
@ -288,6 +293,14 @@ static bool fp32_mma_hardware_available(const int cc) {
|
|||
return GGML_CUDA_CC_IS_CDNA(cc);
|
||||
}
|
||||
|
||||
static bool amd_mfma_available(const int cc) {
|
||||
#if !defined(GGML_HIP_NO_MMQ_MFMA)
|
||||
return GGML_CUDA_CC_IS_CDNA(cc);
|
||||
#else
|
||||
return false;
|
||||
#endif //!defined(GGML_HIP_NO_MMQ_MFMA)
|
||||
}
|
||||
|
||||
// Volta technically had FP16 tensor cores but they work very differently compared to Turing and later.
|
||||
static bool new_mma_available(const int cc) {
|
||||
return GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_TURING;
|
||||
|
|
@ -298,25 +311,25 @@ static bool cp_async_available(const int cc) {
|
|||
}
|
||||
|
||||
static constexpr __device__ int ggml_cuda_get_physical_warp_size() {
|
||||
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && (defined(__GFX9__) || defined(__GFX8__))
|
||||
#if defined(GGML_USE_HIP) && (defined(__GFX9__) || defined(__GFX8__))
|
||||
return 64;
|
||||
#else
|
||||
return 32;
|
||||
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && (defined(__GFX9__) || defined(__GFX8__))
|
||||
#endif // defined(GGML_USE_HIP) && (defined(__GFX9__) || defined(__GFX8__))
|
||||
}
|
||||
|
||||
[[noreturn]]
|
||||
static __device__ void no_device_code(
|
||||
const char * file_name, const int line, const char * function_name, const int arch, const char * arch_list) {
|
||||
|
||||
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||
#if defined(GGML_USE_HIP)
|
||||
printf("%s:%d: ERROR: HIP kernel %s has no device code compatible with HIP arch %d.\n",
|
||||
file_name, line, function_name, arch);
|
||||
GGML_UNUSED(arch_list);
|
||||
#else
|
||||
printf("%s:%d: ERROR: CUDA kernel %s has no device code compatible with CUDA arch %d. ggml-cuda.cu was compiled for: %s\n",
|
||||
file_name, line, function_name, arch, arch_list);
|
||||
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||
#endif // defined(GGML_USE_HIP)
|
||||
__trap();
|
||||
|
||||
GGML_UNUSED(no_device_code); // suppress unused function warning
|
||||
|
|
@ -353,7 +366,7 @@ struct ggml_cuda_unroll<1> {
|
|||
|
||||
template<int width = WARP_SIZE>
|
||||
static __device__ __forceinline__ int warp_reduce_sum(int x) {
|
||||
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
|
||||
#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
|
||||
return __reduce_add_sync(0xffffffff, x);
|
||||
#else
|
||||
#pragma unroll
|
||||
|
|
@ -361,7 +374,7 @@ static __device__ __forceinline__ int warp_reduce_sum(int x) {
|
|||
x += __shfl_xor_sync(0xffffffff, x, offset, width);
|
||||
}
|
||||
return x;
|
||||
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
|
||||
#endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
|
||||
}
|
||||
|
||||
template<int width = WARP_SIZE>
|
||||
|
|
@ -418,6 +431,20 @@ static __global__ void reduce_rows_f32(const float * x, float * dst, const int n
|
|||
dst[row] = norm ? sum / ncols : sum;
|
||||
}
|
||||
|
||||
template<int width = WARP_SIZE>
|
||||
static __device__ __forceinline__ int warp_reduce_all(int x) {
|
||||
#ifdef GGML_USE_HIP
|
||||
#pragma unroll
|
||||
for (int offset = width/2; offset > 0; offset >>= 1) {
|
||||
x = x && __shfl_xor_sync(0xffffffff, x, offset, width);
|
||||
}
|
||||
return x;
|
||||
#else
|
||||
static_assert(width == WARP_SIZE, "width != WARP_SIZE not implemented");
|
||||
return __all_sync(0xffffffff, x);
|
||||
#endif // GGML_USE_HIP
|
||||
}
|
||||
|
||||
template<int width = WARP_SIZE>
|
||||
static __device__ __forceinline__ float warp_reduce_max(float x) {
|
||||
#pragma unroll
|
||||
|
|
@ -430,11 +457,11 @@ static __device__ __forceinline__ float warp_reduce_max(float x) {
|
|||
static __device__ __forceinline__ half ggml_cuda_hmax(const half a, const half b) {
|
||||
#ifdef FP16_AVAILABLE
|
||||
|
||||
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX
|
||||
#if !defined(GGML_USE_HIP) && CUDART_VERSION < CUDART_HMAX
|
||||
return __float2half(fmaxf(__half2float(a), __half2float(b)));
|
||||
#else
|
||||
return __hmax(a, b);
|
||||
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX
|
||||
#endif // !defined(GGML_USE_HIP) && CUDART_VERSION < CUDART_HMAX
|
||||
|
||||
#else
|
||||
NO_DEVICE_CODE;
|
||||
|
|
@ -462,7 +489,7 @@ static __device__ __forceinline__ half2 ggml_cuda_hmax2(const half2 a, const hal
|
|||
|
||||
template<int width = WARP_SIZE>
|
||||
static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
|
||||
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL || (defined(GGML_USE_HIP) && HIP_VERSION >= 50700000)
|
||||
#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL || (defined(GGML_USE_HIP) && HIP_VERSION >= 50700000)
|
||||
#pragma unroll
|
||||
for (int offset = width/2; offset > 0; offset >>= 1) {
|
||||
x = ggml_cuda_hmax2(x, __shfl_xor_sync(0xffffffff, x, offset, width));
|
||||
|
|
@ -471,7 +498,7 @@ static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
|
|||
#else
|
||||
GGML_UNUSED(x);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL || (defined(GGML_USE_HIP) && HIP_VERSION >= 50700000)
|
||||
#endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL || (defined(GGML_USE_HIP) && HIP_VERSION >= 50700000)
|
||||
}
|
||||
|
||||
#if CUDART_VERSION < CUDART_HMASK
|
||||
|
|
@ -483,7 +510,7 @@ static __device__ __forceinline__ uint32_t __hgt2_mask(const half2 a, const half
|
|||
#endif // CUDART_VERSION < CUDART_HMASK
|
||||
|
||||
static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, int c) {
|
||||
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||
#if defined(GGML_USE_HIP)
|
||||
#if defined(CDNA) || defined(RDNA2) || defined(__gfx906__)
|
||||
c = __builtin_amdgcn_sdot4(a, b, c, false);
|
||||
#elif defined(RDNA3) || defined(RDNA4)
|
||||
|
|
@ -509,7 +536,7 @@ static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, i
|
|||
#endif
|
||||
return c;
|
||||
|
||||
#else // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||
#else // defined(GGML_USE_HIP)
|
||||
|
||||
#if __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A || defined(GGML_USE_MUSA)
|
||||
return __dp4a(a, b, c);
|
||||
|
|
@ -519,7 +546,7 @@ static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, i
|
|||
return c + a8[0]*b8[0] + a8[1]*b8[1] + a8[2]*b8[2] + a8[3]*b8[3];
|
||||
#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A || defined(GGML_USE_MUSA)
|
||||
|
||||
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||
#endif // defined(GGML_USE_HIP)
|
||||
}
|
||||
|
||||
typedef void (*dequantize_kernel_t)(const void * vx, const int64_t ib, const int iqs, dfloat2 & v);
|
||||
|
|
|
|||
|
|
@ -15,6 +15,7 @@ typedef void (* fattn_kernel_t)(
|
|||
const char * __restrict__ K,
|
||||
const char * __restrict__ V,
|
||||
const char * __restrict__ mask,
|
||||
const int * __restrict__ KV_max,
|
||||
float * __restrict__ dst,
|
||||
float2 * __restrict__ dst_meta,
|
||||
const float scale,
|
||||
|
|
@ -500,6 +501,55 @@ constexpr __device__ dequantize_1_f32_t get_dequantize_1_f32(ggml_type type_V) {
|
|||
nullptr;
|
||||
}
|
||||
|
||||
template <int ncols1>
|
||||
__launch_bounds__(FATTN_KQ_STRIDE/2, 1)
|
||||
static __global__ void flash_attn_mask_to_KV_max(
|
||||
const half2 * __restrict__ mask, int * __restrict__ KV_max, const int ne30, const int s31, const int s33) {
|
||||
const int ne31 = gridDim.x;
|
||||
const int tid = threadIdx.x;
|
||||
const int sequence = blockIdx.y;
|
||||
const int jt = blockIdx.x;
|
||||
|
||||
mask += sequence*s33 + jt*ncols1*s31;
|
||||
|
||||
__shared__ int buf_iw[WARP_SIZE];
|
||||
if (tid < WARP_SIZE) {
|
||||
buf_iw[tid] = 1;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
int KV_max_sj = (ne30 - 1) * FATTN_KQ_STRIDE;
|
||||
for (; KV_max_sj >= 0; KV_max_sj -= FATTN_KQ_STRIDE) {
|
||||
int all_inf = 1;
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols1; ++j) {
|
||||
const float2 tmp = __half22float2(mask[j*s31 + KV_max_sj/2 + tid]);
|
||||
all_inf = all_inf && int(isinf(tmp.x)) && int(isinf(tmp.y));
|
||||
}
|
||||
|
||||
all_inf = warp_reduce_all(all_inf);
|
||||
if (tid % WARP_SIZE == 0) {
|
||||
buf_iw[tid / WARP_SIZE] = all_inf;
|
||||
}
|
||||
__syncthreads();
|
||||
all_inf = buf_iw[tid % WARP_SIZE];
|
||||
__syncthreads();
|
||||
all_inf = warp_reduce_all(all_inf);
|
||||
|
||||
if (!all_inf) {
|
||||
KV_max_sj += FATTN_KQ_STRIDE;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (threadIdx.x != 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
KV_max[sequence*ne31 + jt] = KV_max_sj;
|
||||
}
|
||||
|
||||
template<int D, int ncols1, int ncols2> // D == head size
|
||||
__launch_bounds__(D, 1)
|
||||
static __global__ void flash_attn_stream_k_fixup(
|
||||
|
|
@ -592,9 +642,9 @@ static __global__ void flash_attn_stream_k_fixup(
|
|||
}
|
||||
|
||||
template<int D> // D == head size
|
||||
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
||||
#if !defined(GGML_USE_HIP)
|
||||
__launch_bounds__(D, 1)
|
||||
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
||||
#endif // !(defined(GGML_USE_HIP)
|
||||
static __global__ void flash_attn_combine_results(
|
||||
const float * __restrict__ VKQ_parts,
|
||||
const float2 * __restrict__ VKQ_meta,
|
||||
|
|
@ -711,6 +761,7 @@ void launch_fattn(
|
|||
|
||||
ggml_cuda_pool_alloc<half> K_f16(pool);
|
||||
ggml_cuda_pool_alloc<half> V_f16(pool);
|
||||
ggml_cuda_pool_alloc<int> KV_max(pool);
|
||||
ggml_cuda_pool_alloc<float> dst_tmp(pool);
|
||||
ggml_cuda_pool_alloc<float2> dst_tmp_meta(pool);
|
||||
|
||||
|
|
@ -779,11 +830,30 @@ void launch_fattn(
|
|||
V_data = (char *) V_f16.ptr;
|
||||
}
|
||||
|
||||
int parallel_blocks = 1;
|
||||
|
||||
const int ntiles_x = ((Q->ne[1] + ncols1 - 1) / ncols1);
|
||||
const int ntiles_total = ntiles_x * (Q->ne[2] / ncols2) * Q->ne[3];
|
||||
|
||||
// Optional optimization where the mask is scanned to determine whether part of the calculation can be skipped.
|
||||
// Only worth the overhead if there is at lease one FATTN_KQ_STRIDE x FATTN_KQ_STRIDE square to be skipped or
|
||||
// multiple sequences of possibly different lengths.
|
||||
if (mask && (Q->ne[1] >= 1024 || Q->ne[3] > 1)) {
|
||||
const int s31 = mask->nb[1] / sizeof(half2);
|
||||
const int s33 = mask->nb[3] / sizeof(half2);
|
||||
|
||||
const dim3 blocks_num_KV_max(ntiles_x, Q->ne[3], 1);
|
||||
const dim3 block_dim_KV_max(FATTN_KQ_STRIDE/2, 1, 1);
|
||||
|
||||
const int ne_KV_max = blocks_num_KV_max.x*blocks_num_KV_max.y;
|
||||
const int iter_k = K->ne[1] / FATTN_KQ_STRIDE;
|
||||
|
||||
KV_max.alloc(ne_KV_max);
|
||||
flash_attn_mask_to_KV_max<ncols1><<<blocks_num_KV_max, block_dim_KV_max, 0, main_stream>>>
|
||||
((const half2 *) mask->data, KV_max.ptr, iter_k, s31, s33);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
}
|
||||
|
||||
int parallel_blocks = 1;
|
||||
|
||||
const dim3 block_dim(warp_size, nwarps, 1);
|
||||
int max_blocks_per_sm = 1; // Max. number of active blocks limited by occupancy.
|
||||
CUDA_CHECK(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks_per_sm, fattn_kernel, block_dim.x * block_dim.y * block_dim.z, nbytes_shared));
|
||||
|
|
@ -870,6 +940,7 @@ void launch_fattn(
|
|||
K_data,
|
||||
V_data,
|
||||
mask ? ((const char *) mask->data) : nullptr,
|
||||
KV_max.ptr,
|
||||
!stream_k && parallel_blocks > 1 ? dst_tmp.ptr : (float *) KQV->data, dst_tmp_meta.ptr,
|
||||
scale, max_bias, m0, m1, n_head_log2, logit_softcap,
|
||||
Q->ne[0], Q->ne[1], Q->ne[2], Q->ne[3], Q->nb[1], Q->nb[2], Q->nb[3],
|
||||
|
|
|
|||
|
|
@ -392,7 +392,8 @@ static __device__ __forceinline__ void flash_attn_ext_f16_load_mask(
|
|||
}
|
||||
}
|
||||
|
||||
template<int DKQ, int DV, int ncols1, int ncols2, int nwarps, int ntiles, bool use_logit_softcap, bool mla, bool needs_fixup, bool is_fixup, bool last_iter>
|
||||
template<int DKQ, int DV, int ncols1, int ncols2, int nwarps, int ntiles,
|
||||
bool use_logit_softcap, bool mla, bool needs_fixup, bool is_fixup, bool last_iter>
|
||||
static __device__ __forceinline__ void flash_attn_ext_f16_iter(
|
||||
const float2 * const __restrict__ Q_f2,
|
||||
const half2 * const __restrict__ K_h2,
|
||||
|
|
@ -922,7 +923,8 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
|
|||
}
|
||||
|
||||
// Iterate over ne11 == previous tokens:
|
||||
for (int kb0 = kb0_start; kb0 < kb0_stop-1; ++kb0) {
|
||||
int kb0 = kb0_start;
|
||||
for (; kb0 < kb0_stop-1; ++kb0) {
|
||||
constexpr bool last_iter = false;
|
||||
flash_attn_ext_f16_iter<DKQ, DV, ncols1, ncols2, nwarps, ntiles, use_logit_softcap, mla, needs_fixup, is_fixup, last_iter>
|
||||
(Q_f2, K_h2, V_h2, mask_h2, dstk, dstk_fixup, scale, slope, logit_softcap,
|
||||
|
|
@ -932,7 +934,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
|
|||
constexpr bool last_iter = true;
|
||||
flash_attn_ext_f16_iter<DKQ, DV, ncols1, ncols2, nwarps, ntiles, use_logit_softcap, mla, needs_fixup, is_fixup, last_iter>
|
||||
(Q_f2, K_h2, V_h2, mask_h2, dstk, dstk_fixup, scale, slope, logit_softcap,
|
||||
ne01, ne02, stride_K, stride_V, stride_mask, tile_Q, tile_K, tile_V, tile_mask, Q_B, VKQ_C, KQ_max, KQ_rowsum, kb0_stop-1);
|
||||
ne01, ne02, stride_K, stride_V, stride_mask, tile_Q, tile_K, tile_V, tile_mask, Q_B, VKQ_C, KQ_max, KQ_rowsum, kb0);
|
||||
}
|
||||
|
||||
// With multi-stage loading there is no __syncthreads at the end of the iter,
|
||||
|
|
@ -1204,6 +1206,7 @@ static __global__ void flash_attn_ext_f16(
|
|||
const char * __restrict__ K,
|
||||
const char * __restrict__ V,
|
||||
const char * __restrict__ mask,
|
||||
const int * __restrict__ KV_max,
|
||||
float * __restrict__ dst,
|
||||
float2 * __restrict__ dst_meta,
|
||||
const float scale,
|
||||
|
|
@ -1280,7 +1283,11 @@ static __global__ void flash_attn_ext_f16(
|
|||
const float slope = ncols2 == 1 ? get_alibi_slope(max_bias, head, n_head_log2, m0, m1) : 1.0f;
|
||||
|
||||
const int kb0_start_kernel = kb0_start * kb_niter;
|
||||
const int kb0_stop_kernel = kb0_stop * kb_niter;
|
||||
int kb0_stop_kernel = kb0_stop * kb_niter;
|
||||
|
||||
if (KV_max) {
|
||||
kb0_stop_kernel = min(kb0_stop_kernel, KV_max[sequence*iter_j + jt] / c::nbatch_fa);
|
||||
}
|
||||
|
||||
constexpr bool is_fixup = false; // All but (potentially) the last iterations write their data to dst rather than the fixup buffer.
|
||||
if (kb0_start == 0) {
|
||||
|
|
@ -1321,7 +1328,11 @@ static __global__ void flash_attn_ext_f16(
|
|||
const float slope = ncols2 == 1 ? get_alibi_slope(max_bias, head, n_head_log2, m0, m1) : 1.0f;
|
||||
|
||||
const int kb0_start_kernel = kb0_start * kb_niter;
|
||||
const int kb0_stop_kernel = kb0_stop * kb_niter;
|
||||
int kb0_stop_kernel = kb0_stop * kb_niter;
|
||||
|
||||
if (KV_max) {
|
||||
kb0_stop_kernel = min(kb0_stop_kernel, KV_max[sequence*iter_j + jt] / c::nbatch_fa);
|
||||
}
|
||||
|
||||
constexpr bool is_fixup = true; // Last index writes its data to fixup buffer to avoid data races with other blocks.
|
||||
constexpr bool needs_fixup = false;
|
||||
|
|
@ -1391,24 +1402,24 @@ void ggml_cuda_flash_attn_ext_mma_f16_case(ggml_backend_cuda_context & ctx, ggml
|
|||
constexpr bool use_logit_softcap = false;
|
||||
fattn_kernel = flash_attn_ext_f16<DKQ, DV, ncols1, ncols2, nwarps, ntiles, use_logit_softcap, mla>;
|
||||
|
||||
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA)
|
||||
#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
|
||||
static bool shared_memory_limit_raised[GGML_CUDA_MAX_DEVICES] = {false};
|
||||
if (!shared_memory_limit_raised[id]) {
|
||||
CUDA_CHECK(cudaFuncSetAttribute(fattn_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, nbytes_shared_total));
|
||||
shared_memory_limit_raised[id] = true;
|
||||
}
|
||||
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA)
|
||||
#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
|
||||
} else {
|
||||
constexpr bool use_logit_softcap = true;
|
||||
fattn_kernel = flash_attn_ext_f16<DKQ, DV, ncols1, ncols2, nwarps, ntiles, use_logit_softcap, mla>;
|
||||
|
||||
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA)
|
||||
#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
|
||||
static bool shared_memory_limit_raised[GGML_CUDA_MAX_DEVICES] = {false};
|
||||
if (!shared_memory_limit_raised[id]) {
|
||||
CUDA_CHECK(cudaFuncSetAttribute(fattn_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, nbytes_shared_total));
|
||||
shared_memory_limit_raised[id] = true;
|
||||
}
|
||||
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA)
|
||||
#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
|
||||
}
|
||||
|
||||
launch_fattn<DV, ncols1, ncols2>
|
||||
|
|
|
|||
|
|
@ -5,14 +5,15 @@
|
|||
#define FATTN_KQ_STRIDE_TILE_F16 64
|
||||
|
||||
template<int D, int ncols, int nwarps, bool use_logit_softcap> // D == head size
|
||||
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
||||
#if !defined(GGML_USE_HIP)
|
||||
__launch_bounds__(nwarps*WARP_SIZE, 2)
|
||||
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
||||
#endif // !defined(GGML_USE_HIP)
|
||||
static __global__ void flash_attn_tile_ext_f16(
|
||||
const char * __restrict__ Q,
|
||||
const char * __restrict__ K,
|
||||
const char * __restrict__ V,
|
||||
const char * __restrict__ mask,
|
||||
const int * __restrict__ KV_max,
|
||||
float * __restrict__ dst,
|
||||
float2 * __restrict__ dst_meta,
|
||||
const float scale,
|
||||
|
|
@ -90,7 +91,8 @@ static __global__ void flash_attn_tile_ext_f16(
|
|||
|
||||
__syncthreads();
|
||||
|
||||
for (int k_VKQ_0 = blockIdx.y*FATTN_KQ_STRIDE_TILE_F16; k_VKQ_0 < ne11; k_VKQ_0 += gridDim.y*FATTN_KQ_STRIDE_TILE_F16) {
|
||||
const int k_VKQ_max = KV_max ? KV_max[sequence*gridDim.x + blockIdx.x] : ne11;
|
||||
for (int k_VKQ_0 = blockIdx.y*FATTN_KQ_STRIDE_TILE_F16; k_VKQ_0 < k_VKQ_max; k_VKQ_0 += gridDim.y*FATTN_KQ_STRIDE_TILE_F16) {
|
||||
// Calculate KQ tile and keep track of new maximum KQ values:
|
||||
|
||||
half kqmax_new[ncols/nwarps];
|
||||
|
|
|
|||
|
|
@ -5,14 +5,15 @@
|
|||
#define FATTN_KQ_STRIDE_TILE_F32 32
|
||||
|
||||
template<int D, int ncols, int nwarps, bool use_logit_softcap> // D == head size
|
||||
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
||||
#if !defined(GGML_USE_HIP)
|
||||
__launch_bounds__(nwarps*WARP_SIZE, 2)
|
||||
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
||||
#endif // !defined(GGML_USE_HIP)
|
||||
static __global__ void flash_attn_tile_ext_f32(
|
||||
const char * __restrict__ Q,
|
||||
const char * __restrict__ K,
|
||||
const char * __restrict__ V,
|
||||
const char * __restrict__ mask,
|
||||
const int * __restrict__ KV_max,
|
||||
float * __restrict__ dst,
|
||||
float2 * __restrict__ dst_meta,
|
||||
const float scale,
|
||||
|
|
@ -99,7 +100,8 @@ static __global__ void flash_attn_tile_ext_f32(
|
|||
|
||||
__syncthreads();
|
||||
|
||||
for (int k_VKQ_0 = blockIdx.y*FATTN_KQ_STRIDE_TILE_F32; k_VKQ_0 < ne11; k_VKQ_0 += gridDim.y*FATTN_KQ_STRIDE_TILE_F32) {
|
||||
const int k_VKQ_max = KV_max ? KV_max[sequence*gridDim.x + blockIdx.x] : ne11;
|
||||
for (int k_VKQ_0 = blockIdx.y*FATTN_KQ_STRIDE_TILE_F32; k_VKQ_0 < k_VKQ_max; k_VKQ_0 += gridDim.y*FATTN_KQ_STRIDE_TILE_F32) {
|
||||
// Calculate KQ tile and keep track of new maximum KQ values:
|
||||
|
||||
float kqmax_new[ncols/nwarps];
|
||||
|
|
|
|||
|
|
@ -1,6 +1,12 @@
|
|||
#include "common.cuh"
|
||||
#include "fattn-common.cuh"
|
||||
|
||||
// Currenlty llvm with the amdgcn target dose not support unrolling loops
|
||||
// that contain a break that can not be resolved at compile time.
|
||||
#ifdef __clang__
|
||||
#pragma clang diagnostic push
|
||||
#pragma clang diagnostic ignored "-Wpass-failed"
|
||||
#endif // __clang__
|
||||
template<int D, int ncols, ggml_type type_K, ggml_type type_V, bool use_logit_softcap> // D == head size
|
||||
#ifndef GGML_USE_HIP
|
||||
__launch_bounds__(D, 1)
|
||||
|
|
@ -10,6 +16,7 @@ static __global__ void flash_attn_vec_ext_f16(
|
|||
const char * __restrict__ K,
|
||||
const char * __restrict__ V,
|
||||
const char * __restrict__ mask,
|
||||
const int * __restrict__ KV_max,
|
||||
float * __restrict__ dst,
|
||||
float2 * __restrict__ dst_meta,
|
||||
const float scale,
|
||||
|
|
@ -171,10 +178,14 @@ static __global__ void flash_attn_vec_ext_f16(
|
|||
|
||||
half2 VKQ[ncols] = {{0.0f, 0.0f}};
|
||||
|
||||
const int k_VKQ_max = KV_max ? KV_max[sequence*gridDim.x + blockIdx.x] : ne11;
|
||||
K += blockIdx.y*D * nb11;
|
||||
V += blockIdx.y*D * nb21;
|
||||
maskh += blockIdx.y*D;
|
||||
for (int k_VKQ_0 = blockIdx.y*D; k_VKQ_0 < ne11; k_VKQ_0 += gridDim.y*D) {
|
||||
for (int k_VKQ_0 = blockIdx.y*D; k_VKQ_0 < k_VKQ_max; k_VKQ_0 += gridDim.y*D,
|
||||
// Increment pointers after each loop:
|
||||
K += gridDim.y*D*nb11, V += gridDim.y*D*nb21, maskh += gridDim.y*D) {
|
||||
|
||||
// Calculate KQ tile and keep track of new maximum KQ values:
|
||||
|
||||
if (mask) {
|
||||
|
|
@ -182,29 +193,7 @@ static __global__ void flash_attn_vec_ext_f16(
|
|||
for (int j = 0; j < ncols; ++j) {
|
||||
maskh_shared[j*D + tid] = slopeh*maskh[j*ne11 + tid];
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
// When using multiple parallel sequences in llama.cpp, some KV slices can be fully masked out.
|
||||
// In such cases, skip the KV slice.
|
||||
// On AMD __all_sync would not work correctly because it assumes a warp size of 64.
|
||||
#ifndef GGML_USE_HIP
|
||||
bool skip = true;
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
|
||||
const int i = i0 + threadIdx.x;
|
||||
|
||||
const float2 tmp = __half22float2(((const half2 *) maskh_shared)[j*(D/2) + i]);
|
||||
skip = skip && isinf(tmp.x) && isinf(tmp.y);
|
||||
}
|
||||
}
|
||||
if (__all_sync(0xFFFFFFFF, skip)) {
|
||||
__syncthreads();
|
||||
continue;
|
||||
}
|
||||
#endif // GGML_USE_HIP
|
||||
}
|
||||
|
||||
// For unknown reasons using a half array of size 1 for kqmax_new causes a performance regression,
|
||||
|
|
@ -291,10 +280,6 @@ static __global__ void flash_attn_vec_ext_f16(
|
|||
}
|
||||
}
|
||||
|
||||
K += gridDim.y*D * nb11;
|
||||
V += gridDim.y*D * nb21;
|
||||
maskh += gridDim.y*D;
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
|
|
@ -342,6 +327,9 @@ static __global__ void flash_attn_vec_ext_f16(
|
|||
NO_DEVICE_CODE;
|
||||
#endif // defined(FLASH_ATTN_AVAILABLE) && defined(FP16_AVAILABLE)
|
||||
}
|
||||
#ifdef __clang__
|
||||
#pragma clang diagnostic pop
|
||||
#endif // __clang__
|
||||
|
||||
template <int D, int cols_per_block, ggml_type type_K, ggml_type type_V, bool use_logit_softcap>
|
||||
void ggml_cuda_flash_attn_ext_vec_f16_case_impl(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
|
|
|
|||
|
|
@ -1,6 +1,12 @@
|
|||
#include "common.cuh"
|
||||
#include "fattn-common.cuh"
|
||||
|
||||
// Currenlty llvm with the amdgcn target dose not support unrolling loops
|
||||
// that contain a break that can not be resolved at compile time.
|
||||
#ifdef __clang__
|
||||
#pragma clang diagnostic push
|
||||
#pragma clang diagnostic ignored "-Wpass-failed"
|
||||
#endif // __clang__
|
||||
template<int D, int ncols, ggml_type type_K, ggml_type type_V, bool use_logit_softcap> // D == head size
|
||||
#ifndef GGML_USE_HIP
|
||||
__launch_bounds__(D, 1)
|
||||
|
|
@ -10,6 +16,7 @@ static __global__ void flash_attn_vec_ext_f32(
|
|||
const char * __restrict__ K,
|
||||
const char * __restrict__ V,
|
||||
const char * __restrict__ mask,
|
||||
const int * __restrict__ KV_max,
|
||||
float * __restrict__ dst,
|
||||
float2 * __restrict__ dst_meta,
|
||||
const float scale,
|
||||
|
|
@ -177,10 +184,14 @@ static __global__ void flash_attn_vec_ext_f32(
|
|||
|
||||
float VKQ[ncols] = {0.0f};
|
||||
|
||||
const int k_VKQ_max = KV_max ? KV_max[sequence*gridDim.x + blockIdx.x] : ne11;
|
||||
K += blockIdx.y*D * nb11;
|
||||
V += blockIdx.y*D * nb21;
|
||||
maskh += blockIdx.y*D;
|
||||
for (int k_VKQ_0 = blockIdx.y*D; k_VKQ_0 < ne11; k_VKQ_0 += gridDim.y*D) {
|
||||
for (int k_VKQ_0 = blockIdx.y*D; k_VKQ_0 < k_VKQ_max; k_VKQ_0 += gridDim.y*D,
|
||||
// Increment pointers after each loop:
|
||||
K += gridDim.y*D*nb11, V += gridDim.y*D*nb21, maskh += gridDim.y*D) {
|
||||
|
||||
// Calculate KQ tile and keep track of new maximum KQ values:
|
||||
|
||||
if (mask) {
|
||||
|
|
@ -188,28 +199,7 @@ static __global__ void flash_attn_vec_ext_f32(
|
|||
for (int j = 0; j < ncols; ++j) {
|
||||
maskf_shared[j*D + tid] = slope*__half2float(maskh[j*ne11 + tid]);
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
// When using multiple parallel sequences in llama.cpp, some KV slices can be fully masked out.
|
||||
// In such cases, skip the KV slice.
|
||||
// On AMD __all_sync would not work correctly because it assumes a warp size of 64.
|
||||
#ifndef GGML_USE_HIP
|
||||
bool skip = true;
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < D; i0 += WARP_SIZE) {
|
||||
const int i = i0 + threadIdx.x;
|
||||
|
||||
skip = skip && isinf(maskf_shared[j*D + i]);
|
||||
}
|
||||
}
|
||||
if (__all_sync(0xFFFFFFFF, skip)) {
|
||||
__syncthreads();
|
||||
continue;
|
||||
}
|
||||
#endif // GGML_USE_HIP
|
||||
}
|
||||
|
||||
float kqmax_new_arr[ncols];
|
||||
|
|
@ -286,10 +276,6 @@ static __global__ void flash_attn_vec_ext_f32(
|
|||
}
|
||||
}
|
||||
|
||||
K += gridDim.y*D * nb11;
|
||||
V += gridDim.y*D * nb21;
|
||||
maskh += gridDim.y*D;
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
|
|
@ -337,6 +323,9 @@ static __global__ void flash_attn_vec_ext_f32(
|
|||
NO_DEVICE_CODE;
|
||||
#endif // FLASH_ATTN_AVAILABLE
|
||||
}
|
||||
#ifdef __clang__
|
||||
#pragma clang diagnostic pop
|
||||
#endif // __clang__
|
||||
|
||||
template <int D, int cols_per_block, ggml_type type_K, ggml_type type_V, bool use_logit_softcap>
|
||||
void ggml_cuda_flash_attn_ext_vec_f32_case_impl(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
|
|
|
|||
|
|
@ -7,7 +7,7 @@
|
|||
#include "fattn-wmma-f16.cuh"
|
||||
|
||||
#ifdef FP16_MMA_AVAILABLE
|
||||
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
||||
#if !defined(GGML_USE_HIP)
|
||||
#include <mma.h>
|
||||
#ifdef GGML_USE_MUSA
|
||||
namespace wmma = mtmusa::wmma;
|
||||
|
|
@ -18,7 +18,7 @@ namespace wmma = nvcuda::wmma;
|
|||
#undef HIP_ENABLE_WARP_SYNC_BUILTINS // conflicts with rocWMMA headers
|
||||
#include <rocwmma/rocwmma.hpp>
|
||||
namespace wmma = rocwmma;
|
||||
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
||||
#endif // !defined(GGML_USE_HIP)
|
||||
#endif // FP16_MMA_AVAILABLE
|
||||
|
||||
// D == head size, VKQ_stride == num VKQ rows calculated in parallel:
|
||||
|
|
@ -29,6 +29,7 @@ static __global__ void flash_attn_ext_f16(
|
|||
const char * __restrict__ K,
|
||||
const char * __restrict__ V,
|
||||
const char * __restrict__ mask,
|
||||
const int * __restrict__ KV_max,
|
||||
float * __restrict__ dst,
|
||||
float2 * __restrict__ dst_meta,
|
||||
const float scale,
|
||||
|
|
@ -165,7 +166,8 @@ static __global__ void flash_attn_ext_f16(
|
|||
__syncthreads();
|
||||
|
||||
// Iterate over ne11 == previous tokens:
|
||||
for (int k_VKQ_0 = blockIdx.y*FATTN_KQ_STRIDE; k_VKQ_0 < ne11; k_VKQ_0 += gridDim.y*FATTN_KQ_STRIDE) {
|
||||
const int k_VKQ_max = KV_max ? KV_max[sequence*gridDim.x + blockIdx.x] : ne11;
|
||||
for (int k_VKQ_0 = blockIdx.y*FATTN_KQ_STRIDE; k_VKQ_0 < k_VKQ_max; k_VKQ_0 += gridDim.y*FATTN_KQ_STRIDE) {
|
||||
// Calculate tile of KQ:
|
||||
#pragma unroll
|
||||
for (int i_KQ_0 = 0; i_KQ_0 < FATTN_KQ_STRIDE; i_KQ_0 += KQ_stride_tc) {
|
||||
|
|
@ -546,7 +548,7 @@ void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, ggml_ten
|
|||
return;
|
||||
}
|
||||
|
||||
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
||||
#if !defined(GGML_USE_HIP)
|
||||
if (Q->ne[1] <= 8 && Q->ne[0] % warp_size == 0) {
|
||||
constexpr int cols_per_block = 8;
|
||||
switch (Q->ne[0]) {
|
||||
|
|
@ -568,7 +570,7 @@ void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, ggml_ten
|
|||
}
|
||||
return;
|
||||
}
|
||||
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
||||
#endif // !defined(GGML_USE_HIP)
|
||||
|
||||
if (Q->ne[1] <= 32) {
|
||||
constexpr int cols_per_block = 16;
|
||||
|
|
|
|||
|
|
@ -315,7 +315,9 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
|
|||
|
||||
const bool gqa_opt_applies = ((Q->ne[2] / K->ne[2]) % 2 == 0) && mask; // The mma-based kernels have GQA-specific optimizations
|
||||
const bool mma_needs_data_conversion = K->type != GGML_TYPE_F16 || V->type != GGML_TYPE_F16;
|
||||
const bool mma_faster_for_bs1 = new_mma_available(cc) && gqa_opt_applies && cc < GGML_CUDA_CC_ADA_LOVELACE && !mma_needs_data_conversion;
|
||||
const bool mma_faster_for_rtx4000 = Q->ne[3] > 1 || (Q->ne[2] > 4*K->ne[2] && K->ne[1] >= 8192);
|
||||
const bool mma_faster_for_bs1 = new_mma_available(cc) && gqa_opt_applies && !mma_needs_data_conversion &&
|
||||
(cc < GGML_CUDA_CC_ADA_LOVELACE || mma_faster_for_rtx4000);
|
||||
const bool can_use_vector_kernel = Q->ne[0] <= 256 && Q->ne[0] % (2*warp_size) == 0;
|
||||
if (Q->ne[1] == 1 && can_use_vector_kernel && !mma_faster_for_bs1) {
|
||||
if (prec == GGML_PREC_DEFAULT) {
|
||||
|
|
|
|||
|
|
@ -31,7 +31,9 @@
|
|||
#include "ggml-cuda/pool2d.cuh"
|
||||
#include "ggml-cuda/quantize.cuh"
|
||||
#include "ggml-cuda/rope.cuh"
|
||||
#include "ggml-cuda/roll.cuh"
|
||||
#include "ggml-cuda/scale.cuh"
|
||||
#include "ggml-cuda/softcap.cuh"
|
||||
#include "ggml-cuda/softmax.cuh"
|
||||
#include "ggml-cuda/ssm-conv.cuh"
|
||||
#include "ggml-cuda/ssm-scan.cuh"
|
||||
|
|
@ -126,7 +128,7 @@ static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device)
|
|||
return err;
|
||||
}
|
||||
|
||||
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||
#if defined(GGML_USE_HIP)
|
||||
static int ggml_cuda_parse_id(char devName[]) {
|
||||
// A list of possible Target IDs can be found under the rocclr/clr repo in device.cpp
|
||||
// these values are not stable so this is susceptible to breakage
|
||||
|
|
@ -173,10 +175,10 @@ static int ggml_cuda_parse_id(char devName[]) {
|
|||
archNum += archMinor;
|
||||
return archNum;
|
||||
}
|
||||
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||
#endif // defined(GGML_USE_HIP)
|
||||
|
||||
static ggml_cuda_device_info ggml_cuda_init() {
|
||||
#ifdef __HIP_PLATFORM_AMD__
|
||||
#if defined(GGML_USE_HIP)
|
||||
// Workaround for a rocBLAS bug when using multiple graphics cards:
|
||||
// https://github.com/ROCmSoftwarePlatform/rocBLAS/issues/1346
|
||||
{
|
||||
|
|
@ -249,7 +251,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
|||
info.devices[id].nsm = prop.multiProcessorCount;
|
||||
info.devices[id].smpb = prop.sharedMemPerBlock;
|
||||
info.devices[id].warp_size = prop.warpSize;
|
||||
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||
#if defined(GGML_USE_HIP)
|
||||
info.devices[id].smpbo = prop.sharedMemPerBlock;
|
||||
|
||||
info.devices[id].cc = ggml_cuda_parse_id(prop.gcnArchName);
|
||||
|
|
@ -279,7 +281,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
|||
info.devices[id].cc = 100*prop.major + 10*prop.minor;
|
||||
GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s\n",
|
||||
id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
|
||||
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||
#endif // defined(GGML_USE_HIP)
|
||||
}
|
||||
|
||||
for (int id = 0; id < info.device_count; ++id) {
|
||||
|
|
@ -1850,6 +1852,9 @@ static void ggml_cuda_mul_mat_batched_cublas_impl(ggml_backend_cuda_context & ct
|
|||
ggml_cuda_pool_alloc<cuda_t> src0_alloc(ctx.pool());
|
||||
ggml_cuda_pool_alloc<cuda_t> src1_alloc(ctx.pool());
|
||||
|
||||
bool is_src0_cont_2 = ggml_is_contiguous_2(src0);
|
||||
bool is_src1_cont_2 = ggml_is_contiguous_2(src1);
|
||||
|
||||
// Handle src0
|
||||
src0_ptr = (const cuda_t *) src0->data;
|
||||
|
||||
|
|
@ -1868,6 +1873,8 @@ static void ggml_cuda_mul_mat_batched_cublas_impl(ggml_backend_cuda_context & ct
|
|||
s11 = ne10;
|
||||
s12 = ne11*s11;
|
||||
s13 = ne12*s12;
|
||||
|
||||
is_src1_cont_2 = true;
|
||||
}
|
||||
|
||||
// Setup destination buffer
|
||||
|
|
@ -1916,15 +1923,19 @@ static void ggml_cuda_mul_mat_batched_cublas_impl(ggml_backend_cuda_context & ct
|
|||
const int64_t r2 = ne12/ne02;
|
||||
const int64_t r3 = ne13/ne03;
|
||||
|
||||
if (r2 == 1 && r3 == 1 && ggml_is_contiguous_2(src0) && ggml_is_contiguous_2(src1)) {
|
||||
if (r2 == 1 && r3 == 1 && is_src0_cont_2 && is_src1_cont_2) {
|
||||
// with a [0, 2, 1, 3] perm. and ne02==1 the matrix strides need to be determined from dim 3:
|
||||
const int64_t sma = ne02 == 1 ? nb03/nb00 : nb02/nb00;
|
||||
const int64_t smb = ne12 == 1 ? s13 : s12;
|
||||
|
||||
// there is no broadcast and src0, src1 are contiguous across dims 2, 3
|
||||
// use cublasGemmStridedBatchedEx
|
||||
CUBLAS_CHECK(
|
||||
cublasGemmStridedBatchedEx(ctx.cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N,
|
||||
ne01, ne11, ne10,
|
||||
alpha, src0_ptr, cu_data_type_a, nb01/nb00, nb02/nb00, // strideA
|
||||
src1_ptr, cu_data_type_b, s11, s12, // strideB
|
||||
beta, dst_t, cu_data_type, ne0, ne1*ne0, // strideC
|
||||
alpha, src0_ptr, cu_data_type_a, nb01/nb00, sma, // strideA
|
||||
src1_ptr, cu_data_type_b, s11, smb, // strideB
|
||||
beta, dst_t, cu_data_type, ne0, ne1*ne0, // strideC
|
||||
ne12*ne13,
|
||||
cu_compute_type,
|
||||
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
||||
|
|
@ -2419,6 +2430,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
|||
case GGML_OP_ROPE_BACK:
|
||||
ggml_cuda_op_rope_back(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_ROLL:
|
||||
ggml_cuda_op_roll(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_IM2COL:
|
||||
ggml_cuda_op_im2col(ctx, dst);
|
||||
break;
|
||||
|
|
@ -2766,7 +2780,12 @@ static void update_cuda_graph_executable(ggml_backend_cuda_context * cuda_ctx) {
|
|||
}
|
||||
#endif
|
||||
|
||||
static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx, std::initializer_list<enum ggml_op> ops) {
|
||||
static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx, std::initializer_list<enum ggml_op> ops, std::initializer_list<enum ggml_unary_op> unary_ops) {
|
||||
#ifndef NDEBUG
|
||||
const size_t num_unary = std::count(ops.begin(), ops.end(), GGML_OP_UNARY);
|
||||
GGML_ASSERT(unary_ops.size() == num_unary);
|
||||
#endif
|
||||
|
||||
if (!ggml_can_fuse(cgraph, node_idx, ops)) {
|
||||
return false;
|
||||
}
|
||||
|
|
@ -2794,9 +2813,32 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
|
|||
if (!ggml_is_contiguous_rows(mul->src[0]) || !ggml_is_contiguous_rows(mul->src[1])) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
return true;
|
||||
if (ops.size() == 3 && ops.begin()[0] == GGML_OP_SCALE && ops.begin()[1] == GGML_OP_UNARY && ops.begin()[2] == GGML_OP_SCALE
|
||||
&& unary_ops.size() == 1 && unary_ops.begin()[0] == GGML_UNARY_OP_TANH) {
|
||||
const ggml_tensor *scale = cgraph->nodes[node_idx];
|
||||
const ggml_tensor *tanh = cgraph->nodes[node_idx+1];
|
||||
const ggml_tensor *scale2 = cgraph->nodes[node_idx+2];
|
||||
|
||||
GGML_ASSERT(scale->src[0]->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(scale->type == GGML_TYPE_F32);
|
||||
|
||||
if (ggml_get_unary_op(tanh) != GGML_UNARY_OP_TANH) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Check for bias
|
||||
if (ggml_get_op_params_f32(scale, 1) != 0.0f || ggml_get_op_params_f32(scale2, 1) != 0.0f) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph,
|
||||
|
|
@ -2817,10 +2859,18 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
|
|||
}
|
||||
|
||||
static bool disable_fusion = (getenv("GGML_CUDA_DISABLE_FUSION") != nullptr);
|
||||
if (!disable_fusion && ggml_cuda_can_fuse(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL })) {
|
||||
ggml_cuda_op_rms_norm_fused(*cuda_ctx, node, cgraph->nodes[i+1]);
|
||||
i++;
|
||||
continue;
|
||||
if (!disable_fusion) {
|
||||
if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL }, {})) {
|
||||
ggml_cuda_op_rms_norm_fused(*cuda_ctx, node, cgraph->nodes[i+1]);
|
||||
i++;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_SCALE, GGML_OP_UNARY, GGML_OP_SCALE }, { GGML_UNARY_OP_TANH })) {
|
||||
i += 2;
|
||||
ggml_cuda_op_softcap(*cuda_ctx, cgraph->nodes[i], node);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
#ifndef NDEBUG
|
||||
assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
|
||||
|
|
@ -3411,6 +3461,11 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
|||
memcpy(&max_bias, (const float *) op->op_params + 1, sizeof(float));
|
||||
return max_bias == 0.0f;
|
||||
}
|
||||
case GGML_OP_ROLL:
|
||||
if(op->src[0]->type == GGML_TYPE_F32) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
case GGML_OP_ROPE:
|
||||
case GGML_OP_ROPE_BACK: {
|
||||
return op->src[0]->nb[0] == ggml_type_size(op->src[0]->type) && ggml_is_contiguous_2(op->src[0]);
|
||||
|
|
|
|||
|
|
@ -1,65 +1,75 @@
|
|||
#include "im2col.cuh"
|
||||
|
||||
#define MIN(a, b) (a) < (b) ? (a) : (b)
|
||||
|
||||
#define MAX_GRIDDIM_Z 65535
|
||||
|
||||
template <typename T>
|
||||
static __global__ void im2col_kernel(
|
||||
const float * x, T * dst, int64_t batch_offset,
|
||||
int64_t offset_delta, int64_t IC, int64_t IW, int64_t IH, int64_t OH, int64_t OW, int64_t KW, int64_t KH, int64_t pelements, int64_t CHW,
|
||||
const float * x, T * dst,
|
||||
int64_t IC, int64_t IW, int64_t IH, int64_t OH, int64_t OW, int64_t KW, int64_t KH,
|
||||
int64_t IC_IH_IW, int64_t IH_IW, int64_t N_OH, int64_t KH_KW, int64_t IC_KH_KW,
|
||||
int s0, int s1, int p0, int p1, int d0, int d1) {
|
||||
const int64_t i = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
if (i >= pelements) {
|
||||
if (i >= IC_KH_KW) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int64_t ksize = OW * KH;
|
||||
const int64_t kx = i / ksize;
|
||||
const int64_t kd = kx * ksize;
|
||||
const int64_t ky = (i - kd) / OW;
|
||||
const int64_t ix = i % OW;
|
||||
const int64_t iic = i / (KH_KW);
|
||||
const int64_t rem = i - iic * KH_KW;
|
||||
const int64_t ikh = rem / KW;
|
||||
const int64_t ikw = rem - ikh * KW;
|
||||
|
||||
const int64_t oh = blockIdx.y;
|
||||
const int64_t batch = blockIdx.z / IC;
|
||||
const int64_t ic = blockIdx.z % IC;
|
||||
const int64_t iow = blockIdx.y;
|
||||
for (int64_t iz = blockIdx.z; iz < N_OH; iz+=MAX_GRIDDIM_Z) {
|
||||
const int64_t in = iz / OH;
|
||||
const int64_t ioh = iz - in * OH;
|
||||
|
||||
const int64_t iiw = ix * s0 + kx * d0 - p0;
|
||||
const int64_t iih = oh * s1 + ky * d1 - p1;
|
||||
const int64_t iiw = iow * s0 + ikw * d0 - p0;
|
||||
const int64_t iih = ioh * s1 + ikh * d1 - p1;
|
||||
|
||||
const int64_t offset_dst =
|
||||
((batch * OH + oh) * OW + ix) * CHW +
|
||||
(ic * (KW * KH) + ky * KW + kx);
|
||||
const int64_t offset_dst =
|
||||
((in * OH + ioh) * OW + iow) * IC_KH_KW + iic * KH_KW + ikh * KW + ikw;
|
||||
|
||||
if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
|
||||
dst[offset_dst] = 0.0f;
|
||||
} else {
|
||||
const int64_t offset_src = ic * offset_delta + batch * batch_offset;
|
||||
dst[offset_dst] = x[offset_src + iih * IW + iiw];
|
||||
if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
|
||||
dst[offset_dst] = 0.0f;
|
||||
} else {
|
||||
const int64_t offset_src = iic * IC_IH_IW + in * IH_IW;
|
||||
dst[offset_dst] = x[offset_src + iih * IW + iiw];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW]
|
||||
template <typename T>
|
||||
static void im2col_cuda(const float * x, T* dst,
|
||||
int64_t IW, int64_t IH, int64_t OW, int64_t OH, int64_t KW, int64_t KH, int64_t IC,
|
||||
int64_t batch, int64_t batch_offset, int64_t offset_delta,
|
||||
int64_t N, int64_t IC_IH_IW, int64_t IH_IW,
|
||||
int s0,int s1,int p0,int p1,int d0,int d1, cudaStream_t stream) {
|
||||
const int parallel_elements = OW * KW * KH;
|
||||
const int num_blocks = (parallel_elements + CUDA_IM2COL_BLOCK_SIZE - 1) / CUDA_IM2COL_BLOCK_SIZE;
|
||||
dim3 block_nums(num_blocks, OH, batch * IC);
|
||||
im2col_kernel<<<block_nums, CUDA_IM2COL_BLOCK_SIZE, 0, stream>>>(x, dst, batch_offset, offset_delta, IC, IW, IH, OH, OW, KW, KH, parallel_elements, (IC * KH * KW), s0, s1, p0, p1, d0, d1);
|
||||
const int64_t IC_KH_KW = IC * KH * KW;
|
||||
const int64_t num_blocks = (IC_KH_KW + CUDA_IM2COL_BLOCK_SIZE - 1) / CUDA_IM2COL_BLOCK_SIZE;
|
||||
const int64_t N_OH = N * OH;
|
||||
const int64_t KH_KW = KW*KH;
|
||||
dim3 block_nums(num_blocks, OW, MIN(N_OH, MAX_GRIDDIM_Z));
|
||||
im2col_kernel<<<block_nums, MIN(IC_KH_KW, CUDA_IM2COL_BLOCK_SIZE) , 0, stream>>>(x, dst, IC, IW, IH, OH, OW, KW, KH,
|
||||
IC_IH_IW, IH_IW, N_OH, KH_KW, IC_KH_KW,
|
||||
s0, s1, p0, p1, d0, d1);
|
||||
}
|
||||
|
||||
static void im2col_cuda_f16(const float * x, half * dst,
|
||||
int64_t IW, int64_t IH, int64_t OW, int64_t OH, int64_t KW, int64_t KH, int64_t IC,
|
||||
int64_t batch, int64_t batch_offset, int64_t offset_delta,
|
||||
int64_t N, int64_t IC_IH_IW, int64_t IH_IW,
|
||||
int s0,int s1,int p0,int p1,int d0,int d1, cudaStream_t stream) {
|
||||
|
||||
im2col_cuda<half>(x, dst, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, offset_delta, s0, s1, p0, p1, d0, d1, stream);
|
||||
im2col_cuda<half>(x, dst, IW, IH, OW, OH, KW, KH, IC, N, IC_IH_IW, IH_IW, s0, s1, p0, p1, d0, d1, stream);
|
||||
}
|
||||
|
||||
static void im2col_cuda_f32(const float * x, float * dst,
|
||||
int64_t IW, int64_t IH, int64_t OW, int64_t OH, int64_t KW, int64_t KH, int64_t IC,
|
||||
int64_t batch, int64_t batch_offset, int64_t offset_delta,
|
||||
int64_t N, int64_t IC_IH_IW, int64_t IH_IW,
|
||||
int s0,int s1,int p0,int p1,int d0,int d1, cudaStream_t stream) {
|
||||
|
||||
im2col_cuda<float>(x, dst, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, offset_delta, s0, s1, p0, p1, d0, d1, stream);
|
||||
im2col_cuda<float>(x, dst, IW, IH, OW, OH, KW, KH, IC, N, IC_IH_IW, IH_IW, s0, s1, p0, p1, d0, d1, stream);
|
||||
}
|
||||
|
||||
void ggml_cuda_op_im2col(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
|
|
@ -91,13 +101,13 @@ void ggml_cuda_op_im2col(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
|||
const int64_t OH = is_2D ? dst->ne[2] : 1;
|
||||
const int64_t OW = dst->ne[1];
|
||||
|
||||
const size_t delta_offset = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32
|
||||
const int64_t batch = src1->ne[is_2D ? 3 : 2];
|
||||
const size_t batch_offset = src1->nb[is_2D ? 3 : 2] / 4; // nb is byte offset, src is type float32
|
||||
const int64_t IC_IH_IW = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32
|
||||
const int64_t N = src1->ne[is_2D ? 3 : 2];
|
||||
const int64_t IH_IW = src1->nb[is_2D ? 3 : 2] / 4; // nb is byte offset, src is type float32
|
||||
|
||||
if(dst->type == GGML_TYPE_F16) {
|
||||
im2col_cuda_f16(src1_d, (half *) dst_d, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, stream);
|
||||
im2col_cuda_f16(src1_d, (half *) dst_d, IW, IH, OW, OH, KW, KH, IC, N, IC_IH_IW, IH_IW, s0, s1, p0, p1, d0, d1, stream);
|
||||
} else {
|
||||
im2col_cuda_f32(src1_d, (float *) dst_d, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, stream);
|
||||
im2col_cuda_f32(src1_d, (float *) dst_d, IW, IH, OW, OH, KW, KH, IC, N, IC_IH_IW, IH_IW, s0, s1, p0, p1, d0, d1, stream);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -12,7 +12,8 @@
|
|||
// The methods get_i and get_j can be used to get the physical 32 bit index of the lth element of a thread within a tile.
|
||||
// All matrix tiles have ne physical 32 bit elements per warp.
|
||||
//
|
||||
// As described in the documentation, all pointers for load_ldmatrix must be to shared memory and aligned to 16 bytes.
|
||||
// As described in the PTX documentation, all pointers for load_ldmatrix must be to shared memory and aligned to 16 bytes.
|
||||
// The API in this file also assumes that the pointers for load_generic are aligned to 16 bytes, unaligned pointers are considered undefined behavior.
|
||||
|
||||
#include "common.cuh"
|
||||
|
||||
|
|
@ -66,7 +67,44 @@ namespace ggml_cuda_mma {
|
|||
struct tile {
|
||||
static constexpr int I = I_;
|
||||
static constexpr int J = J_;
|
||||
static constexpr int ne = I * J / WARP_SIZE;
|
||||
|
||||
#if defined(GGML_USE_HIP)
|
||||
static constexpr int ne = I * J / 64;
|
||||
T x[ne] = {0};
|
||||
|
||||
static __device__ __forceinline__ int get_i(const int l) {
|
||||
if constexpr (I == 64 && J == 2) { // Special tile size to load <16, 4> as <16, 8>
|
||||
return threadIdx.x % 16;
|
||||
} else if constexpr (I == 16 && J == 8) {
|
||||
return threadIdx.x % 16;
|
||||
} else if constexpr (I == 32 && J == 4) {
|
||||
return threadIdx.x % 32;
|
||||
} else if constexpr (I == 16 && J == 16) {
|
||||
return 4 * (threadIdx.x / 16) + l;
|
||||
} else if constexpr (I == 32 && J == 32) {
|
||||
return 4 * (threadIdx.x / 32) + 8 * (l / 4) + (l % 4);
|
||||
} else {
|
||||
static_assert(I == -1 && J == -1, "template specialization not implemented");
|
||||
}
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ int get_j(const int l) {
|
||||
if constexpr (I == 64 && J == 2) { // Special tile size to load <16, 4> as <16, 8>
|
||||
return (2 * ((threadIdx.x / 16) % 2) + l);
|
||||
} else if constexpr (I == 16 && J == 8) {
|
||||
return 2 * (threadIdx.x / 16) + l;
|
||||
} else if constexpr (I == 32 && J == 4) {
|
||||
return 2 * (threadIdx.x / 32) + l;
|
||||
} else if constexpr (I == 16 && J == 16) {
|
||||
return threadIdx.x % 16;
|
||||
} else if constexpr (I == 32 && J == 32) {
|
||||
return threadIdx.x % 32;
|
||||
} else {
|
||||
static_assert(I == -1 && J == -1, "template specialization not implemented");
|
||||
}
|
||||
}
|
||||
#else
|
||||
static constexpr int ne = I * J / 32;
|
||||
T x[ne] = {0};
|
||||
|
||||
static __device__ __forceinline__ int get_i(const int l) {
|
||||
|
|
@ -94,6 +132,7 @@ namespace ggml_cuda_mma {
|
|||
static_assert(I == -1 && J == -1, "template specialization not implemented");
|
||||
}
|
||||
}
|
||||
#endif // defined(GGML_USE_HIP)
|
||||
};
|
||||
|
||||
template <int I_, int J_>
|
||||
|
|
@ -148,10 +187,23 @@ namespace ggml_cuda_mma {
|
|||
|
||||
template <int I, int J, typename T>
|
||||
static __device__ __forceinline__ void load_generic(tile<I, J, T> & t, const T * __restrict__ xs0, const int stride) {
|
||||
#if defined(AMD_MFMA_AVAILABLE)
|
||||
if constexpr (I == 64 && J == 2) { // Special tile size to load <16, 4> as <16, 8>
|
||||
#pragma unroll
|
||||
for (int l = 0; l < t.ne; ++l) {
|
||||
t.x[l] = xs0[t.get_i(l)*stride + t.get_j(l)];
|
||||
}
|
||||
} else {
|
||||
int64_t * xi = (int64_t *) t.x;
|
||||
const int64_t * xs = (int64_t *) ((const int *) xs0 + (threadIdx.x % t.I) * stride + 2 * (threadIdx.x / t.I));
|
||||
xi[0] = xs[0];
|
||||
}
|
||||
#else
|
||||
#pragma unroll
|
||||
for (int l = 0; l < t.ne; ++l) {
|
||||
t.x[l] = xs0[t.get_i(l)*stride + t.get_j(l)];
|
||||
}
|
||||
#endif // defined(AMD_MFMA_AVAILABLE)
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
|
|
@ -186,7 +238,7 @@ namespace ggml_cuda_mma {
|
|||
template <typename T>
|
||||
static __device__ __forceinline__ void load_ldmatrix(
|
||||
tile<16, 8, T> & t, const T * __restrict__ xs0, const int stride) {
|
||||
#ifdef NEW_MMA_AVAILABLE
|
||||
#if defined(NEW_MMA_AVAILABLE)
|
||||
int * xi = (int * ) t.x;
|
||||
const int * xs = (const int *) xs0 + (threadIdx.x % t.I) * stride + (threadIdx.x / t.I) * (t.J / 2);
|
||||
asm volatile("ldmatrix.sync.aligned.m8n8.x4.b16 {%0, %1, %2, %3}, [%4];"
|
||||
|
|
@ -393,4 +445,60 @@ namespace ggml_cuda_mma {
|
|||
NO_DEVICE_CODE;
|
||||
#endif // NEW_MMA_AVAILABLE
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ void mma(
|
||||
tile<16, 16, int> & D, const tile<16, 8, int> & A, const tile<16, 8, int> & B) {
|
||||
#if defined(AMD_MFMA_AVAILABLE)
|
||||
using int32x4_t = __attribute__((__vector_size__(4 * sizeof(int)))) int;
|
||||
int32x4_t * acc = (int32x4_t *) D.x;
|
||||
#if defined(CDNA3)
|
||||
acc[0] = __builtin_amdgcn_mfma_i32_16x16x32_i8(((int64_t *) A.x)[0],
|
||||
((int64_t *) B.x)[0],
|
||||
acc[0],
|
||||
0, 0, 0);
|
||||
#elif defined(CDNA2) || defined(CDNA)
|
||||
acc[0] = __builtin_amdgcn_mfma_i32_16x16x16i8(A.x[0],
|
||||
B.x[0],
|
||||
acc[0],
|
||||
0, 0, 0);
|
||||
acc[0] = __builtin_amdgcn_mfma_i32_16x16x16i8(A.x[1],
|
||||
B.x[1],
|
||||
acc[0],
|
||||
0, 0, 0);
|
||||
#endif // defined(CDNA3)
|
||||
#else
|
||||
GGML_UNUSED(D);
|
||||
GGML_UNUSED(A);
|
||||
GGML_UNUSED(B);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // AMD_MFMA_AVAILABLE
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ void mma(
|
||||
tile<32, 32, int> & D, const tile<32, 4, int> & A, const tile<32, 4, int> & B) {
|
||||
#if defined(AMD_MFMA_AVAILABLE)
|
||||
using int32x16_t = __attribute__((__vector_size__(16 * sizeof(int)))) int;
|
||||
int32x16_t * acc = (int32x16_t *) D.x;
|
||||
#if defined(CDNA3)
|
||||
acc[0] = __builtin_amdgcn_mfma_i32_32x32x16_i8(((int64_t *) A.x)[0],
|
||||
((int64_t *) B.x)[0],
|
||||
acc[0],
|
||||
0, 0, 0);
|
||||
#elif defined(CDNA2) || defined(CDNA)
|
||||
acc[0] = __builtin_amdgcn_mfma_i32_32x32x8i8(A.x[0],
|
||||
B.x[0],
|
||||
acc[0],
|
||||
0, 0, 0);
|
||||
acc[0] = __builtin_amdgcn_mfma_i32_32x32x8i8(A.x[1],
|
||||
B.x[1],
|
||||
acc[0],
|
||||
0, 0, 0);
|
||||
#endif // defined(CDNA3)
|
||||
#else
|
||||
GGML_UNUSED(D);
|
||||
GGML_UNUSED(A);
|
||||
GGML_UNUSED(B);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // AMD_MFMA_AVAILABLE
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -109,7 +109,8 @@ void ggml_cuda_mul_mat_q(
|
|||
const int64_t s03 = src0->nb[3] / ts_src0;
|
||||
const int64_t s3 = dst->nb[3] / ts_dst;
|
||||
|
||||
const bool use_stream_k = GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA;
|
||||
const bool use_stream_k = (GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA)
|
||||
|| GGML_CUDA_CC_IS_CDNA(cc);
|
||||
|
||||
if (!ids) {
|
||||
const size_t nbytes_src1_q8_1 = ne13*ne12 * ne11*ne10_padded * sizeof(block_q8_1)/QK8_1 +
|
||||
|
|
@ -250,8 +251,9 @@ void ggml_cuda_op_mul_mat_q(
|
|||
// The stream-k decomposition is only faster for recent NVIDIA GPUs.
|
||||
// Also its fixup needs to allocate a temporary buffer in the memory pool.
|
||||
// There are multiple parallel CUDA streams for src1_ncols != ne11 which would introduce a race condition for this buffer.
|
||||
const bool use_stream_k = GGML_CUDA_CC_IS_NVIDIA(cc) &&
|
||||
ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA && src1_ncols == ne11;
|
||||
const bool use_stream_k = ((GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA)
|
||||
|| GGML_CUDA_CC_IS_CDNA(cc))
|
||||
&& src1_ncols == ne11;
|
||||
const mmq_args args = {
|
||||
src0_dd_i, src0->type, (const int *) src1_ddq_i, nullptr, nullptr, dst_dd_i,
|
||||
ne00, row_diff, src1_ncols, stride01, ne11, nrows_dst,
|
||||
|
|
@ -320,5 +322,21 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
|
|||
return !fp16_mma_hardware_available(cc) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
|
||||
}
|
||||
|
||||
if (amd_mfma_available(cc)) {
|
||||
// As of ROCM 7.0 rocblas/tensile performs very poorly on CDNA3 and hipblaslt (via ROCBLAS_USE_HIPBLASLT)
|
||||
// performs better but is currently suffering from a crash on this architecture.
|
||||
// TODO: Revisit when hipblaslt is fixed on CDNA3
|
||||
if (GGML_CUDA_CC_IS_CDNA3(cc)) {
|
||||
return true;
|
||||
}
|
||||
if (ne11 <= 128 || type == GGML_TYPE_Q4_0 || type == GGML_TYPE_Q4_1 || type == GGML_TYPE_Q5_0 || type == GGML_TYPE_Q5_1) {
|
||||
return true;
|
||||
}
|
||||
if (ne11 <= 256 && (type == GGML_TYPE_Q4_K || type == GGML_TYPE_Q5_K)) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
return (!GGML_CUDA_CC_IS_RDNA4(cc) && !GGML_CUDA_CC_IS_RDNA3(cc) && !GGML_CUDA_CC_IS_CDNA(cc)) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
|
||||
}
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load Diff
|
|
@ -0,0 +1,67 @@
|
|||
#include "ggml-cuda/common.cuh"
|
||||
#include "roll.cuh"
|
||||
|
||||
static __forceinline__ __device__ int64_t wrap_index(const int64_t idx, const int64_t ne) {
|
||||
if (idx < 0) {
|
||||
return idx + ne;
|
||||
}
|
||||
if (idx >= ne) {
|
||||
return idx - ne;
|
||||
}
|
||||
return idx;
|
||||
}
|
||||
|
||||
static __global__ void roll_f32_cuda(const float * __restrict__ src,
|
||||
float * __restrict__ dst,
|
||||
const int64_t ne00,
|
||||
const int64_t ne01,
|
||||
const int64_t ne02,
|
||||
const int64_t ne03,
|
||||
const int s0,
|
||||
const int s1,
|
||||
const int s2,
|
||||
const int s3) {
|
||||
const int64_t idx = int64_t(blockDim.x) * blockIdx.x + threadIdx.x;
|
||||
const int64_t n_elements = ne00 * ne01 * ne02 * ne03;
|
||||
|
||||
if (idx >= n_elements) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int64_t i0 = idx % ne00;
|
||||
const int64_t i1 = (idx / ne00) % ne01;
|
||||
const int64_t i2 = (idx / (ne00 * ne01)) % ne02;
|
||||
const int64_t i3 = (idx / (ne00 * ne01 * ne02)) % ne03;
|
||||
|
||||
const int64_t d0 = wrap_index(i0 - s0, ne00);
|
||||
const int64_t d1 = wrap_index(i1 - s1, ne01);
|
||||
const int64_t d2 = wrap_index(i2 - s2, ne02);
|
||||
const int64_t d3 = wrap_index(i3 - s3, ne03);
|
||||
|
||||
dst[i3 * (ne00 * ne01 * ne02) + i2 * (ne01 * ne00) + i1 * ne00 + i0] =
|
||||
src[d3 * (ne00 * ne01 * ne02) + d2 * (ne01 * ne00) + d1 * ne00 + d0];
|
||||
}
|
||||
|
||||
void ggml_cuda_op_roll(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
int s0 = dst->op_params[0];
|
||||
int s1 = dst->op_params[1];
|
||||
int s2 = dst->op_params[2];
|
||||
int s3 = dst->op_params[3];
|
||||
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const float * src0_d = (const float *) dst->src[0]->data;
|
||||
float * dst_d = (float *) dst->data;
|
||||
|
||||
GGML_TENSOR_UNARY_OP_LOCALS;
|
||||
|
||||
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(ggml_are_same_shape(dst->src[0], dst));
|
||||
|
||||
cudaStream_t stream = ctx.stream();
|
||||
|
||||
int64_t sz = (ne00 * ne01 * ne02 * ne03);
|
||||
int64_t num_blocks = (sz + CUDA_ROLL_BLOCK_SIZE - 1) / CUDA_ROLL_BLOCK_SIZE;
|
||||
|
||||
roll_f32_cuda<<<num_blocks, CUDA_ROLL_BLOCK_SIZE, 0, stream>>>(
|
||||
src0_d, dst_d, ne00, ne01, ne02, ne03, s0, s1, s2, s3);
|
||||
}
|
||||
|
|
@ -0,0 +1,5 @@
|
|||
#include "common.cuh"
|
||||
|
||||
#define CUDA_ROLL_BLOCK_SIZE 256
|
||||
|
||||
void ggml_cuda_op_roll(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
|
|
@ -0,0 +1,34 @@
|
|||
#include "softcap.cuh"
|
||||
|
||||
static __global__ void softcap_f32(const float * x, float * dst, const float scale, const float softcap, const int k) {
|
||||
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
|
||||
if (i >= k) {
|
||||
return;
|
||||
}
|
||||
|
||||
dst[i] = tanhf(scale * x[i]) * softcap;
|
||||
}
|
||||
|
||||
static void softcap_f32_cuda(const float * x, float * dst, const float scale, const float softcap, const int k, cudaStream_t stream) {
|
||||
const int num_blocks = (k + CUDA_SOFTCAP_BLOCK_SIZE - 1) / CUDA_SOFTCAP_BLOCK_SIZE;
|
||||
softcap_f32<<<num_blocks, CUDA_SOFTCAP_BLOCK_SIZE, 0, stream>>>(x, dst, scale, softcap, k);
|
||||
}
|
||||
|
||||
// fused GGML_OP_SCALE + GGML_UNARY_OP_TANH + GGML_OP_SCALE
|
||||
void ggml_cuda_op_softcap(ggml_backend_cuda_context & ctx, ggml_tensor * dst, ggml_tensor * src) {
|
||||
const ggml_tensor * src0 = src->src[0];
|
||||
const float * src0_d = (const float *)src0->data;
|
||||
float * dst_d = (float *)dst->data;
|
||||
cudaStream_t stream = ctx.stream();
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||
|
||||
float scale;
|
||||
float softcap;
|
||||
memcpy(&scale, (float *) src->op_params + 0, sizeof(float));
|
||||
memcpy(&softcap, (float *) dst->op_params + 0, sizeof(float));
|
||||
|
||||
softcap_f32_cuda(src0_d, dst_d, scale, softcap, ggml_nelements(src0), stream);
|
||||
}
|
||||
|
|
@ -0,0 +1,5 @@
|
|||
#include "common.cuh"
|
||||
|
||||
#define CUDA_SOFTCAP_BLOCK_SIZE 256
|
||||
|
||||
void ggml_cuda_op_softcap(ggml_backend_cuda_context & ctx, ggml_tensor * dst, ggml_tensor * src);
|
||||
|
|
@ -5,10 +5,8 @@
|
|||
#include <hipblas/hipblas.h>
|
||||
#include <hip/hip_fp16.h>
|
||||
#include <hip/hip_bfloat16.h>
|
||||
#ifdef __HIP_PLATFORM_AMD__
|
||||
// for rocblas_initialize()
|
||||
#include "rocblas/rocblas.h"
|
||||
#endif // __HIP_PLATFORM_AMD__
|
||||
|
||||
#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
|
||||
#define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT
|
||||
|
|
@ -139,7 +137,7 @@
|
|||
#define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR
|
||||
#define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED
|
||||
|
||||
#if defined(__HIP_PLATFORM_AMD__) && HIP_VERSION >= 70000000
|
||||
#if HIP_VERSION >= 70000000
|
||||
#define CUBLAS_COMPUTE_16F HIPBLAS_COMPUTE_16F
|
||||
#define CUBLAS_COMPUTE_32F HIPBLAS_COMPUTE_32F
|
||||
#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_COMPUTE_32F_FAST_16F
|
||||
|
|
@ -151,7 +149,11 @@
|
|||
#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
|
||||
#define cublasComputeType_t hipblasDatatype_t
|
||||
#define cudaDataType_t hipblasDatatype_t
|
||||
#endif
|
||||
#endif // HIP_VERSION >= 7000000
|
||||
|
||||
#if !defined(__HIP_PLATFORM_AMD__)
|
||||
#error "The HIP backend supports only AMD targets"
|
||||
#endif // !defined(__HIP_PLATFORM_AMD__)
|
||||
|
||||
#define __CUDA_ARCH__ 1300
|
||||
|
||||
|
|
@ -160,7 +162,19 @@
|
|||
#endif
|
||||
|
||||
#if defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx942__)
|
||||
#define CDNA
|
||||
#define CDNA // For the entire family
|
||||
#endif
|
||||
|
||||
#if defined(__gfx942__)
|
||||
#define CDNA3
|
||||
#endif
|
||||
|
||||
#if defined(__gfx90a__)
|
||||
#define CDNA2
|
||||
#endif
|
||||
|
||||
#if defined(__gfx908__)
|
||||
#define CDNA1
|
||||
#endif
|
||||
|
||||
#if defined(__GFX12__)
|
||||
|
|
@ -237,7 +251,7 @@ static __device__ __forceinline__ unsigned int __vcmpne4(unsigned int a, unsigne
|
|||
return c;
|
||||
}
|
||||
|
||||
#if defined(__HIP_PLATFORM_AMD__) && HIP_VERSION < 50600000
|
||||
#if HIP_VERSION < 50600000
|
||||
// __shfl_xor() for half2 was added in ROCm 5.6
|
||||
static __device__ __forceinline__ half2 __shfl_xor(half2 var, int laneMask, int width) {
|
||||
typedef union half2_b32 {
|
||||
|
|
@ -249,4 +263,4 @@ static __device__ __forceinline__ half2 __shfl_xor(half2 var, int laneMask, int
|
|||
tmp.b32 = __shfl_xor(tmp.b32, laneMask, width);
|
||||
return tmp.val;
|
||||
}
|
||||
#endif // defined(__HIP_PLATFORM_AMD__) && HIP_VERSION < 50600000
|
||||
#endif // HIP_VERSION < 50600000
|
||||
|
|
|
|||
|
|
@ -113,6 +113,10 @@ if (GGML_HIP_ROCWMMA_FATTN)
|
|||
add_compile_definitions(GGML_HIP_ROCWMMA_FATTN)
|
||||
endif()
|
||||
|
||||
if (NOT GGML_HIP_MMQ_MFMA)
|
||||
add_compile_definitions(GGML_HIP_NO_MMQ_MFMA)
|
||||
endif()
|
||||
|
||||
if (GGML_HIP_FORCE_ROCWMMA_FATTN_GFX12 OR ${hip_VERSION} VERSION_GREATER_EQUAL 7.0)
|
||||
add_compile_definitions(GGML_HIP_ROCWMMA_FATTN_GFX12)
|
||||
endif()
|
||||
|
|
|
|||
|
|
@ -82,6 +82,8 @@ set(GGML_OPENCL_KERNELS
|
|||
mul_mv_q4_0_f32_1d_16x_flat
|
||||
mul_mv_q6_k
|
||||
mul_mv_id_q4_0_f32_8x_flat
|
||||
mul_mm_f32_f32_l4_lm
|
||||
mul_mm_f16_f32_l4_lm
|
||||
mul
|
||||
norm
|
||||
relu
|
||||
|
|
|
|||
|
|
@ -33,6 +33,7 @@
|
|||
#undef MAX
|
||||
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
||||
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
||||
#define CEIL_DIV(M, N) (((M) + (N)-1) / (N))
|
||||
|
||||
#define UNUSED(x) (void)(x)
|
||||
|
||||
|
|
@ -396,11 +397,13 @@ struct ggml_backend_opencl_context {
|
|||
cl_program program_conv_2d_f16_f32;
|
||||
cl_program program_tsembd;
|
||||
cl_program program_mul_mv_id_q4_0_f32_8x_flat;
|
||||
cl_program program_mul_mm_f32_f32_l4_lm;
|
||||
cl_program program_mul_mm_f16_f32_l4_lm;
|
||||
|
||||
cl_kernel kernel_add, kernel_add_row;
|
||||
cl_kernel kernel_mul, kernel_mul_row;
|
||||
cl_kernel kernel_div, kernel_div_row;
|
||||
cl_kernel kernel_sub, kernel_sub_row;
|
||||
cl_kernel kernel_add, kernel_add_row, kernel_add_f16, kernel_add_row_f16;
|
||||
cl_kernel kernel_mul, kernel_mul_row, kernel_mul_f16, kernel_mul_row_f16;
|
||||
cl_kernel kernel_div, kernel_div_row, kernel_div_f16, kernel_div_row_f16;
|
||||
cl_kernel kernel_sub, kernel_sub_row, kernel_sub_f16, kernel_sub_row_f16;
|
||||
cl_kernel kernel_scale;
|
||||
cl_kernel kernel_silu, kernel_silu_4;
|
||||
cl_kernel kernel_gelu, kernel_gelu_4;
|
||||
|
|
@ -450,6 +453,8 @@ struct ggml_backend_opencl_context {
|
|||
cl_kernel kernel_conv_2d_f16_f32;
|
||||
cl_kernel kernel_timestep_embedding;
|
||||
cl_kernel kernel_mul_mv_id_q4_0_f32_8x_flat;
|
||||
cl_kernel kernel_mul_mm_f32_f32_l4_lm;
|
||||
cl_kernel kernel_mul_mm_f16_f32_l4_lm;
|
||||
|
||||
std::vector<ProfilingInfo> profiling_info;
|
||||
|
||||
|
|
@ -669,8 +674,10 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
|||
backend_ctx->program_add =
|
||||
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
||||
|
||||
CL_CHECK((backend_ctx->kernel_add = clCreateKernel(backend_ctx->program_add, "kernel_add", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_add_row = clCreateKernel(backend_ctx->program_add, "kernel_add_row", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_add = clCreateKernel(backend_ctx->program_add, "kernel_add", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_add_row = clCreateKernel(backend_ctx->program_add, "kernel_add_row", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_add_f16 = clCreateKernel(backend_ctx->program_add, "kernel_add_f16", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_add_row_f16 = clCreateKernel(backend_ctx->program_add, "kernel_add_row_f16", &err), err));
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
|
|
@ -1040,6 +1047,38 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
|||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
// mul_mm_f32_f32_l4_lm
|
||||
{
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
const std::string kernel_src {
|
||||
#include "mul_mm_f32_f32_l4_lm.cl.h"
|
||||
};
|
||||
#else
|
||||
const std::string kernel_src = read_file("mul_mm_f32_f32_l4_lm.cl");
|
||||
#endif
|
||||
backend_ctx->program_mul_mm_f32_f32_l4_lm =
|
||||
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
||||
|
||||
CL_CHECK((backend_ctx->kernel_mul_mm_f32_f32_l4_lm = clCreateKernel(backend_ctx->program_mul_mm_f32_f32_l4_lm, "kernel_mul_mm_f32_f32_l4_lm", &err), err));
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
// mul_mm_f16_f32_l4_lm
|
||||
{
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
const std::string kernel_src {
|
||||
#include "mul_mm_f16_f32_l4_lm.cl.h"
|
||||
};
|
||||
#else
|
||||
const std::string kernel_src = read_file("mul_mm_f16_f32_l4_lm.cl");
|
||||
#endif
|
||||
backend_ctx->program_mul_mm_f16_f32_l4_lm =
|
||||
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
||||
|
||||
CL_CHECK((backend_ctx->kernel_mul_mm_f16_f32_l4_lm = clCreateKernel(backend_ctx->program_mul_mm_f16_f32_l4_lm, "kernel_mul_mm_f16_f32_l4_lm", &err), err));
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
// mul
|
||||
{
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
|
|
@ -1052,8 +1091,10 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
|||
backend_ctx->program_mul =
|
||||
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
||||
|
||||
CL_CHECK((backend_ctx->kernel_mul = clCreateKernel(backend_ctx->program_mul, "kernel_mul", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_mul_row = clCreateKernel(backend_ctx->program_mul, "kernel_mul_row", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_mul = clCreateKernel(backend_ctx->program_mul, "kernel_mul", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_mul_row = clCreateKernel(backend_ctx->program_mul, "kernel_mul_row", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_mul_f16 = clCreateKernel(backend_ctx->program_mul, "kernel_mul_f16", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_mul_row_f16 = clCreateKernel(backend_ctx->program_mul, "kernel_mul_row_f16", &err), err));
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
|
|
@ -1251,11 +1292,16 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
|||
#else
|
||||
const std::string kernel_src = read_file("div.cl");
|
||||
#endif
|
||||
std::string compile_opts = std::string("-cl-std=") + opencl_c_std +
|
||||
" -cl-mad-enable -cl-finite-math-only ";
|
||||
|
||||
backend_ctx->program_div =
|
||||
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
||||
|
||||
CL_CHECK((backend_ctx->kernel_div = clCreateKernel(backend_ctx->program_div, "kernel_div", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_div_row = clCreateKernel(backend_ctx->program_div, "kernel_div_row", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_div = clCreateKernel(backend_ctx->program_div, "kernel_div", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_div_row = clCreateKernel(backend_ctx->program_div, "kernel_div_row", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_div_f16 = clCreateKernel(backend_ctx->program_div, "kernel_div_f16", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_div_row_f16 = clCreateKernel(backend_ctx->program_div, "kernel_div_row_f16", &err), err));
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
|
|
@ -1271,8 +1317,10 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
|||
backend_ctx->program_sub =
|
||||
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
||||
|
||||
CL_CHECK((backend_ctx->kernel_sub = clCreateKernel(backend_ctx->program_sub, "kernel_sub", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_sub_row = clCreateKernel(backend_ctx->program_sub, "kernel_sub_row", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_sub = clCreateKernel(backend_ctx->program_sub, "kernel_sub", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_sub_row = clCreateKernel(backend_ctx->program_sub, "kernel_sub_row", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_sub_f16 = clCreateKernel(backend_ctx->program_sub, "kernel_sub_f16", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_sub_row_f16 = clCreateKernel(backend_ctx->program_sub, "kernel_sub_row_f16", &err), err));
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
|
|
@ -1998,8 +2046,8 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
|||
|
||||
backend_ctx->adreno_cl_compiler_version = get_adreno_cl_compiler_version(driver_version);
|
||||
backend_ctx->has_vector_subgroup_broadcast =
|
||||
backend_ctx->adreno_cl_compiler_version.major >= 47 ||
|
||||
backend_ctx->adreno_cl_compiler_version.major == 17;
|
||||
(backend_ctx->adreno_cl_compiler_version.type == E031 && backend_ctx->adreno_cl_compiler_version.major >= 47) ||
|
||||
(backend_ctx->adreno_cl_compiler_version.type == DX && backend_ctx->adreno_cl_compiler_version.major >= 17);
|
||||
GGML_LOG_INFO("ggml_opencl: vector subgroup broadcast support: %s\n",
|
||||
backend_ctx->has_vector_subgroup_broadcast ? "true" : "false");
|
||||
|
||||
|
|
@ -2410,12 +2458,15 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
|
|||
default:
|
||||
return false;
|
||||
}
|
||||
case GGML_OP_ADD:
|
||||
case GGML_OP_SCALE:
|
||||
return op->src[0]->type == GGML_TYPE_F32 && ggml_is_contiguous(op->src[0]);
|
||||
case GGML_OP_ADD:
|
||||
case GGML_OP_MUL:
|
||||
case GGML_OP_DIV:
|
||||
case GGML_OP_SUB:
|
||||
return op->src[0]->type == GGML_TYPE_F32;
|
||||
return (op->src[0]->type == op->src[1]->type) &&
|
||||
(op->src[0]->type == op->type) &&
|
||||
(op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16);
|
||||
case GGML_OP_UNARY:
|
||||
switch (ggml_get_unary_op(op)) {
|
||||
case GGML_UNARY_OP_GELU:
|
||||
|
|
@ -3643,35 +3694,39 @@ static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const
|
|||
GGML_ASSERT(dst);
|
||||
GGML_ASSERT(dst->extra);
|
||||
|
||||
const int ne00 = src0 ? src0->ne[0] : 0;
|
||||
const int ne01 = src0 ? src0->ne[1] : 0;
|
||||
const int ne02 = src0 ? src0->ne[2] : 0;
|
||||
const int ne03 = src0 ? src0->ne[3] : 0;
|
||||
GGML_ASSERT(src0->type == src1->type);
|
||||
GGML_ASSERT(src0->type == dst->type);
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
|
||||
|
||||
const cl_ulong nb00 = src0 ? src0->nb[0] : 0;
|
||||
const cl_ulong nb01 = src0 ? src0->nb[1] : 0;
|
||||
const cl_ulong nb02 = src0 ? src0->nb[2] : 0;
|
||||
const cl_ulong nb03 = src0 ? src0->nb[3] : 0;
|
||||
const int ne00 = src0->ne[0];
|
||||
const int ne01 = src0->ne[1];
|
||||
const int ne02 = src0->ne[2];
|
||||
const int ne03 = src0->ne[3];
|
||||
|
||||
const int ne10 = src1 ? src1->ne[0] : 0;
|
||||
const int ne11 = src1 ? src1->ne[1] : 0;
|
||||
const int ne12 = src1 ? src1->ne[2] : 0;
|
||||
const int ne13 = src1 ? src1->ne[3] : 0; UNUSED(ne13);
|
||||
const cl_ulong nb00 = src0->nb[0];
|
||||
const cl_ulong nb01 = src0->nb[1];
|
||||
const cl_ulong nb02 = src0->nb[2];
|
||||
const cl_ulong nb03 = src0->nb[3];
|
||||
|
||||
const cl_ulong nb10 = src1 ? src1->nb[0] : 0;
|
||||
const cl_ulong nb11 = src1 ? src1->nb[1] : 0;
|
||||
const cl_ulong nb12 = src1 ? src1->nb[2] : 0;
|
||||
const cl_ulong nb13 = src1 ? src1->nb[3] : 0; UNUSED(nb13);
|
||||
const int ne10 = src1->ne[0];
|
||||
const int ne11 = src1->ne[1];
|
||||
const int ne12 = src1->ne[2];
|
||||
const int ne13 = src1->ne[3]; UNUSED(ne13);
|
||||
|
||||
const int ne0 = dst ? dst->ne[0] : 0;
|
||||
const int ne1 = dst ? dst->ne[1] : 0;
|
||||
const int ne2 = dst ? dst->ne[2] : 0;
|
||||
const int ne3 = dst ? dst->ne[3] : 0;
|
||||
const cl_ulong nb10 = src1->nb[0];
|
||||
const cl_ulong nb11 = src1->nb[1];
|
||||
const cl_ulong nb12 = src1->nb[2];
|
||||
const cl_ulong nb13 = src1->nb[3]; UNUSED(nb13);
|
||||
|
||||
const cl_ulong nb0 = dst ? dst->nb[0] : 0;
|
||||
const cl_ulong nb1 = dst ? dst->nb[1] : 0;
|
||||
const cl_ulong nb2 = dst ? dst->nb[2] : 0;
|
||||
const cl_ulong nb3 = dst ? dst->nb[3] : 0;
|
||||
const int ne0 = dst->ne[0];
|
||||
const int ne1 = dst->ne[1];
|
||||
const int ne2 = dst->ne[2];
|
||||
const int ne3 = dst->ne[3];
|
||||
|
||||
const cl_ulong nb0 = dst->nb[0];
|
||||
const cl_ulong nb1 = dst->nb[1];
|
||||
const cl_ulong nb2 = dst->nb[2];
|
||||
const cl_ulong nb3 = dst->nb[3];
|
||||
|
||||
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
||||
|
||||
|
|
@ -3694,7 +3749,12 @@ static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const
|
|||
|
||||
bcast_row = true;
|
||||
int ne = ne00 / 4;
|
||||
kernel = backend_ctx->kernel_add_row;
|
||||
|
||||
if (src0->type == GGML_TYPE_F32) {
|
||||
kernel = backend_ctx->kernel_add_row;
|
||||
} else {
|
||||
kernel = backend_ctx->kernel_add_row_f16;
|
||||
}
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
||||
|
|
@ -3704,7 +3764,11 @@ static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const
|
|||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne));
|
||||
} else {
|
||||
kernel = backend_ctx->kernel_add;
|
||||
if (src0->type == GGML_TYPE_F32) {
|
||||
kernel = backend_ctx->kernel_add;
|
||||
} else {
|
||||
kernel = backend_ctx->kernel_add_f16;
|
||||
}
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
||||
|
|
@ -3766,35 +3830,39 @@ static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const
|
|||
GGML_ASSERT(dst);
|
||||
GGML_ASSERT(dst->extra);
|
||||
|
||||
const int ne00 = src0 ? src0->ne[0] : 0;
|
||||
const int ne01 = src0 ? src0->ne[1] : 0;
|
||||
const int ne02 = src0 ? src0->ne[2] : 0;
|
||||
const int ne03 = src0 ? src0->ne[3] : 0;
|
||||
GGML_ASSERT(src0->type == src1->type);
|
||||
GGML_ASSERT(src0->type == dst->type);
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
|
||||
|
||||
const cl_ulong nb00 = src0 ? src0->nb[0] : 0;
|
||||
const cl_ulong nb01 = src0 ? src0->nb[1] : 0;
|
||||
const cl_ulong nb02 = src0 ? src0->nb[2] : 0;
|
||||
const cl_ulong nb03 = src0 ? src0->nb[3] : 0;
|
||||
const int ne00 = src0->ne[0];
|
||||
const int ne01 = src0->ne[1];
|
||||
const int ne02 = src0->ne[2];
|
||||
const int ne03 = src0->ne[3];
|
||||
|
||||
const int ne10 = src1 ? src1->ne[0] : 0;
|
||||
const int ne11 = src1 ? src1->ne[1] : 0;
|
||||
const int ne12 = src1 ? src1->ne[2] : 0;
|
||||
const int ne13 = src1 ? src1->ne[3] : 0; UNUSED(ne13);
|
||||
const cl_ulong nb00 = src0->nb[0];
|
||||
const cl_ulong nb01 = src0->nb[1];
|
||||
const cl_ulong nb02 = src0->nb[2];
|
||||
const cl_ulong nb03 = src0->nb[3];
|
||||
|
||||
const cl_ulong nb10 = src1 ? src1->nb[0] : 0;
|
||||
const cl_ulong nb11 = src1 ? src1->nb[1] : 0;
|
||||
const cl_ulong nb12 = src1 ? src1->nb[2] : 0;
|
||||
const cl_ulong nb13 = src1 ? src1->nb[3] : 0; UNUSED(nb13);
|
||||
const int ne10 = src1->ne[0];
|
||||
const int ne11 = src1->ne[1];
|
||||
const int ne12 = src1->ne[2];
|
||||
const int ne13 = src1->ne[3]; UNUSED(ne13);
|
||||
|
||||
const int ne0 = dst ? dst->ne[0] : 0;
|
||||
const int ne1 = dst ? dst->ne[1] : 0;
|
||||
const int ne2 = dst ? dst->ne[2] : 0;
|
||||
const int ne3 = dst ? dst->ne[3] : 0;
|
||||
const cl_ulong nb10 = src1->nb[0];
|
||||
const cl_ulong nb11 = src1->nb[1];
|
||||
const cl_ulong nb12 = src1->nb[2];
|
||||
const cl_ulong nb13 = src1->nb[3]; UNUSED(nb13);
|
||||
|
||||
const cl_ulong nb0 = dst ? dst->nb[0] : 0;
|
||||
const cl_ulong nb1 = dst ? dst->nb[1] : 0;
|
||||
const cl_ulong nb2 = dst ? dst->nb[2] : 0;
|
||||
const cl_ulong nb3 = dst ? dst->nb[3] : 0;
|
||||
const int ne0 = dst->ne[0];
|
||||
const int ne1 = dst->ne[1];
|
||||
const int ne2 = dst->ne[2];
|
||||
const int ne3 = dst->ne[3];
|
||||
|
||||
const cl_ulong nb0 = dst->nb[0];
|
||||
const cl_ulong nb1 = dst->nb[1];
|
||||
const cl_ulong nb2 = dst->nb[2];
|
||||
const cl_ulong nb3 = dst->nb[3];
|
||||
|
||||
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
||||
|
||||
|
|
@ -3817,7 +3885,12 @@ static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const
|
|||
|
||||
bcast_row = true;
|
||||
int ne = ne00 / 4;
|
||||
kernel = backend_ctx->kernel_mul_row;
|
||||
|
||||
if (src0->type == GGML_TYPE_F32) {
|
||||
kernel = backend_ctx->kernel_mul_row;
|
||||
} else {
|
||||
kernel = backend_ctx->kernel_mul_row_f16;
|
||||
}
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
||||
|
|
@ -3827,7 +3900,11 @@ static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const
|
|||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne));
|
||||
} else {
|
||||
kernel = backend_ctx->kernel_mul;
|
||||
if (src0->type == GGML_TYPE_F32) {
|
||||
kernel = backend_ctx->kernel_mul;
|
||||
} else {
|
||||
kernel = backend_ctx->kernel_mul_f16;
|
||||
}
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
||||
|
|
@ -3889,6 +3966,10 @@ static void ggml_cl_div(ggml_backend_t backend, const ggml_tensor * src0, const
|
|||
GGML_ASSERT(dst);
|
||||
GGML_ASSERT(dst->extra);
|
||||
|
||||
GGML_ASSERT(src0->type == src1->type);
|
||||
GGML_ASSERT(src0->type == dst->type);
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
|
||||
|
||||
const int ne00 = src0->ne[0];
|
||||
const int ne01 = src0->ne[1];
|
||||
const int ne02 = src0->ne[2];
|
||||
|
|
@ -3937,7 +4018,12 @@ static void ggml_cl_div(ggml_backend_t backend, const ggml_tensor * src0, const
|
|||
|
||||
bcast_row = true;
|
||||
int ne = ne00 / 4;
|
||||
kernel = backend_ctx->kernel_div_row;
|
||||
|
||||
if (src0->type == GGML_TYPE_F32) {
|
||||
kernel = backend_ctx->kernel_div_row;
|
||||
} else {
|
||||
kernel = backend_ctx->kernel_div_row_f16;
|
||||
}
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
||||
|
|
@ -3947,7 +4033,11 @@ static void ggml_cl_div(ggml_backend_t backend, const ggml_tensor * src0, const
|
|||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne));
|
||||
} else {
|
||||
kernel = backend_ctx->kernel_div;
|
||||
if (src0->type == GGML_TYPE_F32) {
|
||||
kernel = backend_ctx->kernel_div;
|
||||
} else {
|
||||
kernel = backend_ctx->kernel_div_f16;
|
||||
}
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
||||
|
|
@ -3997,6 +4087,10 @@ static void ggml_cl_sub(ggml_backend_t backend, const ggml_tensor * src0, const
|
|||
GGML_ASSERT(dst);
|
||||
GGML_ASSERT(dst->extra);
|
||||
|
||||
GGML_ASSERT(src0->type == src1->type);
|
||||
GGML_ASSERT(src0->type == dst->type);
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
|
||||
|
||||
const int ne00 = src0->ne[0];
|
||||
const int ne01 = src0->ne[1];
|
||||
const int ne02 = src0->ne[2];
|
||||
|
|
@ -4045,7 +4139,12 @@ static void ggml_cl_sub(ggml_backend_t backend, const ggml_tensor * src0, const
|
|||
|
||||
bcast_row = true;
|
||||
int ne = ne00 / 4;
|
||||
kernel = backend_ctx->kernel_sub_row;
|
||||
|
||||
if (src0->type == GGML_TYPE_F32) {
|
||||
kernel = backend_ctx->kernel_sub_row;
|
||||
} else {
|
||||
kernel = backend_ctx->kernel_sub_row_f16;
|
||||
}
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
||||
|
|
@ -4055,7 +4154,11 @@ static void ggml_cl_sub(ggml_backend_t backend, const ggml_tensor * src0, const
|
|||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne));
|
||||
} else {
|
||||
kernel = backend_ctx->kernel_sub;
|
||||
if (src0->type == GGML_TYPE_F32) {
|
||||
kernel = backend_ctx->kernel_sub;
|
||||
} else {
|
||||
kernel = backend_ctx->kernel_sub_f16;
|
||||
}
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
||||
|
|
@ -5297,18 +5400,6 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
|||
|
||||
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
||||
|
||||
if (src0t == GGML_TYPE_F16 && src1t == GGML_TYPE_F32 &&
|
||||
src0->ne[1] > 32 && // M > 32
|
||||
src1->ne[1] > 32 && // N > 32
|
||||
src0->ne[0] > 32 && // K > 32
|
||||
src0->ne[2] == 1 && src0->ne[3] == 1 &&
|
||||
src1->ne[2] == 1 && src1->ne[3] == 1 &&
|
||||
ggml_is_contiguous(src0) && ggml_is_contiguous(src1) &&
|
||||
backend_ctx->kernel_mul_mat_f16_f32_tiled != NULL) {
|
||||
ggml_cl_mul_mat_f16_f32_tiled(backend, src0, src1, dst);
|
||||
return;
|
||||
}
|
||||
|
||||
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
||||
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
||||
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
||||
|
|
@ -5655,6 +5746,101 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
|||
} // if (ne01 && ne1)
|
||||
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
|
||||
// GEMM using local memory
|
||||
// Current BK = 16, so ne00 % 16 == 0
|
||||
if (ggml_is_contiguous(src0) &&
|
||||
ggml_is_contiguous(src1) &&
|
||||
src1t == GGML_TYPE_F32 &&
|
||||
ne00 % 16 == 0 &&
|
||||
ne11 > 1) {
|
||||
switch(src0t) {
|
||||
case GGML_TYPE_F32: {
|
||||
kernel = backend_ctx->kernel_mul_mm_f32_f32_l4_lm;
|
||||
nth0 = 128; // calculated as (BM*BN)/(TM*TN)
|
||||
|
||||
int batch_stride_a = ne00*ne01;
|
||||
int batch_stride_b = ne10*ne11;
|
||||
int batch_stride_d = ne0*ne1;
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne11));
|
||||
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne12));
|
||||
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne10)); // stride_a
|
||||
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne10)); // stride_b
|
||||
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne01)); // stride_d
|
||||
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &batch_stride_a));
|
||||
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &batch_stride_b));
|
||||
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &batch_stride_d));
|
||||
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &r2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &r3));
|
||||
|
||||
// 64 is block tile size BM and BN - change here when BM and BN in the kernel are changed.
|
||||
size_t global_work_size[] = {(size_t)(CEIL_DIV(ne01, 64)*nth0), (size_t)(CEIL_DIV(ne11, 64)), (size_t)ne12*ne13};
|
||||
size_t local_work_size[] = {(size_t)nth0, 1, 1};
|
||||
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
||||
return;
|
||||
}
|
||||
case GGML_TYPE_F16: {
|
||||
kernel = backend_ctx->kernel_mul_mm_f16_f32_l4_lm;
|
||||
nth0 = 128; // calculated as (BM*BN)/(TM*TN)
|
||||
|
||||
int batch_stride_a = ne00*ne01;
|
||||
int batch_stride_b = ne10*ne11;
|
||||
int batch_stride_d = ne0*ne1;
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne11));
|
||||
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne12));
|
||||
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne10)); // stride_a
|
||||
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne10)); // stride_b
|
||||
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne01)); // stride_d
|
||||
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &batch_stride_a));
|
||||
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &batch_stride_b));
|
||||
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &batch_stride_d));
|
||||
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &r2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &r3));
|
||||
|
||||
// 64 is block tile size BM and BN - change here when BM and BN in the kernel are changed.
|
||||
size_t global_work_size[] = {(size_t)(CEIL_DIV(ne01, 64)*nth0), (size_t)(CEIL_DIV(ne11, 64)), (size_t)ne12*ne13};
|
||||
size_t local_work_size[] = {(size_t)nth0, 1, 1};
|
||||
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
||||
return;
|
||||
}
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (src0t == GGML_TYPE_F16 && src1t == GGML_TYPE_F32 &&
|
||||
src0->ne[1] > 32 && // M > 32
|
||||
src1->ne[1] > 32 && // N > 32
|
||||
src0->ne[0] > 32 && // K > 32
|
||||
src0->ne[2] == 1 && src0->ne[3] == 1 &&
|
||||
src1->ne[2] == 1 && src1->ne[3] == 1 &&
|
||||
ggml_is_contiguous(src0) && ggml_is_contiguous(src1) &&
|
||||
backend_ctx->kernel_mul_mat_f16_f32_tiled != NULL) {
|
||||
ggml_cl_mul_mat_f16_f32_tiled(backend, src0, src1, dst);
|
||||
return;
|
||||
}
|
||||
|
||||
if (!ggml_is_transposed(src0) &&
|
||||
!ggml_is_transposed(src1) &&
|
||||
src1t == GGML_TYPE_F32 &&
|
||||
|
|
|
|||
|
|
@ -81,3 +81,76 @@ kernel void kernel_add_row(
|
|||
uint idx1 = gid - (gid/ne)*ne; // get_global_id(0) % ne
|
||||
dst[gid] = src0[gid] + src1[idx1];
|
||||
}
|
||||
|
||||
kernel void kernel_add_f16(
|
||||
global char * src0,
|
||||
ulong offset0,
|
||||
global char * src1,
|
||||
ulong offset1,
|
||||
global char * dst,
|
||||
ulong offsetd,
|
||||
int ne00,
|
||||
int ne01,
|
||||
int ne02,
|
||||
int ne03,
|
||||
ulong nb00,
|
||||
ulong nb01,
|
||||
ulong nb02,
|
||||
ulong nb03,
|
||||
int ne10,
|
||||
int ne11,
|
||||
int ne12,
|
||||
int ne13,
|
||||
ulong nb10,
|
||||
ulong nb11,
|
||||
ulong nb12,
|
||||
ulong nb13,
|
||||
int ne0,
|
||||
int ne1,
|
||||
int ne2,
|
||||
int ne3,
|
||||
ulong nb0,
|
||||
ulong nb1,
|
||||
ulong nb2,
|
||||
ulong nb3
|
||||
) {
|
||||
src0 = src0 + offset0;
|
||||
src1 = src1 + offset1;
|
||||
dst = dst + offsetd;
|
||||
|
||||
int i03 = get_group_id(2);
|
||||
int i02 = get_group_id(1);
|
||||
int i01 = get_group_id(0);
|
||||
|
||||
int i13 = i03 % ne13;
|
||||
int i12 = i02 % ne12;
|
||||
int i11 = i01 % ne11;
|
||||
|
||||
global char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01;
|
||||
global char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11;
|
||||
global char * dst_ptr = dst + i03*nb3 + i02*nb2 + i01*nb1;
|
||||
|
||||
for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) {
|
||||
const int i10 = i0 % ne10;
|
||||
*((global half *)(dst_ptr + i0*nb0)) = *((global half *)(src0_ptr + i0*nb00)) + *((global half *)(src1_ptr + i10*nb10));
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_add_row_f16(
|
||||
global half4 * src0,
|
||||
ulong offset0,
|
||||
global half4 * src1,
|
||||
ulong offset1,
|
||||
global half4 * dst,
|
||||
ulong offsetd,
|
||||
int ne
|
||||
) {
|
||||
src0 = (global half4*)((global char*)src0 + offset0);
|
||||
src1 = (global half4*)((global char*)src1 + offset1);
|
||||
dst = (global half4*)((global char*)dst + offsetd);
|
||||
|
||||
// This performs better than using %.
|
||||
uint gid = get_global_id(0);
|
||||
uint idx1 = gid - (gid/ne)*ne; // get_global_id(0) % ne
|
||||
dst[gid] = src0[gid] + src1[idx1];
|
||||
}
|
||||
|
|
|
|||
|
|
@ -70,3 +70,69 @@ kernel void kernel_div_row(
|
|||
uint idx1 = gid - (gid/ne)*ne; // get_global_id(0) % ne
|
||||
dst[gid] = src0[gid] / src1[idx1];
|
||||
}
|
||||
|
||||
kernel void kernel_div_f16(
|
||||
global char * src0,
|
||||
ulong offset0,
|
||||
global char * src1,
|
||||
ulong offset1,
|
||||
global char * dst,
|
||||
ulong offsetd,
|
||||
ulong nb00,
|
||||
ulong nb01,
|
||||
ulong nb02,
|
||||
ulong nb03,
|
||||
int ne10,
|
||||
int ne11,
|
||||
int ne12,
|
||||
int ne13,
|
||||
ulong nb10,
|
||||
ulong nb11,
|
||||
ulong nb12,
|
||||
ulong nb13,
|
||||
int ne0,
|
||||
ulong nb0,
|
||||
ulong nb1,
|
||||
ulong nb2,
|
||||
ulong nb3
|
||||
) {
|
||||
src0 = src0 + offset0;
|
||||
src1 = src1 + offset1;
|
||||
dst = dst + offsetd;
|
||||
|
||||
int i03 = get_group_id(2);
|
||||
int i02 = get_group_id(1);
|
||||
int i01 = get_group_id(0);
|
||||
|
||||
int i13 = i03 % ne13;
|
||||
int i12 = i02 % ne12;
|
||||
int i11 = i01 % ne11;
|
||||
|
||||
global char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01;
|
||||
global char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11;
|
||||
global char * dst_ptr = dst + i03*nb3 + i02*nb2 + i01*nb1;
|
||||
|
||||
for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) {
|
||||
const int i10 = i0 % ne10;
|
||||
*((global half *)(dst_ptr + i0*nb0)) = *((global half *)(src0_ptr + i0*nb00)) / *((global half *)(src1_ptr + i10*nb10));
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_div_row_f16(
|
||||
global half4 * src0,
|
||||
ulong offset0,
|
||||
global half4 * src1,
|
||||
ulong offset1,
|
||||
global half4 * dst,
|
||||
ulong offsetd,
|
||||
int ne
|
||||
) {
|
||||
src0 = (global half4*)((global char*)src0 + offset0);
|
||||
src1 = (global half4*)((global char*)src1 + offset1);
|
||||
dst = (global half4*)((global char*)dst + offsetd);
|
||||
|
||||
// This performs better than using %.
|
||||
uint gid = get_global_id(0);
|
||||
uint idx1 = gid - (gid/ne)*ne; // get_global_id(0) % ne
|
||||
dst[gid] = src0[gid] / src1[idx1];
|
||||
}
|
||||
|
|
|
|||
|
|
@ -77,3 +77,76 @@ kernel void kernel_mul_row(
|
|||
uint idx1 = gid - (gid/ne)*ne; // get_global_id(0) % ne
|
||||
dst[gid] = src0[gid] * src1[idx1];
|
||||
}
|
||||
|
||||
kernel void kernel_mul_f16(
|
||||
global char * src0,
|
||||
ulong offset0,
|
||||
global char * src1,
|
||||
ulong offset1,
|
||||
global char * dst,
|
||||
ulong offsetd,
|
||||
int ne00,
|
||||
int ne01,
|
||||
int ne02,
|
||||
int ne03,
|
||||
ulong nb00,
|
||||
ulong nb01,
|
||||
ulong nb02,
|
||||
ulong nb03,
|
||||
int ne10,
|
||||
int ne11,
|
||||
int ne12,
|
||||
int ne13,
|
||||
ulong nb10,
|
||||
ulong nb11,
|
||||
ulong nb12,
|
||||
ulong nb13,
|
||||
int ne0,
|
||||
int ne1,
|
||||
int ne2,
|
||||
int ne3,
|
||||
ulong nb0,
|
||||
ulong nb1,
|
||||
ulong nb2,
|
||||
ulong nb3
|
||||
) {
|
||||
src0 = src0 + offset0;
|
||||
src1 = src1 + offset1;
|
||||
dst = dst + offsetd;
|
||||
|
||||
int i03 = get_group_id(2);
|
||||
int i02 = get_group_id(1);
|
||||
int i01 = get_group_id(0);
|
||||
|
||||
int i13 = i03 % ne13;
|
||||
int i12 = i02 % ne12;
|
||||
int i11 = i01 % ne11;
|
||||
|
||||
global char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01;
|
||||
global char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11;
|
||||
global char * dst_ptr = dst + i03*nb3 + i02*nb2 + i01*nb1;
|
||||
|
||||
for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) {
|
||||
const int i10 = i0 % ne10;
|
||||
*((global half *)(dst_ptr + i0*nb0)) = *((global half *)(src0_ptr + i0*nb00)) * *((global half *)(src1_ptr + i10*nb10));
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_mul_row_f16(
|
||||
global half4 * src0,
|
||||
ulong offset0,
|
||||
global half4 * src1,
|
||||
ulong offset1,
|
||||
global half4 * dst,
|
||||
ulong offsetd,
|
||||
int ne
|
||||
) {
|
||||
src0 = (global half4*)((global char*)src0 + offset0);
|
||||
src1 = (global half4*)((global char*)src1 + offset1);
|
||||
dst = (global half4*)((global char*)dst + offsetd);
|
||||
|
||||
// This performs better than using %.
|
||||
uint gid = get_global_id(0);
|
||||
uint idx1 = gid - (gid/ne)*ne; // get_global_id(0) % ne
|
||||
dst[gid] = src0[gid] * src1[idx1];
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,132 @@
|
|||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
|
||||
#define LOAD_VEC_A 4
|
||||
#define LOAD_VEC_B 4
|
||||
|
||||
#define BM 64
|
||||
#define BN 64
|
||||
#define BK 16
|
||||
#define TM 4
|
||||
#define TN 8
|
||||
|
||||
kernel void kernel_mul_mm_f16_f32_l4_lm(
|
||||
global half4 * src0,
|
||||
ulong offset0,
|
||||
global float4 * src1,
|
||||
ulong offset1,
|
||||
global float * dst,
|
||||
ulong offsetd,
|
||||
|
||||
int ne00,
|
||||
int ne01,
|
||||
int ne02,
|
||||
int ne11,
|
||||
int ne12,
|
||||
|
||||
int stride_a,
|
||||
int stride_b,
|
||||
int stride_d,
|
||||
|
||||
int batch_stride_a,
|
||||
int batch_stride_b,
|
||||
int batch_stride_d,
|
||||
|
||||
int r2,
|
||||
int r3
|
||||
) {
|
||||
src0 = (global half4*)((global char*)src0 + offset0);
|
||||
src1 = (global float4*)((global char*)src1 + offset1);
|
||||
dst = (global float*)((global char*)dst + offsetd);
|
||||
|
||||
local half buf_a[BM * BK];
|
||||
local float buf_b[BN * BK];
|
||||
|
||||
const int batch_idx = get_global_id(2);
|
||||
|
||||
const int i13 = batch_idx / ne12;
|
||||
const int i12 = batch_idx % ne12;
|
||||
|
||||
const int i03 = i13 / r3;
|
||||
const int i02 = i12 / r2;
|
||||
|
||||
const int batch_idx_a = i03 * ne02 + i02;
|
||||
|
||||
const int ir = get_group_id(0);
|
||||
const int ic = get_group_id(1);
|
||||
|
||||
const int tid = get_local_id(0);
|
||||
const int th_r = tid % (BM / TM);
|
||||
const int th_c = tid / (BM / TM);
|
||||
|
||||
const int loadr_a = get_local_id(0) % (BK / LOAD_VEC_A);
|
||||
const int loadc_a = get_local_id(0) / (BK / LOAD_VEC_A);
|
||||
const int loadr_b = get_local_id(0) % (BK / LOAD_VEC_B);
|
||||
const int loadc_b = get_local_id(0) / (BK / LOAD_VEC_B);
|
||||
|
||||
const int loadstride_a = get_local_size(0) * LOAD_VEC_A / BK;
|
||||
const int loadstride_b = get_local_size(0) * LOAD_VEC_B / BK;
|
||||
|
||||
int pos_a = (batch_idx_a * batch_stride_a + ir * BM * stride_a) / LOAD_VEC_A;
|
||||
int pos_b = (batch_idx * batch_stride_b + ic * BN * stride_b) / LOAD_VEC_B;
|
||||
|
||||
float sums[TM * TN];
|
||||
half cache_a[TM];
|
||||
float cache_b[TN];
|
||||
|
||||
for (int i = 0; i < TM * TN; i++) {
|
||||
sums[i] = 0.0f;
|
||||
}
|
||||
|
||||
for (int block = 0; block < ne00; block += BK) {
|
||||
for (int l = 0; l < BM; l += loadstride_a) {
|
||||
const int idx = pos_a + (loadc_a + l) * stride_a / LOAD_VEC_A + loadr_a;
|
||||
buf_a[(loadr_a * LOAD_VEC_A + 0) * BM + loadc_a + l] = src0[idx].s0;
|
||||
buf_a[(loadr_a * LOAD_VEC_A + 1) * BM + loadc_a + l] = src0[idx].s1;
|
||||
buf_a[(loadr_a * LOAD_VEC_A + 2) * BM + loadc_a + l] = src0[idx].s2;
|
||||
buf_a[(loadr_a * LOAD_VEC_A + 3) * BM + loadc_a + l] = src0[idx].s3;
|
||||
}
|
||||
|
||||
for (int l = 0; l < BN; l += loadstride_b) {
|
||||
const int idx = pos_b + (loadc_b + l) * stride_b / LOAD_VEC_B + loadr_b;
|
||||
buf_b[(loadr_b * LOAD_VEC_B + 0) * BN + loadc_b + l] = src1[idx].s0;
|
||||
buf_b[(loadr_b * LOAD_VEC_B + 1) * BN + loadc_b + l] = src1[idx].s1;
|
||||
buf_b[(loadr_b * LOAD_VEC_B + 2) * BN + loadc_b + l] = src1[idx].s2;
|
||||
buf_b[(loadr_b * LOAD_VEC_B + 3) * BN + loadc_b + l] = src1[idx].s3;
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
pos_a += BK / LOAD_VEC_A;
|
||||
pos_b += BK / LOAD_VEC_B;
|
||||
|
||||
for (int i = 0; i < BK; i++) {
|
||||
for (int j = 0; j < TM; j++) {
|
||||
cache_a[j] = buf_a[(i) * BM + th_r * TM + j];
|
||||
}
|
||||
for (int j = 0; j < TN; j++) {
|
||||
cache_b[j] = buf_b[(i) * BN + th_c * TN + j];
|
||||
}
|
||||
|
||||
for (int cc = 0; cc < TN; cc++) {
|
||||
for (int cr = 0; cr < TM; cr++) {
|
||||
const int sums_idx = cc*TM + cr;
|
||||
sums[sums_idx] = mad(convert_float(cache_a[cr]), cache_b[cc], sums[sums_idx]);
|
||||
}
|
||||
}
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
const int dr = ir * BM + th_r * TM;
|
||||
const int dc = ic * BN + th_c * TN;
|
||||
|
||||
const int offsets = batch_idx * batch_stride_d;
|
||||
|
||||
for (int cc = 0; cc < TN; cc++) {
|
||||
for (int cr = 0; cr < TM; cr++) {
|
||||
if (dr + cr < ne01 && dc + cc < ne11) {
|
||||
dst[offsets + (dc + cc) * stride_d + dr + cr] = sums[cc * TM + cr];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,133 @@
|
|||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
|
||||
#define LOAD_VEC_A 4
|
||||
#define LOAD_VEC_B 4
|
||||
|
||||
#define BM 64
|
||||
#define BN 64
|
||||
#define BK 16
|
||||
#define TM 4
|
||||
#define TN 8
|
||||
|
||||
kernel void kernel_mul_mm_f32_f32_l4_lm(
|
||||
global float4 * src0,
|
||||
ulong offset0,
|
||||
global float4 * src1,
|
||||
ulong offset1,
|
||||
global float * dst,
|
||||
ulong offsetd,
|
||||
|
||||
int ne00,
|
||||
int ne01,
|
||||
int ne02,
|
||||
int ne11,
|
||||
int ne12,
|
||||
|
||||
int stride_a,
|
||||
int stride_b,
|
||||
int stride_d,
|
||||
|
||||
int batch_stride_a,
|
||||
int batch_stride_b,
|
||||
int batch_stride_d,
|
||||
|
||||
int r2,
|
||||
int r3
|
||||
) {
|
||||
src0 = (global float4*)((global char*)src0 + offset0);
|
||||
src1 = (global float4*)((global char*)src1 + offset1);
|
||||
dst = (global float*)((global char*)dst + offsetd);
|
||||
|
||||
local float buf_a[BM * BK];
|
||||
local float buf_b[BN * BK];
|
||||
|
||||
const int batch_idx = get_global_id(2);
|
||||
|
||||
const int i13 = batch_idx / ne12;
|
||||
const int i12 = batch_idx % ne12;
|
||||
|
||||
const int i03 = i13 / r3;
|
||||
const int i02 = i12 / r2;
|
||||
|
||||
const int batch_idx_a = i03 * ne02 + i02;
|
||||
|
||||
const int ir = get_group_id(0);
|
||||
const int ic = get_group_id(1);
|
||||
|
||||
const int tid = get_local_id(0);
|
||||
const int th_r = tid % (BM / TM);
|
||||
const int th_c = tid / (BM / TM);
|
||||
|
||||
const int loadr_a = get_local_id(0) % (BK / LOAD_VEC_A);
|
||||
const int loadc_a = get_local_id(0) / (BK / LOAD_VEC_A);
|
||||
const int loadr_b = get_local_id(0) % (BK / LOAD_VEC_B);
|
||||
const int loadc_b = get_local_id(0) / (BK / LOAD_VEC_B);
|
||||
|
||||
const int loadstride_a = get_local_size(0) * LOAD_VEC_A / BK;
|
||||
const int loadstride_b = get_local_size(0) * LOAD_VEC_B / BK;
|
||||
|
||||
int pos_a = (batch_idx_a * batch_stride_a + ir * BM * stride_a) / LOAD_VEC_A;
|
||||
int pos_b = (batch_idx * batch_stride_b + ic * BN * stride_b) / LOAD_VEC_B;
|
||||
|
||||
float sums[TM * TN];
|
||||
float cache_a[TM];
|
||||
float cache_b[TN];
|
||||
|
||||
for (int i = 0; i < TM * TN; i++) {
|
||||
sums[i] = 0.0f;
|
||||
}
|
||||
|
||||
for (int block = 0; block < ne00; block += BK) {
|
||||
for (int l = 0; l < BM; l += loadstride_a) {
|
||||
const int idx = pos_a + (loadc_a + l) * stride_a / LOAD_VEC_A + loadr_a;
|
||||
buf_a[(loadr_a * LOAD_VEC_A + 0) * BM + loadc_a + l] = src0[idx].s0;
|
||||
buf_a[(loadr_a * LOAD_VEC_A + 1) * BM + loadc_a + l] = src0[idx].s1;
|
||||
buf_a[(loadr_a * LOAD_VEC_A + 2) * BM + loadc_a + l] = src0[idx].s2;
|
||||
buf_a[(loadr_a * LOAD_VEC_A + 3) * BM + loadc_a + l] = src0[idx].s3;
|
||||
}
|
||||
|
||||
for (int l = 0; l < BN; l += loadstride_b) {
|
||||
const int idx = pos_b + (loadc_b + l) * stride_b / LOAD_VEC_B + loadr_b;
|
||||
buf_b[(loadr_b * LOAD_VEC_B + 0) * BN + loadc_b + l] = src1[idx].s0;
|
||||
buf_b[(loadr_b * LOAD_VEC_B + 1) * BN + loadc_b + l] = src1[idx].s1;
|
||||
buf_b[(loadr_b * LOAD_VEC_B + 2) * BN + loadc_b + l] = src1[idx].s2;
|
||||
buf_b[(loadr_b * LOAD_VEC_B + 3) * BN + loadc_b + l] = src1[idx].s3;
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
pos_a += BK / LOAD_VEC_A;
|
||||
pos_b += BK / LOAD_VEC_B;
|
||||
|
||||
for (int i = 0; i < BK; i++) {
|
||||
for (int j = 0; j < TM; j++) {
|
||||
cache_a[j] = buf_a[(i) * BM + th_r * TM + j];
|
||||
}
|
||||
|
||||
for (int j = 0; j < TN; j++) {
|
||||
cache_b[j] = buf_b[(i) * BN + th_c * TN + j];
|
||||
}
|
||||
|
||||
for (int cc = 0; cc < TN; cc++) {
|
||||
for (int cr = 0; cr < TM; cr++) {
|
||||
const int sums_idx = cc*TM + cr;
|
||||
sums[sums_idx] = mad(cache_a[cr], cache_b[cc], sums[sums_idx]);
|
||||
}
|
||||
}
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
const int dr = ir * BM + th_r * TM;
|
||||
const int dc = ic * BN + th_c * TN;
|
||||
|
||||
const int offsets = batch_idx * batch_stride_d;
|
||||
|
||||
for (int cc = 0; cc < TN; cc++) {
|
||||
for (int cr = 0; cr < TM; cr++) {
|
||||
if (dr + cr < ne01 && dc + cc < ne11) {
|
||||
dst[offsets + (dc + cc) * stride_d + dr + cr] = sums[cc * TM + cr];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -70,3 +70,69 @@ kernel void kernel_sub_row(
|
|||
uint idx1 = gid - (gid/ne)*ne; // get_global_id(0) % ne
|
||||
dst[gid] = src0[gid] - src1[idx1];
|
||||
}
|
||||
|
||||
kernel void kernel_sub_f16(
|
||||
global char * src0,
|
||||
ulong offset0,
|
||||
global char * src1,
|
||||
ulong offset1,
|
||||
global char * dst,
|
||||
ulong offsetd,
|
||||
ulong nb00,
|
||||
ulong nb01,
|
||||
ulong nb02,
|
||||
ulong nb03,
|
||||
int ne10,
|
||||
int ne11,
|
||||
int ne12,
|
||||
int ne13,
|
||||
ulong nb10,
|
||||
ulong nb11,
|
||||
ulong nb12,
|
||||
ulong nb13,
|
||||
int ne0,
|
||||
ulong nb0,
|
||||
ulong nb1,
|
||||
ulong nb2,
|
||||
ulong nb3
|
||||
) {
|
||||
src0 = src0 + offset0;
|
||||
src1 = src1 + offset1;
|
||||
dst = dst + offsetd;
|
||||
|
||||
int i03 = get_group_id(2);
|
||||
int i02 = get_group_id(1);
|
||||
int i01 = get_group_id(0);
|
||||
|
||||
int i13 = i03 % ne13;
|
||||
int i12 = i02 % ne12;
|
||||
int i11 = i01 % ne11;
|
||||
|
||||
global char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01;
|
||||
global char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11;
|
||||
global char * dst_ptr = dst + i03*nb3 + i02*nb2 + i01*nb1;
|
||||
|
||||
for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) {
|
||||
const int i10 = i0 % ne10;
|
||||
*((global half *)(dst_ptr + i0*nb0)) = *((global half *)(src0_ptr + i0*nb00)) - *((global half *)(src1_ptr + i10*nb10));
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_sub_row_f16(
|
||||
global half4 * src0,
|
||||
ulong offset0,
|
||||
global half4 * src1,
|
||||
ulong offset1,
|
||||
global half4 * dst,
|
||||
ulong offsetd,
|
||||
int ne
|
||||
) {
|
||||
src0 = (global half4*)((global char*)src0 + offset0);
|
||||
src1 = (global half4*)((global char*)src1 + offset1);
|
||||
dst = (global half4*)((global char*)dst + offsetd);
|
||||
|
||||
// This performs better than using %.
|
||||
uint gid = get_global_id(0);
|
||||
uint idx1 = gid - (gid/ne)*ne; // get_global_id(0) % ne
|
||||
dst[gid] = src0[gid] - src1[idx1];
|
||||
}
|
||||
|
|
|
|||
|
|
@ -28,6 +28,7 @@
|
|||
#include "mmvq.hpp"
|
||||
#include "norm.hpp"
|
||||
#include "outprod.hpp"
|
||||
#include "quantize.hpp"
|
||||
#include "quants.hpp"
|
||||
#include "rope.hpp"
|
||||
#include "set_rows.hpp"
|
||||
|
|
|
|||
|
|
@ -1,31 +1,12 @@
|
|||
#include "cpy.hpp"
|
||||
|
||||
#include <float.h>
|
||||
#include <string>
|
||||
|
||||
#include "dequantize.hpp"
|
||||
#include "ggml-sycl/common.hpp"
|
||||
#include "ggml-sycl/presets.hpp"
|
||||
#include "ggml.h"
|
||||
|
||||
static __dpct_inline__ int best_index_int8(int n, const int8_t * val, float x) {
|
||||
if (x <= val[0]) {
|
||||
return 0;
|
||||
}
|
||||
if (x >= val[n - 1]) {
|
||||
return n - 1;
|
||||
}
|
||||
int ml = 0, mu = n - 1;
|
||||
while (mu - ml > 1) {
|
||||
int mav = (ml + mu) / 2;
|
||||
if (x < val[mav]) {
|
||||
mu = mav;
|
||||
} else {
|
||||
ml = mav;
|
||||
}
|
||||
}
|
||||
return x - val[mu - 1] < val[mu] - x ? mu - 1 : mu;
|
||||
}
|
||||
|
||||
static void cpy_1_f32_f32(const char * cxi, char * cdsti) {
|
||||
const float * xi = (const float *) cxi;
|
||||
|
|
@ -97,28 +78,6 @@ static void cpy_f32_f16(const char * cx, char * cdst, const int ne, const int ne
|
|||
cpy_1(cx + x_offset, cdst + dst_offset);
|
||||
}
|
||||
|
||||
static void cpy_blck_f32_q8_0(const char * cxi, char * cdsti) {
|
||||
const float * xi = (const float *) cxi;
|
||||
block_q8_0 * dsti = (block_q8_0 *) cdsti;
|
||||
|
||||
float amax = 0.0f; // absolute max
|
||||
|
||||
for (int j = 0; j < QK8_0; j++) {
|
||||
const float v = xi[j];
|
||||
amax = sycl::fmax(amax, sycl::fabs((float) v));
|
||||
}
|
||||
|
||||
const float d = amax / ((1 << 7) - 1);
|
||||
const float id = d ? 1.0f / d : 0.0f;
|
||||
|
||||
dsti->d = d;
|
||||
|
||||
for (int j = 0; j < QK8_0; ++j) {
|
||||
const float x0 = xi[j] * id;
|
||||
|
||||
dsti->qs[j] = sycl::round((float) x0);
|
||||
}
|
||||
}
|
||||
|
||||
/* quantized type same copy */
|
||||
template<typename T>
|
||||
|
|
@ -140,178 +99,7 @@ static void cpy_blck_q8_0_f32(const char * cxi, char * cdsti) {
|
|||
}
|
||||
}
|
||||
|
||||
static void cpy_blck_f32_q4_0(const char * cxi, char * cdsti) {
|
||||
const float * xi = (const float *) cxi;
|
||||
block_q4_0 * dsti = (block_q4_0 *) cdsti;
|
||||
|
||||
float amax = 0.0f;
|
||||
float vmax = 0.0f;
|
||||
|
||||
for (int j = 0; j < QK4_0; ++j) {
|
||||
const float v = xi[j];
|
||||
if (amax < sycl::fabs((float) v)) {
|
||||
amax = sycl::fabs((float) v);
|
||||
vmax = v;
|
||||
}
|
||||
}
|
||||
|
||||
const float d = vmax / -8;
|
||||
const float id = d ? 1.0f / d : 0.0f;
|
||||
|
||||
dsti->d = d;
|
||||
|
||||
for (int j = 0; j < QK4_0 / 2; ++j) {
|
||||
const float x0 = xi[0 + j] * id;
|
||||
const float x1 = xi[QK4_0 / 2 + j] * id;
|
||||
|
||||
const uint8_t xi0 = dpct::min(15, (int8_t) (x0 + 8.5f));
|
||||
const uint8_t xi1 = dpct::min(15, (int8_t) (x1 + 8.5f));
|
||||
|
||||
dsti->qs[j] = xi0;
|
||||
dsti->qs[j] |= xi1 << 4;
|
||||
}
|
||||
}
|
||||
|
||||
static void cpy_blck_f32_q4_1(const char * cxi, char * cdsti) {
|
||||
const float * xi = (const float *) cxi;
|
||||
block_q4_1 * dsti = (block_q4_1 *) cdsti;
|
||||
|
||||
float vmin = FLT_MAX;
|
||||
float vmax = -FLT_MAX;
|
||||
|
||||
for (int j = 0; j < QK4_1; ++j) {
|
||||
const float v = xi[j];
|
||||
|
||||
if (v < vmin) {
|
||||
vmin = v;
|
||||
}
|
||||
if (v > vmax) {
|
||||
vmax = v;
|
||||
}
|
||||
}
|
||||
|
||||
const float d = (vmax - vmin) / ((1 << 4) - 1);
|
||||
const float id = d ? 1.0f / d : 0.0f;
|
||||
|
||||
dsti->dm.x() = d;
|
||||
dsti->dm.y() = vmin;
|
||||
|
||||
for (int j = 0; j < QK4_1 / 2; ++j) {
|
||||
const float x0 = (xi[0 + j] - vmin) * id;
|
||||
const float x1 = (xi[QK4_1 / 2 + j] - vmin) * id;
|
||||
|
||||
const uint8_t xi0 = dpct::min(15, (int8_t) (x0 + 0.5f));
|
||||
const uint8_t xi1 = dpct::min(15, (int8_t) (x1 + 0.5f));
|
||||
|
||||
dsti->qs[j] = xi0;
|
||||
dsti->qs[j] |= xi1 << 4;
|
||||
}
|
||||
}
|
||||
|
||||
static void cpy_blck_f32_q5_0(const char * cxi, char * cdsti) {
|
||||
const float * xi = (const float *) cxi;
|
||||
block_q5_0 * dsti = (block_q5_0 *) cdsti;
|
||||
|
||||
float amax = 0.0f;
|
||||
float vmax = 0.0f;
|
||||
|
||||
for (int j = 0; j < QK5_0; ++j) {
|
||||
const float v = xi[j];
|
||||
if (amax < sycl::fabs((float) v)) {
|
||||
amax = sycl::fabs((float) v);
|
||||
vmax = v;
|
||||
}
|
||||
}
|
||||
|
||||
const float d = vmax / -16;
|
||||
const float id = d ? 1.0f / d : 0.0f;
|
||||
|
||||
dsti->d = d;
|
||||
|
||||
uint32_t qh = 0;
|
||||
for (int j = 0; j < QK5_0 / 2; ++j) {
|
||||
const float x0 = xi[0 + j] * id;
|
||||
const float x1 = xi[QK5_0 / 2 + j] * id;
|
||||
|
||||
const uint8_t xi0 = dpct::min(31, (int8_t) (x0 + 16.5f));
|
||||
const uint8_t xi1 = dpct::min(31, (int8_t) (x1 + 16.5f));
|
||||
|
||||
dsti->qs[j] = (xi0 & 0xf) | ((xi1 & 0xf) << 4);
|
||||
qh |= ((xi0 & 0x10u) >> 4) << (j + 0);
|
||||
qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_0 / 2);
|
||||
}
|
||||
memcpy(dsti->qh, &qh, sizeof(qh));
|
||||
}
|
||||
|
||||
static void cpy_blck_f32_q5_1(const char * cxi, char * cdsti) {
|
||||
const float * xi = (const float *) cxi;
|
||||
block_q5_1 * dsti = (block_q5_1 *) cdsti;
|
||||
|
||||
float min = xi[0];
|
||||
float max = xi[0];
|
||||
|
||||
for (int j = 1; j < QK5_1; ++j) {
|
||||
const float v = xi[j];
|
||||
min = v < min ? v : min;
|
||||
max = v > max ? v : max;
|
||||
}
|
||||
|
||||
const float d = (max - min) / 31;
|
||||
const float id = d ? 1.0f / d : 0.0f;
|
||||
|
||||
dsti->dm.x() = d;
|
||||
dsti->dm.y() = min;
|
||||
|
||||
uint32_t qh = 0;
|
||||
for (int j = 0; j < QK5_1 / 2; ++j) {
|
||||
const float x0 = (xi[0 + j] - min) * id;
|
||||
const float x1 = (xi[QK5_1 / 2 + j] - min) * id;
|
||||
|
||||
const uint8_t xi0 = (uint8_t) (x0 + 0.5f);
|
||||
const uint8_t xi1 = (uint8_t) (x1 + 0.5f);
|
||||
|
||||
dsti->qs[j] = (xi0 & 0xf) | ((xi1 & 0xf) << 4);
|
||||
qh |= ((xi0 & 0x10u) >> 4) << (j + 0);
|
||||
qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_1 / 2);
|
||||
}
|
||||
memcpy(dsti->qh, &qh, sizeof(qh));
|
||||
}
|
||||
|
||||
static void cpy_blck_f32_iq4_nl(const char * cxi, char * cdsti) {
|
||||
const float * xi = (const float *) cxi;
|
||||
block_iq4_nl * dsti = (block_iq4_nl *) cdsti;
|
||||
|
||||
float amax = 0.0f;
|
||||
float vmax = 0.0f;
|
||||
|
||||
for (int j = 0; j < QK4_NL; ++j) {
|
||||
const float v = xi[j];
|
||||
if (amax < sycl::fabs((float) v)) {
|
||||
amax = sycl::fabs((float) v);
|
||||
vmax = v;
|
||||
}
|
||||
}
|
||||
|
||||
float d = vmax / kvalues_iq4nl[0];
|
||||
const float id = d ? 1.0f / d : 0.0f;
|
||||
|
||||
float sumqx = 0, sumq2 = 0;
|
||||
for (int j = 0; j < QK4_NL / 2; ++j) {
|
||||
const float x0 = xi[0 + j] * id;
|
||||
const float x1 = xi[QK4_NL / 2 + j] * id;
|
||||
const uint8_t xi0 = best_index_int8(16, kvalues_iq4nl, x0);
|
||||
const uint8_t xi1 = best_index_int8(16, kvalues_iq4nl, x1);
|
||||
dsti->qs[j] = xi0 | (xi1 << 4);
|
||||
const float v0 = kvalues_iq4nl[xi0];
|
||||
const float v1 = kvalues_iq4nl[xi1];
|
||||
const float w0 = xi[0 + j] * xi[0 + j];
|
||||
const float w1 = xi[QK4_NL / 2 + j] * xi[QK4_NL / 2 + j];
|
||||
sumqx += w0 * v0 * xi[j] + w1 * v1 * xi[QK4_NL / 2 + j];
|
||||
sumq2 += w0 * v0 * v0 + w1 * v1 * v1;
|
||||
}
|
||||
|
||||
dsti->d = sumq2 > 0 ? sumqx / sumq2 : d;
|
||||
}
|
||||
|
||||
template <dequantize_kernel_t dequant, int qk> static void cpy_blck_q_f32(const char * cxi, char * cdsti) {
|
||||
float * cdstf = (float *) (cdsti);
|
||||
|
|
|
|||
|
|
@ -2,10 +2,222 @@
|
|||
#define GGML_SYCL_CPY_HPP
|
||||
|
||||
#include "common.hpp"
|
||||
#include <float.h>
|
||||
|
||||
typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
|
||||
|
||||
__dpct_inline__ int best_index_int8(int n, const int8_t * val, float x) {
|
||||
if (x <= val[0]) {
|
||||
return 0;
|
||||
}
|
||||
if (x >= val[n - 1]) {
|
||||
return n - 1;
|
||||
}
|
||||
int ml = 0, mu = n - 1;
|
||||
while (mu - ml > 1) {
|
||||
int mav = (ml + mu) / 2;
|
||||
if (x < val[mav]) {
|
||||
mu = mav;
|
||||
} else {
|
||||
ml = mav;
|
||||
}
|
||||
}
|
||||
return x - val[mu - 1] < val[mu] - x ? mu - 1 : mu;
|
||||
}
|
||||
|
||||
inline void cpy_blck_f32_q8_0(const char * cxi, char * cdsti) {
|
||||
const float * xi = (const float *) cxi;
|
||||
block_q8_0 * dsti = (block_q8_0 *) cdsti;
|
||||
|
||||
float amax = 0.0f; // absolute max
|
||||
|
||||
for (int j = 0; j < QK8_0; j++) {
|
||||
const float v = xi[j];
|
||||
amax = sycl::fmax(amax, sycl::fabs((float) v));
|
||||
}
|
||||
|
||||
const float d = amax / ((1 << 7) - 1);
|
||||
const float id = d ? 1.0f / d : 0.0f;
|
||||
|
||||
dsti->d = d;
|
||||
|
||||
for (int j = 0; j < QK8_0; ++j) {
|
||||
const float x0 = xi[j] * id;
|
||||
|
||||
dsti->qs[j] = sycl::round((float) x0);
|
||||
}
|
||||
}
|
||||
|
||||
inline void cpy_blck_f32_q4_0(const char * cxi, char * cdsti) {
|
||||
const float * xi = (const float *) cxi;
|
||||
block_q4_0 * dsti = (block_q4_0 *) cdsti;
|
||||
|
||||
float amax = 0.0f;
|
||||
float vmax = 0.0f;
|
||||
|
||||
for (int j = 0; j < QK4_0; ++j) {
|
||||
const float v = xi[j];
|
||||
if (amax < sycl::fabs((float) v)) {
|
||||
amax = sycl::fabs((float) v);
|
||||
vmax = v;
|
||||
}
|
||||
}
|
||||
|
||||
const float d = vmax / -8;
|
||||
const float id = d ? 1.0f / d : 0.0f;
|
||||
|
||||
dsti->d = d;
|
||||
|
||||
for (int j = 0; j < QK4_0 / 2; ++j) {
|
||||
const float x0 = xi[0 + j] * id;
|
||||
const float x1 = xi[QK4_0 / 2 + j] * id;
|
||||
|
||||
const uint8_t xi0 = dpct::min(15, (int8_t) (x0 + 8.5f));
|
||||
const uint8_t xi1 = dpct::min(15, (int8_t) (x1 + 8.5f));
|
||||
|
||||
dsti->qs[j] = xi0;
|
||||
dsti->qs[j] |= xi1 << 4;
|
||||
}
|
||||
}
|
||||
|
||||
inline void cpy_blck_f32_q4_1(const char * cxi, char * cdsti) {
|
||||
const float * xi = (const float *) cxi;
|
||||
block_q4_1 * dsti = (block_q4_1 *) cdsti;
|
||||
|
||||
float vmin = FLT_MAX;
|
||||
float vmax = -FLT_MAX;
|
||||
|
||||
for (int j = 0; j < QK4_1; ++j) {
|
||||
const float v = xi[j];
|
||||
|
||||
vmin = sycl::min(v, vmin);
|
||||
vmax = sycl::max(v, vmax);
|
||||
}
|
||||
|
||||
const float d = (vmax - vmin) / ((1 << 4) - 1);
|
||||
const float id = d ? 1.0f / d : 0.0f;
|
||||
|
||||
dsti->dm.x() = d;
|
||||
dsti->dm.y() = vmin;
|
||||
|
||||
for (int j = 0; j < QK4_1 / 2; ++j) {
|
||||
const float x0 = (xi[0 + j] - vmin) * id;
|
||||
const float x1 = (xi[QK4_1 / 2 + j] - vmin) * id;
|
||||
|
||||
const uint8_t xi0 = dpct::min(15, (int8_t) (x0 + 0.5f));
|
||||
const uint8_t xi1 = dpct::min(15, (int8_t) (x1 + 0.5f));
|
||||
|
||||
dsti->qs[j] = xi0;
|
||||
dsti->qs[j] |= xi1 << 4;
|
||||
}
|
||||
}
|
||||
|
||||
inline void cpy_blck_f32_q5_0(const char * cxi, char * cdsti) {
|
||||
const float * xi = (const float *) cxi;
|
||||
block_q5_0 * dsti = (block_q5_0 *) cdsti;
|
||||
|
||||
float amax = 0.0f;
|
||||
float vmax = 0.0f;
|
||||
|
||||
for (int j = 0; j < QK5_0; ++j) {
|
||||
const float v = xi[j];
|
||||
if (amax < sycl::fabs((float) v)) {
|
||||
amax = sycl::fabs((float) v);
|
||||
vmax = v;
|
||||
}
|
||||
}
|
||||
|
||||
const float d = vmax / -16;
|
||||
const float id = d ? 1.0f / d : 0.0f;
|
||||
|
||||
dsti->d = d;
|
||||
|
||||
uint32_t qh = 0;
|
||||
for (int j = 0; j < QK5_0 / 2; ++j) {
|
||||
const float x0 = xi[0 + j] * id;
|
||||
const float x1 = xi[QK5_0 / 2 + j] * id;
|
||||
|
||||
const uint8_t xi0 = dpct::min(31, (int8_t) (x0 + 16.5f));
|
||||
const uint8_t xi1 = dpct::min(31, (int8_t) (x1 + 16.5f));
|
||||
|
||||
dsti->qs[j] = (xi0 & 0xf) | ((xi1 & 0xf) << 4);
|
||||
qh |= ((xi0 & 0x10u) >> 4) << (j + 0);
|
||||
qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_0 / 2);
|
||||
}
|
||||
memcpy(dsti->qh, &qh, sizeof(qh));
|
||||
}
|
||||
|
||||
inline void cpy_blck_f32_q5_1(const char * cxi, char * cdsti) {
|
||||
const float * xi = (const float *) cxi;
|
||||
block_q5_1 * dsti = (block_q5_1 *) cdsti;
|
||||
|
||||
float min = xi[0];
|
||||
float max = xi[0];
|
||||
|
||||
for (int j = 1; j < QK5_1; ++j) {
|
||||
const float v = xi[j];
|
||||
min = v < min ? v : min;
|
||||
max = v > max ? v : max;
|
||||
}
|
||||
|
||||
const float d = (max - min) / 31;
|
||||
const float id = d ? 1.0f / d : 0.0f;
|
||||
|
||||
dsti->dm.x() = d;
|
||||
dsti->dm.y() = min;
|
||||
|
||||
uint32_t qh = 0;
|
||||
for (int j = 0; j < QK5_1 / 2; ++j) {
|
||||
const float x0 = (xi[0 + j] - min) * id;
|
||||
const float x1 = (xi[QK5_1 / 2 + j] - min) * id;
|
||||
|
||||
const uint8_t xi0 = (uint8_t) (x0 + 0.5f);
|
||||
const uint8_t xi1 = (uint8_t) (x1 + 0.5f);
|
||||
|
||||
dsti->qs[j] = (xi0 & 0xf) | ((xi1 & 0xf) << 4);
|
||||
qh |= ((xi0 & 0x10u) >> 4) << (j + 0);
|
||||
qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_1 / 2);
|
||||
}
|
||||
memcpy(dsti->qh, &qh, sizeof(qh));
|
||||
}
|
||||
|
||||
inline void cpy_blck_f32_iq4_nl(const char * cxi, char * cdsti) {
|
||||
const float * xi = (const float *) cxi;
|
||||
block_iq4_nl * dsti = (block_iq4_nl *) cdsti;
|
||||
|
||||
float amax = 0.0f;
|
||||
float vmax = 0.0f;
|
||||
|
||||
for (int j = 0; j < QK4_NL; ++j) {
|
||||
const float v = xi[j];
|
||||
if (amax < sycl::fabs((float) v)) {
|
||||
amax = sycl::fabs((float) v);
|
||||
vmax = v;
|
||||
}
|
||||
}
|
||||
|
||||
float d = vmax / kvalues_iq4nl[0];
|
||||
const float id = d ? 1.0f / d : 0.0f;
|
||||
|
||||
float sumqx = 0, sumq2 = 0;
|
||||
for (int j = 0; j < QK4_NL / 2; ++j) {
|
||||
const float x0 = xi[0 + j] * id;
|
||||
const float x1 = xi[QK4_NL / 2 + j] * id;
|
||||
const uint8_t xi0 = best_index_int8(16, kvalues_iq4nl, x0);
|
||||
const uint8_t xi1 = best_index_int8(16, kvalues_iq4nl, x1);
|
||||
dsti->qs[j] = xi0 | (xi1 << 4);
|
||||
const float v0 = kvalues_iq4nl[xi0];
|
||||
const float v1 = kvalues_iq4nl[xi1];
|
||||
const float w0 = xi[0 + j] * xi[0 + j];
|
||||
const float w1 = xi[QK4_NL / 2 + j] * xi[QK4_NL / 2 + j];
|
||||
sumqx += w0 * v0 * xi[j] + w1 * v1 * xi[QK4_NL / 2 + j];
|
||||
sumq2 += w0 * v0 * v0 + w1 * v1 * v1;
|
||||
}
|
||||
|
||||
dsti->d = sumq2 > 0 ? sumqx / sumq2 : d;
|
||||
}
|
||||
|
||||
void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1);
|
||||
void ggml_sycl_dup(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
|
||||
|
||||
#endif // GGML_SYCL_CPY_HPP
|
||||
#endif // GGML_SYCL_CPY_HPP
|
||||
|
|
|
|||
|
|
@ -44,6 +44,7 @@
|
|||
#include "ggml-sycl/set_rows.hpp"
|
||||
#include "ggml-sycl/sycl_hw.hpp"
|
||||
#include "ggml-sycl/getrows.hpp"
|
||||
#include "ggml-sycl/quantize.hpp"
|
||||
#include "ggml.h"
|
||||
|
||||
static bool g_sycl_loaded = false;
|
||||
|
|
@ -1373,120 +1374,6 @@ typedef void (*ggml_sycl_op_mul_mat_t)(
|
|||
|
||||
|
||||
|
||||
template<int QUANT_BLOCK_TILE>
|
||||
static void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int kx, const int kx_padded,
|
||||
const sycl::nd_item<3> &item_ct1) {
|
||||
const int ix = (item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
||||
item_ct1.get_local_id(2)) * QUANT_BLOCK_TILE;
|
||||
|
||||
if (ix >= kx_padded) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int iy = item_ct1.get_local_range(1) * item_ct1.get_group(1) +
|
||||
item_ct1.get_local_id(1);
|
||||
|
||||
const int i_padded = iy*kx_padded + ix;
|
||||
|
||||
block_q8_1 * y = (block_q8_1 *) vy;
|
||||
|
||||
const int ib = i_padded / QK8_1; // block index
|
||||
const int iqs = i_padded % QK8_1; // quant index
|
||||
typedef sycl::vec<float, QUANT_BLOCK_TILE> TC;
|
||||
typedef sycl::vec<int8_t, QUANT_BLOCK_TILE> TQ;
|
||||
TC zeros;
|
||||
TQ qzeros;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < QUANT_BLOCK_TILE; i++)
|
||||
{
|
||||
zeros[i] = 0.f;
|
||||
qzeros[i] = 0;
|
||||
}
|
||||
const TC xi = ix < kx ? *(const TC *)&x[iy * kx + ix] : zeros;
|
||||
float sum = xi[0];
|
||||
float amax = sycl::fabs(xi[0]);
|
||||
#pragma unroll
|
||||
for (int i = 1; i < QUANT_BLOCK_TILE; i++)
|
||||
{
|
||||
sum += xi[i];
|
||||
amax = sycl::fmax(sycl::fabs(xi[i]), amax);
|
||||
}
|
||||
sum = warp_reduce_sum(sum, item_ct1);
|
||||
amax = warp_reduce_max(amax, item_ct1);
|
||||
|
||||
const float d = amax / 127;
|
||||
TQ q = qzeros;
|
||||
if (amax != 0.0f)
|
||||
{
|
||||
#pragma unroll
|
||||
for (int i = 0; i < QUANT_BLOCK_TILE; i++) {
|
||||
q[i] = sycl::round(xi[i] / d);
|
||||
}
|
||||
}
|
||||
|
||||
*(TQ *)&y[ib].qs[iqs] = q;
|
||||
|
||||
if (iqs > 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
reinterpret_cast<sycl::half &>(y[ib].ds.x()) = d;
|
||||
reinterpret_cast<sycl::half &>(y[ib].ds.y()) = sum;
|
||||
}
|
||||
|
||||
template <int ElementsPerWI>
|
||||
static __dpct_inline__ void quantize_and_reorder_q8_1(const float * __restrict__ x, void * reordered_q8_tensor,
|
||||
const int kx, const int kx_padded, const sycl::nd_item<1> & it) {
|
||||
/*
|
||||
Quantizes and reorders the resultant q8 tensor in a per row fashion
|
||||
Each sub-group calculates one quant block. i.e. QK8_1 quant values and the d and sum values
|
||||
*/
|
||||
|
||||
auto subgroup_id = it.get_group(0);
|
||||
auto wi_id = it.get_local_id(0);
|
||||
|
||||
const int num_blocks_per_row = kx / QK8_1;
|
||||
auto row = subgroup_id / num_blocks_per_row;
|
||||
auto col = subgroup_id % num_blocks_per_row;
|
||||
|
||||
auto row_offset = row * (kx_padded / QK8_1) * sizeof(block_q8_1);
|
||||
auto col_offset = QK8_1 * col + wi_id * ElementsPerWI;
|
||||
|
||||
auto quant_ptr = (int8_t *) ((char *) reordered_q8_tensor + row_offset + col_offset);
|
||||
auto ds_ptr = (sycl::half2 *) ((char *) reordered_q8_tensor + row_offset + kx + col * sizeof(sycl::half2));
|
||||
|
||||
sycl::vec<float, ElementsPerWI> wi_f32_vals;
|
||||
sycl::vec<int8_t, ElementsPerWI> quantized_values;
|
||||
|
||||
auto float_ptr_offset = subgroup_id * QK8_1 + ElementsPerWI * wi_id;
|
||||
wi_f32_vals = *reinterpret_cast<const sycl::vec<float, ElementsPerWI> *>(x + float_ptr_offset);
|
||||
|
||||
float sum = 0.0f;
|
||||
float amax = 0.0f;
|
||||
|
||||
#pragma unroll(ElementsPerWI)
|
||||
for (int i = 0; i < ElementsPerWI; i++) {
|
||||
sum += wi_f32_vals[i];
|
||||
amax = sycl::fmax(amax, sycl::fabs(wi_f32_vals[i]));
|
||||
quantized_values[i] = 0;
|
||||
}
|
||||
sum = sycl::reduce_over_group(it.get_group(), sum, sycl::plus<float>());
|
||||
amax = sycl::reduce_over_group(it.get_group(), amax, sycl::maximum<float>());
|
||||
float d = amax == 0 ? 1 : amax / 127;
|
||||
|
||||
#pragma unroll(ElementsPerWI)
|
||||
for (int i = 0; i < ElementsPerWI; i++) {
|
||||
quantized_values[i] = sycl::round(wi_f32_vals[i] / d);
|
||||
}
|
||||
|
||||
d = amax == 0 ? 0 : d;
|
||||
|
||||
*reinterpret_cast<sycl::vec<int8_t, ElementsPerWI> *>(quant_ptr) = quantized_values;
|
||||
if (wi_id == 0) {
|
||||
*ds_ptr = sycl::half2(sycl::half(d), sycl::half(sum));
|
||||
}
|
||||
}
|
||||
|
||||
static void mul_mat_p021_f16_f32(
|
||||
const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst,
|
||||
const int ncols_x, const int nrows_x, const int nchannels_x, const int nchannels_y,
|
||||
|
|
@ -1770,32 +1657,6 @@ static void pool2d_nchw_kernel(
|
|||
o_ptr[cur_oh * ow + cur_ow] = res;
|
||||
}
|
||||
|
||||
static void quantize_row_q8_1_sycl(const float * x, void * vy, const int kx, const int ky, const int kx_padded,
|
||||
bool reorder_q8_tensor, queue_ptr stream) {
|
||||
if (reorder_q8_tensor) {
|
||||
auto local_range = std::size_t(WARP_SIZE);
|
||||
auto num_quant_blocks = ky * (kx / QK8_1);
|
||||
auto global_range = num_quant_blocks * local_range;
|
||||
stream->parallel_for(sycl::nd_range<1>({ global_range }, { local_range }),
|
||||
[=](sycl::nd_item<1> it) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
quantize_and_reorder_q8_1<QK8_1 / WARP_SIZE>(x, vy, kx, kx_padded, it);
|
||||
});
|
||||
} else {
|
||||
const int block_num_x = (kx_padded + SYCL_QUANTIZE_BLOCK_SIZE - 1) / SYCL_QUANTIZE_BLOCK_SIZE;
|
||||
const sycl::range<3> num_blocks(1, ky, block_num_x);
|
||||
int constexpr QUANT_BLOCK_TILE = QK8_1 / WARP_SIZE;
|
||||
static_assert(QK8_1 % WARP_SIZE == 0);
|
||||
const sycl::range<3> block_size(1, 1, SYCL_QUANTIZE_BLOCK_SIZE / QUANT_BLOCK_TILE);
|
||||
{
|
||||
dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });
|
||||
|
||||
stream->parallel_for(sycl::nd_range<3>(num_blocks * block_size, block_size),
|
||||
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
quantize_q8_1<QUANT_BLOCK_TILE>(x, vy, kx, kx_padded, item_ct1);
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_mul_mat_p021_f16_f32_sycl(const void *vx, const float *y,
|
||||
float *dst, const int ncols_x,
|
||||
|
|
@ -2372,10 +2233,10 @@ static void ggml_sycl_set_peer_access(const int n_tokens, int main_device) {
|
|||
peer_access_enabled = enable_peer_access;
|
||||
}
|
||||
|
||||
template <template <int> typename quantize_f>
|
||||
static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
||||
const ggml_tensor *src1, ggml_tensor *dst,
|
||||
ggml_sycl_op_mul_mat_t op,
|
||||
const bool convert_src1_to_q8_1) try {
|
||||
ggml_sycl_op_mul_mat_t op) try {
|
||||
|
||||
GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne);
|
||||
|
||||
|
|
@ -2470,6 +2331,8 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
|
|||
}
|
||||
}
|
||||
|
||||
constexpr bool quantize_enabled = !std::is_same_v<quantize_f<QK8_1 / WARP_SIZE>,
|
||||
no_quantize_q8_1<QK8_1 / WARP_SIZE>>;
|
||||
for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
|
||||
if ((!split && i != ctx.device) || dev[i].row_low == dev[i].row_high) {
|
||||
continue;
|
||||
|
|
@ -2495,20 +2358,19 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
|
|||
dev[i].src1_ddf = dev[i].src1_ddf_alloc.alloc(ctx.pool(i), ggml_nelements(src1));
|
||||
}
|
||||
|
||||
if (convert_src1_to_q8_1) {
|
||||
if constexpr(quantize_enabled) {
|
||||
dev[i].src1_ddq = dev[i].src1_ddq_alloc.alloc(ctx.pool(i), nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs);
|
||||
|
||||
if (src1_on_device && src1_is_contiguous) {
|
||||
bool reorder_q8_tensor = src0->extra && ((ggml_tensor_extra_gpu *)src0->extra)->optimized_feature.reorder;
|
||||
scope_op_debug_print scope_dbg_print(__func__, "/quantize_row_q8_1_sycl", dst,
|
||||
/*num_src=*/2, " : converting src1 to Q8_1");
|
||||
quantize_row_q8_1_sycl(dev[i].src1_ddf, dev[i].src1_ddq, ne10, nrows1, src1_padded_col_size, reorder_q8_tensor, stream);
|
||||
/*
|
||||
DPCT1010:90: SYCL uses exceptions to report errors and does not
|
||||
use the error codes. The call was replaced with 0. You need to
|
||||
rewrite this code.
|
||||
*/
|
||||
SYCL_CHECK(0);
|
||||
try {
|
||||
quantize_row_q8_1_sycl<quantize_f>(dev[i].src1_ddf, dev[i].src1_ddq, ne10, nrows1, src1_padded_col_size, stream);
|
||||
} catch (sycl::exception const &exc) {
|
||||
std::cerr << "Quantize_row_q8_1_sycl error" << exc.what() << "Exception caught at file:" << __FILE__
|
||||
<< ", line:" << __LINE__ << std::endl;
|
||||
std::exit(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -2524,11 +2386,6 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
|
|||
// here an event is recorded that signals that the main device has finished calculating the input data
|
||||
if (split && used_devices > 1) {
|
||||
ggml_sycl_set_device(ctx.device);
|
||||
/*
|
||||
DPCT1024:91: The original code returned the error code that was further
|
||||
consumed by the program logic. This original code was replaced with 0.
|
||||
You may need to rewrite the program logic consuming the error code.
|
||||
*/
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(
|
||||
*src0_extra->events[ctx.device][0] =
|
||||
ctx.stream()->ext_oneapi_submit_barrier()));
|
||||
|
|
@ -2552,11 +2409,6 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
|
|||
|
||||
// wait for main GPU data if necessary
|
||||
if (split && (i != ctx.device || is != 0)) {
|
||||
/*
|
||||
DPCT1009:163: SYCL uses exceptions to report errors and does not
|
||||
use the error codes. The original code was commented out and a
|
||||
warning string was inserted. You need to rewrite this code.
|
||||
*/
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(stream->ext_oneapi_submit_barrier(
|
||||
{*src0_extra->events[ctx.device][0]})));
|
||||
}
|
||||
|
|
@ -2582,39 +2434,42 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
|
|||
// copy src0, src1 to device if necessary
|
||||
if (src1_is_contiguous) {
|
||||
if (i != ctx.device) {
|
||||
if (convert_src1_to_q8_1) {
|
||||
if constexpr (quantize_enabled) {
|
||||
char * src1_ddq_i_source = dev[ctx.device].src1_ddq + src1_ddq_i_offset;
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(stream->memcpy(
|
||||
src1_ddq_i, src1_ddq_i_source,
|
||||
src1_ncols * src1_padded_col_size * q8_1_ts /
|
||||
q8_1_bs).wait()));
|
||||
SYCL_CHECK(
|
||||
CHECK_TRY_ERROR(stream
|
||||
->memcpy(src1_ddq_i, src1_ddq_i_source,
|
||||
src1_ncols * src1_padded_col_size * q8_1_ts / q8_1_bs)
|
||||
.wait()));
|
||||
} else {
|
||||
|
||||
float * src1_ddf_i_source = (float *) src1_extra->data_device[ctx.device];
|
||||
src1_ddf_i_source += (i0*ne11 + src1_col_0) * ne10;
|
||||
src1_ddf_i_source += (i0 * ne11 + src1_col_0) * ne10;
|
||||
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(dev2dev_memcpy(*stream, *main_stream,
|
||||
src1_ddf_i, src1_ddf_i_source,
|
||||
src1_ncols * ne10 * sizeof(float))));
|
||||
SYCL_CHECK(
|
||||
CHECK_TRY_ERROR(dev2dev_memcpy(*stream, *main_stream, src1_ddf_i, src1_ddf_i_source,
|
||||
src1_ncols * ne10 * sizeof(float))));
|
||||
}
|
||||
}
|
||||
} else if (src1_on_device && !src1_is_contiguous) {
|
||||
SYCL_CHECK(ggml_sycl_cpy_tensor_2d(
|
||||
src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream));
|
||||
} else {
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
if (src1_on_device) {
|
||||
SYCL_CHECK(ggml_sycl_cpy_tensor_2d(src1_ddf_i, src1, i03, i02, src1_col_0,
|
||||
src1_col_0 + src1_ncols, stream));
|
||||
} else {
|
||||
GGML_ABORT("src1 is non-contiguous and not on device");
|
||||
}
|
||||
|
||||
if (convert_src1_to_q8_1 && !src1_is_contiguous) {
|
||||
scope_op_debug_print scope_dbg_print(__func__, "/quantize_row_q8_1_sycl", dst,
|
||||
/*num_src=*/2, " : converting src1 to Q8_1");
|
||||
quantize_row_q8_1_sycl(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, false, stream);
|
||||
/*
|
||||
DPCT1010:92: SYCL uses exceptions to report errors and does
|
||||
not use the error codes. The call was replaced with 0. You
|
||||
need to rewrite this code.
|
||||
*/
|
||||
SYCL_CHECK(0);
|
||||
if constexpr (quantize_enabled) {
|
||||
scope_op_debug_print scope_dbg_print(__func__, "/quantize_row_q8_1_sycl", dst,
|
||||
/*num_src=*/2, " : converting src1 to Q8_1");
|
||||
try {
|
||||
quantize_row_q8_1_sycl<quantize_q8_1>(src1_ddf_i, src1_ddq_i, ne10, src1_ncols,
|
||||
src1_padded_col_size, stream);
|
||||
} catch (const sycl::exception & exc) {
|
||||
std::cerr << "Quantize_row_q8_1_sycl error" << exc.what()
|
||||
<< "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl;
|
||||
std::exit(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (src1_col_0 == 0 && !src0_is_contiguous && i02 % i02_divisor == 0) {
|
||||
|
|
@ -2626,12 +2481,6 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
|
|||
// do the computation
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(op(ctx, src0, src1, dst, src0_dd_i, src1_ddf_i, src1_ddq_i, dst_dd_i,
|
||||
dev[i].row_low, dev[i].row_high, src1_ncols, src1_padded_col_size, stream)));
|
||||
/*
|
||||
DPCT1010:93: SYCL uses exceptions to report errors and does not
|
||||
use the error codes. The call was replaced with 0. You need to
|
||||
rewrite this code.
|
||||
*/
|
||||
SYCL_CHECK(0);
|
||||
|
||||
// copy dst to host or other device if necessary
|
||||
if (!dst_on_device) {
|
||||
|
|
@ -2662,12 +2511,6 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
|
|||
|
||||
// add event for the main device to wait on until other device is done
|
||||
if (split && (i != ctx.device || is != 0)) {
|
||||
/*
|
||||
DPCT1024:94: The original code returned the error code that
|
||||
was further consumed by the program logic. This original
|
||||
code was replaced with 0. You may need to rewrite the
|
||||
program logic consuming the error code.
|
||||
*/
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(
|
||||
*src0_extra->events[i][is] =
|
||||
stream->ext_oneapi_submit_barrier()));
|
||||
|
|
@ -2845,6 +2688,9 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx, cons
|
|||
const size_t type_size_src0 = ggml_type_size(src0->type);
|
||||
const size_t type_size_src1 = ggml_type_size(src1->type);
|
||||
|
||||
bool is_src0_cont_2 = ggml_is_contiguous_2(src0);
|
||||
bool is_src1_cont_2 = ggml_is_contiguous_2(src1);
|
||||
|
||||
// SRC1 strides
|
||||
int64_t s11 = nb11 / type_size_src1;
|
||||
int64_t s12 = nb12 / type_size_src1;
|
||||
|
|
@ -2894,6 +2740,8 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx, cons
|
|||
s11 = ne10;
|
||||
s12 = ne11 * s11;
|
||||
s13 = ne12 * s12;
|
||||
|
||||
is_src1_cont_2 = true;
|
||||
}
|
||||
|
||||
ggml_sycl_pool_alloc<sycl::half> dst_f16(ctx.pool());
|
||||
|
|
@ -3009,12 +2857,16 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx, cons
|
|||
else
|
||||
#endif
|
||||
{
|
||||
if (r2 == 1 && r3 == 1 && ggml_is_contiguous_2(src0) && ggml_is_contiguous_2(src1)) {
|
||||
if (r2 == 1 && r3 == 1 && is_src0_cont_2 && is_src1_cont_2) {
|
||||
// with a [0, 2, 1, 3] perm. and ne02==1 the matrix strides need to be determined from dim 3:
|
||||
const int64_t sma = ne02 == 1 ? nb03/nb00 : nb02/nb00;
|
||||
const int64_t smb = ne12 == 1 ? s13 : s12;
|
||||
|
||||
// there is no broadcast and src0, src1 are contiguous across dims 2, 3
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(*queue, oneapi::math::transpose::trans,
|
||||
oneapi::math::transpose::nontrans, ne01, ne11, ne10, alpha,
|
||||
src0_f16, dpct::library_data_t::real_half, nb01 / nb00, nb02 / nb00,
|
||||
src1_f16, dpct::library_data_t::real_half, s11, s12, beta, dst_ddf,
|
||||
src0_f16, dpct::library_data_t::real_half, nb01 / nb00, sma,
|
||||
src1_f16, dpct::library_data_t::real_half, s11, smb, beta, dst_ddf,
|
||||
mkl_data_type, ne0, ne1 * ne0, ne12 * ne13, mkl_compute_type)));
|
||||
} else {
|
||||
const int ne23 = ne12 * ne13;
|
||||
|
|
@ -3351,19 +3203,20 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
|
|||
// KQ + KQV multi-batch
|
||||
ggml_sycl_mul_mat_batched_sycl(ctx, src0, src1, dst);
|
||||
} else if (use_dequantize_mul_mat_vec) {
|
||||
constexpr bool convert_src1_to_q8_1 = false;
|
||||
opt_for_reorder(&ctx, src0, src1, dst, mul_mat_algo::DMMV);
|
||||
ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_dequantize_mul_mat_vec, convert_src1_to_q8_1);
|
||||
ggml_sycl_op_mul_mat<no_quantize_q8_1>(ctx, src0, src1, dst, ggml_sycl_op_dequantize_mul_mat_vec);
|
||||
} else if (use_mul_mat_vec_q) {
|
||||
constexpr bool convert_src1_to_q8_1 = true;
|
||||
opt_for_reorder(&ctx, src0, src1, dst, mul_mat_algo::MMVQ);
|
||||
ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_vec_q, convert_src1_to_q8_1);
|
||||
ggml_tensor_extra_gpu * extra = static_cast<ggml_tensor_extra_gpu *>(src0->extra);
|
||||
if (extra && extra->optimized_feature.reorder) {
|
||||
ggml_sycl_op_mul_mat<quantize_and_reorder_q8_1_soa>(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_vec_q);
|
||||
} else {
|
||||
ggml_sycl_op_mul_mat<quantize_q8_1>(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_vec_q);
|
||||
}
|
||||
} else if (use_mul_mat_q) {
|
||||
constexpr bool convert_src1_to_q8_1 = true;
|
||||
ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_q, convert_src1_to_q8_1);
|
||||
ggml_sycl_op_mul_mat<quantize_q8_1>(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_q);
|
||||
} else {
|
||||
constexpr bool convert_src1_to_q8_1 = false;
|
||||
ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_sycl, convert_src1_to_q8_1);
|
||||
ggml_sycl_op_mul_mat<no_quantize_q8_1>(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_sycl);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -4385,11 +4238,12 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
|
|||
}
|
||||
case GGML_OP_SET_ROWS:
|
||||
{
|
||||
// TODO: add support
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/14274
|
||||
#pragma message("TODO: implement BF16, Q4_0, Q4_1, Q5_0, Q5_1, Q8_0, IQ4_NL support (https://github.com/ggml-org/llama.cpp/pull/14661)")
|
||||
return (op->type == GGML_TYPE_F32 || (op->type == GGML_TYPE_F16 && op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_I64));
|
||||
} break;
|
||||
return ((op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16 || op->type == GGML_TYPE_BF16 ||
|
||||
op->type == GGML_TYPE_Q8_0 || op->type == GGML_TYPE_Q5_1 || op->type == GGML_TYPE_Q5_0 ||
|
||||
op->type == GGML_TYPE_Q4_1 || op->type == GGML_TYPE_Q4_0 || op->type == GGML_TYPE_IQ4_NL) &&
|
||||
(op->src[1]->type == GGML_TYPE_I64));
|
||||
}
|
||||
break;
|
||||
case GGML_OP_CPY:
|
||||
{
|
||||
ggml_type src0_type = op->src[0]->type;
|
||||
|
|
|
|||
|
|
@ -0,0 +1,133 @@
|
|||
/***************************************************************************
|
||||
*
|
||||
* Copyright (C) 2025 Codeplay Software Ltd.
|
||||
* Copyright (C) 2025 Intel Corporation
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*
|
||||
* quantize.hpp
|
||||
*
|
||||
* Description:
|
||||
* Sycl backend specific quantization functions
|
||||
**************************************************************************/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <sycl/nd_item.hpp>
|
||||
|
||||
#include "ggml-sycl/dpct/helper.hpp"
|
||||
|
||||
template <int ElementsPerWI>
|
||||
__dpct_inline__ static void quantize_q8_1_impl(const float * __restrict__ x,
|
||||
sycl::vec<int8_t, ElementsPerWI> & quantized_values, float & d,
|
||||
float & sum, const sycl::nd_item<1> & it) {
|
||||
auto subgroup_id = it.get_group(0);
|
||||
auto wi_id = it.get_local_id(0);
|
||||
|
||||
sycl::vec<float, ElementsPerWI> wi_f32_vals;
|
||||
|
||||
auto float_ptr_offset = subgroup_id * QK8_1 + ElementsPerWI * wi_id;
|
||||
wi_f32_vals = *reinterpret_cast<const sycl::vec<float, ElementsPerWI> *>(x + float_ptr_offset);
|
||||
|
||||
float amax = 0.0f;
|
||||
|
||||
#pragma unroll(ElementsPerWI)
|
||||
for (int i = 0; i < ElementsPerWI; i++) {
|
||||
sum += wi_f32_vals[i];
|
||||
amax = sycl::fmax(amax, sycl::fabs(wi_f32_vals[i]));
|
||||
quantized_values[i] = 0;
|
||||
}
|
||||
sum = sycl::reduce_over_group(it.get_sub_group(), sum, sycl::plus<float>());
|
||||
amax = sycl::reduce_over_group(it.get_sub_group(), amax, sycl::maximum<float>());
|
||||
d = amax == 0 ? 1 : amax / 127;
|
||||
|
||||
#pragma unroll(ElementsPerWI)
|
||||
for (int i = 0; i < ElementsPerWI; i++) {
|
||||
quantized_values[i] = sycl::round(wi_f32_vals[i] / d);
|
||||
}
|
||||
|
||||
d = amax == 0 ? 0 : d;
|
||||
}
|
||||
|
||||
// No op to control codepath in ggml_sycl_op_mul_mat
|
||||
template <int ElementsPerWI> struct no_quantize_q8_1 {
|
||||
void operator()(const float *, void *, int, int, const sycl::nd_item<1> &) const {}
|
||||
};
|
||||
|
||||
template <int ElementsPerWI> struct quantize_and_reorder_q8_1_soa {
|
||||
__dpct_inline__ void operator()(const float * __restrict__ x, void * reordered_q8_tensor, const int kx,
|
||||
const int kx_padded, const sycl::nd_item<1> & it) const {
|
||||
/*
|
||||
Quantizes and reorders the resultant q8 tensor in a per row fashion
|
||||
Each sub-group calculates one quant block. i.e. QK8_1 quant values and the d and sum values
|
||||
*/
|
||||
auto subgroup_id = it.get_group(0);
|
||||
auto wi_id = it.get_local_id(0);
|
||||
|
||||
sycl::vec<int8_t, ElementsPerWI> quantized_values;
|
||||
float d = 0.0f;
|
||||
float sum = 0.0f;
|
||||
quantize_q8_1_impl<ElementsPerWI>(x, quantized_values, d, sum, it);
|
||||
|
||||
const int num_blocks_per_row = kx / QK8_1;
|
||||
auto row = subgroup_id / num_blocks_per_row;
|
||||
auto col = subgroup_id % num_blocks_per_row;
|
||||
auto row_offset = row * (kx_padded / QK8_1) * sizeof(block_q8_1);
|
||||
auto col_offset = QK8_1 * col + wi_id * ElementsPerWI;
|
||||
|
||||
auto quant_ptr = (int8_t *) ((char *) reordered_q8_tensor + row_offset + col_offset);
|
||||
*reinterpret_cast<sycl::vec<int8_t, ElementsPerWI> *>(quant_ptr) = quantized_values;
|
||||
|
||||
auto ds_ptr = (sycl::half2 *) ((char *) reordered_q8_tensor + row_offset + kx + col * sizeof(sycl::half2));
|
||||
if (wi_id == 0) {
|
||||
*ds_ptr = sycl::half2(sycl::half(d), sycl::half(sum));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <int ElementsPerWI> struct quantize_q8_1 {
|
||||
__dpct_inline__ void operator()(const float * __restrict__ x, void * q8_tensor, const int kx, const int kx_padded,
|
||||
const sycl::nd_item<1> & it) const {
|
||||
auto subgroup_id = it.get_group(0);
|
||||
auto wi_id = it.get_local_id(0);
|
||||
|
||||
const int num_blocks_per_row = kx / QK8_1;
|
||||
auto row = subgroup_id / num_blocks_per_row;
|
||||
const int pitch = kx_padded / QK8_1;
|
||||
|
||||
sycl::vec<int8_t, ElementsPerWI> quantized_values;
|
||||
float d = 0.0f;
|
||||
float sum = 0.0f;
|
||||
quantize_q8_1_impl<ElementsPerWI>(x, quantized_values, d, sum, it);
|
||||
|
||||
block_q8_1 * quant_ptr = (block_q8_1 *) q8_tensor;
|
||||
auto block_id = subgroup_id % num_blocks_per_row + row * pitch;
|
||||
|
||||
int8_t * qs = &(quant_ptr[block_id].qs[wi_id * ElementsPerWI]);
|
||||
*reinterpret_cast<sycl::vec<int8_t, ElementsPerWI> *>(qs) = quantized_values;
|
||||
if (wi_id == 0) {
|
||||
quant_ptr[block_id].ds = sycl::half2(sycl::half(d), sycl::half(sum));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <template <int> typename quantize_f>
|
||||
void quantize_row_q8_1_sycl(const float * x, void * vy, const int kx, const int ky, const int kx_padded,
|
||||
dpct::queue_ptr stream) {
|
||||
static_assert(QK8_1 % WARP_SIZE == 0);
|
||||
auto local_range = std::size_t(WARP_SIZE);
|
||||
auto num_quant_blocks = ky * (kx / QK8_1);
|
||||
auto global_range = num_quant_blocks * local_range;
|
||||
dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });
|
||||
|
||||
stream->parallel_for(sycl::nd_range<1>({ global_range }, { local_range }),
|
||||
[=](sycl::nd_item<1> it) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
quantize_f<QK8_1 / WARP_SIZE>()(x, vy, kx, kx_padded, it);
|
||||
});
|
||||
}
|
||||
|
|
@ -1,4 +1,5 @@
|
|||
#include "set_rows.hpp"
|
||||
#include "cpy.hpp"
|
||||
|
||||
namespace utils {
|
||||
template<typename T>
|
||||
|
|
@ -15,6 +16,68 @@ convert (const char* src, char* dst) {
|
|||
*reinterpret_cast<TOut*>(dst) = dst_val;
|
||||
}
|
||||
|
||||
template <typename blockType, int qk, cpy_kernel_t cpyblck>
|
||||
static void set_rows_sycl_q(const char * __restrict__ src0_d,
|
||||
const int64_t * __restrict__ src1_d,
|
||||
blockType * __restrict__ dst_d,
|
||||
// tensor dimensions src0 and src1
|
||||
const int64_t ne00,
|
||||
const int64_t ne01,
|
||||
const int64_t ne02,
|
||||
const int64_t ne03,
|
||||
const int64_t ne10,
|
||||
const int64_t ne11,
|
||||
const int64_t ne12,
|
||||
const int64_t ne13,
|
||||
// strides for src0
|
||||
const size_t nb00,
|
||||
const size_t nb01,
|
||||
const size_t nb02,
|
||||
const size_t nb03,
|
||||
// strides for src1
|
||||
const size_t nb10,
|
||||
const size_t nb11,
|
||||
const size_t nb12,
|
||||
const size_t nb13,
|
||||
// strides for dst
|
||||
const size_t nb1,
|
||||
const size_t nb2,
|
||||
const size_t nb3,
|
||||
queue_ptr stream) {
|
||||
const int64_t total_blocks = (ne00 * ne01 * ne02 * ne03) / qk;
|
||||
constexpr int block_size = 256;
|
||||
const int64_t grid_size = ceil_div(total_blocks, block_size);
|
||||
|
||||
sycl_parallel_for(stream, sycl::nd_range<1>(grid_size * block_size, block_size), [=](sycl::nd_item<1> item_ct1) {
|
||||
const int64_t i = item_ct1.get_global_linear_id();
|
||||
if (i >= total_blocks) {
|
||||
return;
|
||||
}
|
||||
const int64_t i_base = i * qk;
|
||||
const int64_t i03 = i_base / (ne00 * ne01 * ne02);
|
||||
const int64_t rem1 = i_base - i03 * (ne00 * ne01 * ne02);
|
||||
const int64_t i02 = rem1 / (ne00 * ne01);
|
||||
const int64_t rem2 = rem1 - i02 * ne00 * ne01;
|
||||
const int64_t i01 = rem2 / ne00;
|
||||
const int64_t i00 = rem2 - i01 * ne00;
|
||||
const int64_t i12 = i03 % ne12;
|
||||
const int64_t i11 = i02 % ne11;
|
||||
const int64_t i10 = i01;
|
||||
const size_t src_offset = calculate_offset<3>({ nb01, nb02, nb03 }, { i01, i02, i03 });
|
||||
const char * src_block = src0_d + src_offset + i00 * sizeof(float);
|
||||
const size_t src1_offset = calculate_offset<3>({ nb10, nb11, nb12 }, { i10, i11, i12 });
|
||||
const int64_t dst_row = src1_d[src1_offset / sizeof(int64_t)];
|
||||
const size_t dst_offset =
|
||||
calculate_offset<3>({ nb1, nb2, nb3 }, { dst_row, i02, i03 }) + (i00 / qk) * sizeof(blockType);
|
||||
char * dst_block = reinterpret_cast<char *>(reinterpret_cast<char *>(dst_d) + dst_offset);
|
||||
cpyblck(src_block, dst_block);
|
||||
});
|
||||
GGML_UNUSED(ne10);
|
||||
GGML_UNUSED(ne13);
|
||||
GGML_UNUSED(nb00);
|
||||
GGML_UNUSED(nb13);
|
||||
}
|
||||
|
||||
template<typename TIn, typename TOut>
|
||||
static void k_set_rows(
|
||||
const char * __restrict__ src0, const int64_t * __restrict__ src1, char * __restrict__ dst,
|
||||
|
|
@ -124,6 +187,37 @@ void ggml_sycl_op_set_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|||
stream
|
||||
);
|
||||
break;
|
||||
case GGML_TYPE_BF16:
|
||||
set_rows_sycl<float, sycl::ext::oneapi::bfloat16>(
|
||||
(const char *)src0->data, src1_dd, (char *)dst->data,
|
||||
ne00, ne01, ne02, ne03,
|
||||
ne11, ne12,
|
||||
nb01, nb02, nb03,
|
||||
nb10, nb11, nb12,
|
||||
nb1, nb2, nb3,
|
||||
sizeof(float), sizeof(sycl::ext::oneapi::bfloat16),
|
||||
stream
|
||||
);
|
||||
break;
|
||||
case GGML_TYPE_Q8_0:
|
||||
set_rows_sycl_q<block_q8_0, QK8_0, cpy_blck_f32_q8_0>((const char *)src0->data, src1_dd, (block_q8_0 *)dst->data, ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13, nb1, nb2, nb3, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q5_1:
|
||||
set_rows_sycl_q<block_q5_1, QK5_1, cpy_blck_f32_q5_1>((const char *)src0->data, src1_dd, (block_q5_1 *)dst->data, ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13, nb1, nb2, nb3, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q5_0:
|
||||
set_rows_sycl_q<block_q5_0, QK5_0, cpy_blck_f32_q5_0>((const char *)src0->data, src1_dd, (block_q5_0 *)dst->data, ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13, nb1, nb2, nb3, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q4_1:
|
||||
set_rows_sycl_q<block_q4_1, QK4_1, cpy_blck_f32_q4_1>((const char *)src0->data, src1_dd, (block_q4_1 *)dst->data, ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13, nb1, nb2, nb3, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q4_0:
|
||||
set_rows_sycl_q<block_q4_0, QK4_0, cpy_blck_f32_q4_0>((const char *)src0->data, src1_dd, (block_q4_0 *)dst->data, ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13, nb1, nb2, nb3, stream);
|
||||
break;
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
set_rows_sycl_q<block_iq4_nl, QK4_NL, cpy_blck_f32_iq4_nl>((const char *)src0->data, src1_dd, (block_iq4_nl *)dst->data, ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13, nb1, nb2, nb3, stream);
|
||||
break;
|
||||
|
||||
default:
|
||||
GGML_ABORT("Unsupported tensor type!");
|
||||
break;
|
||||
|
|
|
|||
|
|
@ -222,6 +222,7 @@ enum vk_device_architecture {
|
|||
AMD_RDNA2,
|
||||
AMD_RDNA3,
|
||||
INTEL_XE2,
|
||||
NVIDIA_PRE_TURING,
|
||||
};
|
||||
|
||||
// HSK x HSV
|
||||
|
|
@ -315,10 +316,33 @@ static vk_device_architecture get_device_architecture(const vk::PhysicalDevice&
|
|||
// https://www.intel.com/content/www/us/en/docs/oneapi/optimization-guide-gpu/2025-0/intel-xe-gpu-architecture.html
|
||||
return vk_device_architecture::INTEL_XE2;
|
||||
}
|
||||
} else if (props.vendorID == VK_VENDOR_ID_NVIDIA) {
|
||||
const std::vector<vk::ExtensionProperties> ext_props = device.enumerateDeviceExtensionProperties();
|
||||
|
||||
bool cooperative_matrix = false;
|
||||
|
||||
// Detect "pre-turing" based on lack of coopmat support.
|
||||
for (const auto& properties : ext_props) {
|
||||
if (strcmp("VK_KHR_cooperative_matrix", properties.extensionName) == 0) {
|
||||
cooperative_matrix = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!cooperative_matrix) {
|
||||
return vk_device_architecture::NVIDIA_PRE_TURING;
|
||||
}
|
||||
}
|
||||
return vk_device_architecture::OTHER;
|
||||
}
|
||||
|
||||
enum vk_conv_shapes {
|
||||
CONV_SHAPE_128x128,
|
||||
CONV_SHAPE_64x32,
|
||||
CONV_SHAPE_32x256,
|
||||
CONV_SHAPE_COUNT,
|
||||
};
|
||||
|
||||
struct vk_device_struct {
|
||||
std::recursive_mutex mutex;
|
||||
|
||||
|
|
@ -483,7 +507,8 @@ struct vk_device_struct {
|
|||
vk_pipeline pipeline_rwkv_wkv6_f32;
|
||||
vk_pipeline pipeline_rwkv_wkv7_f32;
|
||||
vk_pipeline pipeline_opt_step_adamw_f32;
|
||||
vk_pipeline pipeline_conv2d_f32;
|
||||
vk_pipeline pipeline_conv2d_f32[CONV_SHAPE_COUNT];
|
||||
vk_pipeline pipeline_conv2d_f16_f32[CONV_SHAPE_COUNT];
|
||||
vk_pipeline pipeline_conv2d_dw_whcn_f32;
|
||||
vk_pipeline pipeline_conv2d_dw_cwhn_f32;
|
||||
|
||||
|
|
@ -907,8 +932,22 @@ struct vk_op_conv2d_push_constants {
|
|||
uint32_t nb1;
|
||||
uint32_t nb2;
|
||||
uint32_t nb3;
|
||||
|
||||
// init_fastdiv_values constants for dividing by KW, KW*KH, OW, OW*OH
|
||||
uint32_t KWmp; uint32_t KWL;
|
||||
uint32_t KWKHmp; uint32_t KWKHL;
|
||||
uint32_t OWmp; uint32_t OWL;
|
||||
uint32_t OWOHmp; uint32_t OWOHL;
|
||||
};
|
||||
|
||||
template <> void init_pushconst_fastdiv(vk_op_conv2d_push_constants &p) {
|
||||
// Compute magic values to divide by KW, KW*KH, OW, OW*OH
|
||||
init_fastdiv_values(p.KW, p.KWmp, p.KWL);
|
||||
init_fastdiv_values(p.KW*p.KH, p.KWKHmp, p.KWKHL);
|
||||
init_fastdiv_values(p.OW, p.OWmp, p.OWL);
|
||||
init_fastdiv_values(p.OW*p.OH, p.OWOHmp, p.OWOHL);
|
||||
}
|
||||
|
||||
struct vk_op_conv2d_dw_push_constants {
|
||||
uint32_t ne;
|
||||
uint32_t batches;
|
||||
|
|
@ -1340,7 +1379,7 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin
|
|||
vk::DebugUtilsObjectNameInfoEXT duoni;
|
||||
duoni.objectType = vk::ObjectType::ePipeline;
|
||||
duoni.pObjectName = pipeline->name.c_str();
|
||||
duoni.objectHandle = reinterpret_cast<uint64_t>(static_cast<VkPipeline_T*>(pipeline->pipeline));
|
||||
duoni.objectHandle = /*reinterpret_cast*/(uint64_t)(static_cast<VkPipeline>(pipeline->pipeline));
|
||||
vk_instance.pfn_vkSetDebugUtilsObjectNameEXT(device->device, &static_cast<VkDebugUtilsObjectNameInfoEXT &>(duoni));
|
||||
}
|
||||
|
||||
|
|
@ -2067,12 +2106,12 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|||
s_mmq_wg_denoms = { 32, 64, 1 };
|
||||
|
||||
// spec constants and tile sizes for quant matmul (Qi_K)
|
||||
l_warptile_mmq_k = { 256, 64, 128, 64, 1 };
|
||||
m_warptile_mmq_k = { 256, 32, 64, 64, 0 };
|
||||
s_warptile_mmq_k = { 256, 32, 32, 128, 0 };
|
||||
l_mmq_wg_denoms_k = { 64, 128, 1 };
|
||||
m_mmq_wg_denoms_k = { 32, 64, 1 };
|
||||
s_mmq_wg_denoms_k = { 32, 32, 1 };
|
||||
l_warptile_mmq_k = { 256, 128, 256, 64, 1 };
|
||||
m_warptile_mmq_k = { 256, 128, 128, 64, 1 };
|
||||
s_warptile_mmq_k = { 256, 32, 64, 128, 0 };
|
||||
l_mmq_wg_denoms_k = { 128, 256, 1 };
|
||||
m_mmq_wg_denoms_k = { 128, 128, 1 };
|
||||
s_mmq_wg_denoms_k = { 32, 64, 1 };
|
||||
|
||||
// spec constants and tile sizes for quant matmul_id
|
||||
l_warptile_mmqid = { 256, 128, 128, 16, 0 };
|
||||
|
|
@ -2846,7 +2885,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|||
ggml_vk_create_pipeline(device, device->pipeline_mul_mat_vec_p021_f16_f32[i], "mul_mat_vec_p021_f16_f32"+std::to_string(i+1), mul_mat_vec_p021_f16_f32_len, mul_mat_vec_p021_f16_f32_data, "main", 3, 6 * sizeof(uint32_t), {1, 1, 1}, {device->subgroup_size, i + 1}, 1, true);
|
||||
}
|
||||
}
|
||||
ggml_vk_create_pipeline(device, device->pipeline_mul_mat_vec_nc_f16_f32, "mul_mat_vec_nc_f16_f32", mul_mat_vec_nc_f16_f32_len, mul_mat_vec_nc_f16_f32_data, "main", 3, 9 * sizeof(uint32_t), {1, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_mul_mat_vec_nc_f16_f32, "mul_mat_vec_nc_f16_f32", mul_mat_vec_nc_f16_f32_len, mul_mat_vec_nc_f16_f32_data, "main", 3, 12 * sizeof(uint32_t), {1, 1, 1}, {}, 1);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_norm_f32, "norm_f32", norm_f32_len, norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_group_norm_f32, "group_norm_f32", group_norm_f32_len, group_norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1);
|
||||
|
|
@ -3047,39 +3086,105 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|||
ggml_vk_create_pipeline(device, device->pipeline_opt_step_adamw_f32, "opt_step_adamw_f32", opt_step_adamw_f32_len, opt_step_adamw_f32_data, "main", 5, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
||||
|
||||
// conv2d
|
||||
uint32_t conv2d_WG_SIZE = 256;
|
||||
uint32_t conv2d_BS_K = 128;
|
||||
uint32_t conv2d_BS_CRS = 16;
|
||||
uint32_t use_collectives = 0; // Enables subgroup ops for preventing the re-calculation of indices.
|
||||
if (device->subgroup_shuffle &&
|
||||
device->vendor_id != VK_VENDOR_ID_INTEL) { // Do not enable collectives on Intel, see PR 14316
|
||||
use_collectives = 1;
|
||||
conv2d_BS_CRS = std::min(
|
||||
device->subgroup_size,
|
||||
conv2d_BS_CRS); // CRS block size should be capped at sugroup size for correctness when shuffle is used.
|
||||
}
|
||||
uint32_t conv2d_BS_NPQ = 128;
|
||||
uint32_t conv2d_TS_K = 8;
|
||||
uint32_t conv2d_shmem_req =
|
||||
(conv2d_BS_K * (conv2d_BS_CRS + 1) + conv2d_BS_CRS * (conv2d_BS_NPQ + 1)) * sizeof(float);
|
||||
if (device->properties.limits.maxComputeSharedMemorySize < conv2d_shmem_req) {
|
||||
conv2d_BS_CRS = 8;
|
||||
if (use_collectives) {
|
||||
conv2d_BS_CRS = std::min(device->subgroup_size, conv2d_BS_CRS);
|
||||
}
|
||||
}
|
||||
for (uint32_t s = 0; s < CONV_SHAPE_COUNT; ++s) {
|
||||
uint32_t conv2d_WG_SIZE = 256;
|
||||
uint32_t conv2d_BS_K = 128;
|
||||
uint32_t conv2d_BS_CRS = 16;
|
||||
uint32_t use_collectives = 0; // Enables subgroup ops for preventing the re-calculation of indices.
|
||||
uint32_t conv2d_BS_NPQ = 128;
|
||||
uint32_t conv2d_TS_K = 8;
|
||||
uint32_t conv2d_SHMEM_PAD = 4;
|
||||
bool conv2d_UNROLL = true;
|
||||
|
||||
if (use_collectives) {
|
||||
ggml_vk_create_pipeline(
|
||||
device, device->pipeline_conv2d_f32, "conv2d_f32", conv2d_f32_len, conv2d_f32_data, "main", 3,
|
||||
sizeof(vk_op_conv2d_push_constants), { conv2d_BS_K, conv2d_BS_NPQ, 1 },
|
||||
{ conv2d_WG_SIZE, conv2d_BS_K, conv2d_BS_CRS, conv2d_BS_NPQ, conv2d_TS_K, use_collectives }, 1, true, true);
|
||||
} else {
|
||||
ggml_vk_create_pipeline(
|
||||
device, device->pipeline_conv2d_f32, "conv2d_f32", conv2d_f32_len, conv2d_f32_data, "main", 3,
|
||||
sizeof(vk_op_conv2d_push_constants), { conv2d_BS_K, conv2d_BS_NPQ, 1 },
|
||||
{ conv2d_WG_SIZE, conv2d_BS_K, conv2d_BS_CRS, conv2d_BS_NPQ, conv2d_TS_K, use_collectives }, 1, true,
|
||||
false);
|
||||
#if defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
|
||||
if (device->coopmat2) {
|
||||
conv2d_SHMEM_PAD = 8; // 8 float16_t
|
||||
}
|
||||
#endif
|
||||
|
||||
if (device->vendor_id == VK_VENDOR_ID_INTEL) {
|
||||
conv2d_SHMEM_PAD = 0;
|
||||
conv2d_UNROLL = false;
|
||||
} else if (device->vendor_id == VK_VENDOR_ID_AMD) {
|
||||
conv2d_SHMEM_PAD = device->architecture == vk_device_architecture::AMD_GCN ? 1 : 4;
|
||||
}
|
||||
|
||||
switch (s) {
|
||||
default:
|
||||
case CONV_SHAPE_128x128:
|
||||
conv2d_BS_K = 128;
|
||||
conv2d_BS_NPQ = 128;
|
||||
conv2d_BS_CRS = 16;
|
||||
if (device->vendor_id == VK_VENDOR_ID_AMD && device->architecture != vk_device_architecture::AMD_GCN) {
|
||||
conv2d_UNROLL = false;
|
||||
}
|
||||
break;
|
||||
case CONV_SHAPE_64x32:
|
||||
conv2d_BS_K = 64;
|
||||
conv2d_BS_NPQ = 32;
|
||||
conv2d_BS_CRS = 32;
|
||||
conv2d_TS_K = 4;
|
||||
break;
|
||||
case CONV_SHAPE_32x256:
|
||||
conv2d_BS_K = 32;
|
||||
conv2d_BS_NPQ = 256;
|
||||
conv2d_BS_CRS = 16;
|
||||
break;
|
||||
}
|
||||
|
||||
// Use collectives on pre-Turing NVIDIA GPUs and GCN AMD cards, which had slower integer math.
|
||||
bool allow_collectives_nv = device->vendor_id != VK_VENDOR_ID_NVIDIA ||
|
||||
device->architecture == vk_device_architecture::NVIDIA_PRE_TURING;
|
||||
bool allow_collectives_amd = device->vendor_id != VK_VENDOR_ID_AMD ||
|
||||
device->architecture == vk_device_architecture::AMD_GCN;
|
||||
|
||||
if (device->subgroup_shuffle &&
|
||||
device->vendor_id != VK_VENDOR_ID_INTEL && // Do not enable collectives on Intel, see PR 14316.
|
||||
allow_collectives_nv &&
|
||||
allow_collectives_amd) {
|
||||
use_collectives = 1;
|
||||
conv2d_BS_CRS = std::min(
|
||||
device->subgroup_size,
|
||||
conv2d_BS_CRS); // CRS block size should be capped at subgroup size for correctness when shuffle is used.
|
||||
}
|
||||
|
||||
uint32_t conv2d_shmem_req =
|
||||
(conv2d_BS_K * (conv2d_BS_CRS + conv2d_SHMEM_PAD) + conv2d_BS_CRS * (conv2d_BS_NPQ + conv2d_SHMEM_PAD)) * sizeof(float);
|
||||
if (device->properties.limits.maxComputeSharedMemorySize < conv2d_shmem_req) {
|
||||
conv2d_BS_CRS = 8;
|
||||
if (use_collectives) {
|
||||
conv2d_BS_CRS = std::min(device->subgroup_size, conv2d_BS_CRS);
|
||||
}
|
||||
}
|
||||
|
||||
std::array<uint32_t, 3> wg_denoms = { conv2d_BS_K, conv2d_BS_NPQ, 1 };
|
||||
std::vector<uint32_t> spec_constants = { conv2d_WG_SIZE, conv2d_BS_K, conv2d_BS_CRS, conv2d_BS_NPQ, conv2d_TS_K, use_collectives, conv2d_SHMEM_PAD };
|
||||
|
||||
#if defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
|
||||
if (device->coopmat2) {
|
||||
ggml_vk_create_pipeline(
|
||||
device, device->pipeline_conv2d_f32[s], "conv2d_f32", conv2d_f32_cm2_len, conv2d_f32_cm2_data, "main", 3,
|
||||
sizeof(vk_op_conv2d_push_constants), wg_denoms, spec_constants, 1, true, use_collectives);
|
||||
ggml_vk_create_pipeline(
|
||||
device, device->pipeline_conv2d_f16_f32[s], "conv2d_f16_f32", conv2d_f16_f32_cm2_len, conv2d_f16_f32_cm2_data, "main", 3,
|
||||
sizeof(vk_op_conv2d_push_constants), wg_denoms, spec_constants, 1, true, use_collectives);
|
||||
} else
|
||||
#endif
|
||||
if (conv2d_UNROLL) {
|
||||
ggml_vk_create_pipeline(
|
||||
device, device->pipeline_conv2d_f32[s], "conv2d_f32", conv2d_f32_unroll_len, conv2d_f32_unroll_data, "main", 3,
|
||||
sizeof(vk_op_conv2d_push_constants), wg_denoms, spec_constants, 1, true, use_collectives);
|
||||
ggml_vk_create_pipeline(
|
||||
device, device->pipeline_conv2d_f16_f32[s], "conv2d_f16_f32", conv2d_f16_f32_unroll_len, conv2d_f16_f32_unroll_data, "main", 3,
|
||||
sizeof(vk_op_conv2d_push_constants), wg_denoms, spec_constants, 1, true, use_collectives);
|
||||
} else {
|
||||
ggml_vk_create_pipeline(
|
||||
device, device->pipeline_conv2d_f32[s], "conv2d_f32", conv2d_f32_len, conv2d_f32_data, "main", 3,
|
||||
sizeof(vk_op_conv2d_push_constants), wg_denoms, spec_constants, 1, true, use_collectives);
|
||||
ggml_vk_create_pipeline(
|
||||
device, device->pipeline_conv2d_f16_f32[s], "conv2d_f16_f32", conv2d_f16_f32_len, conv2d_f16_f32_data, "main", 3,
|
||||
sizeof(vk_op_conv2d_push_constants), wg_denoms, spec_constants, 1, true, use_collectives);
|
||||
}
|
||||
}
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_conv2d_dw_whcn_f32, "conv2d_dw_whcn_f32", conv2d_dw_whcn_f32_len, conv2d_dw_whcn_f32_data, "main", 3, sizeof(vk_op_conv2d_dw_push_constants), {512, 1, 1}, {}, 1);
|
||||
|
|
@ -4933,26 +5038,37 @@ static void ggml_vk_buffer_memset(vk_buffer& dst, size_t offset, uint32_t c, siz
|
|||
ggml_vk_queue_command_pools_cleanup(dst->device);
|
||||
}
|
||||
|
||||
static uint32_t ggml_vk_guess_split_k(ggml_backend_vk_context * ctx, int m, int n, int k, const vk_pipeline& pipeline) {
|
||||
static uint32_t ggml_vk_guess_split_k(ggml_backend_vk_context * ctx, uint32_t m, uint32_t n, uint32_t k, const vk_pipeline& pipeline) {
|
||||
VK_LOG_DEBUG("ggml_vk_guess_split_k(" << m << ", " << n << ", " << k << ")");
|
||||
|
||||
uint32_t split_k = 1;
|
||||
if (ctx->device->shader_core_count != 0 && m >= (int)pipeline->wg_denoms[0] && n >= (int)pipeline->wg_denoms[1]) {
|
||||
if (ctx->device->shader_core_count != 0 && m >= pipeline->wg_denoms[0] && n >= pipeline->wg_denoms[1]) {
|
||||
// If k is 'large' and the SMs will fill less than halfway, use split_k.
|
||||
uint32_t m_tiles = CEIL_DIV(m, pipeline->wg_denoms[0]);
|
||||
uint32_t n_tiles = CEIL_DIV(n, pipeline->wg_denoms[1]);
|
||||
if (k >= 2048 && m_tiles * n_tiles < ctx->device->shader_core_count / 2) {
|
||||
split_k = ctx->device->shader_core_count / (m_tiles * n_tiles);
|
||||
// Clamp to 2 or 4
|
||||
split_k = std::min(split_k, 4u);
|
||||
if (split_k == 3) {
|
||||
split_k = 2;
|
||||
|
||||
if (k >= 2048) {
|
||||
if (m_tiles * n_tiles <= ctx->device->shader_core_count / 2) {
|
||||
split_k = ctx->device->shader_core_count / (m_tiles * n_tiles);
|
||||
} else if (m_tiles * n_tiles <= ctx->device->shader_core_count * 2 / 3) {
|
||||
split_k = 3;
|
||||
}
|
||||
if (ctx->device->coopmat2) {
|
||||
// coopmat2 shader expects splits to be aligned to 256
|
||||
while (split_k > 1 && ((k / split_k) % 256) != 0) {
|
||||
split_k /= 2;
|
||||
// Cap the split at 8x. Unless k is huge this is a lot of overhead.
|
||||
split_k = std::min(split_k, 8u);
|
||||
|
||||
// ggml_vk_matmul will align the splits to be a multiple of 256.
|
||||
// If this rounded up size would cause the last split to be empty,
|
||||
// then reduce the split count.
|
||||
while (true) {
|
||||
if (split_k == 1) {
|
||||
break;
|
||||
}
|
||||
uint32_t k_split = CEIL_DIV(k, split_k);
|
||||
k_split = ROUNDUP_POW2(k_split, 256);
|
||||
if (k_split * (split_k - 1) < k) {
|
||||
break;
|
||||
}
|
||||
split_k--;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -4964,9 +5080,22 @@ static vk_pipeline ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx,
|
|||
VK_LOG_DEBUG("ggml_vk_guess_matmul_pipeline(" << m << ", " << n << ", " << aligned << ", " << ggml_type_name(src0_type) << ", " << ggml_type_name(src1_type) << ")");
|
||||
|
||||
if (ctx->device->coopmat2) {
|
||||
const uint32_t shader_core_count = ctx->device->shader_core_count;
|
||||
const uint32_t tiles_l = CEIL_DIV(m, mmp->a_l->wg_denoms[0]) * CEIL_DIV(n, mmp->a_l->wg_denoms[1]);
|
||||
const uint32_t tiles_m = CEIL_DIV(m, mmp->a_m->wg_denoms[0]) * CEIL_DIV(n, mmp->a_m->wg_denoms[1]);
|
||||
|
||||
// Use large shader when the N dimension is greater than the medium shader's tile size
|
||||
uint32_t crossover_large = mmp->m->wg_denoms[1];
|
||||
if ((ctx->device->mul_mat_l[src0_type] && (n > crossover_large)) || (!ctx->device->mul_mat_m[src0_type] && !ctx->device->mul_mat_s[src0_type])) {
|
||||
|
||||
// Prefer large over medium if either:
|
||||
// - medium or large tiles would overfill the GPU
|
||||
// - large tiles with a split_k==3 fits in the GPU and medium tiles with split_k==2 does not
|
||||
// (medium with split_k==2 is probably better if it fits - more workgroups running and less split_k overhead)
|
||||
bool prefer_large = tiles_m > shader_core_count || tiles_l > shader_core_count ||
|
||||
// split_k==3 with large tiles likely better than medium tiles with no split_k.
|
||||
(tiles_l <= shader_core_count / 3 && tiles_m > shader_core_count / 2);
|
||||
|
||||
if ((ctx->device->mul_mat_l[src0_type] && (n > crossover_large && prefer_large)) || (!ctx->device->mul_mat_m[src0_type] && !ctx->device->mul_mat_s[src0_type])) {
|
||||
return aligned ? mmp->a_l : mmp->l;
|
||||
}
|
||||
// Use medium shader when the N dimension is greater than the small shader's tile size
|
||||
|
|
@ -5010,7 +5139,11 @@ static void ggml_vk_matmul(
|
|||
|
||||
GGML_ASSERT(batch_stride_d == m * n);
|
||||
|
||||
const vk_mat_mat_push_constants pc1 = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d, CEIL_DIV(k, split_k), ne02, ne12, broadcast2, broadcast3, padded_n };
|
||||
// Round the split size up to a multiple of 256 (k-quant alignment)
|
||||
uint32_t k_split = CEIL_DIV(k, split_k);
|
||||
k_split = ROUNDUP_POW2(k_split, 256);
|
||||
|
||||
const vk_mat_mat_push_constants pc1 = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d, k_split, ne02, ne12, broadcast2, broadcast3, padded_n };
|
||||
// Make sure enough workgroups get assigned for split k to work
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, split_k_buffer }, pc1, { (CEIL_DIV(m, pipeline->wg_denoms[0]) * pipeline->wg_denoms[0]) * split_k, n, batch });
|
||||
ggml_vk_sync_buffers(subctx);
|
||||
|
|
@ -5215,9 +5348,9 @@ static void ggml_vk_quantize_q8_1(ggml_backend_vk_context * ctx, vk_context& sub
|
|||
}
|
||||
|
||||
static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
|
||||
VK_LOG_DEBUG("ggml_vk_mul_mat_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
||||
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
|
||||
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3];
|
||||
VK_LOG_DEBUG("ggml_vk_mul_mat_q_f16((" << src0 << ", name=" << src0->name << ", type=" << ggml_type_name(src0->type) << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
||||
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << ggml_type_name(src1->type) << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
|
||||
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << ggml_type_name(dst->type) << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3];
|
||||
std::cerr << "), " << (dryrun ? "dryrun" : "") << ")");
|
||||
GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16); // NOLINT
|
||||
GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT
|
||||
|
|
@ -5732,7 +5865,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
|
|||
const uint64_t ne00 = src0->ne[0];
|
||||
const uint64_t ne01 = src0->ne[1];
|
||||
const uint64_t ne02 = src0->ne[2];
|
||||
// const uint64_t ne03 = src0->ne[3];
|
||||
const uint64_t ne03 = src0->ne[3];
|
||||
|
||||
const uint64_t nb01 = src0->nb[1];
|
||||
const uint64_t nb02 = src0->nb[2];
|
||||
|
|
@ -5744,7 +5877,12 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
|
|||
const uint64_t ne12 = src1->ne[2];
|
||||
// const uint64_t ne13 = src1->ne[3];
|
||||
|
||||
const uint32_t nb03 = (uint32_t)(src0->nb[3] / sizeof(ggml_fp16_t));
|
||||
const uint32_t nb13 = (uint32_t)(src1->nb[3] / sizeof(float));
|
||||
const uint32_t nb23 = (uint32_t)(dst->nb[3] / sizeof(float));
|
||||
|
||||
GGML_ASSERT(ne11 == 1);
|
||||
GGML_ASSERT(src0->ne[3] == src1->ne[3]); // checked in supports_op
|
||||
|
||||
ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context;
|
||||
ggml_backend_vk_buffer_context * src0_buf_ctx = (ggml_backend_vk_buffer_context *)src0->buffer->context;
|
||||
|
|
@ -5760,7 +5898,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
|
|||
src1_uma = d_Qy != nullptr;
|
||||
}
|
||||
|
||||
const uint64_t d_ne = ne01 * ne11 * ne12;
|
||||
const uint64_t d_ne = ne01 * ne11 * ne12 * ne03;
|
||||
|
||||
const uint32_t row_stride_x = nb01 / sizeof(ggml_fp16_t);
|
||||
const uint32_t channel_stride_x = nb02 / sizeof(ggml_fp16_t);
|
||||
|
|
@ -5795,10 +5933,10 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
|
|||
const uint64_t d_shader_offset = d_buf_offset - d_buffer_offset;
|
||||
|
||||
// compute
|
||||
const std::array<uint32_t, 9> pc = { (uint32_t)ne00, (uint32_t)ne01, row_stride_x, channel_stride_x, channel_stride_y, (uint32_t)(ne12 / ne02), (uint32_t)ne12, (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) };
|
||||
const std::array<uint32_t, 12> pc = { (uint32_t)ne00, (uint32_t)ne01, row_stride_x, channel_stride_x, channel_stride_y, (uint32_t)(ne12 / ne02), (uint32_t)ne12, (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)), nb03, nb13, nb23 };
|
||||
ggml_vk_sync_buffers(subctx);
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_nc_f16_f32,
|
||||
{ vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz }, vk_subbuffer{ d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, vk_subbuffer{ d_D, d_buffer_offset, d_sz + d_shader_offset } }, pc, { 1, (uint32_t)ne01, (uint32_t)ne12 });
|
||||
{ vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz }, vk_subbuffer{ d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, vk_subbuffer{ d_D, d_buffer_offset, d_sz + d_shader_offset } }, pc, { (uint32_t)ne03, (uint32_t)ne01, (uint32_t)ne12 });
|
||||
}
|
||||
|
||||
static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
|
||||
|
|
@ -6631,6 +6769,34 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
|
|||
}
|
||||
}
|
||||
|
||||
static std::array<uint32_t, 3> ggml_vk_get_conv_elements(const ggml_tensor *dst) {
|
||||
const ggml_tensor *src0 = dst->src[0];
|
||||
const ggml_tensor *src1 = dst->src[1];
|
||||
|
||||
// src0 - kernel: [KW, KH, Cin, Cout]
|
||||
// src1 - input: [W, H, Cin, N]
|
||||
// dst - result: [OW, OH, Cout, N]
|
||||
|
||||
// Copied from ggml.c: int64_t ggml_calc_conv_output_size(int64_t ins, int64_t ks, int s, int p, int d)
|
||||
auto calc_conv_output_size = [](int64_t ins, int64_t ks, int s, int p, int d) -> int64_t {
|
||||
return (ins + 2 * p - d * (ks - 1) - 1) / s + 1;
|
||||
};
|
||||
// parallelize in {OW/BS_K, OH/BS_NPQ, 1}
|
||||
int64_t W = src1->ne[0];
|
||||
int64_t H = src1->ne[1];
|
||||
int64_t KW = src0->ne[0];
|
||||
int64_t KH = src0->ne[1];
|
||||
int64_t Cout = src0->ne[3];
|
||||
int64_t N = src1->ne[3];
|
||||
int64_t OH = calc_conv_output_size(H, KH, dst->op_params[1], dst->op_params[3], dst->op_params[5]);
|
||||
int64_t OW = calc_conv_output_size(W, KW, dst->op_params[0], dst->op_params[2], dst->op_params[4]);
|
||||
int64_t NPQ = N * OW * OH;
|
||||
|
||||
// Tile output matrix to (K/NB_K, NPQ/NB_NPQ, 1) workgroups
|
||||
std::array<uint32_t, 3> elements = { static_cast<uint32_t>(Cout), static_cast<uint32_t>(NPQ), 1 };
|
||||
return elements;
|
||||
}
|
||||
|
||||
static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, ggml_op op) {
|
||||
switch (op) {
|
||||
case GGML_OP_GET_ROWS:
|
||||
|
|
@ -6958,9 +7124,33 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
|
|||
}
|
||||
return nullptr;
|
||||
case GGML_OP_CONV_2D:
|
||||
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 &&
|
||||
if (src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 &&
|
||||
ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && ggml_is_contiguous(dst)) {
|
||||
return ctx->device->pipeline_conv2d_f32;
|
||||
auto elements = ggml_vk_get_conv_elements(dst);
|
||||
vk_conv_shapes shape;
|
||||
|
||||
uint32_t tiles[CONV_SHAPE_COUNT];
|
||||
for (uint32_t i = 0; i < CONV_SHAPE_COUNT; ++i) {
|
||||
tiles[i] = CEIL_DIV(elements[0], ctx->device->pipeline_conv2d_f32[i]->wg_denoms[0]) * CEIL_DIV(elements[1], ctx->device->pipeline_conv2d_f32[i]->wg_denoms[1]);
|
||||
}
|
||||
|
||||
// We can't query number of shader cores on Intel, use 32 as a placeholder
|
||||
// so small convolutions will still choose a smaller tile.
|
||||
const uint32_t shader_core_count = ctx->device->shader_core_count > 0 ? ctx->device->shader_core_count : 32;
|
||||
|
||||
if (elements[0] > 64 && tiles[CONV_SHAPE_128x128] >= shader_core_count * 2) {
|
||||
shape = CONV_SHAPE_128x128;
|
||||
} else if (elements[0] <= 32 && tiles[CONV_SHAPE_32x256] >= shader_core_count * 2) {
|
||||
shape = CONV_SHAPE_32x256;
|
||||
} else {
|
||||
shape = CONV_SHAPE_64x32;
|
||||
}
|
||||
|
||||
if (src0->type == GGML_TYPE_F32) {
|
||||
return ctx->device->pipeline_conv2d_f32[shape];
|
||||
} else if (src0->type == GGML_TYPE_F16) {
|
||||
return ctx->device->pipeline_conv2d_f16_f32[shape];
|
||||
}
|
||||
}
|
||||
return nullptr;
|
||||
case GGML_OP_CONV_2D_DW:
|
||||
|
|
@ -7287,29 +7477,8 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
|
|||
} break;
|
||||
case GGML_OP_CONV_2D:
|
||||
{
|
||||
// src0 - kernel: [KW, KH, Cin, Cout]
|
||||
// src1 - input: [W, H, Cin, N]
|
||||
// dst - result: [OW, OH, Cout, N]
|
||||
|
||||
// Copied from ggml.c: int64_t ggml_calc_conv_output_size(int64_t ins, int64_t ks, int s, int p, int d)
|
||||
auto calc_conv_output_size = [](int64_t ins, int64_t ks, int s, int p, int d) -> int64_t {
|
||||
return (ins + 2 * p - d * (ks - 1) - 1) / s + 1;
|
||||
};
|
||||
// parallelize in {OW/BS_K, OH/BS_NPQ, 1}
|
||||
int64_t W = src1->ne[0];
|
||||
int64_t H = src1->ne[1];
|
||||
int64_t KW = src0->ne[0];
|
||||
int64_t KH = src0->ne[1];
|
||||
int64_t Cout = src0->ne[3];
|
||||
int64_t N = src1->ne[3];
|
||||
int64_t OH = calc_conv_output_size(H, KH, dst->op_params[1], dst->op_params[3], dst->op_params[5]);
|
||||
int64_t OW = calc_conv_output_size(W, KW, dst->op_params[0], dst->op_params[2], dst->op_params[4]);
|
||||
int64_t NPQ = N * OW * OH;
|
||||
|
||||
// Tile output matrix to (K/NB_K, NPQ/NB_NPQ, 1) workgroups
|
||||
elements = { static_cast<uint32_t>(Cout), static_cast<uint32_t>(NPQ), 1 };
|
||||
}
|
||||
break;
|
||||
elements = ggml_vk_get_conv_elements(dst);
|
||||
} break;
|
||||
case GGML_OP_ADD:
|
||||
case GGML_OP_SUB:
|
||||
case GGML_OP_DIV:
|
||||
|
|
@ -7882,6 +8051,13 @@ static void ggml_vk_set_rows(ggml_backend_vk_context * ctx, vk_context& subctx,
|
|||
const uint32_t src1_type_size = ggml_type_size(src1->type);
|
||||
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
||||
|
||||
// Skip empty skip_rows operations. For most ops the empty check at the start
|
||||
// of ggml_vk_build_graph is sufficient, but set_rows can have a nonempty dst
|
||||
// with empty srcs.
|
||||
if (ggml_is_empty(src0) || ggml_is_empty(src1)) {
|
||||
return;
|
||||
}
|
||||
|
||||
ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_SET_ROWS, {
|
||||
(uint32_t)ggml_nelements(src0),
|
||||
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
||||
|
|
@ -8178,13 +8354,13 @@ static void ggml_vk_pool_2d(ggml_backend_vk_context * ctx, vk_context& subctx, c
|
|||
|
||||
static void ggml_vk_conv_2d(ggml_backend_vk_context * ctx, vk_context & subctx, const ggml_tensor * src0,
|
||||
const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||
|
||||
GGML_TENSOR_BINARY_OP_LOCALS
|
||||
|
||||
GGML_ASSERT(nb00 == sizeof(float));
|
||||
GGML_ASSERT(nb00 == sizeof(float) || nb00 == sizeof(ggml_fp16_t));
|
||||
GGML_ASSERT(nb10 == sizeof(float));
|
||||
GGML_ASSERT(nb0 == sizeof(float));
|
||||
|
||||
|
|
@ -10867,7 +11043,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
|||
const vk_device& device = ggml_vk_get_device(ctx->device);
|
||||
bool is_Apple = ggml_vk_get_device(ctx->device)->vendor_id == VK_VENDOR_ID_APPLE;
|
||||
// Channel-contiguous format is not supported yet.
|
||||
return (op->src[0]->type == GGML_TYPE_F32 &&
|
||||
return ((op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16) &&
|
||||
op->src[1]->type == GGML_TYPE_F32 &&
|
||||
op->type == GGML_TYPE_F32 &&
|
||||
ggml_is_contiguous(op->src[0]) &&
|
||||
|
|
@ -11147,7 +11323,7 @@ size_t comp_nb[GGML_MAX_DIMS];
|
|||
size_t check_counter = 0;
|
||||
static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph * cgraph, int tensor_idx) {
|
||||
ggml_tensor * tensor = cgraph->nodes[tensor_idx];
|
||||
if (tensor->op == GGML_OP_TRANSPOSE) {
|
||||
if (tensor->op == GGML_OP_TRANSPOSE || tensor->op == GGML_OP_SET_ROWS) {
|
||||
return;
|
||||
}
|
||||
|
||||
|
|
@ -11267,7 +11443,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph *
|
|||
tensor_clone = ggml_upscale_ext(ggml_ctx, src_clone[0], tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], (ggml_scale_mode) tensor->op_params[0]);
|
||||
} else if (tensor->op == GGML_OP_SCALE) {
|
||||
const float * params = (const float *)tensor->op_params;
|
||||
tensor_clone = ggml_scale(ggml_ctx, src_clone[0], params[0]);
|
||||
tensor_clone = ggml_scale_bias(ggml_ctx, src_clone[0], params[0], params[1]);
|
||||
} else if (tensor->op == GGML_OP_SQR) {
|
||||
tensor_clone = ggml_sqr(ggml_ctx, src_clone[0]);
|
||||
} else if (tensor->op == GGML_OP_SIN) {
|
||||
|
|
@ -11378,8 +11554,6 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph *
|
|||
} else {
|
||||
tensor_clone = ggml_cpy(ggml_ctx, src_clone[0], src_clone[1]);
|
||||
}
|
||||
} else if (tensor->op == GGML_OP_SET_ROWS) {
|
||||
tensor_clone = ggml_set_rows(ggml_ctx, src_clone[0], src_clone[1]);
|
||||
} else if (tensor->op == GGML_OP_CONT) {
|
||||
tensor_clone = ggml_cont_4d(ggml_ctx, src_clone[0], tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]);
|
||||
} else if (tensor->op == GGML_OP_RESHAPE) {
|
||||
|
|
@ -11487,7 +11661,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph *
|
|||
|
||||
static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_cgraph * cgraph, int tensor_idx) {
|
||||
ggml_tensor * tensor = cgraph->nodes[tensor_idx];
|
||||
if (tensor->op == GGML_OP_TRANSPOSE) {
|
||||
if (tensor->op == GGML_OP_TRANSPOSE || tensor->op == GGML_OP_SET_ROWS) {
|
||||
return;
|
||||
}
|
||||
bool fused_rms_norm_mul = false;
|
||||
|
|
@ -11547,6 +11721,9 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_cgraph *
|
|||
} else if (tensor->type == GGML_TYPE_F16) {
|
||||
correct = ggml_fp16_to_fp32(*(ggml_fp16_t *) ((char *) comp_result + i3*comp_nb[3] + i2*comp_nb[2] + i1*comp_nb[1] + i0*comp_nb[0]));
|
||||
result = ggml_fp16_to_fp32(*(ggml_fp16_t *) ((char *) tensor_data + i3*tensor->nb[3] + i2*tensor->nb[2] + i1*tensor->nb[1] + i0*tensor->nb[0]));
|
||||
} else if (tensor->type == GGML_TYPE_BF16) {
|
||||
correct = ggml_bf16_to_fp32(*(ggml_bf16_t *) ((char *) comp_result + i3*comp_nb[3] + i2*comp_nb[2] + i1*comp_nb[1] + i0*comp_nb[0]));
|
||||
result = ggml_bf16_to_fp32(*(ggml_bf16_t *) ((char *) tensor_data + i3*tensor->nb[3] + i2*tensor->nb[2] + i1*tensor->nb[1] + i0*tensor->nb[0]));
|
||||
} else if (tensor->type == GGML_TYPE_I32) {
|
||||
correct = *(int32_t *) ((char *) comp_result + i3*comp_nb[3] + i2*comp_nb[2] + i1*comp_nb[1] + i0*comp_nb[0]);
|
||||
result = *(int32_t *) ((char *) tensor_data + i3*tensor->nb[3] + i2*tensor->nb[2] + i1*tensor->nb[1] + i0*tensor->nb[0]);
|
||||
|
|
|
|||
|
|
@ -1,14 +1,18 @@
|
|||
#version 450
|
||||
|
||||
#extension GL_EXT_control_flow_attributes : enable
|
||||
#ifdef COOPMAT2
|
||||
#extension GL_NV_cooperative_matrix2 : enable
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require
|
||||
#extension GL_KHR_memory_scope_semantics : enable
|
||||
#endif
|
||||
|
||||
#ifdef USE_COLLECTIVES
|
||||
# extension GL_KHR_shader_subgroup_shuffle : enable
|
||||
#endif
|
||||
|
||||
#include "types.comp"
|
||||
|
||||
// Make spec constant
|
||||
#define SHMEM_PAD 0
|
||||
|
||||
// shape notation: [dim(N), ..., dim(0)] -- stride(dim(j)) >= stride(dim(i)) if i > j
|
||||
layout(binding = 0) readonly buffer A {
|
||||
A_TYPE knl_data[];
|
||||
|
|
@ -56,6 +60,12 @@ layout(push_constant) uniform parameter {
|
|||
uint32_t nb1;
|
||||
uint32_t nb2;
|
||||
uint32_t nb3;
|
||||
|
||||
// fastdiv helper values
|
||||
uint32_t KWmp; uint32_t KWL;
|
||||
uint32_t KWKHmp; uint32_t KWKHL;
|
||||
uint32_t OWmp; uint32_t OWL;
|
||||
uint32_t OWOHmp; uint32_t OWOHL;
|
||||
}
|
||||
|
||||
p;
|
||||
|
|
@ -68,6 +78,7 @@ layout(constant_id = 3) const uint BS_NPQ = 128;
|
|||
// Thread-tile sizes
|
||||
layout(constant_id = 4) const uint TS_K = 8;
|
||||
layout(constant_id = 5) const uint use_collectives = 1;
|
||||
layout(constant_id = 6) const uint SHMEM_PAD = 4;
|
||||
|
||||
uint32_t tid = gl_LocalInvocationID.x;
|
||||
const uint32_t WG_SIZE = gl_WorkGroupSize.x;
|
||||
|
|
@ -85,6 +96,12 @@ uint32_t n_elems_out = K * NPQ;
|
|||
// Number of blocktiles per input
|
||||
uint32_t NB_CRS = splitWork(CRS, BS_CRS);
|
||||
|
||||
#ifdef COOPMAT2
|
||||
#define SHMEM_TYPE float16_t
|
||||
#else
|
||||
#define SHMEM_TYPE float
|
||||
#endif
|
||||
|
||||
const uint32_t Ash_stride = BS_CRS + SHMEM_PAD;
|
||||
const uint32_t Bsh_stride = BS_NPQ + SHMEM_PAD;
|
||||
|
||||
|
|
@ -94,8 +111,8 @@ const uint32_t Bsh_numel = BS_CRS * BS_NPQ;
|
|||
const uint32_t Ash_len = BS_K * Ash_stride;
|
||||
const uint32_t Bsh_len = BS_CRS * Bsh_stride;
|
||||
|
||||
shared float Ash[Ash_len]; // K x CRS
|
||||
shared float Bsh[Bsh_len]; // CRS x NPQ
|
||||
shared SHMEM_TYPE Ash[Ash_len]; // K x CRS
|
||||
shared SHMEM_TYPE Bsh[Bsh_len]; // CRS x NPQ
|
||||
|
||||
// Threadtile sizes
|
||||
const uint32_t TS_NPQ = BS_K * BS_NPQ / WG_SIZE / TS_K;
|
||||
|
|
@ -104,10 +121,6 @@ const uint32_t TS_NPQ = BS_K * BS_NPQ / WG_SIZE / TS_K;
|
|||
const uint32_t NT_K = BS_K / TS_K;
|
||||
const uint32_t NT_NPQ = BS_NPQ / TS_NPQ;
|
||||
|
||||
float regA[TS_K];
|
||||
float regB[TS_NPQ];
|
||||
float regC[TS_K][TS_NPQ];
|
||||
|
||||
/*
|
||||
Compute
|
||||
KxCRS @ CRSxNPQ = K x NPQ
|
||||
|
|
@ -131,12 +144,44 @@ uint32_t Br = tid / BS_NPQ;
|
|||
uint32_t Bc = tid % BS_NPQ;
|
||||
const uint32_t BrpWg = WG_SIZE / BS_NPQ;
|
||||
|
||||
// see init_fastdiv_values in ggml-vulkan.cpp
|
||||
uint fastdiv(uint n, uint mp, uint L) {
|
||||
uint msbs, lsbs;
|
||||
// msbs = mulhi(n, mp)
|
||||
umulExtended(n, mp, msbs, lsbs);
|
||||
return (msbs + n) >> L;
|
||||
}
|
||||
|
||||
#ifdef COOPMAT2
|
||||
#define ACC_TYPE float16_t
|
||||
|
||||
ACC_TYPE perElemOpStore(const in uint32_t r, const in uint32_t c, const in ACC_TYPE elem)
|
||||
{
|
||||
uint32_t K_idx = B_idx_K * BS_K + r;
|
||||
uint32_t NPQ_idx = B_idx_NPQ * BS_NPQ + c;
|
||||
uint32_t N_idx = fastdiv(NPQ_idx, p.OWOHmp, p.OWOHL); // divide by p.OH * p.OW;
|
||||
uint32_t OH_idx = fastdiv(NPQ_idx - N_idx * p.OH * p.OW, p.OWmp, p.OWL); // divide by p.OW;
|
||||
uint32_t OW_idx = NPQ_idx - N_idx * p.OH * p.OW - OH_idx * p.OW;
|
||||
uint32_t dst_idx = OW_idx + OH_idx * p.nb1 + K_idx * p.nb2 + N_idx * p.nb3;
|
||||
if (K_idx < K && NPQ_idx < NPQ) {
|
||||
dst_data[dst_idx] = D_TYPE(elem);
|
||||
}
|
||||
return elem;
|
||||
}
|
||||
#endif
|
||||
|
||||
void main() {
|
||||
#ifdef COOPMAT2
|
||||
coopmat<ACC_TYPE, gl_ScopeWorkgroup, BS_K, BS_NPQ, gl_MatrixUseAccumulator> matC;
|
||||
matC = coopmat<ACC_TYPE, gl_ScopeWorkgroup, BS_K, BS_NPQ, gl_MatrixUseAccumulator>(0.0);
|
||||
#else
|
||||
float regC[TS_K][TS_NPQ];
|
||||
for (uint32_t T_ly = 0; T_ly < TS_K; T_ly++) {
|
||||
for (uint32_t T_lx = 0; T_lx < TS_NPQ; T_lx++) {
|
||||
regC[T_ly][T_lx] = 0.0;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
/* Advance block in CRS dim */
|
||||
for (uint32_t B_idx_CRS = 0; B_idx_CRS < NB_CRS; B_idx_CRS++) {
|
||||
uint32_t CRS_idx_a;
|
||||
|
|
@ -151,9 +196,9 @@ void main() {
|
|||
uint32_t cached_KW_idx;
|
||||
if (use_collectives == 1) {
|
||||
cached_CRS_idx = B_idx_CRS * BS_CRS + gl_SubgroupInvocationID;
|
||||
cached_Cin_idx = cached_CRS_idx / (p.KW * p.KH);
|
||||
cached_Cin_idx = fastdiv(cached_CRS_idx, p.KWKHmp, p.KWKHL); // divide by (p.KW * p.KH);
|
||||
uint32_t cached_CRS_remainder = (cached_CRS_idx - cached_Cin_idx * p.KW * p.KH);
|
||||
cached_KH_idx = cached_CRS_remainder / p.KW;
|
||||
cached_KH_idx = fastdiv(cached_CRS_remainder, p.KWmp, p.KWL); // divide by p.KW;
|
||||
cached_KW_idx = cached_CRS_remainder - cached_KH_idx * p.KW;
|
||||
|
||||
CRS_idx_a = subgroupShuffle(cached_CRS_idx, Ac);
|
||||
|
|
@ -162,16 +207,16 @@ void main() {
|
|||
KW_idx_a = subgroupShuffle(cached_KW_idx, Ac);
|
||||
} else {
|
||||
CRS_idx_a = B_idx_CRS * BS_CRS + Ac; // Global CRS_idx_a (column index of A)
|
||||
Cin_idx_a = CRS_idx_a / (p.KW * p.KH);
|
||||
Cin_idx_a = fastdiv(CRS_idx_a, p.KWKHmp, p.KWKHL); // divide by (p.KW * p.KH);
|
||||
uint32_t CRS_remainder = CRS_idx_a - Cin_idx_a * p.KW * p.KH;
|
||||
KH_idx_a = CRS_remainder / p.KW;
|
||||
KH_idx_a = fastdiv(CRS_remainder, p.KWmp, p.KWL); // divide by p.KW;
|
||||
KW_idx_a = CRS_remainder - KH_idx_a * p.KW;
|
||||
}
|
||||
#else
|
||||
CRS_idx_a = B_idx_CRS * BS_CRS + Ac; // Global CRS_idx_a (column index of A)
|
||||
Cin_idx_a = CRS_idx_a / (p.KW * p.KH);
|
||||
Cin_idx_a = fastdiv(CRS_idx_a, p.KWKHmp, p.KWKHL); // divide by (p.KW * p.KH); / (p.KW * p.KH);
|
||||
CRS_remainder = CRS_idx_a - Cin_idx_a * p.KW * p.KH;
|
||||
KH_idx_a = CRS_remainder / p.KW;
|
||||
KH_idx_a = fastdiv(CRS_remainder, p.KWmp, p.KWL); // divide by p.KW;
|
||||
KW_idx_a = CRS_remainder - KH_idx_a * p.KW;
|
||||
#endif
|
||||
|
||||
|
|
@ -185,16 +230,16 @@ void main() {
|
|||
if (K_idx >= K || CRS_idx_a >= CRS) {
|
||||
val = 0.0;
|
||||
}
|
||||
Ash[B_ly * Ash_stride + B_lx] = val;
|
||||
Ash[B_ly * Ash_stride + B_lx] = SHMEM_TYPE(val);
|
||||
}
|
||||
/* Load input to B_block: (BS_CRS x BS_NPQ) */
|
||||
for (uint32_t r_offset = 0; r_offset < BS_CRS; r_offset += BrpWg) {
|
||||
UNROLL for (uint32_t r_offset = 0; r_offset < BS_CRS; r_offset += BrpWg) {
|
||||
uint32_t B_ly = r_offset + Br; /* Row index of B block */
|
||||
uint32_t B_lx = Bc;
|
||||
uint32_t NPQ_idx = B_idx_NPQ * BS_NPQ + B_lx; /* Global NPQ index (column index of B) */
|
||||
uint32_t N_idx = NPQ_idx / (p.OH * p.OW);
|
||||
uint32_t N_idx = fastdiv(NPQ_idx, p.OWOHmp, p.OWOHL); // divide by p.OH * p.OW;
|
||||
uint32_t NPQ_remainder = NPQ_idx - N_idx * p.OH * p.OW;
|
||||
uint32_t OH_idx = NPQ_remainder / p.OW;
|
||||
uint32_t OH_idx = fastdiv(NPQ_remainder, p.OWmp, p.OWL); // divide by p.OW;
|
||||
uint32_t OW_idx = NPQ_remainder - OH_idx * p.OW;
|
||||
|
||||
uint32_t CRS_idx_b;
|
||||
|
|
@ -209,16 +254,16 @@ void main() {
|
|||
KW_idx_b = subgroupShuffle(cached_KW_idx, r_offset + Br);
|
||||
} else {
|
||||
CRS_idx_b = B_idx_CRS * BS_CRS + B_ly; /* Global CRS index (row index of B) */
|
||||
Cin_idx_b = CRS_idx_b / (p.KW * p.KH);
|
||||
Cin_idx_b = fastdiv(CRS_idx_b, p.KWKHmp, p.KWKHL); // divide by (p.KW * p.KH);
|
||||
uint32_t CRS_remainder = CRS_idx_b - Cin_idx_b * p.KW * p.KH;
|
||||
KH_idx_b = CRS_remainder / p.KW;
|
||||
KH_idx_b = fastdiv(CRS_remainder, p.KWmp, p.KWL); // divide by p.KW;
|
||||
KW_idx_b = CRS_remainder - KH_idx_b * p.KW;
|
||||
}
|
||||
#else
|
||||
CRS_idx_b = B_idx_CRS * BS_CRS + B_ly; /* Global CRS index (row index of B) */
|
||||
Cin_idx_b = CRS_idx_b / (p.KW * p.KH);
|
||||
Cin_idx_b = fastdiv(CRS_idx_b, p.KWKHmp, p.KWKHL); // divide by (p.KW * p.KH);
|
||||
uint32_t CRS_remainder = CRS_idx_b - Cin_idx_b * p.KW * p.KH;
|
||||
KH_idx_b = CRS_remainder / p.KW;
|
||||
KH_idx_b = fastdiv(CRS_remainder, p.KWmp, p.KWL); // divide by p.KW;
|
||||
KW_idx_b = CRS_remainder - KH_idx_b * p.KW;
|
||||
#endif
|
||||
|
||||
|
|
@ -230,36 +275,55 @@ void main() {
|
|||
if (CRS_idx_b >= CRS || NPQ_idx >= NPQ || H_idx < 0 || H_idx >= p.H || W_idx < 0 || W_idx >= p.W) {
|
||||
val = 0.0;
|
||||
}
|
||||
Bsh[B_ly * Bsh_stride + B_lx] = val;
|
||||
Bsh[B_ly * Bsh_stride + B_lx] = SHMEM_TYPE(val);
|
||||
}
|
||||
barrier();
|
||||
for (uint32_t CRS_lidx = 0; CRS_lidx < BS_CRS; CRS_lidx++) {
|
||||
for (uint32_t T_ly = 0; T_ly < TS_K; T_ly++) {
|
||||
regA[T_ly] = Ash[(T_y * TS_K + T_ly) * Ash_stride + CRS_lidx];
|
||||
}
|
||||
for (uint32_t T_lx = 0; T_lx < TS_NPQ; T_lx++) {
|
||||
regB[T_lx] = Bsh[CRS_lidx * Bsh_stride + T_x * TS_NPQ + T_lx];
|
||||
}
|
||||
for (uint32_t T_ly = 0; T_ly < TS_K; T_ly++) {
|
||||
#ifdef COOPMAT2
|
||||
coopmat<float16_t, gl_ScopeWorkgroup, BS_K, BS_CRS, gl_MatrixUseA> matA;
|
||||
coopmat<float16_t, gl_ScopeWorkgroup, BS_CRS, BS_NPQ, gl_MatrixUseB> matB;
|
||||
|
||||
coopMatLoad(matA, Ash, 0, Ash_stride, gl_CooperativeMatrixLayoutRowMajor);
|
||||
coopMatLoad(matB, Bsh, 0, Bsh_stride, gl_CooperativeMatrixLayoutRowMajor);
|
||||
matC = coopMatMulAdd(matA, matB, matC);
|
||||
#else
|
||||
if (T_y * TS_K < K) {
|
||||
UNROLL for (uint32_t CRS_lidx = 0; CRS_lidx < BS_CRS; CRS_lidx++) {
|
||||
float regA[TS_K];
|
||||
float regB[TS_NPQ];
|
||||
for (uint32_t T_ly = 0; T_ly < TS_K; T_ly++) {
|
||||
regA[T_ly] = Ash[(T_y * TS_K + T_ly) * Ash_stride + CRS_lidx];
|
||||
}
|
||||
for (uint32_t T_lx = 0; T_lx < TS_NPQ; T_lx++) {
|
||||
regC[T_ly][T_lx] = fma(regA[T_ly], regB[T_lx], regC[T_ly][T_lx]);
|
||||
regB[T_lx] = Bsh[CRS_lidx * Bsh_stride + T_x * TS_NPQ + T_lx];
|
||||
}
|
||||
for (uint32_t T_ly = 0; T_ly < TS_K; T_ly++) {
|
||||
for (uint32_t T_lx = 0; T_lx < TS_NPQ; T_lx++) {
|
||||
regC[T_ly][T_lx] = fma(regA[T_ly], regB[T_lx], regC[T_ly][T_lx]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
barrier();
|
||||
}
|
||||
/* Save C* */
|
||||
for (uint32_t T_ly = 0; T_ly < TS_K; T_ly++) {
|
||||
for (uint32_t T_lx = 0; T_lx < TS_NPQ; T_lx++) {
|
||||
uint32_t K_idx = B_idx_K * BS_K + T_y * TS_K + T_ly;
|
||||
uint32_t NPQ_idx = B_idx_NPQ * BS_NPQ + T_x * TS_NPQ + T_lx;
|
||||
uint32_t N_idx = NPQ_idx / (p.OH * p.OW);
|
||||
uint32_t OH_idx = (NPQ_idx - N_idx * p.OH * p.OW) / p.OW;
|
||||
uint32_t OW_idx = NPQ_idx - N_idx * p.OH * p.OW - OH_idx * p.OW;
|
||||
uint32_t dst_idx = OW_idx + OH_idx * p.nb1 + K_idx * p.nb2 + N_idx * p.nb3;
|
||||
if (K_idx < K && NPQ_idx < NPQ) {
|
||||
dst_data[dst_idx] = regC[T_ly][T_lx];
|
||||
#ifdef COOPMAT2
|
||||
coopMatPerElementNV(matC, matC, perElemOpStore);
|
||||
#else
|
||||
if (T_y * TS_K < K) {
|
||||
for (uint32_t T_ly = 0; T_ly < TS_K; T_ly++) {
|
||||
for (uint32_t T_lx = 0; T_lx < TS_NPQ; T_lx++) {
|
||||
uint32_t K_idx = B_idx_K * BS_K + T_y * TS_K + T_ly;
|
||||
uint32_t NPQ_idx = B_idx_NPQ * BS_NPQ + T_x * TS_NPQ + T_lx;
|
||||
uint32_t N_idx = fastdiv(NPQ_idx, p.OWOHmp, p.OWOHL); // divide by p.OH * p.OW;
|
||||
uint32_t OH_idx = fastdiv(NPQ_idx - N_idx * p.OH * p.OW, p.OWmp, p.OWL); // divide by p.OW;
|
||||
uint32_t OW_idx = NPQ_idx - N_idx * p.OH * p.OW - OH_idx * p.OW;
|
||||
uint32_t dst_idx = OW_idx + OH_idx * p.nb1 + K_idx * p.nb2 + N_idx * p.nb3;
|
||||
if (K_idx < K && NPQ_idx < NPQ) {
|
||||
dst_data[dst_idx] = regC[T_ly][T_lx];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
|
|
|||
|
|
@ -26,6 +26,9 @@ layout (push_constant) uniform parameter
|
|||
uint ne12;
|
||||
uint b_offset;
|
||||
uint d_offset;
|
||||
uint nb03;
|
||||
uint nb13;
|
||||
uint nb23;
|
||||
} p;
|
||||
|
||||
shared FLOAT_TYPE tmp[BLOCK_SIZE];
|
||||
|
|
@ -34,6 +37,7 @@ void main() {
|
|||
const uint tid = gl_LocalInvocationID.x;
|
||||
const uint row_x = gl_GlobalInvocationID.y;
|
||||
const uint channel = gl_GlobalInvocationID.z;
|
||||
const uint i3 = gl_WorkGroupID.x;
|
||||
const uint channel_x = channel / p.channel_x_divisor;
|
||||
const uint channel_y = channel % p.ne12;
|
||||
|
||||
|
|
@ -41,7 +45,7 @@ void main() {
|
|||
const uint nrows_dst = p.nrows_x;
|
||||
const uint row_dst = row_x;
|
||||
|
||||
const uint idst = channel*nrows_dst + row_dst;
|
||||
const uint idst = i3*p.nb23 + channel*nrows_dst + row_dst;
|
||||
|
||||
FLOAT_TYPE temp = 0.0f;
|
||||
|
||||
|
|
@ -58,8 +62,8 @@ void main() {
|
|||
|
||||
const uint row_y = col_x;
|
||||
|
||||
const uint ix = channel_x*p.channel_stride_x + row_x*p.row_stride_x + col_x;
|
||||
const uint iy = channel_y*p.channel_stride_y + row_y;
|
||||
const uint ix = i3*p.nb03 + channel_x*p.channel_stride_x + row_x*p.row_stride_x + col_x;
|
||||
const uint iy = i3*p.nb13 + channel_y*p.channel_stride_y + row_y;
|
||||
|
||||
const vec4 av4 = vec4(data_a_v4[ix / 4]);
|
||||
const vec4 bv4 = vec4(data_b_v4[iy / 4]);
|
||||
|
|
@ -74,8 +78,8 @@ void main() {
|
|||
|
||||
const uint row_y = col_x;
|
||||
|
||||
const uint ix = channel_x*p.channel_stride_x + row_x*p.row_stride_x + col_x;
|
||||
const uint iy = channel_y*p.channel_stride_y + row_y;
|
||||
const uint ix = i3*p.nb03 + channel_x*p.channel_stride_x + row_x*p.row_stride_x + col_x;
|
||||
const uint iy = i3*p.nb13 + channel_y*p.channel_stride_y + row_y;
|
||||
|
||||
const vec4 av4 = vec4(data_a_v4[ix / 4]);
|
||||
const vec4 bv4 = vec4(data_b_v4[iy / 4]);
|
||||
|
|
@ -91,8 +95,8 @@ void main() {
|
|||
|
||||
const uint row_y = col_x;
|
||||
|
||||
const uint ix = channel_x*p.channel_stride_x + row_x*p.row_stride_x + col_x;
|
||||
const uint iy = channel_y*p.channel_stride_y + row_y;
|
||||
const uint ix = i3*p.nb03 + channel_x*p.channel_stride_x + row_x*p.row_stride_x + col_x;
|
||||
const uint iy = i3*p.nb13 + channel_y*p.channel_stride_y + row_y;
|
||||
|
||||
const FLOAT_TYPE xi = FLOAT_TYPE(data_a[ix]);
|
||||
|
||||
|
|
|
|||
|
|
@ -655,7 +655,16 @@ void process_shaders() {
|
|||
|
||||
string_to_spv("opt_step_adamw_f32", "opt_step_adamw.comp", merge_maps(base_dict, {{"A_TYPE", "float"}}));
|
||||
|
||||
string_to_spv("conv2d_f32", "conv2d_mm.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"USE_COLLECTIVES", "1"}});
|
||||
string_to_spv("conv2d_f32_unroll", "conv2d_mm.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"USE_COLLECTIVES", "1"}, {"UNROLL", "[[unroll]]"}});
|
||||
string_to_spv("conv2d_f16_f32_unroll", "conv2d_mm.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"USE_COLLECTIVES", "1"}, {"UNROLL", "[[unroll]]"}});
|
||||
|
||||
string_to_spv("conv2d_f32", "conv2d_mm.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"USE_COLLECTIVES", "1"}, {"UNROLL", ""}});
|
||||
string_to_spv("conv2d_f16_f32", "conv2d_mm.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"USE_COLLECTIVES", "1"}, {"UNROLL", ""}});
|
||||
|
||||
#if defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
|
||||
string_to_spv("conv2d_f32", "conv2d_mm.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"USE_COLLECTIVES", "1"}, {"UNROLL", "[[unroll]]"}, {"COOPMAT2", "1"}}, true, false, true);
|
||||
string_to_spv("conv2d_f16_f32", "conv2d_mm.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"USE_COLLECTIVES", "1"}, {"UNROLL", "[[unroll]]"}, {"COOPMAT2", "1"}}, true, false, true);
|
||||
#endif
|
||||
|
||||
string_to_spv("conv2d_dw_whcn_f32", "conv2d_dw.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"WHCN", "1"}}));
|
||||
string_to_spv("conv2d_dw_cwhn_f32", "conv2d_dw.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"CWHN", "1"}}));
|
||||
|
|
|
|||
|
|
@ -279,6 +279,9 @@ class Keys:
|
|||
class Projector:
|
||||
STACK_FACTOR = "clip.audio.projector.stack_factor"
|
||||
|
||||
class Diffusion:
|
||||
SHIFT_LOGITS = "diffusion.shift_logits"
|
||||
|
||||
#
|
||||
# recommended mapping of model tensor names for storage in gguf
|
||||
#
|
||||
|
|
@ -373,9 +376,12 @@ class MODEL_ARCH(IntEnum):
|
|||
ERNIE4_5 = auto()
|
||||
ERNIE4_5_MOE = auto()
|
||||
HUNYUAN_MOE = auto()
|
||||
HUNYUAN_DENSE = auto()
|
||||
SMOLLM3 = auto()
|
||||
LFM2 = auto()
|
||||
DREAM = auto()
|
||||
SMALLTHINKER = auto()
|
||||
LLADA = auto()
|
||||
|
||||
|
||||
class VISION_PROJECTOR_TYPE(IntEnum):
|
||||
|
|
@ -692,9 +698,12 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
|||
MODEL_ARCH.ERNIE4_5_MOE: "ernie4_5-moe",
|
||||
MODEL_ARCH.FALCON_H1: "falcon-h1",
|
||||
MODEL_ARCH.HUNYUAN_MOE: "hunyuan-moe",
|
||||
MODEL_ARCH.HUNYUAN_DENSE: "hunyuan-dense",
|
||||
MODEL_ARCH.SMOLLM3: "smollm3",
|
||||
MODEL_ARCH.LFM2: "lfm2",
|
||||
MODEL_ARCH.DREAM: "dream",
|
||||
MODEL_ARCH.SMALLTHINKER: "smallthinker",
|
||||
MODEL_ARCH.LLADA: "llada",
|
||||
}
|
||||
|
||||
VISION_PROJECTOR_TYPE_NAMES: dict[VISION_PROJECTOR_TYPE, str] = {
|
||||
|
|
@ -1316,6 +1325,21 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
|||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
],
|
||||
MODEL_ARCH.LLADA: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
MODEL_TENSOR.OUTPUT,
|
||||
MODEL_TENSOR.ROPE_FREQS,
|
||||
MODEL_TENSOR.ATTN_NORM,
|
||||
MODEL_TENSOR.ATTN_Q,
|
||||
MODEL_TENSOR.ATTN_K,
|
||||
MODEL_TENSOR.ATTN_V,
|
||||
MODEL_TENSOR.ATTN_OUT,
|
||||
MODEL_TENSOR.FFN_NORM,
|
||||
MODEL_TENSOR.FFN_GATE,
|
||||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
],
|
||||
MODEL_ARCH.QWEN2VL: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
|
|
@ -2449,6 +2473,22 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
|||
MODEL_TENSOR.FFN_DOWN_SHEXP,
|
||||
MODEL_TENSOR.FFN_UP_SHEXP,
|
||||
],
|
||||
MODEL_ARCH.HUNYUAN_DENSE: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
MODEL_TENSOR.OUTPUT,
|
||||
MODEL_TENSOR.ATTN_NORM,
|
||||
MODEL_TENSOR.ATTN_Q,
|
||||
MODEL_TENSOR.ATTN_Q_NORM,
|
||||
MODEL_TENSOR.ATTN_K,
|
||||
MODEL_TENSOR.ATTN_K_NORM,
|
||||
MODEL_TENSOR.ATTN_V,
|
||||
MODEL_TENSOR.ATTN_OUT,
|
||||
MODEL_TENSOR.FFN_NORM,
|
||||
MODEL_TENSOR.FFN_GATE,
|
||||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
],
|
||||
MODEL_ARCH.SMOLLM3: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
|
|
@ -2483,6 +2523,24 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
|||
MODEL_TENSOR.ATTN_V,
|
||||
MODEL_TENSOR.ATTN_OUT,
|
||||
],
|
||||
MODEL_ARCH.SMALLTHINKER: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
MODEL_TENSOR.OUTPUT,
|
||||
MODEL_TENSOR.ATTN_NORM,
|
||||
MODEL_TENSOR.ATTN_Q,
|
||||
MODEL_TENSOR.ATTN_K,
|
||||
MODEL_TENSOR.ATTN_V,
|
||||
MODEL_TENSOR.ATTN_OUT,
|
||||
MODEL_TENSOR.FFN_NORM,
|
||||
MODEL_TENSOR.FFN_GATE,
|
||||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
MODEL_TENSOR.FFN_GATE_INP,
|
||||
MODEL_TENSOR.FFN_GATE_EXP,
|
||||
MODEL_TENSOR.FFN_DOWN_EXP,
|
||||
MODEL_TENSOR.FFN_UP_EXP,
|
||||
],
|
||||
# TODO
|
||||
}
|
||||
|
||||
|
|
@ -2704,6 +2762,7 @@ class VisionProjectorType:
|
|||
INTERNVL = "internvl"
|
||||
QWEN2A = "qwen2a" # audio
|
||||
QWEN25O = "qwen2.5o" # omni
|
||||
VOXTRAL = "voxtral"
|
||||
|
||||
|
||||
# Items here are (block size, type size)
|
||||
|
|
|
|||
|
|
@ -1047,6 +1047,11 @@ class GGUFWriter:
|
|||
def add_audio_stack_factor(self, value: int) -> None:
|
||||
self.add_uint32(Keys.ClipAudio.Projector.STACK_FACTOR, value)
|
||||
|
||||
# diffusion models
|
||||
|
||||
def add_diffusion_shift_logits(self, value: bool) -> None:
|
||||
self.add_bool(Keys.Diffusion.SHIFT_LOGITS, value)
|
||||
|
||||
def _pack(self, fmt: str, value: Any, skip_pack_prefix: bool = False) -> bytes:
|
||||
pack_prefix = ''
|
||||
if not skip_pack_prefix:
|
||||
|
|
|
|||
|
|
@ -32,6 +32,8 @@ class TensorNameMap:
|
|||
"model.word_embeddings", # bailingmoe
|
||||
"language_model.model.embed_tokens", # llama4
|
||||
"encoder", # neobert
|
||||
"model.transformer.wte", # llada
|
||||
"embed_tokens", # qwen3-embedding
|
||||
),
|
||||
|
||||
# Token type embeddings
|
||||
|
|
@ -71,6 +73,7 @@ class TensorNameMap:
|
|||
"head", # rwkv
|
||||
"head.out", # wavtokenizer
|
||||
"lm_head", # llama4
|
||||
"model.transformer.ff_out", # llada
|
||||
),
|
||||
|
||||
# Output norm
|
||||
|
|
@ -94,6 +97,7 @@ class TensorNameMap:
|
|||
"model.ln_out", # rwkv7
|
||||
"backbone.final_layer_norm", # wavtokenizer
|
||||
"model.norm", # llama4
|
||||
"model.transformer.ln_f", # llada
|
||||
),
|
||||
|
||||
# Rope frequencies
|
||||
|
|
@ -139,6 +143,8 @@ class TensorNameMap:
|
|||
"model.layers.{bid}.input_layernorm", # llama4
|
||||
"transformer_encoder.{bid}.attention_norm", # neobert
|
||||
"model.layers.{bid}.operator_norm", # lfm2
|
||||
"model.transformer.blocks.{bid}.attn_norm", # llada
|
||||
"layers.{bid}.input_layernorm", # qwen3-embedding
|
||||
),
|
||||
|
||||
# Attention norm 2
|
||||
|
|
@ -183,6 +189,8 @@ class TensorNameMap:
|
|||
"transformer.decoder_layer.{bid}.multi_head_attention.query",# Grok
|
||||
"transformer.h.{bid}.attn.attention.q_proj", # exaone
|
||||
"model.layers.{bid}.self_attn.q_proj", # llama4
|
||||
"model.transformer.blocks.{bid}.q_proj", # llada
|
||||
"layers.{bid}.self_attn.q_proj", # qwen3-embedding
|
||||
),
|
||||
|
||||
# Attention key
|
||||
|
|
@ -199,6 +207,8 @@ class TensorNameMap:
|
|||
"transformer.decoder_layer.{bid}.multi_head_attention.key",# Grok
|
||||
"transformer.h.{bid}.attn.attention.k_proj", # exaone
|
||||
"model.layers.{bid}.self_attn.k_proj", # llama4
|
||||
"model.transformer.blocks.{bid}.k_proj", # llada
|
||||
"layers.{bid}.self_attn.k_proj", # qwen3-embedding
|
||||
),
|
||||
|
||||
# Attention value
|
||||
|
|
@ -214,6 +224,8 @@ class TensorNameMap:
|
|||
"transformer.decoder_layer.{bid}.multi_head_attention.value",# Grok
|
||||
"transformer.h.{bid}.attn.attention.v_proj", # exaone
|
||||
"model.layers.{bid}.self_attn.v_proj", # llama4
|
||||
"model.transformer.blocks.{bid}.v_proj", # llada
|
||||
"layers.{bid}.self_attn.v_proj", # qwen3-embedding
|
||||
),
|
||||
|
||||
# Attention output
|
||||
|
|
@ -246,6 +258,8 @@ class TensorNameMap:
|
|||
"transformer.h.{bid}.attn.attention.out_proj", # exaone
|
||||
"model.layers.{bid}.self_attn.o_proj", # llama4
|
||||
"transformer_encoder.{bid}.wo", # neobert
|
||||
"model.transformer.blocks.{bid}.attn_out", # llada
|
||||
"layers.{bid}.self_attn.o_proj", # qwen3-embedding
|
||||
),
|
||||
|
||||
# Attention output norm
|
||||
|
|
@ -291,6 +305,8 @@ class TensorNameMap:
|
|||
"model.layers.{bid}.post_attention_layernorm", # llama4
|
||||
"transformer_encoder.{bid}.ffn_norm", # neobert
|
||||
"model.layers.layers.{bid}.pre_mlp_norm", # plamo2
|
||||
"model.transformer.blocks.{bid}.ff_norm", # llada
|
||||
"layers.{bid}.post_attention_layernorm", # qwen3-embedding
|
||||
),
|
||||
|
||||
# Post feed-forward norm
|
||||
|
|
@ -317,6 +333,7 @@ class TensorNameMap:
|
|||
"model.layers.{bid}.feed_forward.router", # llama4 jamba
|
||||
"encoder.layers.{bid}.mlp.router.layer", # nomic-bert-moe
|
||||
"model.layers.{bid}.mlp.gate.wg", # hunyuan
|
||||
"model.layers.{bid}.block_sparse_moe.primary_router", # smallthinker
|
||||
),
|
||||
|
||||
MODEL_TENSOR.FFN_GATE_INP_SHEXP: (
|
||||
|
|
@ -362,6 +379,9 @@ class TensorNameMap:
|
|||
"transformer.h.{bid}.mlp.c_fc_1", # exaone
|
||||
"model.layers.{bid}.feed_forward.up_proj", # llama4 jamba granite-hybrid
|
||||
"transformer_encoder.{bid}.ffn.w12", # neobert
|
||||
"model.layers.{bid}.block_sparse_moe.up", # smallthinker
|
||||
"model.transformer.blocks.{bid}.up_proj", # llada
|
||||
"layers.{bid}.mlp.up_proj", # qwen3-embedding
|
||||
),
|
||||
|
||||
MODEL_TENSOR.FFN_UP_EXP: (
|
||||
|
|
@ -372,6 +392,7 @@ class TensorNameMap:
|
|||
"model.layers.{bid}.block_sparse_moe.experts.w3", # phimoe (merged)
|
||||
"model.layers.{bid}.feed_forward.experts.up_proj", # llama4
|
||||
"encoder.layers.{bid}.mlp.experts.mlp.w1", # nomic-bert-moe
|
||||
"model.layers.{bid}.block_sparse_moe.experts.up", # smallthinker
|
||||
),
|
||||
|
||||
MODEL_TENSOR.FFN_UP_SHEXP: (
|
||||
|
|
@ -401,6 +422,9 @@ class TensorNameMap:
|
|||
"model.layers.{bid}.residual_mlp.w1", # arctic
|
||||
"transformer.h.{bid}.mlp.c_fc_0", # exaone
|
||||
"model.layers.{bid}.feed_forward.gate_proj", # llama4 jamba granite-hybrid
|
||||
"model.layers.{bid}.block_sparse_moe.gate", # smallthinker
|
||||
"model.transformer.blocks.{bid}.ff_proj", # llada
|
||||
"layers.{bid}.mlp.gate_proj", # qwen3-embedding
|
||||
),
|
||||
|
||||
MODEL_TENSOR.FFN_GATE_EXP: (
|
||||
|
|
@ -410,6 +434,7 @@ class TensorNameMap:
|
|||
"model.layers.{bid}.mlp.experts.gate_proj", # qwen2moe olmoe (merged) ernie4.5-moe
|
||||
"model.layers.{bid}.block_sparse_moe.experts.w1", # phimoe (merged)
|
||||
"model.layers.{bid}.feed_forward.experts.gate_proj", # llama4
|
||||
"model.layers.{bid}.block_sparse_moe.experts.gate", # smallthinker
|
||||
),
|
||||
|
||||
MODEL_TENSOR.FFN_GATE_SHEXP: (
|
||||
|
|
@ -448,6 +473,9 @@ class TensorNameMap:
|
|||
"model.layers.h.{bid}.mlp.c_proj", # exaone
|
||||
"model.layers.{bid}.feed_forward.down_proj", # llama4 jamba granite-hybrid
|
||||
"transformer_encoder.{bid}.ffn.w3", # neobert
|
||||
"model.layers.{bid}.block_sparse_moe.down", # smallthinker
|
||||
"model.transformer.blocks.{bid}.ff_out", # llada
|
||||
"layers.{bid}.mlp.down_proj", # qwen3-embedding
|
||||
),
|
||||
|
||||
MODEL_TENSOR.FFN_DOWN_EXP: (
|
||||
|
|
@ -459,6 +487,7 @@ class TensorNameMap:
|
|||
"model.layers.{bid}.block_sparse_moe.experts.w2", # phimoe (merged)
|
||||
"model.layers.{bid}.feed_forward.experts.down_proj", # llama4
|
||||
"encoder.layers.{bid}.mlp.experts.mlp.w2", # nomic-bert-moe
|
||||
"model.layers.{bid}.block_sparse_moe.experts.down", # smallthinker
|
||||
),
|
||||
|
||||
MODEL_TENSOR.FFN_DOWN_SHEXP: (
|
||||
|
|
@ -478,6 +507,7 @@ class TensorNameMap:
|
|||
"encoder.layer.{bid}.attention.self.layer_norm_q", # jina-bert-v2
|
||||
"transformer.layers.{bid}.attn.q_norm", # openelm
|
||||
"model.layers.layers.{bid}.mixer.q", # plamo2
|
||||
"layers.{bid}.self_attn.q_norm", # qwen3-embedding
|
||||
),
|
||||
|
||||
MODEL_TENSOR.ATTN_K_NORM: (
|
||||
|
|
@ -489,6 +519,7 @@ class TensorNameMap:
|
|||
"encoder.layer.{bid}.attention.self.layer_norm_k", # jina-bert-v2
|
||||
"transformer.layers.{bid}.attn.k_norm", # openelm
|
||||
"model.layers.layers.{bid}.mixer.k", # plamo2
|
||||
"layers.{bid}.self_attn.k_norm", # qwen3-embedding
|
||||
),
|
||||
|
||||
MODEL_TENSOR.ROPE_FREQS: (
|
||||
|
|
@ -597,6 +628,7 @@ class TensorNameMap:
|
|||
),
|
||||
|
||||
MODEL_TENSOR.SSM_DT_NORM: (
|
||||
"model.layers.layers.{bid}.mixer.dt_norm.weight", # plamo2
|
||||
"model.layers.{bid}.mamba.dt_layernorm", # jamba
|
||||
),
|
||||
|
||||
|
|
@ -626,10 +658,6 @@ class TensorNameMap:
|
|||
"model.layers.layers.{bid}.mixer.D", # plamo2
|
||||
),
|
||||
|
||||
MODEL_TENSOR.SSM_DT_NORM: (
|
||||
"model.layers.layers.{bid}.mixer.dt_norm.weight", # plamo2
|
||||
),
|
||||
|
||||
MODEL_TENSOR.SSM_NORM: (
|
||||
"model.layers.{bid}.mamba.norm", # falcon-h1 granite-hybrid
|
||||
"backbone.layers.{bid}.mixer.norm", # mamba2
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
from __future__ import annotations
|
||||
|
||||
from enum import Enum
|
||||
import re
|
||||
import logging
|
||||
import json
|
||||
|
|
@ -12,6 +13,25 @@ try:
|
|||
except ImportError:
|
||||
SentencePieceProcessor = None
|
||||
|
||||
try:
|
||||
from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
|
||||
from mistral_common.tokens.tokenizers.tekken import Tekkenizer
|
||||
from mistral_common.tokens.tokenizers.utils import (
|
||||
_filter_valid_tokenizer_files,
|
||||
)
|
||||
from mistral_common.tokens.tokenizers.sentencepiece import (
|
||||
SentencePieceTokenizer,
|
||||
)
|
||||
except ImportError:
|
||||
_mistral_common_installed = False
|
||||
MistralTokenizer = None
|
||||
Tekkenizer = None
|
||||
SentencePieceTokenizer = None
|
||||
_filter_valid_tokenizer_files = None
|
||||
else:
|
||||
_mistral_common_installed = True
|
||||
|
||||
|
||||
import gguf
|
||||
|
||||
from .gguf_writer import GGUFWriter
|
||||
|
|
@ -292,7 +312,11 @@ class SpecialVocab:
|
|||
with open(config_file, encoding = 'utf-8') as f:
|
||||
config = json.load(f)
|
||||
for typ in self.special_token_types:
|
||||
self._set_special_token(typ, config.get(f'{typ}_token_id'))
|
||||
token_id = config.get(f'{typ}_token_id')
|
||||
# If not found at root, check in text_config (for multimodal models like Kimi-VL)
|
||||
if token_id is None and 'text_config' in config:
|
||||
token_id = config['text_config'].get(f'{typ}_token_id')
|
||||
self._set_special_token(typ, token_id)
|
||||
return True
|
||||
|
||||
|
||||
|
|
@ -592,3 +616,262 @@ class LlamaHfVocab(Vocab):
|
|||
|
||||
def __repr__(self) -> str:
|
||||
return f"<LlamaHfVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
|
||||
|
||||
|
||||
class MistralTokenizerType(str, Enum):
|
||||
spm = "spm"
|
||||
tekken = "tekken"
|
||||
|
||||
|
||||
# Copied from Transformers (Apache 2.0)
|
||||
# https://github.com/huggingface/transformers/blob/main/src/transformers/convert_slow_tokenizer.py#L1544
|
||||
|
||||
def bytes_to_unicode() -> dict[int, str]:
|
||||
"""
|
||||
Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
|
||||
characters the bpe code barfs on.
|
||||
|
||||
The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
|
||||
if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
|
||||
decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
|
||||
tables between utf-8 bytes and unicode strings.
|
||||
"""
|
||||
bs = (
|
||||
list(range(ord("!"), ord("~") + 1))
|
||||
+ list(range(ord("¡"), ord("¬") + 1))
|
||||
+ list(range(ord("®"), ord("ÿ") + 1))
|
||||
)
|
||||
cs = bs[:]
|
||||
n = 0
|
||||
for b in range(2**8):
|
||||
if b not in bs:
|
||||
bs.append(b)
|
||||
cs.append(2**8 + n)
|
||||
n += 1
|
||||
cs_str = [chr(n) for n in cs]
|
||||
return dict(zip(bs, cs_str))
|
||||
|
||||
|
||||
class MistralVocab(Vocab):
|
||||
tokenizer_model = "mistral"
|
||||
name = "mistral"
|
||||
|
||||
added_tokens_dict: dict[str, int] = {}
|
||||
added_tokens_list: list[str] = []
|
||||
|
||||
def __init__(self, base_path: Path):
|
||||
if not _mistral_common_installed:
|
||||
raise ImportError(
|
||||
"To use MistralVocab, please install the `mistral-common` package. "
|
||||
"You can install it with `pip install mistral-common`."
|
||||
)
|
||||
assert _filter_valid_tokenizer_files is not None, "mistral_common is not installed"
|
||||
assert MistralTokenizer is not None, "mistral_common is not installed"
|
||||
assert Tekkenizer is not None, "mistral_common is not installed"
|
||||
|
||||
logger.info(f"Loading Mistral tokenizer from {base_path}")
|
||||
|
||||
# Find the tokenizer files
|
||||
all_files = [f.as_posix() for f in base_path.glob("**/*") if f.is_file()]
|
||||
valid_tokenizer_files = _filter_valid_tokenizer_files(all_files)
|
||||
|
||||
if len(valid_tokenizer_files) == 0:
|
||||
raise ValueError(f"No tokenizer file found in the directory: {base_path}")
|
||||
# If there are multiple tokenizer files, we use tekken.json if it exists, otherwise the versioned one.
|
||||
if len(valid_tokenizer_files) > 1:
|
||||
if "tekken.json" in valid_tokenizer_files:
|
||||
tokenizer_file = "tekken.json"
|
||||
else:
|
||||
tokenizer_file = sorted(valid_tokenizer_files)[-1]
|
||||
logger.warning(
|
||||
f"Multiple tokenizer files found in {base_path}. Using {tokenizer_file}"
|
||||
)
|
||||
else:
|
||||
tokenizer_file = valid_tokenizer_files[0]
|
||||
|
||||
self.tokenizer = MistralTokenizer.from_file(
|
||||
base_path / tokenizer_file
|
||||
).instruct_tokenizer.tokenizer
|
||||
self.tokenizer_type = (
|
||||
MistralTokenizerType.tekken
|
||||
if isinstance(self.tokenizer, Tekkenizer)
|
||||
else MistralTokenizerType.spm
|
||||
)
|
||||
self.vocab_size = self.tokenizer.n_words
|
||||
self.fname_tokenizer = base_path / tokenizer_file
|
||||
self._name = (
|
||||
"mistral-" + self.tokenizer_type.value + "-" + self.tokenizer.version
|
||||
)
|
||||
|
||||
@property
|
||||
def tokenizer_name(self) -> str:
|
||||
return self._name
|
||||
|
||||
@property
|
||||
def gguf_tokenizer_model(self) -> str:
|
||||
return "llama" if self.tokenizer_type == MistralTokenizerType.spm else "gpt2"
|
||||
|
||||
def _sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
||||
assert SentencePieceTokenizer is not None, "mistral_common is not installed"
|
||||
assert isinstance(self.tokenizer, SentencePieceTokenizer), (
|
||||
f"Expected SentencePieceTokenizer, got {type(self.tokenizer)}"
|
||||
)
|
||||
|
||||
for i in range(self.tokenizer._model.vocab_size()):
|
||||
piece = self.tokenizer._model.IdToPiece(i)
|
||||
text = piece.encode("utf-8")
|
||||
score: float = self.tokenizer._model.GetScore(i)
|
||||
|
||||
toktype = gguf.TokenType.NORMAL
|
||||
if self.tokenizer._model.IsUnknown(i):
|
||||
toktype = gguf.TokenType.UNKNOWN
|
||||
if self.tokenizer._model.IsControl(i):
|
||||
toktype = gguf.TokenType.CONTROL
|
||||
|
||||
if self.tokenizer._model.IsUnused(i):
|
||||
toktype = gguf.TokenType.UNUSED
|
||||
if self.tokenizer._model.IsByte(i):
|
||||
toktype = gguf.TokenType.BYTE
|
||||
|
||||
yield text, score, toktype
|
||||
|
||||
def _tekken_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
||||
assert Tekkenizer is not None, "mistral_common is not installed"
|
||||
assert isinstance(self.tokenizer, Tekkenizer), (
|
||||
f"Expected Tekkenizer, got {type(self.tokenizer)}"
|
||||
)
|
||||
|
||||
byte_encoder = bytes_to_unicode()
|
||||
for token_id in range(self.tokenizer.num_special_tokens):
|
||||
yield (
|
||||
self.tokenizer.id_to_piece(token_id).encode("utf-8"),
|
||||
0,
|
||||
gguf.TokenType.CONTROL
|
||||
)
|
||||
for token in self.tokenizer._tekken_token2id_nospecial:
|
||||
yield (
|
||||
self.token_bytes_to_string(token, byte_encoder).encode("utf-8"),
|
||||
0,
|
||||
gguf.TokenType.NORMAL,
|
||||
)
|
||||
|
||||
def get_token_id(self, token: str) -> int:
|
||||
assert SentencePieceTokenizer is not None and Tekkenizer is not None, "mistral_common is not installed"
|
||||
if self.tokenizer_type == MistralTokenizerType.spm:
|
||||
assert isinstance(self.tokenizer, SentencePieceTokenizer)
|
||||
return self.tokenizer._vocab.index(token)
|
||||
elif self.tokenizer_type == MistralTokenizerType.tekken:
|
||||
assert isinstance(self.tokenizer, Tekkenizer)
|
||||
return (
|
||||
self.tokenizer._vocab.index(token) + self.tokenizer.num_special_tokens
|
||||
)
|
||||
else:
|
||||
raise ValueError(f"Unknown tokenizer type: {self.tokenizer_type}")
|
||||
|
||||
@property
|
||||
def bos_id(self) -> int:
|
||||
return self.tokenizer.bos_id
|
||||
|
||||
@property
|
||||
def eos_id(self) -> int:
|
||||
return self.tokenizer.eos_id
|
||||
|
||||
@property
|
||||
def pad_id(self) -> int:
|
||||
if self.tokenizer.pad_id == -1:
|
||||
return self.eos_id
|
||||
return self.tokenizer.pad_id
|
||||
|
||||
@property
|
||||
def unk_id(self) -> int:
|
||||
return self.tokenizer.unk_id
|
||||
|
||||
@property
|
||||
def bos_token(self) -> str:
|
||||
return self.tokenizer.id_to_piece(self.tokenizer.bos_id)
|
||||
|
||||
@property
|
||||
def eos_token(self) -> str:
|
||||
return self.tokenizer.id_to_piece(self.tokenizer.eos_id)
|
||||
|
||||
@property
|
||||
def pad_token(self) -> str:
|
||||
return self.tokenizer.id_to_piece(self.tokenizer.pad_id)
|
||||
|
||||
@property
|
||||
def unk_token(self) -> str:
|
||||
return self.tokenizer.id_to_piece(self.tokenizer.unk_id)
|
||||
|
||||
def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
||||
if self.tokenizer_type == MistralTokenizerType.spm:
|
||||
yield from self._sentencepiece_tokens()
|
||||
|
||||
elif self.tokenizer_type == MistralTokenizerType.tekken:
|
||||
yield from self._tekken_tokens()
|
||||
|
||||
else:
|
||||
raise ValueError(f"Unknown tokenizer type: {self.tokenizer_type}")
|
||||
|
||||
@staticmethod
|
||||
def token_bytes_to_string(b, byte_encoder):
|
||||
return "".join([byte_encoder[ord(char)] for char in b.decode("latin-1")])
|
||||
|
||||
def extract_vocab_merges_from_model(self):
|
||||
# Adapted from Transformers (Apache 2.0)
|
||||
# https://github.com/huggingface/transformers/blob/main/src/transformers/convert_slow_tokenizer.py
|
||||
assert Tekkenizer is not None and isinstance(self.tokenizer, Tekkenizer), (
|
||||
f"Expected Tekkenizer, got {type(self.tokenizer)}"
|
||||
)
|
||||
mergeable_ranks = self.tokenizer._model._mergeable_ranks
|
||||
token_bytes_map = {
|
||||
rank: token_bytes for token_bytes, rank in mergeable_ranks.items()
|
||||
}
|
||||
merge_pairs = []
|
||||
|
||||
# Sort vocab by rank to ensure correct merge order
|
||||
for i in range(256, self.vocab_size - self.tokenizer.num_special_tokens):
|
||||
merged_token = token_bytes_map[i]
|
||||
local = []
|
||||
for j in range(1, len(merged_token)):
|
||||
left = merged_token[:j]
|
||||
right = merged_token[j:]
|
||||
if (
|
||||
left in mergeable_ranks
|
||||
and right in mergeable_ranks
|
||||
and (left + right) in mergeable_ranks
|
||||
):
|
||||
local.append((left, right, i))
|
||||
if not local:
|
||||
raise ValueError(
|
||||
f"Could not find valid merge for token at rank {i}: {merged_token.decode('latin-1')}"
|
||||
)
|
||||
local = sorted(
|
||||
local,
|
||||
key=lambda x: (mergeable_ranks[x[0]], mergeable_ranks[x[1]]),
|
||||
reverse=False,
|
||||
)
|
||||
merge_pairs.extend(local)
|
||||
merge_pairs = sorted(merge_pairs, key=lambda val: val[2], reverse=False)
|
||||
|
||||
byte_encoder = bytes_to_unicode()
|
||||
|
||||
decoded_merge_pairs = [
|
||||
[
|
||||
self.token_bytes_to_string(val[0], byte_encoder),
|
||||
self.token_bytes_to_string(val[1], byte_encoder),
|
||||
]
|
||||
for val in merge_pairs
|
||||
]
|
||||
|
||||
merges = [
|
||||
" ".join(
|
||||
[
|
||||
# ensure the spaces are properly encoded
|
||||
"".join(chr(ord(c) + 256) if c == " " else c for c in part)
|
||||
for part in pair
|
||||
]
|
||||
)
|
||||
for pair in decoded_merge_pairs
|
||||
]
|
||||
|
||||
return merges
|
||||
|
|
|
|||
|
|
@ -284,10 +284,11 @@ extern "C" {
|
|||
const struct llama_model_kv_override * kv_overrides;
|
||||
|
||||
// Keep the booleans together to avoid misalignment during copy-by-value.
|
||||
bool vocab_only; // only load the vocabulary, no weights
|
||||
bool use_mmap; // use mmap if possible
|
||||
bool use_mlock; // force system to keep model in RAM
|
||||
bool check_tensors; // validate model tensor data
|
||||
bool vocab_only; // only load the vocabulary, no weights
|
||||
bool use_mmap; // use mmap if possible
|
||||
bool use_mlock; // force system to keep model in RAM
|
||||
bool check_tensors; // validate model tensor data
|
||||
bool use_extra_bufts; // use extra buffer types (used for weight repacking)
|
||||
};
|
||||
|
||||
// NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
|
||||
|
|
@ -537,6 +538,9 @@ extern "C" {
|
|||
// Returns true if the model is recurrent (like Mamba, RWKV, etc.)
|
||||
LLAMA_API bool llama_model_is_recurrent(const struct llama_model * model);
|
||||
|
||||
// Returns true if the model is diffusion-based (like LLaDA, Dream, etc.)
|
||||
LLAMA_API bool llama_model_is_diffusion(const struct llama_model * model);
|
||||
|
||||
// Returns 0 on success
|
||||
LLAMA_API uint32_t llama_model_quantize(
|
||||
const char * fname_inp,
|
||||
|
|
|
|||
File diff suppressed because one or more lines are too long
|
|
@ -1,3 +1,5 @@
|
|||
mistral-common>=1.8.3
|
||||
|
||||
-r ./requirements-convert_legacy_llama.txt
|
||||
--extra-index-url https://download.pytorch.org/whl/cpu
|
||||
torch~=2.2.1; platform_machine != "s390x"
|
||||
|
|
|
|||
|
|
@ -1,7 +1 @@
|
|||
-r ./requirements-convert_legacy_llama.txt
|
||||
--extra-index-url https://download.pytorch.org/whl/cpu
|
||||
torch~=2.2.1; platform_machine != "s390x"
|
||||
|
||||
# torch s390x packages can only be found from nightly builds
|
||||
--extra-index-url https://download.pytorch.org/whl/nightly
|
||||
torch>=0.0.0.dev0; platform_machine == "s390x"
|
||||
|
|
|
|||
|
|
@ -1,3 +1,3 @@
|
|||
docstring_parser~=0.15
|
||||
pydantic~=2.6.3
|
||||
pydantic~=2.11.7
|
||||
requests
|
||||
|
|
|
|||
|
|
@ -1,19 +1,41 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
if [ $# -lt 2 ]; then
|
||||
echo "usage: ./scripts/compare-commits.sh <commit1> <commit2> [additional llama-bench arguments]"
|
||||
echo "usage: ./scripts/compare-commits.sh <commit1> <commit2> [tool] [additional arguments]"
|
||||
echo " tool: 'llama-bench' (default) or 'test-backend-ops'"
|
||||
echo " additional arguments: passed to the selected tool"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
set -e
|
||||
set -x
|
||||
|
||||
# Parse arguments
|
||||
commit1=$1
|
||||
commit2=$2
|
||||
tool=${3:-llama-bench}
|
||||
additional_args="${@:4}"
|
||||
|
||||
# Validate tool argument
|
||||
if [ "$tool" != "llama-bench" ] && [ "$tool" != "test-backend-ops" ]; then
|
||||
echo "Error: tool must be 'llama-bench' or 'test-backend-ops'"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# verify at the start that the compare script has all the necessary dependencies installed
|
||||
./scripts/compare-llama-bench.py --check
|
||||
|
||||
bench_args="${@:3}"
|
||||
if [ "$tool" = "llama-bench" ]; then
|
||||
db_file="llama-bench.sqlite"
|
||||
target="llama-bench"
|
||||
run_args="-o sql -oe md $additional_args"
|
||||
else # test-backend-ops
|
||||
db_file="test-backend-ops.sqlite"
|
||||
target="test-backend-ops"
|
||||
run_args="perf --output sql $additional_args"
|
||||
fi
|
||||
|
||||
rm -f llama-bench.sqlite > /dev/null
|
||||
rm -f "$db_file" > /dev/null
|
||||
|
||||
# to test a backend, call the script with the corresponding environment variable (e.g. GGML_CUDA=1 ./scripts/compare-commits.sh ...)
|
||||
if [ -n "$GGML_CUDA" ]; then
|
||||
|
|
@ -25,14 +47,14 @@ dir="build-bench"
|
|||
function run {
|
||||
rm -fr ${dir} > /dev/null
|
||||
cmake -B ${dir} -S . ${CMAKE_OPTS} > /dev/null
|
||||
cmake --build ${dir} -t llama-bench > /dev/null
|
||||
${dir}/bin/llama-bench -o sql -oe md $bench_args | sqlite3 llama-bench.sqlite
|
||||
cmake --build ${dir} -t $target -j $(nproc) > /dev/null
|
||||
${dir}/bin/$target $run_args | sqlite3 "$db_file"
|
||||
}
|
||||
|
||||
git checkout $1 > /dev/null
|
||||
git checkout $commit1 > /dev/null
|
||||
run
|
||||
|
||||
git checkout $2 > /dev/null
|
||||
git checkout $commit2 > /dev/null
|
||||
run
|
||||
|
||||
./scripts/compare-llama-bench.py -b $1 -c $2
|
||||
./scripts/compare-llama-bench.py -b $commit1 -c $commit2 --tool $tool -i "$db_file"
|
||||
|
|
|
|||
|
|
@ -1,16 +1,16 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
import logging
|
||||
import argparse
|
||||
import heapq
|
||||
import sys
|
||||
import os
|
||||
from glob import glob
|
||||
import sqlite3
|
||||
import json
|
||||
import csv
|
||||
from typing import Optional, Union
|
||||
import heapq
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import sqlite3
|
||||
import sys
|
||||
from collections.abc import Iterator, Sequence
|
||||
from glob import glob
|
||||
from typing import Any, Optional, Union
|
||||
|
||||
try:
|
||||
import git
|
||||
|
|
@ -23,7 +23,7 @@ except ImportError as e:
|
|||
logger = logging.getLogger("compare-llama-bench")
|
||||
|
||||
# All llama-bench SQL fields
|
||||
DB_FIELDS = [
|
||||
LLAMA_BENCH_DB_FIELDS = [
|
||||
"build_commit", "build_number", "cpu_info", "gpu_info", "backends", "model_filename",
|
||||
"model_type", "model_size", "model_n_params", "n_batch", "n_ubatch", "n_threads",
|
||||
"cpu_mask", "cpu_strict", "poll", "type_k", "type_v", "n_gpu_layers",
|
||||
|
|
@ -33,7 +33,7 @@ DB_FIELDS = [
|
|||
"test_time", "avg_ns", "stddev_ns", "avg_ts", "stddev_ts",
|
||||
]
|
||||
|
||||
DB_TYPES = [
|
||||
LLAMA_BENCH_DB_TYPES = [
|
||||
"TEXT", "INTEGER", "TEXT", "TEXT", "TEXT", "TEXT",
|
||||
"TEXT", "INTEGER", "INTEGER", "INTEGER", "INTEGER", "INTEGER",
|
||||
"TEXT", "INTEGER", "INTEGER", "TEXT", "TEXT", "INTEGER",
|
||||
|
|
@ -42,20 +42,41 @@ DB_TYPES = [
|
|||
"INTEGER", "INTEGER", "INTEGER", "INTEGER", "INTEGER", "INTEGER",
|
||||
"TEXT", "INTEGER", "INTEGER", "REAL", "REAL",
|
||||
]
|
||||
assert len(DB_FIELDS) == len(DB_TYPES)
|
||||
|
||||
# Properties by which to differentiate results per commit:
|
||||
KEY_PROPERTIES = [
|
||||
# All test-backend-ops SQL fields
|
||||
TEST_BACKEND_OPS_DB_FIELDS = [
|
||||
"test_time", "build_commit", "backend_name", "op_name", "op_params", "test_mode",
|
||||
"supported", "passed", "error_message", "time_us", "flops", "bandwidth_gb_s",
|
||||
"memory_kb", "n_runs"
|
||||
]
|
||||
|
||||
TEST_BACKEND_OPS_DB_TYPES = [
|
||||
"TEXT", "TEXT", "TEXT", "TEXT", "TEXT", "TEXT",
|
||||
"INTEGER", "INTEGER", "TEXT", "REAL", "REAL", "REAL",
|
||||
"INTEGER", "INTEGER"
|
||||
]
|
||||
|
||||
assert len(LLAMA_BENCH_DB_FIELDS) == len(LLAMA_BENCH_DB_TYPES)
|
||||
assert len(TEST_BACKEND_OPS_DB_FIELDS) == len(TEST_BACKEND_OPS_DB_TYPES)
|
||||
|
||||
# Properties by which to differentiate results per commit for llama-bench:
|
||||
LLAMA_BENCH_KEY_PROPERTIES = [
|
||||
"cpu_info", "gpu_info", "backends", "n_gpu_layers", "tensor_buft_overrides", "model_filename", "model_type",
|
||||
"n_batch", "n_ubatch", "embeddings", "cpu_mask", "cpu_strict", "poll", "n_threads", "type_k", "type_v",
|
||||
"use_mmap", "no_kv_offload", "split_mode", "main_gpu", "tensor_split", "flash_attn", "n_prompt", "n_gen", "n_depth"
|
||||
]
|
||||
|
||||
# Properties that are boolean and are converted to Yes/No for the table:
|
||||
BOOL_PROPERTIES = ["embeddings", "cpu_strict", "use_mmap", "no_kv_offload", "flash_attn"]
|
||||
# Properties by which to differentiate results per commit for test-backend-ops:
|
||||
TEST_BACKEND_OPS_KEY_PROPERTIES = [
|
||||
"backend_name", "op_name", "op_params", "test_mode"
|
||||
]
|
||||
|
||||
# Header names for the table:
|
||||
PRETTY_NAMES = {
|
||||
# Properties that are boolean and are converted to Yes/No for the table:
|
||||
LLAMA_BENCH_BOOL_PROPERTIES = ["embeddings", "cpu_strict", "use_mmap", "no_kv_offload", "flash_attn"]
|
||||
TEST_BACKEND_OPS_BOOL_PROPERTIES = ["supported", "passed"]
|
||||
|
||||
# Header names for the table (llama-bench):
|
||||
LLAMA_BENCH_PRETTY_NAMES = {
|
||||
"cpu_info": "CPU", "gpu_info": "GPU", "backends": "Backends", "n_gpu_layers": "GPU layers",
|
||||
"tensor_buft_overrides": "Tensor overrides", "model_filename": "File", "model_type": "Model", "model_size": "Model size [GiB]",
|
||||
"model_n_params": "Num. of par.", "n_batch": "Batch size", "n_ubatch": "Microbatch size", "embeddings": "Embeddings",
|
||||
|
|
@ -64,21 +85,42 @@ PRETTY_NAMES = {
|
|||
"flash_attn": "FlashAttention",
|
||||
}
|
||||
|
||||
DEFAULT_SHOW = ["model_type"] # Always show these properties by default.
|
||||
DEFAULT_HIDE = ["model_filename"] # Always hide these properties by default.
|
||||
# Header names for the table (test-backend-ops):
|
||||
TEST_BACKEND_OPS_PRETTY_NAMES = {
|
||||
"backend_name": "Backend", "op_name": "GGML op", "op_params": "Op parameters", "test_mode": "Mode",
|
||||
"supported": "Supported", "passed": "Passed", "error_message": "Error",
|
||||
"flops": "FLOPS", "bandwidth_gb_s": "Bandwidth (GB/s)", "memory_kb": "Memory (KB)", "n_runs": "Runs"
|
||||
}
|
||||
|
||||
DEFAULT_SHOW_LLAMA_BENCH = ["model_type"] # Always show these properties by default.
|
||||
DEFAULT_HIDE_LLAMA_BENCH = ["model_filename"] # Always hide these properties by default.
|
||||
|
||||
DEFAULT_SHOW_TEST_BACKEND_OPS = ["backend_name", "op_name"] # Always show these properties by default.
|
||||
DEFAULT_HIDE_TEST_BACKEND_OPS = ["error_message"] # Always hide these properties by default.
|
||||
|
||||
GPU_NAME_STRIP = ["NVIDIA GeForce ", "Tesla ", "AMD Radeon "] # Strip prefixes for smaller tables.
|
||||
MODEL_SUFFIX_REPLACE = {" - Small": "_S", " - Medium": "_M", " - Large": "_L"}
|
||||
|
||||
DESCRIPTION = """Creates tables from llama-bench data written to multiple JSON/CSV files, a single JSONL file or SQLite database. Example usage (Linux):
|
||||
DESCRIPTION = """Creates tables from llama-bench or test-backend-ops data written to multiple JSON/CSV files, a single JSONL file or SQLite database. Example usage (Linux):
|
||||
|
||||
For llama-bench:
|
||||
$ git checkout master
|
||||
$ make clean && make llama-bench
|
||||
$ cmake -B ${BUILD_DIR} ${CMAKE_OPTS} && cmake --build ${BUILD_DIR} -t llama-bench -j $(nproc)
|
||||
$ ./llama-bench -o sql | sqlite3 llama-bench.sqlite
|
||||
$ git checkout some_branch
|
||||
$ make clean && make llama-bench
|
||||
$ cmake -B ${BUILD_DIR} ${CMAKE_OPTS} && cmake --build ${BUILD_DIR} -t llama-bench -j $(nproc)
|
||||
$ ./llama-bench -o sql | sqlite3 llama-bench.sqlite
|
||||
$ ./scripts/compare-llama-bench.py
|
||||
|
||||
For test-backend-ops:
|
||||
$ git checkout master
|
||||
$ cmake -B ${BUILD_DIR} ${CMAKE_OPTS} && cmake --build ${BUILD_DIR} -t test-backend-ops -j $(nproc)
|
||||
$ ./test-backend-ops perf --output sql | sqlite3 test-backend-ops.sqlite
|
||||
$ git checkout some_branch
|
||||
$ cmake -B ${BUILD_DIR} ${CMAKE_OPTS} && cmake --build ${BUILD_DIR} -t test-backend-ops -j $(nproc)
|
||||
$ ./test-backend-ops perf --output sql | sqlite3 test-backend-ops.sqlite
|
||||
$ ./scripts/compare-llama-bench.py --tool test-backend-ops -i test-backend-ops.sqlite
|
||||
|
||||
Performance numbers from multiple runs per commit are averaged WITHOUT being weighted by the --repetitions parameter of llama-bench.
|
||||
"""
|
||||
|
||||
|
|
@ -96,6 +138,13 @@ help_c = (
|
|||
"Defaults to the non-master commit for which llama-bench was run most recently."
|
||||
)
|
||||
parser.add_argument("-c", "--compare", help=help_c)
|
||||
help_t = (
|
||||
"The tool whose data is being compared. "
|
||||
"Either 'llama-bench' or 'test-backend-ops'. "
|
||||
"This determines the database schema and comparison logic used. "
|
||||
"If left unspecified, try to determine from the input file."
|
||||
)
|
||||
parser.add_argument("-t", "--tool", help=help_t, default=None, choices=[None, "llama-bench", "test-backend-ops"])
|
||||
help_i = (
|
||||
"JSON/JSONL/SQLite/CSV files for comparing commits. "
|
||||
"Specify multiple times to use multiple input files (JSON/CSV only). "
|
||||
|
|
@ -114,7 +163,8 @@ parser.add_argument("-o", "--output", help=help_o, default="pipe")
|
|||
help_s = (
|
||||
"Columns to add to the table. "
|
||||
"Accepts a comma-separated list of values. "
|
||||
f"Legal values: {', '.join(KEY_PROPERTIES[:-3])}. "
|
||||
f"Legal values for test-backend-ops: {', '.join(TEST_BACKEND_OPS_KEY_PROPERTIES)}. "
|
||||
f"Legal values for llama-bench: {', '.join(LLAMA_BENCH_KEY_PROPERTIES[:-3])}. "
|
||||
"Defaults to model name (model_type) and CPU and/or GPU name (cpu_info, gpu_info) "
|
||||
"plus any column where not all data points are the same. "
|
||||
"If the columns are manually specified, then the results for each unique combination of the "
|
||||
|
|
@ -142,8 +192,14 @@ if unknown_args:
|
|||
sys.exit(1)
|
||||
|
||||
input_file = known_args.input
|
||||
if not input_file and os.path.exists("./llama-bench.sqlite"):
|
||||
input_file = ["llama-bench.sqlite"]
|
||||
tool = known_args.tool
|
||||
|
||||
if not input_file:
|
||||
if tool == "llama-bench" and os.path.exists("./llama-bench.sqlite"):
|
||||
input_file = ["llama-bench.sqlite"]
|
||||
elif tool == "test-backend-ops" and os.path.exists("./test-backend-ops.sqlite"):
|
||||
input_file = ["test-backend-ops.sqlite"]
|
||||
|
||||
if not input_file:
|
||||
sqlite_files = glob("*.sqlite")
|
||||
if len(sqlite_files) == 1:
|
||||
|
|
@ -161,14 +217,23 @@ class LlamaBenchData:
|
|||
build_len_max: int
|
||||
build_len: int = 8
|
||||
builds: list[str] = []
|
||||
check_keys = set(KEY_PROPERTIES + ["build_commit", "test_time", "avg_ts"])
|
||||
tool: str = "llama-bench" # Tool type: "llama-bench" or "test-backend-ops"
|
||||
|
||||
def __init__(self):
|
||||
def __init__(self, tool: str = "llama-bench"):
|
||||
self.tool = tool
|
||||
try:
|
||||
self.repo = git.Repo(".", search_parent_directories=True)
|
||||
except git.InvalidGitRepositoryError:
|
||||
self.repo = None
|
||||
|
||||
# Set schema-specific properties based on tool
|
||||
if self.tool == "llama-bench":
|
||||
self.check_keys = set(LLAMA_BENCH_KEY_PROPERTIES + ["build_commit", "test_time", "avg_ts"])
|
||||
elif self.tool == "test-backend-ops":
|
||||
self.check_keys = set(TEST_BACKEND_OPS_KEY_PROPERTIES + ["build_commit", "test_time"])
|
||||
else:
|
||||
assert False
|
||||
|
||||
def _builds_init(self):
|
||||
self.build_len = self.build_len_min
|
||||
|
||||
|
|
@ -252,52 +317,121 @@ class LlamaBenchData:
|
|||
class LlamaBenchDataSQLite3(LlamaBenchData):
|
||||
connection: sqlite3.Connection
|
||||
cursor: sqlite3.Cursor
|
||||
table_name: str
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
def __init__(self, tool: str = "llama-bench"):
|
||||
super().__init__(tool)
|
||||
self.connection = sqlite3.connect(":memory:")
|
||||
self.cursor = self.connection.cursor()
|
||||
self.cursor.execute(f"CREATE TABLE test({', '.join(' '.join(x) for x in zip(DB_FIELDS, DB_TYPES))});")
|
||||
|
||||
# Set table name and schema based on tool
|
||||
if self.tool == "llama-bench":
|
||||
self.table_name = "llama_bench"
|
||||
db_fields = LLAMA_BENCH_DB_FIELDS
|
||||
db_types = LLAMA_BENCH_DB_TYPES
|
||||
elif self.tool == "test-backend-ops":
|
||||
self.table_name = "test_backend_ops"
|
||||
db_fields = TEST_BACKEND_OPS_DB_FIELDS
|
||||
db_types = TEST_BACKEND_OPS_DB_TYPES
|
||||
else:
|
||||
assert False
|
||||
|
||||
self.cursor.execute(f"CREATE TABLE {self.table_name}({', '.join(' '.join(x) for x in zip(db_fields, db_types))});")
|
||||
|
||||
def _builds_init(self):
|
||||
if self.connection:
|
||||
self.build_len_min = self.cursor.execute("SELECT MIN(LENGTH(build_commit)) from test;").fetchone()[0]
|
||||
self.build_len_max = self.cursor.execute("SELECT MAX(LENGTH(build_commit)) from test;").fetchone()[0]
|
||||
self.build_len_min = self.cursor.execute(f"SELECT MIN(LENGTH(build_commit)) from {self.table_name};").fetchone()[0]
|
||||
self.build_len_max = self.cursor.execute(f"SELECT MAX(LENGTH(build_commit)) from {self.table_name};").fetchone()[0]
|
||||
|
||||
if self.build_len_min != self.build_len_max:
|
||||
logger.warning("Data contains commit hashes of differing lengths. It's possible that the wrong commits will be compared. "
|
||||
"Try purging the the database of old commits.")
|
||||
self.cursor.execute(f"UPDATE test SET build_commit = SUBSTRING(build_commit, 1, {self.build_len_min});")
|
||||
self.cursor.execute(f"UPDATE {self.table_name} SET build_commit = SUBSTRING(build_commit, 1, {self.build_len_min});")
|
||||
|
||||
builds = self.cursor.execute("SELECT DISTINCT build_commit FROM test;").fetchall()
|
||||
builds = self.cursor.execute(f"SELECT DISTINCT build_commit FROM {self.table_name};").fetchall()
|
||||
self.builds = list(map(lambda b: b[0], builds)) # list[tuple[str]] -> list[str]
|
||||
super()._builds_init()
|
||||
|
||||
def builds_timestamp(self, reverse: bool = False) -> Union[Iterator[tuple], Sequence[tuple]]:
|
||||
data = self.cursor.execute(
|
||||
"SELECT build_commit, test_time FROM test ORDER BY test_time;").fetchall()
|
||||
f"SELECT build_commit, test_time FROM {self.table_name} ORDER BY test_time;").fetchall()
|
||||
return reversed(data) if reverse else data
|
||||
|
||||
def get_rows(self, properties: list[str], hexsha8_baseline: str, hexsha8_compare: str) -> Sequence[tuple]:
|
||||
if self.tool == "llama-bench":
|
||||
return self._get_rows_llama_bench(properties, hexsha8_baseline, hexsha8_compare)
|
||||
elif self.tool == "test-backend-ops":
|
||||
return self._get_rows_test_backend_ops(properties, hexsha8_baseline, hexsha8_compare)
|
||||
else:
|
||||
assert False
|
||||
|
||||
def _get_rows_llama_bench(self, properties: list[str], hexsha8_baseline: str, hexsha8_compare: str) -> Sequence[tuple]:
|
||||
select_string = ", ".join(
|
||||
[f"tb.{p}" for p in properties] + ["tb.n_prompt", "tb.n_gen", "tb.n_depth", "AVG(tb.avg_ts)", "AVG(tc.avg_ts)"])
|
||||
equal_string = " AND ".join(
|
||||
[f"tb.{p} = tc.{p}" for p in KEY_PROPERTIES] + [
|
||||
[f"tb.{p} = tc.{p}" for p in LLAMA_BENCH_KEY_PROPERTIES] + [
|
||||
f"tb.build_commit = '{hexsha8_baseline}'", f"tc.build_commit = '{hexsha8_compare}'"]
|
||||
)
|
||||
group_order_string = ", ".join([f"tb.{p}" for p in properties] + ["tb.n_gen", "tb.n_prompt", "tb.n_depth"])
|
||||
query = (f"SELECT {select_string} FROM test tb JOIN test tc ON {equal_string} "
|
||||
query = (f"SELECT {select_string} FROM {self.table_name} tb JOIN {self.table_name} tc ON {equal_string} "
|
||||
f"GROUP BY {group_order_string} ORDER BY {group_order_string};")
|
||||
return self.cursor.execute(query).fetchall()
|
||||
|
||||
def _get_rows_test_backend_ops(self, properties: list[str], hexsha8_baseline: str, hexsha8_compare: str) -> Sequence[tuple]:
|
||||
# For test-backend-ops, we compare FLOPS and bandwidth metrics (prioritizing FLOPS over bandwidth)
|
||||
select_string = ", ".join(
|
||||
[f"tb.{p}" for p in properties] + [
|
||||
"AVG(tb.flops)", "AVG(tc.flops)",
|
||||
"AVG(tb.bandwidth_gb_s)", "AVG(tc.bandwidth_gb_s)"
|
||||
])
|
||||
equal_string = " AND ".join(
|
||||
[f"tb.{p} = tc.{p}" for p in TEST_BACKEND_OPS_KEY_PROPERTIES] + [
|
||||
f"tb.build_commit = '{hexsha8_baseline}'", f"tc.build_commit = '{hexsha8_compare}'",
|
||||
"tb.supported = 1", "tc.supported = 1", "tb.passed = 1", "tc.passed = 1"] # Only compare successful tests
|
||||
)
|
||||
group_order_string = ", ".join([f"tb.{p}" for p in properties])
|
||||
query = (f"SELECT {select_string} FROM {self.table_name} tb JOIN {self.table_name} tc ON {equal_string} "
|
||||
f"GROUP BY {group_order_string} ORDER BY {group_order_string};")
|
||||
return self.cursor.execute(query).fetchall()
|
||||
|
||||
|
||||
class LlamaBenchDataSQLite3File(LlamaBenchDataSQLite3):
|
||||
def __init__(self, data_file: str):
|
||||
super().__init__()
|
||||
def __init__(self, data_file: str, tool: Any):
|
||||
super().__init__(tool)
|
||||
|
||||
self.connection.close()
|
||||
self.connection = sqlite3.connect(data_file)
|
||||
self.cursor = self.connection.cursor()
|
||||
|
||||
# Check which table exists in the database
|
||||
tables = self.cursor.execute("SELECT name FROM sqlite_master WHERE type='table';").fetchall()
|
||||
table_names = [table[0] for table in tables]
|
||||
|
||||
# Tool selection logic
|
||||
if tool is None:
|
||||
if "llama_bench" in table_names:
|
||||
self.table_name = "llama_bench"
|
||||
self.tool = "llama-bench"
|
||||
elif "test_backend_ops" in table_names:
|
||||
self.table_name = "test_backend_ops"
|
||||
self.tool = "test-backend-ops"
|
||||
else:
|
||||
raise RuntimeError(f"No suitable table found in database. Available tables: {table_names}")
|
||||
elif tool == "llama-bench":
|
||||
if "llama_bench" in table_names:
|
||||
self.table_name = "llama_bench"
|
||||
self.tool = "llama-bench"
|
||||
else:
|
||||
raise RuntimeError(f"Table 'test' not found for tool 'llama-bench'. Available tables: {table_names}")
|
||||
elif tool == "test-backend-ops":
|
||||
if "test_backend_ops" in table_names:
|
||||
self.table_name = "test_backend_ops"
|
||||
self.tool = "test-backend-ops"
|
||||
else:
|
||||
raise RuntimeError(f"Table 'test_backend_ops' not found for tool 'test-backend-ops'. Available tables: {table_names}")
|
||||
else:
|
||||
raise RuntimeError(f"Unknown tool: {tool}")
|
||||
|
||||
self._builds_init()
|
||||
|
||||
@staticmethod
|
||||
|
|
@ -317,20 +451,23 @@ class LlamaBenchDataSQLite3File(LlamaBenchDataSQLite3):
|
|||
|
||||
|
||||
class LlamaBenchDataJSONL(LlamaBenchDataSQLite3):
|
||||
def __init__(self, data_file: str):
|
||||
super().__init__()
|
||||
def __init__(self, data_file: str, tool: str = "llama-bench"):
|
||||
super().__init__(tool)
|
||||
|
||||
# Get the appropriate field list based on tool
|
||||
db_fields = LLAMA_BENCH_DB_FIELDS if tool == "llama-bench" else TEST_BACKEND_OPS_DB_FIELDS
|
||||
|
||||
with open(data_file, "r", encoding="utf-8") as fp:
|
||||
for i, line in enumerate(fp):
|
||||
parsed = json.loads(line)
|
||||
|
||||
for k in parsed.keys() - set(DB_FIELDS):
|
||||
for k in parsed.keys() - set(db_fields):
|
||||
del parsed[k]
|
||||
|
||||
if (missing_keys := self._check_keys(parsed.keys())):
|
||||
raise RuntimeError(f"Missing required data key(s) at line {i + 1}: {', '.join(missing_keys)}")
|
||||
|
||||
self.cursor.execute(f"INSERT INTO test({', '.join(parsed.keys())}) VALUES({', '.join('?' * len(parsed))});", tuple(parsed.values()))
|
||||
self.cursor.execute(f"INSERT INTO {self.table_name}({', '.join(parsed.keys())}) VALUES({', '.join('?' * len(parsed))});", tuple(parsed.values()))
|
||||
|
||||
self._builds_init()
|
||||
|
||||
|
|
@ -349,21 +486,24 @@ class LlamaBenchDataJSONL(LlamaBenchDataSQLite3):
|
|||
|
||||
|
||||
class LlamaBenchDataJSON(LlamaBenchDataSQLite3):
|
||||
def __init__(self, data_files: list[str]):
|
||||
super().__init__()
|
||||
def __init__(self, data_files: list[str], tool: str = "llama-bench"):
|
||||
super().__init__(tool)
|
||||
|
||||
# Get the appropriate field list based on tool
|
||||
db_fields = LLAMA_BENCH_DB_FIELDS if tool == "llama-bench" else TEST_BACKEND_OPS_DB_FIELDS
|
||||
|
||||
for data_file in data_files:
|
||||
with open(data_file, "r", encoding="utf-8") as fp:
|
||||
parsed = json.load(fp)
|
||||
|
||||
for i, entry in enumerate(parsed):
|
||||
for k in entry.keys() - set(DB_FIELDS):
|
||||
for k in entry.keys() - set(db_fields):
|
||||
del entry[k]
|
||||
|
||||
if (missing_keys := self._check_keys(entry.keys())):
|
||||
raise RuntimeError(f"Missing required data key(s) at entry {i + 1}: {', '.join(missing_keys)}")
|
||||
|
||||
self.cursor.execute(f"INSERT INTO test({', '.join(entry.keys())}) VALUES({', '.join('?' * len(entry))});", tuple(entry.values()))
|
||||
self.cursor.execute(f"INSERT INTO {self.table_name}({', '.join(entry.keys())}) VALUES({', '.join('?' * len(entry))});", tuple(entry.values()))
|
||||
|
||||
self._builds_init()
|
||||
|
||||
|
|
@ -384,21 +524,24 @@ class LlamaBenchDataJSON(LlamaBenchDataSQLite3):
|
|||
|
||||
|
||||
class LlamaBenchDataCSV(LlamaBenchDataSQLite3):
|
||||
def __init__(self, data_files: list[str]):
|
||||
super().__init__()
|
||||
def __init__(self, data_files: list[str], tool: str = "llama-bench"):
|
||||
super().__init__(tool)
|
||||
|
||||
# Get the appropriate field list based on tool
|
||||
db_fields = LLAMA_BENCH_DB_FIELDS if tool == "llama-bench" else TEST_BACKEND_OPS_DB_FIELDS
|
||||
|
||||
for data_file in data_files:
|
||||
with open(data_file, "r", encoding="utf-8") as fp:
|
||||
for i, parsed in enumerate(csv.DictReader(fp)):
|
||||
keys = set(parsed.keys())
|
||||
|
||||
for k in keys - set(DB_FIELDS):
|
||||
for k in keys - set(db_fields):
|
||||
del parsed[k]
|
||||
|
||||
if (missing_keys := self._check_keys(keys)):
|
||||
raise RuntimeError(f"Missing required data key(s) at line {i + 1}: {', '.join(missing_keys)}")
|
||||
|
||||
self.cursor.execute(f"INSERT INTO test({', '.join(parsed.keys())}) VALUES({', '.join('?' * len(parsed))});", tuple(parsed.values()))
|
||||
self.cursor.execute(f"INSERT INTO {self.table_name}({', '.join(parsed.keys())}) VALUES({', '.join('?' * len(parsed))});", tuple(parsed.values()))
|
||||
|
||||
self._builds_init()
|
||||
|
||||
|
|
@ -419,21 +562,90 @@ class LlamaBenchDataCSV(LlamaBenchDataSQLite3):
|
|||
return True
|
||||
|
||||
|
||||
def format_flops(flops_value: float) -> str:
|
||||
"""Format FLOPS values with appropriate units for better readability."""
|
||||
if flops_value == 0:
|
||||
return "0.00"
|
||||
|
||||
# Define unit thresholds and names
|
||||
units = [
|
||||
(1e12, "T"), # TeraFLOPS
|
||||
(1e9, "G"), # GigaFLOPS
|
||||
(1e6, "M"), # MegaFLOPS
|
||||
(1e3, "k"), # kiloFLOPS
|
||||
(1, "") # FLOPS
|
||||
]
|
||||
|
||||
for threshold, unit in units:
|
||||
if abs(flops_value) >= threshold:
|
||||
formatted_value = flops_value / threshold
|
||||
if formatted_value >= 100:
|
||||
return f"{formatted_value:.1f}{unit}"
|
||||
else:
|
||||
return f"{formatted_value:.2f}{unit}"
|
||||
|
||||
# Fallback for very small values
|
||||
return f"{flops_value:.2f}"
|
||||
|
||||
|
||||
def format_flops_for_table(flops_value: float, target_unit: str) -> str:
|
||||
"""Format FLOPS values for table display without unit suffix (since unit is in header)."""
|
||||
if flops_value == 0:
|
||||
return "0.00"
|
||||
|
||||
# Define unit thresholds based on target unit
|
||||
unit_divisors = {
|
||||
"TFLOPS": 1e12,
|
||||
"GFLOPS": 1e9,
|
||||
"MFLOPS": 1e6,
|
||||
"kFLOPS": 1e3,
|
||||
"FLOPS": 1
|
||||
}
|
||||
|
||||
divisor = unit_divisors.get(target_unit, 1)
|
||||
formatted_value = flops_value / divisor
|
||||
|
||||
if formatted_value >= 100:
|
||||
return f"{formatted_value:.1f}"
|
||||
else:
|
||||
return f"{formatted_value:.2f}"
|
||||
|
||||
|
||||
def get_flops_unit_name(flops_values: list) -> str:
|
||||
"""Determine the best FLOPS unit name based on the magnitude of values."""
|
||||
if not flops_values or all(v == 0 for v in flops_values):
|
||||
return "FLOPS"
|
||||
|
||||
# Find the maximum absolute value to determine appropriate unit
|
||||
max_flops = max(abs(v) for v in flops_values if v != 0)
|
||||
|
||||
if max_flops >= 1e12:
|
||||
return "TFLOPS"
|
||||
elif max_flops >= 1e9:
|
||||
return "GFLOPS"
|
||||
elif max_flops >= 1e6:
|
||||
return "MFLOPS"
|
||||
elif max_flops >= 1e3:
|
||||
return "kFLOPS"
|
||||
else:
|
||||
return "FLOPS"
|
||||
|
||||
|
||||
bench_data = None
|
||||
if len(input_file) == 1:
|
||||
if LlamaBenchDataSQLite3File.valid_format(input_file[0]):
|
||||
bench_data = LlamaBenchDataSQLite3File(input_file[0])
|
||||
bench_data = LlamaBenchDataSQLite3File(input_file[0], tool)
|
||||
elif LlamaBenchDataJSON.valid_format(input_file):
|
||||
bench_data = LlamaBenchDataJSON(input_file)
|
||||
bench_data = LlamaBenchDataJSON(input_file, tool)
|
||||
elif LlamaBenchDataJSONL.valid_format(input_file[0]):
|
||||
bench_data = LlamaBenchDataJSONL(input_file[0])
|
||||
bench_data = LlamaBenchDataJSONL(input_file[0], tool)
|
||||
elif LlamaBenchDataCSV.valid_format(input_file):
|
||||
bench_data = LlamaBenchDataCSV(input_file)
|
||||
bench_data = LlamaBenchDataCSV(input_file, tool)
|
||||
else:
|
||||
if LlamaBenchDataJSON.valid_format(input_file):
|
||||
bench_data = LlamaBenchDataJSON(input_file)
|
||||
bench_data = LlamaBenchDataJSON(input_file, tool)
|
||||
elif LlamaBenchDataCSV.valid_format(input_file):
|
||||
bench_data = LlamaBenchDataCSV(input_file)
|
||||
bench_data = LlamaBenchDataCSV(input_file, tool)
|
||||
|
||||
if not bench_data:
|
||||
raise RuntimeError("No valid (or some invalid) input files found.")
|
||||
|
|
@ -504,12 +716,29 @@ else:
|
|||
|
||||
name_compare = bench_data.get_commit_name(hexsha8_compare)
|
||||
|
||||
# Get tool-specific configuration
|
||||
if tool == "llama-bench":
|
||||
key_properties = LLAMA_BENCH_KEY_PROPERTIES
|
||||
bool_properties = LLAMA_BENCH_BOOL_PROPERTIES
|
||||
pretty_names = LLAMA_BENCH_PRETTY_NAMES
|
||||
default_show = DEFAULT_SHOW_LLAMA_BENCH
|
||||
default_hide = DEFAULT_HIDE_LLAMA_BENCH
|
||||
elif tool == "test-backend-ops":
|
||||
key_properties = TEST_BACKEND_OPS_KEY_PROPERTIES
|
||||
bool_properties = TEST_BACKEND_OPS_BOOL_PROPERTIES
|
||||
pretty_names = TEST_BACKEND_OPS_PRETTY_NAMES
|
||||
default_show = DEFAULT_SHOW_TEST_BACKEND_OPS
|
||||
default_hide = DEFAULT_HIDE_TEST_BACKEND_OPS
|
||||
else:
|
||||
assert False
|
||||
|
||||
# If the user provided columns to group the results by, use them:
|
||||
if known_args.show is not None:
|
||||
show = known_args.show.split(",")
|
||||
unknown_cols = []
|
||||
for prop in show:
|
||||
if prop not in KEY_PROPERTIES[:-3]: # Last three values are n_prompt, n_gen, n_depth.
|
||||
valid_props = key_properties if tool == "test-backend-ops" else key_properties[:-3] # Exclude n_prompt, n_gen, n_depth for llama-bench
|
||||
if prop not in valid_props:
|
||||
unknown_cols.append(prop)
|
||||
if unknown_cols:
|
||||
logger.error(f"Unknown values for --show: {', '.join(unknown_cols)}")
|
||||
|
|
@ -518,32 +747,54 @@ if known_args.show is not None:
|
|||
rows_show = bench_data.get_rows(show, hexsha8_baseline, hexsha8_compare)
|
||||
# Otherwise, select those columns where the values are not all the same:
|
||||
else:
|
||||
rows_full = bench_data.get_rows(KEY_PROPERTIES, hexsha8_baseline, hexsha8_compare)
|
||||
rows_full = bench_data.get_rows(key_properties, hexsha8_baseline, hexsha8_compare)
|
||||
properties_different = []
|
||||
for i, kp_i in enumerate(KEY_PROPERTIES):
|
||||
if kp_i in DEFAULT_SHOW or kp_i in ["n_prompt", "n_gen", "n_depth"]:
|
||||
continue
|
||||
for row_full in rows_full:
|
||||
if row_full[i] != rows_full[0][i]:
|
||||
properties_different.append(kp_i)
|
||||
break
|
||||
|
||||
if tool == "llama-bench":
|
||||
# For llama-bench, skip n_prompt, n_gen, n_depth from differentiation logic
|
||||
check_properties = [kp for kp in key_properties if kp not in ["n_prompt", "n_gen", "n_depth"]]
|
||||
for i, kp_i in enumerate(key_properties):
|
||||
if kp_i in default_show or kp_i in ["n_prompt", "n_gen", "n_depth"]:
|
||||
continue
|
||||
for row_full in rows_full:
|
||||
if row_full[i] != rows_full[0][i]:
|
||||
properties_different.append(kp_i)
|
||||
break
|
||||
elif tool == "test-backend-ops":
|
||||
# For test-backend-ops, check all key properties
|
||||
for i, kp_i in enumerate(key_properties):
|
||||
if kp_i in default_show:
|
||||
continue
|
||||
for row_full in rows_full:
|
||||
if row_full[i] != rows_full[0][i]:
|
||||
properties_different.append(kp_i)
|
||||
break
|
||||
else:
|
||||
assert False
|
||||
|
||||
show = []
|
||||
# Show CPU and/or GPU by default even if the hardware for all results is the same:
|
||||
if rows_full and "n_gpu_layers" not in properties_different:
|
||||
ngl = int(rows_full[0][KEY_PROPERTIES.index("n_gpu_layers")])
|
||||
|
||||
if ngl != 99 and "cpu_info" not in properties_different:
|
||||
show.append("cpu_info")
|
||||
if tool == "llama-bench":
|
||||
# Show CPU and/or GPU by default even if the hardware for all results is the same:
|
||||
if rows_full and "n_gpu_layers" not in properties_different:
|
||||
ngl = int(rows_full[0][key_properties.index("n_gpu_layers")])
|
||||
|
||||
show += properties_different
|
||||
if ngl != 99 and "cpu_info" not in properties_different:
|
||||
show.append("cpu_info")
|
||||
|
||||
index_default = 0
|
||||
for prop in ["cpu_info", "gpu_info", "n_gpu_layers", "main_gpu"]:
|
||||
if prop in show:
|
||||
index_default += 1
|
||||
show = show[:index_default] + DEFAULT_SHOW + show[index_default:]
|
||||
for prop in DEFAULT_HIDE:
|
||||
show += properties_different
|
||||
|
||||
index_default = 0
|
||||
for prop in ["cpu_info", "gpu_info", "n_gpu_layers", "main_gpu"]:
|
||||
if prop in show:
|
||||
index_default += 1
|
||||
show = show[:index_default] + default_show + show[index_default:]
|
||||
elif tool == "test-backend-ops":
|
||||
show = default_show + properties_different
|
||||
else:
|
||||
assert False
|
||||
|
||||
for prop in default_hide:
|
||||
try:
|
||||
show.remove(prop)
|
||||
except ValueError:
|
||||
|
|
@ -551,7 +802,7 @@ else:
|
|||
|
||||
# Add plot_x parameter to parameters to show if it's not already present:
|
||||
if known_args.plot:
|
||||
for k, v in PRETTY_NAMES.items():
|
||||
for k, v in pretty_names.items():
|
||||
if v == known_args.plot_x and k not in show:
|
||||
show.append(k)
|
||||
break
|
||||
|
|
@ -563,60 +814,120 @@ if not rows_show:
|
|||
sys.exit(1)
|
||||
|
||||
table = []
|
||||
for row in rows_show:
|
||||
n_prompt = int(row[-5])
|
||||
n_gen = int(row[-4])
|
||||
n_depth = int(row[-3])
|
||||
if n_prompt != 0 and n_gen == 0:
|
||||
test_name = f"pp{n_prompt}"
|
||||
elif n_prompt == 0 and n_gen != 0:
|
||||
test_name = f"tg{n_gen}"
|
||||
else:
|
||||
test_name = f"pp{n_prompt}+tg{n_gen}"
|
||||
if n_depth != 0:
|
||||
test_name = f"{test_name}@d{n_depth}"
|
||||
# Regular columns test name avg t/s values Speedup
|
||||
# VVVVVVVVVVVVV VVVVVVVVV VVVVVVVVVVVVVV VVVVVVV
|
||||
table.append(list(row[:-5]) + [test_name] + list(row[-2:]) + [float(row[-1]) / float(row[-2])])
|
||||
primary_metric = "FLOPS" # Default to FLOPS for test-backend-ops
|
||||
|
||||
if tool == "llama-bench":
|
||||
# For llama-bench, create test names and compare avg_ts values
|
||||
for row in rows_show:
|
||||
n_prompt = int(row[-5])
|
||||
n_gen = int(row[-4])
|
||||
n_depth = int(row[-3])
|
||||
if n_prompt != 0 and n_gen == 0:
|
||||
test_name = f"pp{n_prompt}"
|
||||
elif n_prompt == 0 and n_gen != 0:
|
||||
test_name = f"tg{n_gen}"
|
||||
else:
|
||||
test_name = f"pp{n_prompt}+tg{n_gen}"
|
||||
if n_depth != 0:
|
||||
test_name = f"{test_name}@d{n_depth}"
|
||||
# Regular columns test name avg t/s values Speedup
|
||||
# VVVVVVVVVVVVV VVVVVVVVV VVVVVVVVVVVVVV VVVVVVV
|
||||
table.append(list(row[:-5]) + [test_name] + list(row[-2:]) + [float(row[-1]) / float(row[-2])])
|
||||
elif tool == "test-backend-ops":
|
||||
# Determine the primary metric by checking rows until we find one with valid data
|
||||
if rows_show:
|
||||
primary_metric = "FLOPS" # Default to FLOPS
|
||||
flops_values = []
|
||||
|
||||
# Collect all FLOPS values to determine the best unit
|
||||
for sample_row in rows_show:
|
||||
baseline_flops = float(sample_row[-4])
|
||||
compare_flops = float(sample_row[-3])
|
||||
baseline_bandwidth = float(sample_row[-2])
|
||||
|
||||
if baseline_flops > 0:
|
||||
flops_values.extend([baseline_flops, compare_flops])
|
||||
elif baseline_bandwidth > 0 and not flops_values:
|
||||
primary_metric = "Bandwidth (GB/s)"
|
||||
|
||||
# If we have FLOPS data, determine the appropriate unit
|
||||
if flops_values:
|
||||
primary_metric = get_flops_unit_name(flops_values)
|
||||
|
||||
# For test-backend-ops, prioritize FLOPS > bandwidth for comparison
|
||||
for row in rows_show:
|
||||
# Extract metrics: flops, bandwidth_gb_s (baseline and compare)
|
||||
baseline_flops = float(row[-4])
|
||||
compare_flops = float(row[-3])
|
||||
baseline_bandwidth = float(row[-2])
|
||||
compare_bandwidth = float(row[-1])
|
||||
|
||||
# Determine which metric to use for comparison (prioritize FLOPS > bandwidth)
|
||||
if baseline_flops > 0 and compare_flops > 0:
|
||||
# Use FLOPS comparison (higher is better)
|
||||
speedup = compare_flops / baseline_flops
|
||||
baseline_str = format_flops_for_table(baseline_flops, primary_metric)
|
||||
compare_str = format_flops_for_table(compare_flops, primary_metric)
|
||||
elif baseline_bandwidth > 0 and compare_bandwidth > 0:
|
||||
# Use bandwidth comparison (higher is better)
|
||||
speedup = compare_bandwidth / baseline_bandwidth
|
||||
baseline_str = f"{baseline_bandwidth:.2f}"
|
||||
compare_str = f"{compare_bandwidth:.2f}"
|
||||
else:
|
||||
# Fallback if no valid data is available
|
||||
baseline_str = "N/A"
|
||||
compare_str = "N/A"
|
||||
from math import nan
|
||||
speedup = nan
|
||||
|
||||
table.append(list(row[:-4]) + [baseline_str, compare_str, speedup])
|
||||
else:
|
||||
assert False
|
||||
|
||||
# Some a-posteriori fixes to make the table contents prettier:
|
||||
for bool_property in BOOL_PROPERTIES:
|
||||
for bool_property in bool_properties:
|
||||
if bool_property in show:
|
||||
ip = show.index(bool_property)
|
||||
for row_table in table:
|
||||
row_table[ip] = "Yes" if int(row_table[ip]) == 1 else "No"
|
||||
|
||||
if "model_type" in show:
|
||||
ip = show.index("model_type")
|
||||
for (old, new) in MODEL_SUFFIX_REPLACE.items():
|
||||
if tool == "llama-bench":
|
||||
if "model_type" in show:
|
||||
ip = show.index("model_type")
|
||||
for (old, new) in MODEL_SUFFIX_REPLACE.items():
|
||||
for row_table in table:
|
||||
row_table[ip] = row_table[ip].replace(old, new)
|
||||
|
||||
if "model_size" in show:
|
||||
ip = show.index("model_size")
|
||||
for row_table in table:
|
||||
row_table[ip] = row_table[ip].replace(old, new)
|
||||
row_table[ip] = float(row_table[ip]) / 1024 ** 3
|
||||
|
||||
if "model_size" in show:
|
||||
ip = show.index("model_size")
|
||||
for row_table in table:
|
||||
row_table[ip] = float(row_table[ip]) / 1024 ** 3
|
||||
if "gpu_info" in show:
|
||||
ip = show.index("gpu_info")
|
||||
for row_table in table:
|
||||
for gns in GPU_NAME_STRIP:
|
||||
row_table[ip] = row_table[ip].replace(gns, "")
|
||||
|
||||
if "gpu_info" in show:
|
||||
ip = show.index("gpu_info")
|
||||
for row_table in table:
|
||||
for gns in GPU_NAME_STRIP:
|
||||
row_table[ip] = row_table[ip].replace(gns, "")
|
||||
gpu_names = row_table[ip].split(", ")
|
||||
num_gpus = len(gpu_names)
|
||||
all_names_the_same = len(set(gpu_names)) == 1
|
||||
if len(gpu_names) >= 2 and all_names_the_same:
|
||||
row_table[ip] = f"{num_gpus}x {gpu_names[0]}"
|
||||
|
||||
gpu_names = row_table[ip].split(", ")
|
||||
num_gpus = len(gpu_names)
|
||||
all_names_the_same = len(set(gpu_names)) == 1
|
||||
if len(gpu_names) >= 2 and all_names_the_same:
|
||||
row_table[ip] = f"{num_gpus}x {gpu_names[0]}"
|
||||
|
||||
headers = [PRETTY_NAMES[p] for p in show]
|
||||
headers += ["Test", f"t/s {name_baseline}", f"t/s {name_compare}", "Speedup"]
|
||||
headers = [pretty_names.get(p, p) for p in show]
|
||||
if tool == "llama-bench":
|
||||
headers += ["Test", f"t/s {name_baseline}", f"t/s {name_compare}", "Speedup"]
|
||||
elif tool == "test-backend-ops":
|
||||
headers += [f"{primary_metric} {name_baseline}", f"{primary_metric} {name_compare}", "Speedup"]
|
||||
else:
|
||||
assert False
|
||||
|
||||
if known_args.plot:
|
||||
def create_performance_plot(table_data: list[list[str]], headers: list[str], baseline_name: str, compare_name: str, output_file: str, plot_x_param: str, log_scale: bool = False):
|
||||
def create_performance_plot(table_data: list[list[str]], headers: list[str], baseline_name: str, compare_name: str, output_file: str, plot_x_param: str, log_scale: bool = False, tool_type: str = "llama-bench", metric_name: str = "t/s"):
|
||||
try:
|
||||
import matplotlib.pyplot as plt
|
||||
import matplotlib
|
||||
import matplotlib.pyplot as plt
|
||||
matplotlib.use('Agg')
|
||||
except ImportError as e:
|
||||
logger.error("matplotlib is required for --plot.")
|
||||
|
|
@ -627,7 +938,7 @@ if known_args.plot:
|
|||
plot_x_label = plot_x_param
|
||||
|
||||
if plot_x_param not in ["n_prompt", "n_gen", "n_depth"]:
|
||||
pretty_name = PRETTY_NAMES.get(plot_x_param, plot_x_param)
|
||||
pretty_name = LLAMA_BENCH_PRETTY_NAMES.get(plot_x_param, plot_x_param)
|
||||
if pretty_name in data_headers:
|
||||
plot_x_index = data_headers.index(pretty_name)
|
||||
plot_x_label = pretty_name
|
||||
|
|
@ -746,8 +1057,16 @@ if known_args.plot:
|
|||
|
||||
title = ', '.join(title_parts) if title_parts else "Performance comparison"
|
||||
|
||||
# Determine y-axis label based on tool type
|
||||
if tool_type == "llama-bench":
|
||||
y_label = "Tokens per second (t/s)"
|
||||
elif tool_type == "test-backend-ops":
|
||||
y_label = metric_name
|
||||
else:
|
||||
assert False
|
||||
|
||||
ax.set_xlabel(plot_x_label, fontsize=12, fontweight='bold')
|
||||
ax.set_ylabel('Tokens per second (t/s)', fontsize=12, fontweight='bold')
|
||||
ax.set_ylabel(y_label, fontsize=12, fontweight='bold')
|
||||
ax.set_title(title, fontsize=12, fontweight='bold')
|
||||
ax.legend(loc='best', fontsize=10)
|
||||
ax.grid(True, alpha=0.3)
|
||||
|
|
@ -765,7 +1084,7 @@ if known_args.plot:
|
|||
plt.savefig(output_file, dpi=300, bbox_inches='tight')
|
||||
plt.close()
|
||||
|
||||
create_performance_plot(table, headers, name_baseline, name_compare, known_args.plot, known_args.plot_x, known_args.plot_log_scale)
|
||||
create_performance_plot(table, headers, name_baseline, name_compare, known_args.plot, known_args.plot_x, known_args.plot_log_scale, tool, primary_metric)
|
||||
|
||||
print(tabulate( # noqa: NP100
|
||||
table,
|
||||
|
|
|
|||
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue