Merge branch 'ggml-org:master' into master
This commit is contained in:
commit
4c5874282b
|
|
@ -0,0 +1,130 @@
|
||||||
|
# ==============================================================================
|
||||||
|
# ARGUMENTS
|
||||||
|
# ==============================================================================
|
||||||
|
|
||||||
|
# Define the CANN base image for easier version updates later
|
||||||
|
ARG CANN_BASE_IMAGE=quay.io/ascend/cann:8.1.rc1-910b-openeuler22.03-py3.10
|
||||||
|
|
||||||
|
# ==============================================================================
|
||||||
|
# BUILD STAGE
|
||||||
|
# Compile all binary files and libraries
|
||||||
|
# ==============================================================================
|
||||||
|
FROM ${CANN_BASE_IMAGE} AS build
|
||||||
|
|
||||||
|
# Define the Ascend chip model for compilation. Default is Ascend910B3
|
||||||
|
ARG ASCEND_SOC_TYPE=Ascend910B3
|
||||||
|
|
||||||
|
# -- Install build dependencies --
|
||||||
|
RUN yum install -y gcc g++ cmake make git libcurl-devel python3 python3-pip && \
|
||||||
|
yum clean all && \
|
||||||
|
rm -rf /var/cache/yum
|
||||||
|
|
||||||
|
# -- Set the working directory --
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
# -- Copy project files --
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
# -- Set CANN environment variables (required for compilation) --
|
||||||
|
# Using ENV instead of `source` allows environment variables to persist across the entire image layer
|
||||||
|
ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
|
||||||
|
ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${LD_LIBRARY_PATH}
|
||||||
|
ENV PATH=${ASCEND_TOOLKIT_HOME}/bin:${PATH}
|
||||||
|
ENV ASCEND_OPP_PATH=${ASCEND_TOOLKIT_HOME}/opp
|
||||||
|
ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/runtime/lib64/stub:$LD_LIBRARY_PATH
|
||||||
|
# ... You can add other environment variables from the original file as needed ...
|
||||||
|
# For brevity, only core variables are listed here. You can paste the original ENV list here.
|
||||||
|
|
||||||
|
# -- Build llama.cpp --
|
||||||
|
# Use the passed ASCEND_SOC_TYPE argument and add general build options
|
||||||
|
RUN source /usr/local/Ascend/ascend-toolkit/set_env.sh --force \
|
||||||
|
&& \
|
||||||
|
cmake -B build \
|
||||||
|
-DGGML_CANN=ON \
|
||||||
|
-DCMAKE_BUILD_TYPE=Release \
|
||||||
|
-DSOC_TYPE=${ASCEND_SOC_TYPE} \
|
||||||
|
. && \
|
||||||
|
cmake --build build --config Release -j$(nproc)
|
||||||
|
|
||||||
|
# -- Organize build artifacts for copying in later stages --
|
||||||
|
# Create a lib directory to store all .so files
|
||||||
|
RUN mkdir -p /app/lib && \
|
||||||
|
find build -name "*.so" -exec cp {} /app/lib \;
|
||||||
|
|
||||||
|
# Create a full directory to store all executables and Python scripts
|
||||||
|
RUN mkdir -p /app/full && \
|
||||||
|
cp build/bin/* /app/full/ && \
|
||||||
|
cp *.py /app/full/ && \
|
||||||
|
cp -r gguf-py /app/full/ && \
|
||||||
|
cp -r requirements /app/full/ && \
|
||||||
|
cp requirements.txt /app/full/
|
||||||
|
# If you have a tools.sh script, make sure it is copied here
|
||||||
|
# cp .devops/tools.sh /app/full/tools.sh
|
||||||
|
|
||||||
|
# ==============================================================================
|
||||||
|
# BASE STAGE
|
||||||
|
# Create a minimal base image with CANN runtime and common libraries
|
||||||
|
# ==============================================================================
|
||||||
|
FROM ${CANN_BASE_IMAGE} AS base
|
||||||
|
|
||||||
|
# -- Install runtime dependencies --
|
||||||
|
RUN yum install -y libgomp curl && \
|
||||||
|
yum clean all && \
|
||||||
|
rm -rf /var/cache/yum
|
||||||
|
|
||||||
|
# -- Set CANN environment variables (required for runtime) --
|
||||||
|
ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
|
||||||
|
ENV LD_LIBRARY_PATH=/app:${ASCEND_TOOLKIT_HOME}/lib64:${LD_LIBRARY_PATH}
|
||||||
|
ENV PATH=${ASCEND_TOOLKIT_HOME}/bin:${PATH}
|
||||||
|
ENV ASCEND_OPP_PATH=${ASCEND_TOOLKIT_HOME}/opp
|
||||||
|
# ... You can add other environment variables from the original file as needed ...
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
# Copy compiled .so files from the build stage
|
||||||
|
COPY --from=build /app/lib/ /app
|
||||||
|
|
||||||
|
# ==============================================================================
|
||||||
|
# FINAL STAGES (TARGETS)
|
||||||
|
# ==============================================================================
|
||||||
|
|
||||||
|
### Target: full
|
||||||
|
# Complete image with all tools, Python bindings, and dependencies
|
||||||
|
# ==============================================================================
|
||||||
|
FROM base AS full
|
||||||
|
|
||||||
|
COPY --from=build /app/full /app
|
||||||
|
|
||||||
|
# Install Python dependencies
|
||||||
|
RUN yum install -y git python3 python3-pip && \
|
||||||
|
pip3 install --no-cache-dir --upgrade pip setuptools wheel && \
|
||||||
|
pip3 install --no-cache-dir -r requirements.txt && \
|
||||||
|
yum clean all && \
|
||||||
|
rm -rf /var/cache/yum
|
||||||
|
|
||||||
|
# You need to provide a tools.sh script as the entrypoint
|
||||||
|
ENTRYPOINT ["/app/tools.sh"]
|
||||||
|
# If there is no tools.sh, you can set the default to start the server
|
||||||
|
# ENTRYPOINT ["/app/llama-server"]
|
||||||
|
|
||||||
|
### Target: light
|
||||||
|
# Lightweight image containing only llama-cli
|
||||||
|
# ==============================================================================
|
||||||
|
FROM base AS light
|
||||||
|
|
||||||
|
COPY --from=build /app/full/llama-cli /app
|
||||||
|
|
||||||
|
ENTRYPOINT [ "/app/llama-cli" ]
|
||||||
|
|
||||||
|
### Target: server
|
||||||
|
# Dedicated server image containing only llama-server
|
||||||
|
# ==============================================================================
|
||||||
|
FROM base AS server
|
||||||
|
|
||||||
|
ENV LLAMA_ARG_HOST=0.0.0.0
|
||||||
|
|
||||||
|
COPY --from=build /app/full/llama-server /app
|
||||||
|
|
||||||
|
HEALTHCHECK --interval=5m CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
||||||
|
|
||||||
|
ENTRYPOINT [ "/app/llama-server" ]
|
||||||
|
|
@ -0,0 +1,45 @@
|
||||||
|
name: Check Pre-Tokenizer Hashes
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
paths:
|
||||||
|
- 'convert_hf_to_gguf.py'
|
||||||
|
- 'convert_hf_to_gguf_update.py'
|
||||||
|
pull_request:
|
||||||
|
paths:
|
||||||
|
- 'convert_hf_to_gguf.py'
|
||||||
|
- 'convert_hf_to_gguf_update.py'
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
pre-tokenizer-hashes:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Checkout repository
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- name: Set up Python
|
||||||
|
uses: actions/setup-python@v5
|
||||||
|
with:
|
||||||
|
python-version: '3.11'
|
||||||
|
|
||||||
|
- name: Install Python dependencies
|
||||||
|
run: |
|
||||||
|
python3 -m venv .venv
|
||||||
|
.venv/bin/pip install -r requirements/requirements-convert_hf_to_gguf_update.txt
|
||||||
|
|
||||||
|
- name: Update pre-tokenizer hashes
|
||||||
|
run: |
|
||||||
|
cp convert_hf_to_gguf.py /tmp
|
||||||
|
.venv/bin/python convert_hf_to_gguf_update.py --check-missing
|
||||||
|
|
||||||
|
- name: Check if committed pre-tokenizer hashes matches generated version
|
||||||
|
run: |
|
||||||
|
if ! diff -q convert_hf_to_gguf.py /tmp/convert_hf_to_gguf.py; then
|
||||||
|
echo "Model pre-tokenizer hashes (in convert_hf_to_gguf.py) do not match generated hashes (from convert_hf_to_gguf_update.py)."
|
||||||
|
echo "To fix: run ./convert_hf_to_gguf_update.py and commit the updated convert_hf_to_gguf.py along with your changes"
|
||||||
|
echo "Differences found:"
|
||||||
|
diff convert_hf_to_gguf.py /tmp/convert_hf_to_gguf.py || true
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
echo "Model pre-tokenizer hashes are up to date."
|
||||||
|
|
@ -2647,6 +2647,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
params.n_out_freq = value;
|
params.n_out_freq = value;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
|
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
|
||||||
|
add_opt(common_arg(
|
||||||
|
{"--output-format"}, "{gguf,dat}",
|
||||||
|
string_format("output format for imatrix file (default: %s)", params.imat_dat ? "dat" : "gguf"),
|
||||||
|
[](common_params & params, const std::string & value) {
|
||||||
|
/**/ if (value == "gguf") { params.imat_dat = false; }
|
||||||
|
else if (value == "dat") { params.imat_dat = true; }
|
||||||
|
else { throw std::invalid_argument("invalid output format"); }
|
||||||
|
}
|
||||||
|
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--save-frequency"}, "N",
|
{"--save-frequency"}, "N",
|
||||||
string_format("save an imatrix copy every N iterations (default: %d)", params.n_save_freq),
|
string_format("save an imatrix copy every N iterations (default: %d)", params.n_save_freq),
|
||||||
|
|
|
||||||
|
|
@ -1646,7 +1646,7 @@ static void common_chat_parse_hermes_2_pro(common_chat_msg_parser & builder) {
|
||||||
"|<function name=\"([^\"]+)\">" // match 5 (function name again)
|
"|<function name=\"([^\"]+)\">" // match 5 (function name again)
|
||||||
);
|
);
|
||||||
|
|
||||||
if (auto res = builder.try_find_regex(open_regex)) {
|
while (auto res = builder.try_find_regex(open_regex)) {
|
||||||
const auto & block_start = res->groups[1];
|
const auto & block_start = res->groups[1];
|
||||||
std::string block_end = block_start.empty() ? "" : "```";
|
std::string block_end = block_start.empty() ? "" : "```";
|
||||||
|
|
||||||
|
|
@ -1668,7 +1668,6 @@ static void common_chat_parse_hermes_2_pro(common_chat_msg_parser & builder) {
|
||||||
builder.consume_literal(block_end);
|
builder.consume_literal(block_end);
|
||||||
builder.consume_spaces();
|
builder.consume_spaces();
|
||||||
}
|
}
|
||||||
builder.add_content(builder.consume_rest());
|
|
||||||
} else {
|
} else {
|
||||||
throw common_chat_msg_partial_exception("failed to parse tool call");
|
throw common_chat_msg_partial_exception("failed to parse tool call");
|
||||||
}
|
}
|
||||||
|
|
@ -1693,11 +1692,10 @@ static void common_chat_parse_hermes_2_pro(common_chat_msg_parser & builder) {
|
||||||
builder.consume_spaces();
|
builder.consume_spaces();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
builder.add_content(builder.consume_rest());
|
|
||||||
}
|
}
|
||||||
} else {
|
|
||||||
builder.add_content(builder.consume_rest());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
builder.add_content(builder.consume_rest());
|
||||||
}
|
}
|
||||||
|
|
||||||
static common_chat_params common_chat_params_init_without_tools(const common_chat_template & tmpl, const struct templates_params & inputs) {
|
static common_chat_params common_chat_params_init_without_tools(const common_chat_template & tmpl, const struct templates_params & inputs) {
|
||||||
|
|
|
||||||
|
|
@ -439,6 +439,7 @@ struct common_params {
|
||||||
int32_t n_out_freq = 10; // output the imatrix every n_out_freq iterations
|
int32_t n_out_freq = 10; // output the imatrix every n_out_freq iterations
|
||||||
int32_t n_save_freq = 0; // save the imatrix every n_save_freq iterations
|
int32_t n_save_freq = 0; // save the imatrix every n_save_freq iterations
|
||||||
int32_t i_chunk = 0; // start processing from this chunk
|
int32_t i_chunk = 0; // start processing from this chunk
|
||||||
|
bool imat_dat = false; // whether the legacy imatrix.dat format should be output
|
||||||
|
|
||||||
bool process_output = false; // collect data for the output tensor
|
bool process_output = false; // collect data for the output tensor
|
||||||
bool compute_ppl = true; // whether to compute perplexity
|
bool compute_ppl = true; // whether to compute perplexity
|
||||||
|
|
|
||||||
|
|
@ -684,6 +684,9 @@ class TextModel(ModelBase):
|
||||||
if chkhsh == "7e57df22b1fe23a7b1e1c7f3dc4e3f96d43a4eb0836d0c6bdc3436d7b2f1c664":
|
if chkhsh == "7e57df22b1fe23a7b1e1c7f3dc4e3f96d43a4eb0836d0c6bdc3436d7b2f1c664":
|
||||||
# ref: https://huggingface.co/tencent/Hunyuan-A13B-Instruct
|
# ref: https://huggingface.co/tencent/Hunyuan-A13B-Instruct
|
||||||
res = "hunyuan"
|
res = "hunyuan"
|
||||||
|
if chkhsh == "bba3b3366b646dbdded5dbc42d59598b849371afc42f7beafa914afaa5b70aa6":
|
||||||
|
# ref: https://huggingface.co/tencent/Hunyuan-4B-Instruct
|
||||||
|
res = "hunyuan-dense"
|
||||||
if chkhsh == "a6b57017d60e6edb4d88ecc2845188e0eb333a70357e45dcc9b53964a73bbae6":
|
if chkhsh == "a6b57017d60e6edb4d88ecc2845188e0eb333a70357e45dcc9b53964a73bbae6":
|
||||||
# ref: https://huggingface.co/tiiuae/Falcon-H1-0.5B-Base
|
# ref: https://huggingface.co/tiiuae/Falcon-H1-0.5B-Base
|
||||||
res = "falcon-h1"
|
res = "falcon-h1"
|
||||||
|
|
@ -699,6 +702,9 @@ class TextModel(ModelBase):
|
||||||
if chkhsh == "81212dc7cdb7e0c1074ca62c5aeab0d43c9f52b8a737be7b12a777c953027890":
|
if chkhsh == "81212dc7cdb7e0c1074ca62c5aeab0d43c9f52b8a737be7b12a777c953027890":
|
||||||
# ref: https://huggingface.co/moonshotai/Kimi-K2-Base
|
# ref: https://huggingface.co/moonshotai/Kimi-K2-Base
|
||||||
res = "kimi-k2"
|
res = "kimi-k2"
|
||||||
|
if chkhsh == "d4540891389ea895b53b399da6ac824becc30f2fba0e9ddbb98f92e55ca0e97c":
|
||||||
|
# ref: https://huggingface.co/Qwen/Qwen3-Embedding-0.6B
|
||||||
|
res = "qwen2"
|
||||||
if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5":
|
if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5":
|
||||||
# ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B
|
# ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B
|
||||||
res = "llama-bpe"
|
res = "llama-bpe"
|
||||||
|
|
@ -846,6 +852,9 @@ class TextModel(ModelBase):
|
||||||
if chkhsh == "2085e1638f6c377a0aa4ead21b27bb4cb941bf800df86ed391011769c1758dfb":
|
if chkhsh == "2085e1638f6c377a0aa4ead21b27bb4cb941bf800df86ed391011769c1758dfb":
|
||||||
# ref: https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B
|
# ref: https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B
|
||||||
res = "exaone4"
|
res = "exaone4"
|
||||||
|
if chkhsh == "a1e163ecab2e718a4c829d1148b6e86824ec36163bb71941c3dca9cd5ac25756":
|
||||||
|
# ref: https://huggingface.co/JetBrains/Mellum-4b-base
|
||||||
|
res = "mellum"
|
||||||
|
|
||||||
if res is None:
|
if res is None:
|
||||||
logger.warning("\n")
|
logger.warning("\n")
|
||||||
|
|
@ -6053,6 +6062,7 @@ class DeepseekModel(TextModel):
|
||||||
|
|
||||||
@ModelBase.register("DeepseekV2ForCausalLM")
|
@ModelBase.register("DeepseekV2ForCausalLM")
|
||||||
@ModelBase.register("DeepseekV3ForCausalLM")
|
@ModelBase.register("DeepseekV3ForCausalLM")
|
||||||
|
@ModelBase.register("KimiVLForConditionalGeneration")
|
||||||
class DeepseekV2Model(TextModel):
|
class DeepseekV2Model(TextModel):
|
||||||
model_arch = gguf.MODEL_ARCH.DEEPSEEK2
|
model_arch = gguf.MODEL_ARCH.DEEPSEEK2
|
||||||
|
|
||||||
|
|
@ -6155,6 +6165,13 @@ class DeepseekV2Model(TextModel):
|
||||||
_experts: list[dict[str, Tensor]] | None = None
|
_experts: list[dict[str, Tensor]] | None = None
|
||||||
|
|
||||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||||
|
# skip vision tensors and remove "language_model." for Kimi-VL
|
||||||
|
if "vision_tower" in name or "multi_modal_projector" in name:
|
||||||
|
return []
|
||||||
|
|
||||||
|
if name.startswith("language_model."):
|
||||||
|
name = name.replace("language_model.", "")
|
||||||
|
|
||||||
# rename e_score_correction_bias tensors
|
# rename e_score_correction_bias tensors
|
||||||
if name.endswith("e_score_correction_bias"):
|
if name.endswith("e_score_correction_bias"):
|
||||||
name = name.replace("e_score_correction_bias", "e_score_correction.bias")
|
name = name.replace("e_score_correction_bias", "e_score_correction.bias")
|
||||||
|
|
@ -7553,11 +7570,6 @@ class FalconH1Model(Mamba2Model):
|
||||||
class HunYuanMoEModel(TextModel):
|
class HunYuanMoEModel(TextModel):
|
||||||
model_arch = gguf.MODEL_ARCH.HUNYUAN_MOE
|
model_arch = gguf.MODEL_ARCH.HUNYUAN_MOE
|
||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
|
||||||
super().__init__(*args, **kwargs)
|
|
||||||
# For handling tied embeddings
|
|
||||||
self._tok_embd = None
|
|
||||||
|
|
||||||
def set_vocab(self):
|
def set_vocab(self):
|
||||||
from transformers import AutoTokenizer
|
from transformers import AutoTokenizer
|
||||||
tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
|
tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
|
||||||
|
|
@ -7651,9 +7663,6 @@ class HunYuanMoEModel(TextModel):
|
||||||
_experts: list[dict[str, Tensor]] | None = None
|
_experts: list[dict[str, Tensor]] | None = None
|
||||||
|
|
||||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||||
if name == "model.embed_tokens.weight":
|
|
||||||
self._tok_embd = data_torch.clone()
|
|
||||||
|
|
||||||
if name == "lm_head.weight":
|
if name == "lm_head.weight":
|
||||||
if self.hparams.get("tie_word_embeddings", False):
|
if self.hparams.get("tie_word_embeddings", False):
|
||||||
logger.info("Skipping tied output layer 'lm_head.weight'")
|
logger.info("Skipping tied output layer 'lm_head.weight'")
|
||||||
|
|
@ -7698,6 +7707,98 @@ class HunYuanMoEModel(TextModel):
|
||||||
raise ValueError(f"Unprocessed experts: {experts}")
|
raise ValueError(f"Unprocessed experts: {experts}")
|
||||||
|
|
||||||
|
|
||||||
|
@ModelBase.register("HunYuanDenseV1ForCausalLM")
|
||||||
|
class HunYuanModel(TextModel):
|
||||||
|
model_arch = gguf.MODEL_ARCH.HUNYUAN_DENSE
|
||||||
|
|
||||||
|
def set_vocab(self):
|
||||||
|
if (self.dir_model / "tokenizer.json").is_file():
|
||||||
|
self._set_vocab_gpt2()
|
||||||
|
else:
|
||||||
|
from transformers import AutoTokenizer
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
|
||||||
|
|
||||||
|
# 1. Get the pre-tokenizer identifier hash
|
||||||
|
tokpre = self.get_vocab_base_pre(tokenizer)
|
||||||
|
|
||||||
|
# 2. Reverse-engineer the merges list from mergeable_ranks
|
||||||
|
merges = []
|
||||||
|
vocab = {}
|
||||||
|
mergeable_ranks = tokenizer.mergeable_ranks
|
||||||
|
for token, rank in mergeable_ranks.items():
|
||||||
|
vocab[QwenModel.token_bytes_to_string(token)] = rank
|
||||||
|
if len(token) == 1:
|
||||||
|
continue
|
||||||
|
merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank)
|
||||||
|
if len(merged) == 2:
|
||||||
|
merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged)))
|
||||||
|
|
||||||
|
# 3. Generate the tokens and toktypes lists
|
||||||
|
vocab_size = self.hparams["vocab_size"]
|
||||||
|
assert tokenizer.vocab_size == vocab_size
|
||||||
|
special_tokens = tokenizer.special_tokens
|
||||||
|
reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **special_tokens}.items()}
|
||||||
|
tokens: list[str] = []
|
||||||
|
toktypes: list[int] = []
|
||||||
|
for i in range(vocab_size):
|
||||||
|
if i not in reverse_vocab:
|
||||||
|
tokens.append(f"[PAD{i}]")
|
||||||
|
toktypes.append(gguf.TokenType.UNUSED)
|
||||||
|
else:
|
||||||
|
token = reverse_vocab[i]
|
||||||
|
tokens.append(token)
|
||||||
|
if i in special_tokens.values():
|
||||||
|
toktypes.append(gguf.TokenType.CONTROL)
|
||||||
|
else:
|
||||||
|
toktypes.append(gguf.TokenType.NORMAL)
|
||||||
|
|
||||||
|
# 4. Write all vocab-related fields to the GGUF writer
|
||||||
|
self.gguf_writer.add_tokenizer_model("gpt2")
|
||||||
|
self.gguf_writer.add_tokenizer_pre(tokpre)
|
||||||
|
self.gguf_writer.add_token_list(tokens)
|
||||||
|
self.gguf_writer.add_token_types(toktypes)
|
||||||
|
self.gguf_writer.add_token_merges(merges)
|
||||||
|
|
||||||
|
# 5. Add special tokens and chat templates
|
||||||
|
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False)
|
||||||
|
special_vocab.add_to_gguf(self.gguf_writer)
|
||||||
|
# FIX for BOS token: Overwrite incorrect id read from config.json
|
||||||
|
if self.hparams['hidden_size'] == 4096:
|
||||||
|
self.gguf_writer.add_bos_token_id(127958) # only for 7b dense, fix <|bos|> token
|
||||||
|
|
||||||
|
def set_gguf_parameters(self):
|
||||||
|
super().set_gguf_parameters()
|
||||||
|
hparams = self.hparams
|
||||||
|
|
||||||
|
# Rope
|
||||||
|
rope_scaling = hparams.get("rope_scaling", {})
|
||||||
|
if rope_scaling.get("type") == "dynamic":
|
||||||
|
# HunYuan uses NTK Aware Alpha based scaling. Original implementation: https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/
|
||||||
|
# 1000 corresponds to a usable context length of 256k (https://github.com/Tencent-Hunyuan/Hunyuan-A13B/blob/main/report/Hunyuan_A13B_Technical_Report.pdf)
|
||||||
|
alpha = rope_scaling.get("alpha", 50)
|
||||||
|
base = hparams.get("rope_theta", 10000.0)
|
||||||
|
dim = hparams["head_dim"]
|
||||||
|
scaled_base = base * (alpha ** (dim / (dim - 2)))
|
||||||
|
self.gguf_writer.add_rope_freq_base(scaled_base)
|
||||||
|
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
|
||||||
|
self.gguf_writer.add_rope_scaling_factor(1)
|
||||||
|
# There is no consistent way to calculate ctx from alpha, and the config is incorrectly set to 32k
|
||||||
|
self.gguf_writer.add_rope_scaling_orig_ctx_len(256 * 1024) # 256k context length
|
||||||
|
self.gguf_writer.add_context_length(256 * 1024) # 256k context length
|
||||||
|
|
||||||
|
# if any of our assumptions about the values are wrong, something has changed and this may need to be updated
|
||||||
|
assert base == 10000.0 and self.hparams["max_position_embeddings"] in [32 * 1024, 256 * 1024] , \
|
||||||
|
"HunYuan dynamic RoPE scaling assumptions changed, please update the logic or context length manually"
|
||||||
|
|
||||||
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||||
|
if name == "lm_head.weight":
|
||||||
|
if self.hparams.get("tie_word_embeddings", False):
|
||||||
|
logger.info("Skipping tied output layer 'lm_head.weight'")
|
||||||
|
return []
|
||||||
|
|
||||||
|
return [(self.map_tensor_name(name), data_torch)]
|
||||||
|
|
||||||
|
|
||||||
@ModelBase.register("SmolLM3ForCausalLM")
|
@ModelBase.register("SmolLM3ForCausalLM")
|
||||||
class SmolLM3Model(LlamaModel):
|
class SmolLM3Model(LlamaModel):
|
||||||
model_arch = gguf.MODEL_ARCH.SMOLLM3
|
model_arch = gguf.MODEL_ARCH.SMOLLM3
|
||||||
|
|
|
||||||
|
|
@ -59,6 +59,10 @@ parser.add_argument(
|
||||||
"--full", action="store_true",
|
"--full", action="store_true",
|
||||||
help="download full list of models - make sure you have access to all of them",
|
help="download full list of models - make sure you have access to all of them",
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--check-missing", action="store_true",
|
||||||
|
help="only check for missing pre-tokenizer hashes",
|
||||||
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"hf_token",
|
"hf_token",
|
||||||
help="optional HF token",
|
help="optional HF token",
|
||||||
|
|
@ -70,6 +74,10 @@ hf_token = args.hf_token if args.hf_token is not None else hf_token
|
||||||
if hf_token is None:
|
if hf_token is None:
|
||||||
logger.warning("HF token not found. You can provide it as an argument or set it in ~/.cache/huggingface/token")
|
logger.warning("HF token not found. You can provide it as an argument or set it in ~/.cache/huggingface/token")
|
||||||
|
|
||||||
|
if args.check_missing and args.full:
|
||||||
|
logger.warning("Downloading full list of models requested, ignoring --check-missing!")
|
||||||
|
args.check_missing = False
|
||||||
|
|
||||||
# TODO: this string has to exercise as much pre-tokenizer functionality as possible
|
# TODO: this string has to exercise as much pre-tokenizer functionality as possible
|
||||||
# will be updated with time - contributions welcome
|
# will be updated with time - contributions welcome
|
||||||
CHK_TXT = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
|
CHK_TXT = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
|
||||||
|
|
@ -130,6 +138,7 @@ models = [
|
||||||
{"name": "midm-2.0", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/K-intelligence/Midm-2.0-Base-Instruct", },
|
{"name": "midm-2.0", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/K-intelligence/Midm-2.0-Base-Instruct", },
|
||||||
{"name": "lfm2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LiquidAI/LFM2-Tokenizer"},
|
{"name": "lfm2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LiquidAI/LFM2-Tokenizer"},
|
||||||
{"name": "exaone4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B", },
|
{"name": "exaone4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B", },
|
||||||
|
{"name": "mellum", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/JetBrains/Mellum-4b-base", },
|
||||||
]
|
]
|
||||||
|
|
||||||
# some models are known to be broken upstream, so we will skip them as exceptions
|
# some models are known to be broken upstream, so we will skip them as exceptions
|
||||||
|
|
@ -140,12 +149,14 @@ pre_computed_hashes = [
|
||||||
{"name": "glm4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-hf", "chkhsh": "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2"},
|
{"name": "glm4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-hf", "chkhsh": "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2"},
|
||||||
{"name": "minerva-7b", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0", "chkhsh": "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35"},
|
{"name": "minerva-7b", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0", "chkhsh": "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35"},
|
||||||
{"name": "hunyuan", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tencent/Hunyuan-A13B-Instruct", "chkhsh": "7e57df22b1fe23a7b1e1c7f3dc4e3f96d43a4eb0836d0c6bdc3436d7b2f1c664"},
|
{"name": "hunyuan", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tencent/Hunyuan-A13B-Instruct", "chkhsh": "7e57df22b1fe23a7b1e1c7f3dc4e3f96d43a4eb0836d0c6bdc3436d7b2f1c664"},
|
||||||
|
{"name": "hunyuan-dense", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tencent/Hunyuan-4B-Instruct", "chkhsh": "bba3b3366b646dbdded5dbc42d59598b849371afc42f7beafa914afaa5b70aa6"},
|
||||||
# falcon-h1 series uses 4 different tokenizers across model sizes (0.5b - 34b), hence we need to define 4 different hashes
|
# falcon-h1 series uses 4 different tokenizers across model sizes (0.5b - 34b), hence we need to define 4 different hashes
|
||||||
{"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-0.5B-Base", "chkhsh": "a6b57017d60e6edb4d88ecc2845188e0eb333a70357e45dcc9b53964a73bbae6"},
|
{"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-0.5B-Base", "chkhsh": "a6b57017d60e6edb4d88ecc2845188e0eb333a70357e45dcc9b53964a73bbae6"},
|
||||||
{"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-1B-Base", "chkhsh": "60476e1243776c4fb1b993dbd7a5f15ac22f83c80afdf425fa5ae01c8d44ef86"},
|
{"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-1B-Base", "chkhsh": "60476e1243776c4fb1b993dbd7a5f15ac22f83c80afdf425fa5ae01c8d44ef86"},
|
||||||
{"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-7B-Base", "chkhsh": "3eda48b4c4dc7de733d1a8b3e3b4a85243dbbf704da2ee9d42c6beced8897896"},
|
{"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-7B-Base", "chkhsh": "3eda48b4c4dc7de733d1a8b3e3b4a85243dbbf704da2ee9d42c6beced8897896"},
|
||||||
{"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-34B-Base", "chkhsh": "48f8e02c0359c0bbdd82f26909171fac1c18a457bb47573ed1fe3bbb2c1cfd4b"},
|
{"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-34B-Base", "chkhsh": "48f8e02c0359c0bbdd82f26909171fac1c18a457bb47573ed1fe3bbb2c1cfd4b"},
|
||||||
{"name": "kimi-k2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/moonshotai/Kimi-K2-Base", "chkhsh": "81212dc7cdb7e0c1074ca62c5aeab0d43c9f52b8a737be7b12a777c953027890"},
|
{"name": "kimi-k2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/moonshotai/Kimi-K2-Base", "chkhsh": "81212dc7cdb7e0c1074ca62c5aeab0d43c9f52b8a737be7b12a777c953027890"},
|
||||||
|
{"name": "qwen2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen3-Embedding-0.6B", "chkhsh": "d4540891389ea895b53b399da6ac824becc30f2fba0e9ddbb98f92e55ca0e97c"},
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -220,12 +231,13 @@ if not args.full:
|
||||||
all_models = models.copy()
|
all_models = models.copy()
|
||||||
models = [model for model in all_models if model["name"] not in existing_models]
|
models = [model for model in all_models if model["name"] not in existing_models]
|
||||||
|
|
||||||
logging.info(f"Downloading {len(models)} models...")
|
if not args.check_missing:
|
||||||
for model in models:
|
logging.info(f"Downloading {len(models)} models...")
|
||||||
try:
|
for model in models:
|
||||||
download_model(model)
|
try:
|
||||||
except Exception as e:
|
download_model(model)
|
||||||
logger.error(f"Failed to download model {model['name']}. Error: {e}")
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to download model {model['name']}. Error: {e}")
|
||||||
|
|
||||||
|
|
||||||
# generate the source code for the convert_hf_to_gguf.py:get_vocab_base_pre() function:
|
# generate the source code for the convert_hf_to_gguf.py:get_vocab_base_pre() function:
|
||||||
|
|
|
||||||
|
|
@ -2016,6 +2016,9 @@ static bool ggml_backend_cann_cpy_tensor_async(
|
||||||
(ggml_backend_cann_context*)backend_dst->context;
|
(ggml_backend_cann_context*)backend_dst->context;
|
||||||
|
|
||||||
size_t copy_size = ggml_nbytes(dst);
|
size_t copy_size = ggml_nbytes(dst);
|
||||||
|
if (copy_size == 0) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
if (backend_src != backend_dst) {
|
if (backend_src != backend_dst) {
|
||||||
ggml_backend_cann_buffer_context* buf_ctx_src =
|
ggml_backend_cann_buffer_context* buf_ctx_src =
|
||||||
(ggml_backend_cann_buffer_context*)buf_src->context;
|
(ggml_backend_cann_buffer_context*)buf_src->context;
|
||||||
|
|
|
||||||
|
|
@ -37,17 +37,21 @@
|
||||||
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
|
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
|
||||||
#define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
|
#define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
|
||||||
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
||||||
|
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
||||||
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
||||||
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
||||||
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
||||||
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
|
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
|
||||||
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
||||||
|
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
||||||
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
||||||
#elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM) || defined(_M_ARM64)
|
#elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM) || defined(_M_ARM64)
|
||||||
// repack.cpp
|
// repack.cpp
|
||||||
#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
|
#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
|
||||||
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
||||||
|
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
||||||
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
||||||
|
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
||||||
#elif defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || defined(_M_X64)
|
#elif defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || defined(_M_X64)
|
||||||
// repack.cpp
|
// repack.cpp
|
||||||
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
|
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
|
||||||
|
|
@ -72,11 +76,13 @@
|
||||||
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
|
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
|
||||||
#define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
|
#define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
|
||||||
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
||||||
|
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
||||||
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
||||||
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
||||||
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
||||||
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
|
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
|
||||||
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
||||||
|
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
||||||
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
||||||
#elif defined(__loongarch64)
|
#elif defined(__loongarch64)
|
||||||
// quants.c
|
// quants.c
|
||||||
|
|
@ -92,11 +98,13 @@
|
||||||
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
|
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
|
||||||
#define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
|
#define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
|
||||||
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
||||||
|
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
||||||
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
||||||
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
||||||
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
||||||
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
|
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
|
||||||
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
||||||
|
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
||||||
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
||||||
#elif defined(__riscv)
|
#elif defined(__riscv)
|
||||||
// quants.c
|
// quants.c
|
||||||
|
|
@ -119,10 +127,12 @@
|
||||||
#define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
|
#define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
|
||||||
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
|
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
|
||||||
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
||||||
|
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
||||||
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
||||||
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
||||||
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
||||||
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
||||||
|
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
||||||
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
||||||
#elif defined(__s390x__)
|
#elif defined(__s390x__)
|
||||||
// quants.c
|
// quants.c
|
||||||
|
|
@ -147,11 +157,13 @@
|
||||||
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
|
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
|
||||||
#define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
|
#define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
|
||||||
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
||||||
|
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
||||||
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
||||||
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
||||||
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
||||||
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
|
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
|
||||||
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
||||||
|
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
||||||
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
||||||
#elif defined(__wasm__)
|
#elif defined(__wasm__)
|
||||||
// quants.c
|
// quants.c
|
||||||
|
|
@ -175,10 +187,12 @@
|
||||||
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
|
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
|
||||||
#define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
|
#define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
|
||||||
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
||||||
|
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
||||||
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
||||||
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
||||||
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
||||||
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
|
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
|
||||||
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
||||||
|
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
||||||
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
||||||
#endif
|
#endif
|
||||||
|
|
|
||||||
File diff suppressed because it is too large
Load Diff
|
|
@ -412,6 +412,82 @@ void ggml_gemv_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void ggml_gemv_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||||
|
const int qk = QK_K;
|
||||||
|
const int nb = n / qk;
|
||||||
|
const int ncols_interleaved = 8;
|
||||||
|
const int blocklen = 8;
|
||||||
|
|
||||||
|
assert (n % qk == 0);
|
||||||
|
assert (nc % ncols_interleaved == 0);
|
||||||
|
|
||||||
|
UNUSED(s);
|
||||||
|
UNUSED(bs);
|
||||||
|
UNUSED(vx);
|
||||||
|
UNUSED(vy);
|
||||||
|
UNUSED(nr);
|
||||||
|
UNUSED(nc);
|
||||||
|
UNUSED(nb);
|
||||||
|
UNUSED(ncols_interleaved);
|
||||||
|
UNUSED(blocklen);
|
||||||
|
|
||||||
|
float sumf[8];
|
||||||
|
float sum_minf[8];
|
||||||
|
int sumi1,sumi2,sumi3,sumi4;
|
||||||
|
int sumi;
|
||||||
|
|
||||||
|
const block_q8_K * a_ptr = (const block_q8_K *)vy;
|
||||||
|
for(int x = 0; x < nc / ncols_interleaved; x++) {
|
||||||
|
const block_q2_Kx8 * b_ptr = (const block_q2_Kx8 *) vx + (x * nb);
|
||||||
|
for (int j = 0; j < ncols_interleaved; j++) {
|
||||||
|
sumf[j] = 0.0;
|
||||||
|
sum_minf[j] = 0.0;
|
||||||
|
}
|
||||||
|
for (int l = 0; l < nb; l++) {
|
||||||
|
for (int k = 0; k < (qk / (4 * blocklen)); k++) {
|
||||||
|
const uint8_t *scales_0 = b_ptr[l].scales + (k / 4) * 64 ;
|
||||||
|
const uint8_t *scales_1 = b_ptr[l].scales + (k / 4) * 64 + 16;
|
||||||
|
const uint8_t *scales_2 = b_ptr[l].scales + (k / 4) * 64 + 32;
|
||||||
|
const uint8_t *scales_3 = b_ptr[l].scales + (k / 4) * 64 + 48;
|
||||||
|
for (int j = 0; j < ncols_interleaved; j++) {
|
||||||
|
sumi1 = 0;
|
||||||
|
sumi2 = 0;
|
||||||
|
sumi3 = 0;
|
||||||
|
sumi4 = 0;
|
||||||
|
sumi = 0;
|
||||||
|
int offset = ((k / 2) % 2) + j * 2;
|
||||||
|
for (int i = 0; i < blocklen; ++i){
|
||||||
|
const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 3);
|
||||||
|
const int v1 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 2 ) & 3);
|
||||||
|
const int v2 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4 ) & 3);
|
||||||
|
const int v3 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 6 ) & 3);
|
||||||
|
sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i]);
|
||||||
|
sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i + 32]);
|
||||||
|
sumi3 = (v2 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i + 64]);
|
||||||
|
sumi4 = (v3 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i + 96]);
|
||||||
|
|
||||||
|
sumi1 = sumi1 * (scales_0[offset] & 0xF);
|
||||||
|
sumi2 = sumi2 * (scales_1[offset] & 0xF);
|
||||||
|
sumi3 = sumi3 * (scales_2[offset] & 0xF);
|
||||||
|
sumi4 = sumi4 * (scales_3[offset] & 0xF);
|
||||||
|
sumi += sumi1 + sumi2 + sumi3 + sumi4;
|
||||||
|
}
|
||||||
|
sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for(int sb = 0; sb < 8; sb++) {
|
||||||
|
const uint8_t *mins = b_ptr[l].scales + sb * 16;
|
||||||
|
for(int j = 0; j < ncols_interleaved; j++){
|
||||||
|
sum_minf[j] += ((mins[j * 2] >> 4) * a_ptr[l].bsums[sb * 2] + (mins[(j * 2)+ 1] >> 4) * a_ptr[l].bsums[sb * 2 + 1]) * GGML_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (int j = 0; j < ncols_interleaved; j++) {
|
||||||
|
s[x * ncols_interleaved + j] = sumf[j] - sum_minf[j];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||||
const int qk = QK8_0;
|
const int qk = QK8_0;
|
||||||
const int nb = n / qk;
|
const int nb = n / qk;
|
||||||
|
|
@ -711,6 +787,97 @@ void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void ggml_gemm_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||||
|
const int qk = QK_K;
|
||||||
|
const int nb = n / qk;
|
||||||
|
const int ncols_interleaved = 8;
|
||||||
|
const int blocklen = 8;
|
||||||
|
|
||||||
|
assert (n % qk == 0);
|
||||||
|
assert (nr % 4 == 0);
|
||||||
|
assert (nc % ncols_interleaved == 0);
|
||||||
|
|
||||||
|
UNUSED(s);
|
||||||
|
UNUSED(bs);
|
||||||
|
UNUSED(vx);
|
||||||
|
UNUSED(vy);
|
||||||
|
UNUSED(nr);
|
||||||
|
UNUSED(nc);
|
||||||
|
UNUSED(nb);
|
||||||
|
UNUSED(ncols_interleaved);
|
||||||
|
UNUSED(blocklen);
|
||||||
|
|
||||||
|
float sumf[4][8];
|
||||||
|
float sum_minf[4][8];
|
||||||
|
int sumi1, sumi2, sumi3, sumi4;
|
||||||
|
int sumi;
|
||||||
|
|
||||||
|
for (int y = 0; y < nr / 4; y++) {
|
||||||
|
const block_q8_Kx4 * a_ptr = (const block_q8_Kx4 *) vy + (y * nb);
|
||||||
|
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
||||||
|
const block_q2_Kx8 * b_ptr = (const block_q2_Kx8 *) vx + (x * nb);
|
||||||
|
for (int m = 0; m < 4; m++) {
|
||||||
|
for (int j = 0; j < ncols_interleaved; j++) {
|
||||||
|
sumf[m][j] = 0.0;
|
||||||
|
sum_minf[m][j] = 0.0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (int l = 0; l < nb; l++) {
|
||||||
|
for (int k = 0; k < (qk / (4 * blocklen)); k++) {
|
||||||
|
|
||||||
|
const uint8_t *scales_0 = b_ptr[l].scales + (k / 4) * 64 ;
|
||||||
|
const uint8_t *scales_1 = b_ptr[l].scales + (k / 4) * 64 + 16;
|
||||||
|
const uint8_t *scales_2 = b_ptr[l].scales + (k / 4) * 64 + 32;
|
||||||
|
const uint8_t *scales_3 = b_ptr[l].scales + (k / 4) * 64 + 48;
|
||||||
|
for (int m = 0; m < 4; m++) {
|
||||||
|
for (int j = 0; j < ncols_interleaved; j++) {
|
||||||
|
sumi1 = 0;
|
||||||
|
sumi2 = 0;
|
||||||
|
sumi3 = 0;
|
||||||
|
sumi4 = 0;
|
||||||
|
sumi = 0;
|
||||||
|
int offset = ((k / 2) % 2) + j * 2;
|
||||||
|
for (int i = 0; i < blocklen; ++i){
|
||||||
|
const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 3);
|
||||||
|
const int v1 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 2 ) & 3);
|
||||||
|
const int v2 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4 ) & 3);
|
||||||
|
const int v3 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 6 ) & 3);
|
||||||
|
sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i]);
|
||||||
|
sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i + 128]);
|
||||||
|
sumi3 = (v2 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i + 256]);
|
||||||
|
sumi4 = (v3 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i + 384]);
|
||||||
|
sumi1 = sumi1 * (scales_0[offset] & 0xF);
|
||||||
|
sumi2 = sumi2 * (scales_1[offset] & 0xF);
|
||||||
|
sumi3 = sumi3 * (scales_2[offset] & 0xF);
|
||||||
|
sumi4 = sumi4 * (scales_3[offset] & 0xF);
|
||||||
|
sumi += sumi1 + sumi2 + sumi3 + sumi4;
|
||||||
|
}
|
||||||
|
sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for(int sb = 0; sb < 8; sb++) {
|
||||||
|
const uint8_t *mins = b_ptr[l].scales + sb * 16;
|
||||||
|
for(int m = 0; m < 4; m++) {
|
||||||
|
const int16_t *bsums = a_ptr[l].bsums + (sb * 8) + (m * 4) - ((sb % 2) * 6);
|
||||||
|
for(int j = 0; j < ncols_interleaved; j++) {
|
||||||
|
int mins_prod = ((mins[j * 2] >> 4) * bsums[0] + (mins[(j * 2)+ 1] >> 4) * bsums[1]);
|
||||||
|
sum_minf[m][j] += (mins_prod) * GGML_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int m = 0; m < 4; m++) {
|
||||||
|
for (int j = 0; j < ncols_interleaved; j++) {
|
||||||
|
s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j] - sum_minf[m][j];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||||
const int qk = QK8_0;
|
const int qk = QK8_0;
|
||||||
const int nb = n / qk;
|
const int nb = n / qk;
|
||||||
|
|
@ -914,6 +1081,50 @@ static block_q4_Kx8 make_block_q4_Kx8(block_q4_K * in, unsigned int blck_size_in
|
||||||
return out;
|
return out;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static block_q2_Kx8 make_block_q2_Kx8(block_q2_K * in, unsigned int blck_size_interleave) {
|
||||||
|
block_q2_Kx8 out;
|
||||||
|
|
||||||
|
// Delta(scale) and dmin values of the eight Q2_K structures are copied onto the output interleaved structure
|
||||||
|
for (int i = 0; i < 8; i++) {
|
||||||
|
out.d[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.d;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < 8; i++) {
|
||||||
|
out.dmin[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.dmin;
|
||||||
|
}
|
||||||
|
|
||||||
|
const int end = QK_K * 2 / blck_size_interleave;
|
||||||
|
|
||||||
|
// Interleave Q2_K quants by taking 8 bytes at a time
|
||||||
|
for (int i = 0; i < end; ++i) {
|
||||||
|
int src_id = i % 8;
|
||||||
|
int src_offset = (i / 8) * blck_size_interleave;
|
||||||
|
int dst_offset = i * blck_size_interleave;
|
||||||
|
|
||||||
|
uint64_t elems;
|
||||||
|
memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
|
||||||
|
memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
|
||||||
|
}
|
||||||
|
|
||||||
|
// The below logic is designed so as to unpack and rearrange scales and mins values in Q2_K
|
||||||
|
// Currently the Q2_K structure has 16 scales and 16 mins packed in 16 bytes ( 4 bits for each value)
|
||||||
|
// The output Q2_Kx8 structure has 128 bytes for storing scales and mins
|
||||||
|
// Every 16 byte is packed such that it contains scales and mins for corresponding sub blocks from Q2_K structure
|
||||||
|
// For eg - First 16 bytes contains 16 scales and 16 mins - each of first and second sub blocks from different Q2_K structures
|
||||||
|
|
||||||
|
for(int i = 0; i < 128; i++){
|
||||||
|
|
||||||
|
// Index for selecting which q2k super block
|
||||||
|
int src1 = (i % 16) / 2;
|
||||||
|
// Index for selecting scale
|
||||||
|
int src2 = ((i / 16) * 2) + (i % 2);
|
||||||
|
|
||||||
|
out.scales[i] = in[src1].scales[src2];
|
||||||
|
}
|
||||||
|
return out;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
static int repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
|
static int repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
|
||||||
GGML_ASSERT(t->type == GGML_TYPE_Q4_0);
|
GGML_ASSERT(t->type == GGML_TYPE_Q4_0);
|
||||||
GGML_ASSERT(interleave_block == 4 || interleave_block == 8);
|
GGML_ASSERT(interleave_block == 4 || interleave_block == 8);
|
||||||
|
|
@ -975,6 +1186,37 @@ static int repack_q4_K_to_q4_K_8_bl(struct ggml_tensor * t, int interleave_block
|
||||||
GGML_UNUSED(data_size);
|
GGML_UNUSED(data_size);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int repack_q2_K_to_q2_K_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
|
||||||
|
GGML_ASSERT(t->type == GGML_TYPE_Q2_K);
|
||||||
|
GGML_ASSERT(interleave_block == 8);
|
||||||
|
constexpr int nrows_interleaved = 8;
|
||||||
|
|
||||||
|
block_q2_Kx8 * dst = (block_q2_Kx8*)t->data;
|
||||||
|
const block_q2_K * src = (const block_q2_K*) data;
|
||||||
|
block_q2_K dst_tmp[8];
|
||||||
|
int nrow = ggml_nrows(t);
|
||||||
|
int nblocks = t->ne[0] / QK_K;
|
||||||
|
|
||||||
|
GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q2_K));
|
||||||
|
|
||||||
|
if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int b = 0; b < nrow; b += nrows_interleaved) {
|
||||||
|
for (int64_t x = 0; x < nblocks; x++) {
|
||||||
|
for (int i = 0; i < nrows_interleaved; i++ ) {
|
||||||
|
dst_tmp[i] = src[x + i * nblocks];
|
||||||
|
}
|
||||||
|
*dst++ = make_block_q2_Kx8(dst_tmp, interleave_block);
|
||||||
|
}
|
||||||
|
src += nrows_interleaved * nblocks;
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
GGML_UNUSED(data_size);
|
||||||
|
}
|
||||||
|
|
||||||
static int repack_q4_0_to_q4_0_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
|
static int repack_q4_0_to_q4_0_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
|
||||||
GGML_ASSERT(t->type == GGML_TYPE_Q4_0);
|
GGML_ASSERT(t->type == GGML_TYPE_Q4_0);
|
||||||
GGML_ASSERT(interleave_block == 8);
|
GGML_ASSERT(interleave_block == 8);
|
||||||
|
|
@ -1095,6 +1337,10 @@ template <> int repack<block_q4_K, 8, 8>(struct ggml_tensor * t, const void * da
|
||||||
return repack_q4_K_to_q4_K_8_bl(t, 8, data, data_size);
|
return repack_q4_K_to_q4_K_8_bl(t, 8, data, data_size);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <> int repack<block_q2_K, 8, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
|
||||||
|
return repack_q2_K_to_q2_K_8_bl(t, 8, data, data_size);
|
||||||
|
}
|
||||||
|
|
||||||
template <> int repack<block_iq4_nl, 4, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
|
template <> int repack<block_iq4_nl, 4, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
|
||||||
return repack_iq4_nl_to_iq4_nl_4_bl(t, 4, data, data_size);
|
return repack_iq4_nl_to_iq4_nl_4_bl(t, 4, data, data_size);
|
||||||
}
|
}
|
||||||
|
|
@ -1124,6 +1370,10 @@ template <> void gemv<block_q4_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t
|
||||||
ggml_gemv_q4_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
|
ggml_gemv_q4_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <> void gemv<block_q2_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
||||||
|
ggml_gemv_q2_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
|
||||||
|
}
|
||||||
|
|
||||||
template <> void gemv<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
template <> void gemv<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
||||||
ggml_gemv_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
|
ggml_gemv_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
|
||||||
}
|
}
|
||||||
|
|
@ -1148,6 +1398,10 @@ template <> void gemm<block_q4_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t
|
||||||
ggml_gemm_q4_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
|
ggml_gemm_q4_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <> void gemm<block_q2_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
||||||
|
ggml_gemm_q2_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
|
||||||
|
}
|
||||||
|
|
||||||
template <> void gemm<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
template <> void gemm<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
||||||
ggml_gemm_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
|
ggml_gemm_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
|
||||||
}
|
}
|
||||||
|
|
@ -1421,6 +1675,9 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
|
||||||
static const ggml::cpu::repack::tensor_traits<block_q4_0, 8, 8, GGML_TYPE_Q8_0> q4_0_8x8_q8_0;
|
static const ggml::cpu::repack::tensor_traits<block_q4_0, 8, 8, GGML_TYPE_Q8_0> q4_0_8x8_q8_0;
|
||||||
static const ggml::cpu::repack::tensor_traits<block_q4_K, 8, 8, GGML_TYPE_Q8_K> q4_K_8x8_q8_K;
|
static const ggml::cpu::repack::tensor_traits<block_q4_K, 8, 8, GGML_TYPE_Q8_K> q4_K_8x8_q8_K;
|
||||||
|
|
||||||
|
// instance for Q2
|
||||||
|
static const ggml::cpu::repack::tensor_traits<block_q2_K, 8, 8, GGML_TYPE_Q8_K> q2_K_8x8_q8_K;
|
||||||
|
|
||||||
// instance for IQ4
|
// instance for IQ4
|
||||||
static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0> iq4_nl_4x4_q8_0;
|
static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0> iq4_nl_4x4_q8_0;
|
||||||
|
|
||||||
|
|
@ -1446,6 +1703,12 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
|
||||||
return &q4_K_8x8_q8_K;
|
return &q4_K_8x8_q8_K;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
} else if (cur->type == GGML_TYPE_Q2_K) {
|
||||||
|
if (ggml_cpu_has_avx512()) {
|
||||||
|
if (cur->ne[1] % 8 == 0) {
|
||||||
|
return &q2_K_8x8_q8_K;
|
||||||
|
}
|
||||||
|
}
|
||||||
} else if (cur->type == GGML_TYPE_IQ4_NL) {
|
} else if (cur->type == GGML_TYPE_IQ4_NL) {
|
||||||
if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
|
if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
|
||||||
if (cur->ne[1] % 4 == 0) {
|
if (cur->ne[1] % 4 == 0) {
|
||||||
|
|
|
||||||
|
|
@ -44,7 +44,14 @@ struct block_q4_Kx8 {
|
||||||
};
|
};
|
||||||
|
|
||||||
static_assert(sizeof(block_q4_Kx8) == sizeof(ggml_half) * 16 + K_SCALE_SIZE * 8 + QK_K * 4, "wrong q4_K block size/padding");
|
static_assert(sizeof(block_q4_Kx8) == sizeof(ggml_half) * 16 + K_SCALE_SIZE * 8 + QK_K * 4, "wrong q4_K block size/padding");
|
||||||
|
struct block_q2_Kx8 {
|
||||||
|
ggml_half d[8]; // super-block scale for quantized scales
|
||||||
|
ggml_half dmin[8]; // super-block scale for quantized mins
|
||||||
|
uint8_t scales[128]; // scales and mins, quantized with 4 bits
|
||||||
|
uint8_t qs[512]; // 2--bit quants
|
||||||
|
};
|
||||||
|
|
||||||
|
static_assert(sizeof(block_q2_Kx8) == sizeof(ggml_half) * 16 + QK_K/2 + QK_K * 2, "wrong q2_K block size/padding");
|
||||||
struct block_q8_Kx4 {
|
struct block_q8_Kx4 {
|
||||||
float d[4]; // delta
|
float d[4]; // delta
|
||||||
int8_t qs[QK_K * 4]; // quants
|
int8_t qs[QK_K * 4]; // quants
|
||||||
|
|
@ -71,11 +78,13 @@ void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
||||||
void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
void ggml_gemv_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
void ggml_gemv_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
|
void ggml_gemv_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
|
void ggml_gemm_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
|
|
||||||
// Native implementations
|
// Native implementations
|
||||||
|
|
@ -86,11 +95,13 @@ void ggml_gemv_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
||||||
void ggml_gemv_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
void ggml_gemv_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
void ggml_gemv_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
void ggml_gemv_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
void ggml_gemv_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
void ggml_gemv_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
|
void ggml_gemv_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
void ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
void ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
void ggml_gemm_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
void ggml_gemm_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
void ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
void ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
|
void ggml_gemm_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
|
|
||||||
#if defined(__cplusplus)
|
#if defined(__cplusplus)
|
||||||
|
|
|
||||||
|
|
@ -315,8 +315,9 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
|
||||||
|
|
||||||
const bool gqa_opt_applies = ((Q->ne[2] / K->ne[2]) % 2 == 0) && mask; // The mma-based kernels have GQA-specific optimizations
|
const bool gqa_opt_applies = ((Q->ne[2] / K->ne[2]) % 2 == 0) && mask; // The mma-based kernels have GQA-specific optimizations
|
||||||
const bool mma_needs_data_conversion = K->type != GGML_TYPE_F16 || V->type != GGML_TYPE_F16;
|
const bool mma_needs_data_conversion = K->type != GGML_TYPE_F16 || V->type != GGML_TYPE_F16;
|
||||||
const bool mma_faster_for_bs1 = new_mma_available(cc) && gqa_opt_applies &&
|
const bool mma_faster_for_rtx4000 = Q->ne[3] > 1 || (Q->ne[2] > 4*K->ne[2] && K->ne[1] >= 8192);
|
||||||
(Q->ne[3] > 1 || cc < GGML_CUDA_CC_ADA_LOVELACE) && !mma_needs_data_conversion;
|
const bool mma_faster_for_bs1 = new_mma_available(cc) && gqa_opt_applies && !mma_needs_data_conversion &&
|
||||||
|
(cc < GGML_CUDA_CC_ADA_LOVELACE || mma_faster_for_rtx4000);
|
||||||
const bool can_use_vector_kernel = Q->ne[0] <= 256 && Q->ne[0] % (2*warp_size) == 0;
|
const bool can_use_vector_kernel = Q->ne[0] <= 256 && Q->ne[0] % (2*warp_size) == 0;
|
||||||
if (Q->ne[1] == 1 && can_use_vector_kernel && !mma_faster_for_bs1) {
|
if (Q->ne[1] == 1 && can_use_vector_kernel && !mma_faster_for_bs1) {
|
||||||
if (prec == GGML_PREC_DEFAULT) {
|
if (prec == GGML_PREC_DEFAULT) {
|
||||||
|
|
|
||||||
|
|
@ -1852,6 +1852,9 @@ static void ggml_cuda_mul_mat_batched_cublas_impl(ggml_backend_cuda_context & ct
|
||||||
ggml_cuda_pool_alloc<cuda_t> src0_alloc(ctx.pool());
|
ggml_cuda_pool_alloc<cuda_t> src0_alloc(ctx.pool());
|
||||||
ggml_cuda_pool_alloc<cuda_t> src1_alloc(ctx.pool());
|
ggml_cuda_pool_alloc<cuda_t> src1_alloc(ctx.pool());
|
||||||
|
|
||||||
|
bool is_src0_cont_2 = ggml_is_contiguous_2(src0);
|
||||||
|
bool is_src1_cont_2 = ggml_is_contiguous_2(src1);
|
||||||
|
|
||||||
// Handle src0
|
// Handle src0
|
||||||
src0_ptr = (const cuda_t *) src0->data;
|
src0_ptr = (const cuda_t *) src0->data;
|
||||||
|
|
||||||
|
|
@ -1870,6 +1873,8 @@ static void ggml_cuda_mul_mat_batched_cublas_impl(ggml_backend_cuda_context & ct
|
||||||
s11 = ne10;
|
s11 = ne10;
|
||||||
s12 = ne11*s11;
|
s12 = ne11*s11;
|
||||||
s13 = ne12*s12;
|
s13 = ne12*s12;
|
||||||
|
|
||||||
|
is_src1_cont_2 = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Setup destination buffer
|
// Setup destination buffer
|
||||||
|
|
@ -1918,15 +1923,19 @@ static void ggml_cuda_mul_mat_batched_cublas_impl(ggml_backend_cuda_context & ct
|
||||||
const int64_t r2 = ne12/ne02;
|
const int64_t r2 = ne12/ne02;
|
||||||
const int64_t r3 = ne13/ne03;
|
const int64_t r3 = ne13/ne03;
|
||||||
|
|
||||||
if (r2 == 1 && r3 == 1 && ggml_is_contiguous_2(src0) && ggml_is_contiguous_2(src1)) {
|
if (r2 == 1 && r3 == 1 && is_src0_cont_2 && is_src1_cont_2) {
|
||||||
|
// with a [0, 2, 1, 3] perm. and ne02==1 the matrix strides need to be determined from dim 3:
|
||||||
|
const int64_t sma = ne02 == 1 ? nb03/nb00 : nb02/nb00;
|
||||||
|
const int64_t smb = ne12 == 1 ? s13 : s12;
|
||||||
|
|
||||||
// there is no broadcast and src0, src1 are contiguous across dims 2, 3
|
// there is no broadcast and src0, src1 are contiguous across dims 2, 3
|
||||||
// use cublasGemmStridedBatchedEx
|
// use cublasGemmStridedBatchedEx
|
||||||
CUBLAS_CHECK(
|
CUBLAS_CHECK(
|
||||||
cublasGemmStridedBatchedEx(ctx.cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N,
|
cublasGemmStridedBatchedEx(ctx.cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N,
|
||||||
ne01, ne11, ne10,
|
ne01, ne11, ne10,
|
||||||
alpha, src0_ptr, cu_data_type_a, nb01/nb00, nb02/nb00, // strideA
|
alpha, src0_ptr, cu_data_type_a, nb01/nb00, sma, // strideA
|
||||||
src1_ptr, cu_data_type_b, s11, s12, // strideB
|
src1_ptr, cu_data_type_b, s11, smb, // strideB
|
||||||
beta, dst_t, cu_data_type, ne0, ne1*ne0, // strideC
|
beta, dst_t, cu_data_type, ne0, ne1*ne0, // strideC
|
||||||
ne12*ne13,
|
ne12*ne13,
|
||||||
cu_compute_type,
|
cu_compute_type,
|
||||||
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
||||||
|
|
|
||||||
|
|
@ -1,65 +1,75 @@
|
||||||
#include "im2col.cuh"
|
#include "im2col.cuh"
|
||||||
|
|
||||||
|
#define MIN(a, b) (a) < (b) ? (a) : (b)
|
||||||
|
|
||||||
|
#define MAX_GRIDDIM_Z 65535
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
static __global__ void im2col_kernel(
|
static __global__ void im2col_kernel(
|
||||||
const float * x, T * dst, int64_t batch_offset,
|
const float * x, T * dst,
|
||||||
int64_t offset_delta, int64_t IC, int64_t IW, int64_t IH, int64_t OH, int64_t OW, int64_t KW, int64_t KH, int64_t pelements, int64_t CHW,
|
int64_t IC, int64_t IW, int64_t IH, int64_t OH, int64_t OW, int64_t KW, int64_t KH,
|
||||||
|
int64_t IC_IH_IW, int64_t IH_IW, int64_t N_OH, int64_t KH_KW, int64_t IC_KH_KW,
|
||||||
int s0, int s1, int p0, int p1, int d0, int d1) {
|
int s0, int s1, int p0, int p1, int d0, int d1) {
|
||||||
const int64_t i = threadIdx.x + blockIdx.x * blockDim.x;
|
const int64_t i = threadIdx.x + blockIdx.x * blockDim.x;
|
||||||
if (i >= pelements) {
|
if (i >= IC_KH_KW) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
const int64_t ksize = OW * KH;
|
const int64_t iic = i / (KH_KW);
|
||||||
const int64_t kx = i / ksize;
|
const int64_t rem = i - iic * KH_KW;
|
||||||
const int64_t kd = kx * ksize;
|
const int64_t ikh = rem / KW;
|
||||||
const int64_t ky = (i - kd) / OW;
|
const int64_t ikw = rem - ikh * KW;
|
||||||
const int64_t ix = i % OW;
|
|
||||||
|
|
||||||
const int64_t oh = blockIdx.y;
|
const int64_t iow = blockIdx.y;
|
||||||
const int64_t batch = blockIdx.z / IC;
|
for (int64_t iz = blockIdx.z; iz < N_OH; iz+=MAX_GRIDDIM_Z) {
|
||||||
const int64_t ic = blockIdx.z % IC;
|
const int64_t in = iz / OH;
|
||||||
|
const int64_t ioh = iz - in * OH;
|
||||||
|
|
||||||
const int64_t iiw = ix * s0 + kx * d0 - p0;
|
const int64_t iiw = iow * s0 + ikw * d0 - p0;
|
||||||
const int64_t iih = oh * s1 + ky * d1 - p1;
|
const int64_t iih = ioh * s1 + ikh * d1 - p1;
|
||||||
|
|
||||||
const int64_t offset_dst =
|
const int64_t offset_dst =
|
||||||
((batch * OH + oh) * OW + ix) * CHW +
|
((in * OH + ioh) * OW + iow) * IC_KH_KW + iic * KH_KW + ikh * KW + ikw;
|
||||||
(ic * (KW * KH) + ky * KW + kx);
|
|
||||||
|
|
||||||
if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
|
if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
|
||||||
dst[offset_dst] = 0.0f;
|
dst[offset_dst] = 0.0f;
|
||||||
} else {
|
} else {
|
||||||
const int64_t offset_src = ic * offset_delta + batch * batch_offset;
|
const int64_t offset_src = iic * IC_IH_IW + in * IH_IW;
|
||||||
dst[offset_dst] = x[offset_src + iih * IW + iiw];
|
dst[offset_dst] = x[offset_src + iih * IW + iiw];
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW]
|
||||||
template <typename T>
|
template <typename T>
|
||||||
static void im2col_cuda(const float * x, T* dst,
|
static void im2col_cuda(const float * x, T* dst,
|
||||||
int64_t IW, int64_t IH, int64_t OW, int64_t OH, int64_t KW, int64_t KH, int64_t IC,
|
int64_t IW, int64_t IH, int64_t OW, int64_t OH, int64_t KW, int64_t KH, int64_t IC,
|
||||||
int64_t batch, int64_t batch_offset, int64_t offset_delta,
|
int64_t N, int64_t IC_IH_IW, int64_t IH_IW,
|
||||||
int s0,int s1,int p0,int p1,int d0,int d1, cudaStream_t stream) {
|
int s0,int s1,int p0,int p1,int d0,int d1, cudaStream_t stream) {
|
||||||
const int parallel_elements = OW * KW * KH;
|
const int64_t IC_KH_KW = IC * KH * KW;
|
||||||
const int num_blocks = (parallel_elements + CUDA_IM2COL_BLOCK_SIZE - 1) / CUDA_IM2COL_BLOCK_SIZE;
|
const int64_t num_blocks = (IC_KH_KW + CUDA_IM2COL_BLOCK_SIZE - 1) / CUDA_IM2COL_BLOCK_SIZE;
|
||||||
dim3 block_nums(num_blocks, OH, batch * IC);
|
const int64_t N_OH = N * OH;
|
||||||
im2col_kernel<<<block_nums, CUDA_IM2COL_BLOCK_SIZE, 0, stream>>>(x, dst, batch_offset, offset_delta, IC, IW, IH, OH, OW, KW, KH, parallel_elements, (IC * KH * KW), s0, s1, p0, p1, d0, d1);
|
const int64_t KH_KW = KW*KH;
|
||||||
|
dim3 block_nums(num_blocks, OW, MIN(N_OH, MAX_GRIDDIM_Z));
|
||||||
|
im2col_kernel<<<block_nums, MIN(IC_KH_KW, CUDA_IM2COL_BLOCK_SIZE) , 0, stream>>>(x, dst, IC, IW, IH, OH, OW, KW, KH,
|
||||||
|
IC_IH_IW, IH_IW, N_OH, KH_KW, IC_KH_KW,
|
||||||
|
s0, s1, p0, p1, d0, d1);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void im2col_cuda_f16(const float * x, half * dst,
|
static void im2col_cuda_f16(const float * x, half * dst,
|
||||||
int64_t IW, int64_t IH, int64_t OW, int64_t OH, int64_t KW, int64_t KH, int64_t IC,
|
int64_t IW, int64_t IH, int64_t OW, int64_t OH, int64_t KW, int64_t KH, int64_t IC,
|
||||||
int64_t batch, int64_t batch_offset, int64_t offset_delta,
|
int64_t N, int64_t IC_IH_IW, int64_t IH_IW,
|
||||||
int s0,int s1,int p0,int p1,int d0,int d1, cudaStream_t stream) {
|
int s0,int s1,int p0,int p1,int d0,int d1, cudaStream_t stream) {
|
||||||
|
|
||||||
im2col_cuda<half>(x, dst, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, offset_delta, s0, s1, p0, p1, d0, d1, stream);
|
im2col_cuda<half>(x, dst, IW, IH, OW, OH, KW, KH, IC, N, IC_IH_IW, IH_IW, s0, s1, p0, p1, d0, d1, stream);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void im2col_cuda_f32(const float * x, float * dst,
|
static void im2col_cuda_f32(const float * x, float * dst,
|
||||||
int64_t IW, int64_t IH, int64_t OW, int64_t OH, int64_t KW, int64_t KH, int64_t IC,
|
int64_t IW, int64_t IH, int64_t OW, int64_t OH, int64_t KW, int64_t KH, int64_t IC,
|
||||||
int64_t batch, int64_t batch_offset, int64_t offset_delta,
|
int64_t N, int64_t IC_IH_IW, int64_t IH_IW,
|
||||||
int s0,int s1,int p0,int p1,int d0,int d1, cudaStream_t stream) {
|
int s0,int s1,int p0,int p1,int d0,int d1, cudaStream_t stream) {
|
||||||
|
|
||||||
im2col_cuda<float>(x, dst, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, offset_delta, s0, s1, p0, p1, d0, d1, stream);
|
im2col_cuda<float>(x, dst, IW, IH, OW, OH, KW, KH, IC, N, IC_IH_IW, IH_IW, s0, s1, p0, p1, d0, d1, stream);
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_cuda_op_im2col(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
void ggml_cuda_op_im2col(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||||
|
|
@ -91,13 +101,13 @@ void ggml_cuda_op_im2col(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||||
const int64_t OH = is_2D ? dst->ne[2] : 1;
|
const int64_t OH = is_2D ? dst->ne[2] : 1;
|
||||||
const int64_t OW = dst->ne[1];
|
const int64_t OW = dst->ne[1];
|
||||||
|
|
||||||
const size_t delta_offset = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32
|
const int64_t IC_IH_IW = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32
|
||||||
const int64_t batch = src1->ne[is_2D ? 3 : 2];
|
const int64_t N = src1->ne[is_2D ? 3 : 2];
|
||||||
const size_t batch_offset = src1->nb[is_2D ? 3 : 2] / 4; // nb is byte offset, src is type float32
|
const int64_t IH_IW = src1->nb[is_2D ? 3 : 2] / 4; // nb is byte offset, src is type float32
|
||||||
|
|
||||||
if(dst->type == GGML_TYPE_F16) {
|
if(dst->type == GGML_TYPE_F16) {
|
||||||
im2col_cuda_f16(src1_d, (half *) dst_d, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, stream);
|
im2col_cuda_f16(src1_d, (half *) dst_d, IW, IH, OW, OH, KW, KH, IC, N, IC_IH_IW, IH_IW, s0, s1, p0, p1, d0, d1, stream);
|
||||||
} else {
|
} else {
|
||||||
im2col_cuda_f32(src1_d, (float *) dst_d, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, stream);
|
im2col_cuda_f32(src1_d, (float *) dst_d, IW, IH, OW, OH, KW, KH, IC, N, IC_IH_IW, IH_IW, s0, s1, p0, p1, d0, d1, stream);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -251,25 +251,21 @@ static constexpr __device__ int mmq_get_granularity_device(const int /*mmq_x*/)
|
||||||
#endif // AMD_MFMA_AVAILABLE
|
#endif // AMD_MFMA_AVAILABLE
|
||||||
|
|
||||||
#if defined(GGML_USE_HIP)
|
#if defined(GGML_USE_HIP)
|
||||||
static int mmq_get_nwarps_host(const int cc) {
|
static int mmq_get_nwarps_host(const int cc, const int warp_size) {
|
||||||
return amd_mfma_available(cc) ? 8 : 4;
|
return amd_mfma_available(cc) ? 8 : 256/warp_size;
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
static int mmq_get_nwarps_host(const int /*cc*/) {
|
static int mmq_get_nwarps_host(const int /*cc*/, const int warp_size) {
|
||||||
return 8;
|
return 256/warp_size;
|
||||||
}
|
}
|
||||||
#endif // (GGML_USE_HIP)
|
#endif // (GGML_USE_HIP)
|
||||||
|
|
||||||
static constexpr __device__ int mmq_get_nwarps_device() {
|
static constexpr __device__ int mmq_get_nwarps_device() {
|
||||||
#if defined(GGML_USE_HIP)
|
|
||||||
#if defined(AMD_MFMA_AVAILABLE)
|
#if defined(AMD_MFMA_AVAILABLE)
|
||||||
return 8;
|
return 8;
|
||||||
#else
|
#else
|
||||||
return 4;
|
return 256/ggml_cuda_get_physical_warp_size();
|
||||||
#endif // AMD_MFMA_AVAILABLE
|
#endif // AMD_MFMA_AVAILABLE
|
||||||
#else
|
|
||||||
return 8;
|
|
||||||
#endif // defined(GGML_USE_HIP)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// ------------------------------------------------------------
|
// ------------------------------------------------------------
|
||||||
|
|
@ -3472,7 +3468,7 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
|
||||||
const int cc = ggml_cuda_info().devices[id].cc;
|
const int cc = ggml_cuda_info().devices[id].cc;
|
||||||
const int nsm = ggml_cuda_info().devices[id].nsm;
|
const int nsm = ggml_cuda_info().devices[id].nsm;
|
||||||
const int warp_size = ggml_cuda_info().devices[id].warp_size;
|
const int warp_size = ggml_cuda_info().devices[id].warp_size;
|
||||||
const int nwarps = mmq_get_nwarps_host(cc);
|
const int nwarps = mmq_get_nwarps_host(cc, warp_size);
|
||||||
const int mmq_y = get_mmq_y_host(cc);
|
const int mmq_y = get_mmq_y_host(cc);
|
||||||
|
|
||||||
const dim3 block_dims(warp_size, nwarps, 1);
|
const dim3 block_dims(warp_size, nwarps, 1);
|
||||||
|
|
@ -3559,7 +3555,7 @@ void mul_mat_q_case(ggml_backend_cuda_context & ctx, const mmq_args & args, cuda
|
||||||
const int cc = ggml_cuda_info().devices[id].cc;
|
const int cc = ggml_cuda_info().devices[id].cc;
|
||||||
const size_t smpbo = ggml_cuda_info().devices[id].smpbo;
|
const size_t smpbo = ggml_cuda_info().devices[id].smpbo;
|
||||||
const int warp_size = ggml_cuda_info().devices[id].warp_size;
|
const int warp_size = ggml_cuda_info().devices[id].warp_size;
|
||||||
const int nwarps = mmq_get_nwarps_host(cc);
|
const int nwarps = mmq_get_nwarps_host(cc, warp_size);
|
||||||
|
|
||||||
const int mmq_x_max = get_mmq_x_max_host(cc);
|
const int mmq_x_max = get_mmq_x_max_host(cc);
|
||||||
const int mmq_y = get_mmq_y_host(cc);
|
const int mmq_y = get_mmq_y_host(cc);
|
||||||
|
|
|
||||||
|
|
@ -400,10 +400,10 @@ struct ggml_backend_opencl_context {
|
||||||
cl_program program_mul_mm_f32_f32_l4_lm;
|
cl_program program_mul_mm_f32_f32_l4_lm;
|
||||||
cl_program program_mul_mm_f16_f32_l4_lm;
|
cl_program program_mul_mm_f16_f32_l4_lm;
|
||||||
|
|
||||||
cl_kernel kernel_add, kernel_add_row;
|
cl_kernel kernel_add, kernel_add_row, kernel_add_f16, kernel_add_row_f16;
|
||||||
cl_kernel kernel_mul, kernel_mul_row;
|
cl_kernel kernel_mul, kernel_mul_row, kernel_mul_f16, kernel_mul_row_f16;
|
||||||
cl_kernel kernel_div, kernel_div_row;
|
cl_kernel kernel_div, kernel_div_row, kernel_div_f16, kernel_div_row_f16;
|
||||||
cl_kernel kernel_sub, kernel_sub_row;
|
cl_kernel kernel_sub, kernel_sub_row, kernel_sub_f16, kernel_sub_row_f16;
|
||||||
cl_kernel kernel_scale;
|
cl_kernel kernel_scale;
|
||||||
cl_kernel kernel_silu, kernel_silu_4;
|
cl_kernel kernel_silu, kernel_silu_4;
|
||||||
cl_kernel kernel_gelu, kernel_gelu_4;
|
cl_kernel kernel_gelu, kernel_gelu_4;
|
||||||
|
|
@ -674,8 +674,10 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
||||||
backend_ctx->program_add =
|
backend_ctx->program_add =
|
||||||
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
||||||
|
|
||||||
CL_CHECK((backend_ctx->kernel_add = clCreateKernel(backend_ctx->program_add, "kernel_add", &err), err));
|
CL_CHECK((backend_ctx->kernel_add = clCreateKernel(backend_ctx->program_add, "kernel_add", &err), err));
|
||||||
CL_CHECK((backend_ctx->kernel_add_row = clCreateKernel(backend_ctx->program_add, "kernel_add_row", &err), err));
|
CL_CHECK((backend_ctx->kernel_add_row = clCreateKernel(backend_ctx->program_add, "kernel_add_row", &err), err));
|
||||||
|
CL_CHECK((backend_ctx->kernel_add_f16 = clCreateKernel(backend_ctx->program_add, "kernel_add_f16", &err), err));
|
||||||
|
CL_CHECK((backend_ctx->kernel_add_row_f16 = clCreateKernel(backend_ctx->program_add, "kernel_add_row_f16", &err), err));
|
||||||
GGML_LOG_CONT(".");
|
GGML_LOG_CONT(".");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -1089,8 +1091,10 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
||||||
backend_ctx->program_mul =
|
backend_ctx->program_mul =
|
||||||
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
||||||
|
|
||||||
CL_CHECK((backend_ctx->kernel_mul = clCreateKernel(backend_ctx->program_mul, "kernel_mul", &err), err));
|
CL_CHECK((backend_ctx->kernel_mul = clCreateKernel(backend_ctx->program_mul, "kernel_mul", &err), err));
|
||||||
CL_CHECK((backend_ctx->kernel_mul_row = clCreateKernel(backend_ctx->program_mul, "kernel_mul_row", &err), err));
|
CL_CHECK((backend_ctx->kernel_mul_row = clCreateKernel(backend_ctx->program_mul, "kernel_mul_row", &err), err));
|
||||||
|
CL_CHECK((backend_ctx->kernel_mul_f16 = clCreateKernel(backend_ctx->program_mul, "kernel_mul_f16", &err), err));
|
||||||
|
CL_CHECK((backend_ctx->kernel_mul_row_f16 = clCreateKernel(backend_ctx->program_mul, "kernel_mul_row_f16", &err), err));
|
||||||
GGML_LOG_CONT(".");
|
GGML_LOG_CONT(".");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -1288,11 +1292,16 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
||||||
#else
|
#else
|
||||||
const std::string kernel_src = read_file("div.cl");
|
const std::string kernel_src = read_file("div.cl");
|
||||||
#endif
|
#endif
|
||||||
|
std::string compile_opts = std::string("-cl-std=") + opencl_c_std +
|
||||||
|
" -cl-mad-enable -cl-finite-math-only ";
|
||||||
|
|
||||||
backend_ctx->program_div =
|
backend_ctx->program_div =
|
||||||
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
||||||
|
|
||||||
CL_CHECK((backend_ctx->kernel_div = clCreateKernel(backend_ctx->program_div, "kernel_div", &err), err));
|
CL_CHECK((backend_ctx->kernel_div = clCreateKernel(backend_ctx->program_div, "kernel_div", &err), err));
|
||||||
CL_CHECK((backend_ctx->kernel_div_row = clCreateKernel(backend_ctx->program_div, "kernel_div_row", &err), err));
|
CL_CHECK((backend_ctx->kernel_div_row = clCreateKernel(backend_ctx->program_div, "kernel_div_row", &err), err));
|
||||||
|
CL_CHECK((backend_ctx->kernel_div_f16 = clCreateKernel(backend_ctx->program_div, "kernel_div_f16", &err), err));
|
||||||
|
CL_CHECK((backend_ctx->kernel_div_row_f16 = clCreateKernel(backend_ctx->program_div, "kernel_div_row_f16", &err), err));
|
||||||
GGML_LOG_CONT(".");
|
GGML_LOG_CONT(".");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -1308,8 +1317,10 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
||||||
backend_ctx->program_sub =
|
backend_ctx->program_sub =
|
||||||
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
||||||
|
|
||||||
CL_CHECK((backend_ctx->kernel_sub = clCreateKernel(backend_ctx->program_sub, "kernel_sub", &err), err));
|
CL_CHECK((backend_ctx->kernel_sub = clCreateKernel(backend_ctx->program_sub, "kernel_sub", &err), err));
|
||||||
CL_CHECK((backend_ctx->kernel_sub_row = clCreateKernel(backend_ctx->program_sub, "kernel_sub_row", &err), err));
|
CL_CHECK((backend_ctx->kernel_sub_row = clCreateKernel(backend_ctx->program_sub, "kernel_sub_row", &err), err));
|
||||||
|
CL_CHECK((backend_ctx->kernel_sub_f16 = clCreateKernel(backend_ctx->program_sub, "kernel_sub_f16", &err), err));
|
||||||
|
CL_CHECK((backend_ctx->kernel_sub_row_f16 = clCreateKernel(backend_ctx->program_sub, "kernel_sub_row_f16", &err), err));
|
||||||
GGML_LOG_CONT(".");
|
GGML_LOG_CONT(".");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -2035,8 +2046,8 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
||||||
|
|
||||||
backend_ctx->adreno_cl_compiler_version = get_adreno_cl_compiler_version(driver_version);
|
backend_ctx->adreno_cl_compiler_version = get_adreno_cl_compiler_version(driver_version);
|
||||||
backend_ctx->has_vector_subgroup_broadcast =
|
backend_ctx->has_vector_subgroup_broadcast =
|
||||||
backend_ctx->adreno_cl_compiler_version.major >= 47 ||
|
(backend_ctx->adreno_cl_compiler_version.type == E031 && backend_ctx->adreno_cl_compiler_version.major >= 47) ||
|
||||||
backend_ctx->adreno_cl_compiler_version.major == 17;
|
(backend_ctx->adreno_cl_compiler_version.type == DX && backend_ctx->adreno_cl_compiler_version.major >= 17);
|
||||||
GGML_LOG_INFO("ggml_opencl: vector subgroup broadcast support: %s\n",
|
GGML_LOG_INFO("ggml_opencl: vector subgroup broadcast support: %s\n",
|
||||||
backend_ctx->has_vector_subgroup_broadcast ? "true" : "false");
|
backend_ctx->has_vector_subgroup_broadcast ? "true" : "false");
|
||||||
|
|
||||||
|
|
@ -2447,12 +2458,15 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
|
||||||
default:
|
default:
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
case GGML_OP_ADD:
|
|
||||||
case GGML_OP_SCALE:
|
case GGML_OP_SCALE:
|
||||||
|
return op->src[0]->type == GGML_TYPE_F32 && ggml_is_contiguous(op->src[0]);
|
||||||
|
case GGML_OP_ADD:
|
||||||
case GGML_OP_MUL:
|
case GGML_OP_MUL:
|
||||||
case GGML_OP_DIV:
|
case GGML_OP_DIV:
|
||||||
case GGML_OP_SUB:
|
case GGML_OP_SUB:
|
||||||
return op->src[0]->type == GGML_TYPE_F32;
|
return (op->src[0]->type == op->src[1]->type) &&
|
||||||
|
(op->src[0]->type == op->type) &&
|
||||||
|
(op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16);
|
||||||
case GGML_OP_UNARY:
|
case GGML_OP_UNARY:
|
||||||
switch (ggml_get_unary_op(op)) {
|
switch (ggml_get_unary_op(op)) {
|
||||||
case GGML_UNARY_OP_GELU:
|
case GGML_UNARY_OP_GELU:
|
||||||
|
|
@ -3680,35 +3694,39 @@ static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const
|
||||||
GGML_ASSERT(dst);
|
GGML_ASSERT(dst);
|
||||||
GGML_ASSERT(dst->extra);
|
GGML_ASSERT(dst->extra);
|
||||||
|
|
||||||
const int ne00 = src0 ? src0->ne[0] : 0;
|
GGML_ASSERT(src0->type == src1->type);
|
||||||
const int ne01 = src0 ? src0->ne[1] : 0;
|
GGML_ASSERT(src0->type == dst->type);
|
||||||
const int ne02 = src0 ? src0->ne[2] : 0;
|
GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
|
||||||
const int ne03 = src0 ? src0->ne[3] : 0;
|
|
||||||
|
|
||||||
const cl_ulong nb00 = src0 ? src0->nb[0] : 0;
|
const int ne00 = src0->ne[0];
|
||||||
const cl_ulong nb01 = src0 ? src0->nb[1] : 0;
|
const int ne01 = src0->ne[1];
|
||||||
const cl_ulong nb02 = src0 ? src0->nb[2] : 0;
|
const int ne02 = src0->ne[2];
|
||||||
const cl_ulong nb03 = src0 ? src0->nb[3] : 0;
|
const int ne03 = src0->ne[3];
|
||||||
|
|
||||||
const int ne10 = src1 ? src1->ne[0] : 0;
|
const cl_ulong nb00 = src0->nb[0];
|
||||||
const int ne11 = src1 ? src1->ne[1] : 0;
|
const cl_ulong nb01 = src0->nb[1];
|
||||||
const int ne12 = src1 ? src1->ne[2] : 0;
|
const cl_ulong nb02 = src0->nb[2];
|
||||||
const int ne13 = src1 ? src1->ne[3] : 0; UNUSED(ne13);
|
const cl_ulong nb03 = src0->nb[3];
|
||||||
|
|
||||||
const cl_ulong nb10 = src1 ? src1->nb[0] : 0;
|
const int ne10 = src1->ne[0];
|
||||||
const cl_ulong nb11 = src1 ? src1->nb[1] : 0;
|
const int ne11 = src1->ne[1];
|
||||||
const cl_ulong nb12 = src1 ? src1->nb[2] : 0;
|
const int ne12 = src1->ne[2];
|
||||||
const cl_ulong nb13 = src1 ? src1->nb[3] : 0; UNUSED(nb13);
|
const int ne13 = src1->ne[3]; UNUSED(ne13);
|
||||||
|
|
||||||
const int ne0 = dst ? dst->ne[0] : 0;
|
const cl_ulong nb10 = src1->nb[0];
|
||||||
const int ne1 = dst ? dst->ne[1] : 0;
|
const cl_ulong nb11 = src1->nb[1];
|
||||||
const int ne2 = dst ? dst->ne[2] : 0;
|
const cl_ulong nb12 = src1->nb[2];
|
||||||
const int ne3 = dst ? dst->ne[3] : 0;
|
const cl_ulong nb13 = src1->nb[3]; UNUSED(nb13);
|
||||||
|
|
||||||
const cl_ulong nb0 = dst ? dst->nb[0] : 0;
|
const int ne0 = dst->ne[0];
|
||||||
const cl_ulong nb1 = dst ? dst->nb[1] : 0;
|
const int ne1 = dst->ne[1];
|
||||||
const cl_ulong nb2 = dst ? dst->nb[2] : 0;
|
const int ne2 = dst->ne[2];
|
||||||
const cl_ulong nb3 = dst ? dst->nb[3] : 0;
|
const int ne3 = dst->ne[3];
|
||||||
|
|
||||||
|
const cl_ulong nb0 = dst->nb[0];
|
||||||
|
const cl_ulong nb1 = dst->nb[1];
|
||||||
|
const cl_ulong nb2 = dst->nb[2];
|
||||||
|
const cl_ulong nb3 = dst->nb[3];
|
||||||
|
|
||||||
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
||||||
|
|
||||||
|
|
@ -3731,7 +3749,12 @@ static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const
|
||||||
|
|
||||||
bcast_row = true;
|
bcast_row = true;
|
||||||
int ne = ne00 / 4;
|
int ne = ne00 / 4;
|
||||||
kernel = backend_ctx->kernel_add_row;
|
|
||||||
|
if (src0->type == GGML_TYPE_F32) {
|
||||||
|
kernel = backend_ctx->kernel_add_row;
|
||||||
|
} else {
|
||||||
|
kernel = backend_ctx->kernel_add_row_f16;
|
||||||
|
}
|
||||||
|
|
||||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
||||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
||||||
|
|
@ -3741,7 +3764,11 @@ static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const
|
||||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
|
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
|
||||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne));
|
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne));
|
||||||
} else {
|
} else {
|
||||||
kernel = backend_ctx->kernel_add;
|
if (src0->type == GGML_TYPE_F32) {
|
||||||
|
kernel = backend_ctx->kernel_add;
|
||||||
|
} else {
|
||||||
|
kernel = backend_ctx->kernel_add_f16;
|
||||||
|
}
|
||||||
|
|
||||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
||||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
||||||
|
|
@ -3803,35 +3830,39 @@ static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const
|
||||||
GGML_ASSERT(dst);
|
GGML_ASSERT(dst);
|
||||||
GGML_ASSERT(dst->extra);
|
GGML_ASSERT(dst->extra);
|
||||||
|
|
||||||
const int ne00 = src0 ? src0->ne[0] : 0;
|
GGML_ASSERT(src0->type == src1->type);
|
||||||
const int ne01 = src0 ? src0->ne[1] : 0;
|
GGML_ASSERT(src0->type == dst->type);
|
||||||
const int ne02 = src0 ? src0->ne[2] : 0;
|
GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
|
||||||
const int ne03 = src0 ? src0->ne[3] : 0;
|
|
||||||
|
|
||||||
const cl_ulong nb00 = src0 ? src0->nb[0] : 0;
|
const int ne00 = src0->ne[0];
|
||||||
const cl_ulong nb01 = src0 ? src0->nb[1] : 0;
|
const int ne01 = src0->ne[1];
|
||||||
const cl_ulong nb02 = src0 ? src0->nb[2] : 0;
|
const int ne02 = src0->ne[2];
|
||||||
const cl_ulong nb03 = src0 ? src0->nb[3] : 0;
|
const int ne03 = src0->ne[3];
|
||||||
|
|
||||||
const int ne10 = src1 ? src1->ne[0] : 0;
|
const cl_ulong nb00 = src0->nb[0];
|
||||||
const int ne11 = src1 ? src1->ne[1] : 0;
|
const cl_ulong nb01 = src0->nb[1];
|
||||||
const int ne12 = src1 ? src1->ne[2] : 0;
|
const cl_ulong nb02 = src0->nb[2];
|
||||||
const int ne13 = src1 ? src1->ne[3] : 0; UNUSED(ne13);
|
const cl_ulong nb03 = src0->nb[3];
|
||||||
|
|
||||||
const cl_ulong nb10 = src1 ? src1->nb[0] : 0;
|
const int ne10 = src1->ne[0];
|
||||||
const cl_ulong nb11 = src1 ? src1->nb[1] : 0;
|
const int ne11 = src1->ne[1];
|
||||||
const cl_ulong nb12 = src1 ? src1->nb[2] : 0;
|
const int ne12 = src1->ne[2];
|
||||||
const cl_ulong nb13 = src1 ? src1->nb[3] : 0; UNUSED(nb13);
|
const int ne13 = src1->ne[3]; UNUSED(ne13);
|
||||||
|
|
||||||
const int ne0 = dst ? dst->ne[0] : 0;
|
const cl_ulong nb10 = src1->nb[0];
|
||||||
const int ne1 = dst ? dst->ne[1] : 0;
|
const cl_ulong nb11 = src1->nb[1];
|
||||||
const int ne2 = dst ? dst->ne[2] : 0;
|
const cl_ulong nb12 = src1->nb[2];
|
||||||
const int ne3 = dst ? dst->ne[3] : 0;
|
const cl_ulong nb13 = src1->nb[3]; UNUSED(nb13);
|
||||||
|
|
||||||
const cl_ulong nb0 = dst ? dst->nb[0] : 0;
|
const int ne0 = dst->ne[0];
|
||||||
const cl_ulong nb1 = dst ? dst->nb[1] : 0;
|
const int ne1 = dst->ne[1];
|
||||||
const cl_ulong nb2 = dst ? dst->nb[2] : 0;
|
const int ne2 = dst->ne[2];
|
||||||
const cl_ulong nb3 = dst ? dst->nb[3] : 0;
|
const int ne3 = dst->ne[3];
|
||||||
|
|
||||||
|
const cl_ulong nb0 = dst->nb[0];
|
||||||
|
const cl_ulong nb1 = dst->nb[1];
|
||||||
|
const cl_ulong nb2 = dst->nb[2];
|
||||||
|
const cl_ulong nb3 = dst->nb[3];
|
||||||
|
|
||||||
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
||||||
|
|
||||||
|
|
@ -3854,7 +3885,12 @@ static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const
|
||||||
|
|
||||||
bcast_row = true;
|
bcast_row = true;
|
||||||
int ne = ne00 / 4;
|
int ne = ne00 / 4;
|
||||||
kernel = backend_ctx->kernel_mul_row;
|
|
||||||
|
if (src0->type == GGML_TYPE_F32) {
|
||||||
|
kernel = backend_ctx->kernel_mul_row;
|
||||||
|
} else {
|
||||||
|
kernel = backend_ctx->kernel_mul_row_f16;
|
||||||
|
}
|
||||||
|
|
||||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
||||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
||||||
|
|
@ -3864,7 +3900,11 @@ static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const
|
||||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
|
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
|
||||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne));
|
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne));
|
||||||
} else {
|
} else {
|
||||||
kernel = backend_ctx->kernel_mul;
|
if (src0->type == GGML_TYPE_F32) {
|
||||||
|
kernel = backend_ctx->kernel_mul;
|
||||||
|
} else {
|
||||||
|
kernel = backend_ctx->kernel_mul_f16;
|
||||||
|
}
|
||||||
|
|
||||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
||||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
||||||
|
|
@ -3926,6 +3966,10 @@ static void ggml_cl_div(ggml_backend_t backend, const ggml_tensor * src0, const
|
||||||
GGML_ASSERT(dst);
|
GGML_ASSERT(dst);
|
||||||
GGML_ASSERT(dst->extra);
|
GGML_ASSERT(dst->extra);
|
||||||
|
|
||||||
|
GGML_ASSERT(src0->type == src1->type);
|
||||||
|
GGML_ASSERT(src0->type == dst->type);
|
||||||
|
GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
|
||||||
|
|
||||||
const int ne00 = src0->ne[0];
|
const int ne00 = src0->ne[0];
|
||||||
const int ne01 = src0->ne[1];
|
const int ne01 = src0->ne[1];
|
||||||
const int ne02 = src0->ne[2];
|
const int ne02 = src0->ne[2];
|
||||||
|
|
@ -3974,7 +4018,12 @@ static void ggml_cl_div(ggml_backend_t backend, const ggml_tensor * src0, const
|
||||||
|
|
||||||
bcast_row = true;
|
bcast_row = true;
|
||||||
int ne = ne00 / 4;
|
int ne = ne00 / 4;
|
||||||
kernel = backend_ctx->kernel_div_row;
|
|
||||||
|
if (src0->type == GGML_TYPE_F32) {
|
||||||
|
kernel = backend_ctx->kernel_div_row;
|
||||||
|
} else {
|
||||||
|
kernel = backend_ctx->kernel_div_row_f16;
|
||||||
|
}
|
||||||
|
|
||||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
||||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
||||||
|
|
@ -3984,7 +4033,11 @@ static void ggml_cl_div(ggml_backend_t backend, const ggml_tensor * src0, const
|
||||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
|
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
|
||||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne));
|
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne));
|
||||||
} else {
|
} else {
|
||||||
kernel = backend_ctx->kernel_div;
|
if (src0->type == GGML_TYPE_F32) {
|
||||||
|
kernel = backend_ctx->kernel_div;
|
||||||
|
} else {
|
||||||
|
kernel = backend_ctx->kernel_div_f16;
|
||||||
|
}
|
||||||
|
|
||||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
||||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
||||||
|
|
@ -4034,6 +4087,10 @@ static void ggml_cl_sub(ggml_backend_t backend, const ggml_tensor * src0, const
|
||||||
GGML_ASSERT(dst);
|
GGML_ASSERT(dst);
|
||||||
GGML_ASSERT(dst->extra);
|
GGML_ASSERT(dst->extra);
|
||||||
|
|
||||||
|
GGML_ASSERT(src0->type == src1->type);
|
||||||
|
GGML_ASSERT(src0->type == dst->type);
|
||||||
|
GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
|
||||||
|
|
||||||
const int ne00 = src0->ne[0];
|
const int ne00 = src0->ne[0];
|
||||||
const int ne01 = src0->ne[1];
|
const int ne01 = src0->ne[1];
|
||||||
const int ne02 = src0->ne[2];
|
const int ne02 = src0->ne[2];
|
||||||
|
|
@ -4082,7 +4139,12 @@ static void ggml_cl_sub(ggml_backend_t backend, const ggml_tensor * src0, const
|
||||||
|
|
||||||
bcast_row = true;
|
bcast_row = true;
|
||||||
int ne = ne00 / 4;
|
int ne = ne00 / 4;
|
||||||
kernel = backend_ctx->kernel_sub_row;
|
|
||||||
|
if (src0->type == GGML_TYPE_F32) {
|
||||||
|
kernel = backend_ctx->kernel_sub_row;
|
||||||
|
} else {
|
||||||
|
kernel = backend_ctx->kernel_sub_row_f16;
|
||||||
|
}
|
||||||
|
|
||||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
||||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
||||||
|
|
@ -4092,7 +4154,11 @@ static void ggml_cl_sub(ggml_backend_t backend, const ggml_tensor * src0, const
|
||||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
|
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
|
||||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne));
|
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne));
|
||||||
} else {
|
} else {
|
||||||
kernel = backend_ctx->kernel_sub;
|
if (src0->type == GGML_TYPE_F32) {
|
||||||
|
kernel = backend_ctx->kernel_sub;
|
||||||
|
} else {
|
||||||
|
kernel = backend_ctx->kernel_sub_f16;
|
||||||
|
}
|
||||||
|
|
||||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
||||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
||||||
|
|
|
||||||
|
|
@ -81,3 +81,76 @@ kernel void kernel_add_row(
|
||||||
uint idx1 = gid - (gid/ne)*ne; // get_global_id(0) % ne
|
uint idx1 = gid - (gid/ne)*ne; // get_global_id(0) % ne
|
||||||
dst[gid] = src0[gid] + src1[idx1];
|
dst[gid] = src0[gid] + src1[idx1];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
kernel void kernel_add_f16(
|
||||||
|
global char * src0,
|
||||||
|
ulong offset0,
|
||||||
|
global char * src1,
|
||||||
|
ulong offset1,
|
||||||
|
global char * dst,
|
||||||
|
ulong offsetd,
|
||||||
|
int ne00,
|
||||||
|
int ne01,
|
||||||
|
int ne02,
|
||||||
|
int ne03,
|
||||||
|
ulong nb00,
|
||||||
|
ulong nb01,
|
||||||
|
ulong nb02,
|
||||||
|
ulong nb03,
|
||||||
|
int ne10,
|
||||||
|
int ne11,
|
||||||
|
int ne12,
|
||||||
|
int ne13,
|
||||||
|
ulong nb10,
|
||||||
|
ulong nb11,
|
||||||
|
ulong nb12,
|
||||||
|
ulong nb13,
|
||||||
|
int ne0,
|
||||||
|
int ne1,
|
||||||
|
int ne2,
|
||||||
|
int ne3,
|
||||||
|
ulong nb0,
|
||||||
|
ulong nb1,
|
||||||
|
ulong nb2,
|
||||||
|
ulong nb3
|
||||||
|
) {
|
||||||
|
src0 = src0 + offset0;
|
||||||
|
src1 = src1 + offset1;
|
||||||
|
dst = dst + offsetd;
|
||||||
|
|
||||||
|
int i03 = get_group_id(2);
|
||||||
|
int i02 = get_group_id(1);
|
||||||
|
int i01 = get_group_id(0);
|
||||||
|
|
||||||
|
int i13 = i03 % ne13;
|
||||||
|
int i12 = i02 % ne12;
|
||||||
|
int i11 = i01 % ne11;
|
||||||
|
|
||||||
|
global char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01;
|
||||||
|
global char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11;
|
||||||
|
global char * dst_ptr = dst + i03*nb3 + i02*nb2 + i01*nb1;
|
||||||
|
|
||||||
|
for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) {
|
||||||
|
const int i10 = i0 % ne10;
|
||||||
|
*((global half *)(dst_ptr + i0*nb0)) = *((global half *)(src0_ptr + i0*nb00)) + *((global half *)(src1_ptr + i10*nb10));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
kernel void kernel_add_row_f16(
|
||||||
|
global half4 * src0,
|
||||||
|
ulong offset0,
|
||||||
|
global half4 * src1,
|
||||||
|
ulong offset1,
|
||||||
|
global half4 * dst,
|
||||||
|
ulong offsetd,
|
||||||
|
int ne
|
||||||
|
) {
|
||||||
|
src0 = (global half4*)((global char*)src0 + offset0);
|
||||||
|
src1 = (global half4*)((global char*)src1 + offset1);
|
||||||
|
dst = (global half4*)((global char*)dst + offsetd);
|
||||||
|
|
||||||
|
// This performs better than using %.
|
||||||
|
uint gid = get_global_id(0);
|
||||||
|
uint idx1 = gid - (gid/ne)*ne; // get_global_id(0) % ne
|
||||||
|
dst[gid] = src0[gid] + src1[idx1];
|
||||||
|
}
|
||||||
|
|
|
||||||
|
|
@ -70,3 +70,69 @@ kernel void kernel_div_row(
|
||||||
uint idx1 = gid - (gid/ne)*ne; // get_global_id(0) % ne
|
uint idx1 = gid - (gid/ne)*ne; // get_global_id(0) % ne
|
||||||
dst[gid] = src0[gid] / src1[idx1];
|
dst[gid] = src0[gid] / src1[idx1];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
kernel void kernel_div_f16(
|
||||||
|
global char * src0,
|
||||||
|
ulong offset0,
|
||||||
|
global char * src1,
|
||||||
|
ulong offset1,
|
||||||
|
global char * dst,
|
||||||
|
ulong offsetd,
|
||||||
|
ulong nb00,
|
||||||
|
ulong nb01,
|
||||||
|
ulong nb02,
|
||||||
|
ulong nb03,
|
||||||
|
int ne10,
|
||||||
|
int ne11,
|
||||||
|
int ne12,
|
||||||
|
int ne13,
|
||||||
|
ulong nb10,
|
||||||
|
ulong nb11,
|
||||||
|
ulong nb12,
|
||||||
|
ulong nb13,
|
||||||
|
int ne0,
|
||||||
|
ulong nb0,
|
||||||
|
ulong nb1,
|
||||||
|
ulong nb2,
|
||||||
|
ulong nb3
|
||||||
|
) {
|
||||||
|
src0 = src0 + offset0;
|
||||||
|
src1 = src1 + offset1;
|
||||||
|
dst = dst + offsetd;
|
||||||
|
|
||||||
|
int i03 = get_group_id(2);
|
||||||
|
int i02 = get_group_id(1);
|
||||||
|
int i01 = get_group_id(0);
|
||||||
|
|
||||||
|
int i13 = i03 % ne13;
|
||||||
|
int i12 = i02 % ne12;
|
||||||
|
int i11 = i01 % ne11;
|
||||||
|
|
||||||
|
global char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01;
|
||||||
|
global char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11;
|
||||||
|
global char * dst_ptr = dst + i03*nb3 + i02*nb2 + i01*nb1;
|
||||||
|
|
||||||
|
for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) {
|
||||||
|
const int i10 = i0 % ne10;
|
||||||
|
*((global half *)(dst_ptr + i0*nb0)) = *((global half *)(src0_ptr + i0*nb00)) / *((global half *)(src1_ptr + i10*nb10));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
kernel void kernel_div_row_f16(
|
||||||
|
global half4 * src0,
|
||||||
|
ulong offset0,
|
||||||
|
global half4 * src1,
|
||||||
|
ulong offset1,
|
||||||
|
global half4 * dst,
|
||||||
|
ulong offsetd,
|
||||||
|
int ne
|
||||||
|
) {
|
||||||
|
src0 = (global half4*)((global char*)src0 + offset0);
|
||||||
|
src1 = (global half4*)((global char*)src1 + offset1);
|
||||||
|
dst = (global half4*)((global char*)dst + offsetd);
|
||||||
|
|
||||||
|
// This performs better than using %.
|
||||||
|
uint gid = get_global_id(0);
|
||||||
|
uint idx1 = gid - (gid/ne)*ne; // get_global_id(0) % ne
|
||||||
|
dst[gid] = src0[gid] / src1[idx1];
|
||||||
|
}
|
||||||
|
|
|
||||||
|
|
@ -77,3 +77,76 @@ kernel void kernel_mul_row(
|
||||||
uint idx1 = gid - (gid/ne)*ne; // get_global_id(0) % ne
|
uint idx1 = gid - (gid/ne)*ne; // get_global_id(0) % ne
|
||||||
dst[gid] = src0[gid] * src1[idx1];
|
dst[gid] = src0[gid] * src1[idx1];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
kernel void kernel_mul_f16(
|
||||||
|
global char * src0,
|
||||||
|
ulong offset0,
|
||||||
|
global char * src1,
|
||||||
|
ulong offset1,
|
||||||
|
global char * dst,
|
||||||
|
ulong offsetd,
|
||||||
|
int ne00,
|
||||||
|
int ne01,
|
||||||
|
int ne02,
|
||||||
|
int ne03,
|
||||||
|
ulong nb00,
|
||||||
|
ulong nb01,
|
||||||
|
ulong nb02,
|
||||||
|
ulong nb03,
|
||||||
|
int ne10,
|
||||||
|
int ne11,
|
||||||
|
int ne12,
|
||||||
|
int ne13,
|
||||||
|
ulong nb10,
|
||||||
|
ulong nb11,
|
||||||
|
ulong nb12,
|
||||||
|
ulong nb13,
|
||||||
|
int ne0,
|
||||||
|
int ne1,
|
||||||
|
int ne2,
|
||||||
|
int ne3,
|
||||||
|
ulong nb0,
|
||||||
|
ulong nb1,
|
||||||
|
ulong nb2,
|
||||||
|
ulong nb3
|
||||||
|
) {
|
||||||
|
src0 = src0 + offset0;
|
||||||
|
src1 = src1 + offset1;
|
||||||
|
dst = dst + offsetd;
|
||||||
|
|
||||||
|
int i03 = get_group_id(2);
|
||||||
|
int i02 = get_group_id(1);
|
||||||
|
int i01 = get_group_id(0);
|
||||||
|
|
||||||
|
int i13 = i03 % ne13;
|
||||||
|
int i12 = i02 % ne12;
|
||||||
|
int i11 = i01 % ne11;
|
||||||
|
|
||||||
|
global char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01;
|
||||||
|
global char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11;
|
||||||
|
global char * dst_ptr = dst + i03*nb3 + i02*nb2 + i01*nb1;
|
||||||
|
|
||||||
|
for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) {
|
||||||
|
const int i10 = i0 % ne10;
|
||||||
|
*((global half *)(dst_ptr + i0*nb0)) = *((global half *)(src0_ptr + i0*nb00)) * *((global half *)(src1_ptr + i10*nb10));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
kernel void kernel_mul_row_f16(
|
||||||
|
global half4 * src0,
|
||||||
|
ulong offset0,
|
||||||
|
global half4 * src1,
|
||||||
|
ulong offset1,
|
||||||
|
global half4 * dst,
|
||||||
|
ulong offsetd,
|
||||||
|
int ne
|
||||||
|
) {
|
||||||
|
src0 = (global half4*)((global char*)src0 + offset0);
|
||||||
|
src1 = (global half4*)((global char*)src1 + offset1);
|
||||||
|
dst = (global half4*)((global char*)dst + offsetd);
|
||||||
|
|
||||||
|
// This performs better than using %.
|
||||||
|
uint gid = get_global_id(0);
|
||||||
|
uint idx1 = gid - (gid/ne)*ne; // get_global_id(0) % ne
|
||||||
|
dst[gid] = src0[gid] * src1[idx1];
|
||||||
|
}
|
||||||
|
|
|
||||||
|
|
@ -70,3 +70,69 @@ kernel void kernel_sub_row(
|
||||||
uint idx1 = gid - (gid/ne)*ne; // get_global_id(0) % ne
|
uint idx1 = gid - (gid/ne)*ne; // get_global_id(0) % ne
|
||||||
dst[gid] = src0[gid] - src1[idx1];
|
dst[gid] = src0[gid] - src1[idx1];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
kernel void kernel_sub_f16(
|
||||||
|
global char * src0,
|
||||||
|
ulong offset0,
|
||||||
|
global char * src1,
|
||||||
|
ulong offset1,
|
||||||
|
global char * dst,
|
||||||
|
ulong offsetd,
|
||||||
|
ulong nb00,
|
||||||
|
ulong nb01,
|
||||||
|
ulong nb02,
|
||||||
|
ulong nb03,
|
||||||
|
int ne10,
|
||||||
|
int ne11,
|
||||||
|
int ne12,
|
||||||
|
int ne13,
|
||||||
|
ulong nb10,
|
||||||
|
ulong nb11,
|
||||||
|
ulong nb12,
|
||||||
|
ulong nb13,
|
||||||
|
int ne0,
|
||||||
|
ulong nb0,
|
||||||
|
ulong nb1,
|
||||||
|
ulong nb2,
|
||||||
|
ulong nb3
|
||||||
|
) {
|
||||||
|
src0 = src0 + offset0;
|
||||||
|
src1 = src1 + offset1;
|
||||||
|
dst = dst + offsetd;
|
||||||
|
|
||||||
|
int i03 = get_group_id(2);
|
||||||
|
int i02 = get_group_id(1);
|
||||||
|
int i01 = get_group_id(0);
|
||||||
|
|
||||||
|
int i13 = i03 % ne13;
|
||||||
|
int i12 = i02 % ne12;
|
||||||
|
int i11 = i01 % ne11;
|
||||||
|
|
||||||
|
global char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01;
|
||||||
|
global char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11;
|
||||||
|
global char * dst_ptr = dst + i03*nb3 + i02*nb2 + i01*nb1;
|
||||||
|
|
||||||
|
for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) {
|
||||||
|
const int i10 = i0 % ne10;
|
||||||
|
*((global half *)(dst_ptr + i0*nb0)) = *((global half *)(src0_ptr + i0*nb00)) - *((global half *)(src1_ptr + i10*nb10));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
kernel void kernel_sub_row_f16(
|
||||||
|
global half4 * src0,
|
||||||
|
ulong offset0,
|
||||||
|
global half4 * src1,
|
||||||
|
ulong offset1,
|
||||||
|
global half4 * dst,
|
||||||
|
ulong offsetd,
|
||||||
|
int ne
|
||||||
|
) {
|
||||||
|
src0 = (global half4*)((global char*)src0 + offset0);
|
||||||
|
src1 = (global half4*)((global char*)src1 + offset1);
|
||||||
|
dst = (global half4*)((global char*)dst + offsetd);
|
||||||
|
|
||||||
|
// This performs better than using %.
|
||||||
|
uint gid = get_global_id(0);
|
||||||
|
uint idx1 = gid - (gid/ne)*ne; // get_global_id(0) % ne
|
||||||
|
dst[gid] = src0[gid] - src1[idx1];
|
||||||
|
}
|
||||||
|
|
|
||||||
|
|
@ -2688,6 +2688,9 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx, cons
|
||||||
const size_t type_size_src0 = ggml_type_size(src0->type);
|
const size_t type_size_src0 = ggml_type_size(src0->type);
|
||||||
const size_t type_size_src1 = ggml_type_size(src1->type);
|
const size_t type_size_src1 = ggml_type_size(src1->type);
|
||||||
|
|
||||||
|
bool is_src0_cont_2 = ggml_is_contiguous_2(src0);
|
||||||
|
bool is_src1_cont_2 = ggml_is_contiguous_2(src1);
|
||||||
|
|
||||||
// SRC1 strides
|
// SRC1 strides
|
||||||
int64_t s11 = nb11 / type_size_src1;
|
int64_t s11 = nb11 / type_size_src1;
|
||||||
int64_t s12 = nb12 / type_size_src1;
|
int64_t s12 = nb12 / type_size_src1;
|
||||||
|
|
@ -2737,6 +2740,8 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx, cons
|
||||||
s11 = ne10;
|
s11 = ne10;
|
||||||
s12 = ne11 * s11;
|
s12 = ne11 * s11;
|
||||||
s13 = ne12 * s12;
|
s13 = ne12 * s12;
|
||||||
|
|
||||||
|
is_src1_cont_2 = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_sycl_pool_alloc<sycl::half> dst_f16(ctx.pool());
|
ggml_sycl_pool_alloc<sycl::half> dst_f16(ctx.pool());
|
||||||
|
|
@ -2852,12 +2857,16 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx, cons
|
||||||
else
|
else
|
||||||
#endif
|
#endif
|
||||||
{
|
{
|
||||||
if (r2 == 1 && r3 == 1 && ggml_is_contiguous_2(src0) && ggml_is_contiguous_2(src1)) {
|
if (r2 == 1 && r3 == 1 && is_src0_cont_2 && is_src1_cont_2) {
|
||||||
|
// with a [0, 2, 1, 3] perm. and ne02==1 the matrix strides need to be determined from dim 3:
|
||||||
|
const int64_t sma = ne02 == 1 ? nb03/nb00 : nb02/nb00;
|
||||||
|
const int64_t smb = ne12 == 1 ? s13 : s12;
|
||||||
|
|
||||||
// there is no broadcast and src0, src1 are contiguous across dims 2, 3
|
// there is no broadcast and src0, src1 are contiguous across dims 2, 3
|
||||||
SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(*queue, oneapi::math::transpose::trans,
|
SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(*queue, oneapi::math::transpose::trans,
|
||||||
oneapi::math::transpose::nontrans, ne01, ne11, ne10, alpha,
|
oneapi::math::transpose::nontrans, ne01, ne11, ne10, alpha,
|
||||||
src0_f16, dpct::library_data_t::real_half, nb01 / nb00, nb02 / nb00,
|
src0_f16, dpct::library_data_t::real_half, nb01 / nb00, sma,
|
||||||
src1_f16, dpct::library_data_t::real_half, s11, s12, beta, dst_ddf,
|
src1_f16, dpct::library_data_t::real_half, s11, smb, beta, dst_ddf,
|
||||||
mkl_data_type, ne0, ne1 * ne0, ne12 * ne13, mkl_compute_type)));
|
mkl_data_type, ne0, ne1 * ne0, ne12 * ne13, mkl_compute_type)));
|
||||||
} else {
|
} else {
|
||||||
const int ne23 = ne12 * ne13;
|
const int ne23 = ne12 * ne13;
|
||||||
|
|
|
||||||
|
|
@ -222,6 +222,7 @@ enum vk_device_architecture {
|
||||||
AMD_RDNA2,
|
AMD_RDNA2,
|
||||||
AMD_RDNA3,
|
AMD_RDNA3,
|
||||||
INTEL_XE2,
|
INTEL_XE2,
|
||||||
|
NVIDIA_PRE_TURING,
|
||||||
};
|
};
|
||||||
|
|
||||||
// HSK x HSV
|
// HSK x HSV
|
||||||
|
|
@ -315,10 +316,33 @@ static vk_device_architecture get_device_architecture(const vk::PhysicalDevice&
|
||||||
// https://www.intel.com/content/www/us/en/docs/oneapi/optimization-guide-gpu/2025-0/intel-xe-gpu-architecture.html
|
// https://www.intel.com/content/www/us/en/docs/oneapi/optimization-guide-gpu/2025-0/intel-xe-gpu-architecture.html
|
||||||
return vk_device_architecture::INTEL_XE2;
|
return vk_device_architecture::INTEL_XE2;
|
||||||
}
|
}
|
||||||
|
} else if (props.vendorID == VK_VENDOR_ID_NVIDIA) {
|
||||||
|
const std::vector<vk::ExtensionProperties> ext_props = device.enumerateDeviceExtensionProperties();
|
||||||
|
|
||||||
|
bool cooperative_matrix = false;
|
||||||
|
|
||||||
|
// Detect "pre-turing" based on lack of coopmat support.
|
||||||
|
for (const auto& properties : ext_props) {
|
||||||
|
if (strcmp("VK_KHR_cooperative_matrix", properties.extensionName) == 0) {
|
||||||
|
cooperative_matrix = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!cooperative_matrix) {
|
||||||
|
return vk_device_architecture::NVIDIA_PRE_TURING;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return vk_device_architecture::OTHER;
|
return vk_device_architecture::OTHER;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
enum vk_conv_shapes {
|
||||||
|
CONV_SHAPE_128x128,
|
||||||
|
CONV_SHAPE_64x32,
|
||||||
|
CONV_SHAPE_32x256,
|
||||||
|
CONV_SHAPE_COUNT,
|
||||||
|
};
|
||||||
|
|
||||||
struct vk_device_struct {
|
struct vk_device_struct {
|
||||||
std::recursive_mutex mutex;
|
std::recursive_mutex mutex;
|
||||||
|
|
||||||
|
|
@ -483,8 +507,8 @@ struct vk_device_struct {
|
||||||
vk_pipeline pipeline_rwkv_wkv6_f32;
|
vk_pipeline pipeline_rwkv_wkv6_f32;
|
||||||
vk_pipeline pipeline_rwkv_wkv7_f32;
|
vk_pipeline pipeline_rwkv_wkv7_f32;
|
||||||
vk_pipeline pipeline_opt_step_adamw_f32;
|
vk_pipeline pipeline_opt_step_adamw_f32;
|
||||||
vk_pipeline pipeline_conv2d_f32;
|
vk_pipeline pipeline_conv2d_f32[CONV_SHAPE_COUNT];
|
||||||
vk_pipeline pipeline_conv2d_f16_f32;
|
vk_pipeline pipeline_conv2d_f16_f32[CONV_SHAPE_COUNT];
|
||||||
vk_pipeline pipeline_conv2d_dw_whcn_f32;
|
vk_pipeline pipeline_conv2d_dw_whcn_f32;
|
||||||
vk_pipeline pipeline_conv2d_dw_cwhn_f32;
|
vk_pipeline pipeline_conv2d_dw_cwhn_f32;
|
||||||
|
|
||||||
|
|
@ -908,8 +932,22 @@ struct vk_op_conv2d_push_constants {
|
||||||
uint32_t nb1;
|
uint32_t nb1;
|
||||||
uint32_t nb2;
|
uint32_t nb2;
|
||||||
uint32_t nb3;
|
uint32_t nb3;
|
||||||
|
|
||||||
|
// init_fastdiv_values constants for dividing by KW, KW*KH, OW, OW*OH
|
||||||
|
uint32_t KWmp; uint32_t KWL;
|
||||||
|
uint32_t KWKHmp; uint32_t KWKHL;
|
||||||
|
uint32_t OWmp; uint32_t OWL;
|
||||||
|
uint32_t OWOHmp; uint32_t OWOHL;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
template <> void init_pushconst_fastdiv(vk_op_conv2d_push_constants &p) {
|
||||||
|
// Compute magic values to divide by KW, KW*KH, OW, OW*OH
|
||||||
|
init_fastdiv_values(p.KW, p.KWmp, p.KWL);
|
||||||
|
init_fastdiv_values(p.KW*p.KH, p.KWKHmp, p.KWKHL);
|
||||||
|
init_fastdiv_values(p.OW, p.OWmp, p.OWL);
|
||||||
|
init_fastdiv_values(p.OW*p.OH, p.OWOHmp, p.OWOHL);
|
||||||
|
}
|
||||||
|
|
||||||
struct vk_op_conv2d_dw_push_constants {
|
struct vk_op_conv2d_dw_push_constants {
|
||||||
uint32_t ne;
|
uint32_t ne;
|
||||||
uint32_t batches;
|
uint32_t batches;
|
||||||
|
|
@ -2068,12 +2106,12 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||||
s_mmq_wg_denoms = { 32, 64, 1 };
|
s_mmq_wg_denoms = { 32, 64, 1 };
|
||||||
|
|
||||||
// spec constants and tile sizes for quant matmul (Qi_K)
|
// spec constants and tile sizes for quant matmul (Qi_K)
|
||||||
l_warptile_mmq_k = { 256, 64, 128, 64, 1 };
|
l_warptile_mmq_k = { 256, 128, 256, 64, 1 };
|
||||||
m_warptile_mmq_k = { 256, 32, 64, 64, 0 };
|
m_warptile_mmq_k = { 256, 128, 128, 64, 1 };
|
||||||
s_warptile_mmq_k = { 256, 32, 32, 128, 0 };
|
s_warptile_mmq_k = { 256, 32, 64, 128, 0 };
|
||||||
l_mmq_wg_denoms_k = { 64, 128, 1 };
|
l_mmq_wg_denoms_k = { 128, 256, 1 };
|
||||||
m_mmq_wg_denoms_k = { 32, 64, 1 };
|
m_mmq_wg_denoms_k = { 128, 128, 1 };
|
||||||
s_mmq_wg_denoms_k = { 32, 32, 1 };
|
s_mmq_wg_denoms_k = { 32, 64, 1 };
|
||||||
|
|
||||||
// spec constants and tile sizes for quant matmul_id
|
// spec constants and tile sizes for quant matmul_id
|
||||||
l_warptile_mmqid = { 256, 128, 128, 16, 0 };
|
l_warptile_mmqid = { 256, 128, 128, 16, 0 };
|
||||||
|
|
@ -2847,7 +2885,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_mul_mat_vec_p021_f16_f32[i], "mul_mat_vec_p021_f16_f32"+std::to_string(i+1), mul_mat_vec_p021_f16_f32_len, mul_mat_vec_p021_f16_f32_data, "main", 3, 6 * sizeof(uint32_t), {1, 1, 1}, {device->subgroup_size, i + 1}, 1, true);
|
ggml_vk_create_pipeline(device, device->pipeline_mul_mat_vec_p021_f16_f32[i], "mul_mat_vec_p021_f16_f32"+std::to_string(i+1), mul_mat_vec_p021_f16_f32_len, mul_mat_vec_p021_f16_f32_data, "main", 3, 6 * sizeof(uint32_t), {1, 1, 1}, {device->subgroup_size, i + 1}, 1, true);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_mul_mat_vec_nc_f16_f32, "mul_mat_vec_nc_f16_f32", mul_mat_vec_nc_f16_f32_len, mul_mat_vec_nc_f16_f32_data, "main", 3, 9 * sizeof(uint32_t), {1, 1, 1}, {}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_mul_mat_vec_nc_f16_f32, "mul_mat_vec_nc_f16_f32", mul_mat_vec_nc_f16_f32_len, mul_mat_vec_nc_f16_f32_data, "main", 3, 12 * sizeof(uint32_t), {1, 1, 1}, {}, 1);
|
||||||
|
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_norm_f32, "norm_f32", norm_f32_len, norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_norm_f32, "norm_f32", norm_f32_len, norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_group_norm_f32, "group_norm_f32", group_norm_f32_len, group_norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_group_norm_f32, "group_norm_f32", group_norm_f32_len, group_norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1);
|
||||||
|
|
@ -3048,48 +3086,105 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_opt_step_adamw_f32, "opt_step_adamw_f32", opt_step_adamw_f32_len, opt_step_adamw_f32_data, "main", 5, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_opt_step_adamw_f32, "opt_step_adamw_f32", opt_step_adamw_f32_len, opt_step_adamw_f32_data, "main", 5, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
||||||
|
|
||||||
// conv2d
|
// conv2d
|
||||||
uint32_t conv2d_WG_SIZE = 256;
|
for (uint32_t s = 0; s < CONV_SHAPE_COUNT; ++s) {
|
||||||
uint32_t conv2d_BS_K = 128;
|
uint32_t conv2d_WG_SIZE = 256;
|
||||||
uint32_t conv2d_BS_CRS = 16;
|
uint32_t conv2d_BS_K = 128;
|
||||||
uint32_t use_collectives = 0; // Enables subgroup ops for preventing the re-calculation of indices.
|
uint32_t conv2d_BS_CRS = 16;
|
||||||
if (device->subgroup_shuffle &&
|
uint32_t use_collectives = 0; // Enables subgroup ops for preventing the re-calculation of indices.
|
||||||
device->vendor_id != VK_VENDOR_ID_INTEL) { // Do not enable collectives on Intel, see PR 14316
|
uint32_t conv2d_BS_NPQ = 128;
|
||||||
use_collectives = 1;
|
uint32_t conv2d_TS_K = 8;
|
||||||
conv2d_BS_CRS = std::min(
|
uint32_t conv2d_SHMEM_PAD = 4;
|
||||||
device->subgroup_size,
|
bool conv2d_UNROLL = true;
|
||||||
conv2d_BS_CRS); // CRS block size should be capped at sugroup size for correctness when shuffle is used.
|
|
||||||
}
|
|
||||||
uint32_t conv2d_BS_NPQ = 128;
|
|
||||||
uint32_t conv2d_TS_K = 8;
|
|
||||||
uint32_t conv2d_shmem_req =
|
|
||||||
(conv2d_BS_K * (conv2d_BS_CRS + 1) + conv2d_BS_CRS * (conv2d_BS_NPQ + 1)) * sizeof(float);
|
|
||||||
if (device->properties.limits.maxComputeSharedMemorySize < conv2d_shmem_req) {
|
|
||||||
conv2d_BS_CRS = 8;
|
|
||||||
if (use_collectives) {
|
|
||||||
conv2d_BS_CRS = std::min(device->subgroup_size, conv2d_BS_CRS);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (use_collectives) {
|
#if defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
|
||||||
ggml_vk_create_pipeline(
|
if (device->coopmat2) {
|
||||||
device, device->pipeline_conv2d_f32, "conv2d_f32", conv2d_f32_len, conv2d_f32_data, "main", 3,
|
conv2d_SHMEM_PAD = 8; // 8 float16_t
|
||||||
sizeof(vk_op_conv2d_push_constants), { conv2d_BS_K, conv2d_BS_NPQ, 1 },
|
}
|
||||||
{ conv2d_WG_SIZE, conv2d_BS_K, conv2d_BS_CRS, conv2d_BS_NPQ, conv2d_TS_K, use_collectives }, 1, true, true);
|
#endif
|
||||||
ggml_vk_create_pipeline(
|
|
||||||
device, device->pipeline_conv2d_f16_f32, "conv2d_f16_f32", conv2d_f16_f32_len, conv2d_f16_f32_data, "main", 3,
|
if (device->vendor_id == VK_VENDOR_ID_INTEL) {
|
||||||
sizeof(vk_op_conv2d_push_constants), { conv2d_BS_K, conv2d_BS_NPQ, 1 },
|
conv2d_SHMEM_PAD = 0;
|
||||||
{ conv2d_WG_SIZE, conv2d_BS_K, conv2d_BS_CRS, conv2d_BS_NPQ, conv2d_TS_K, use_collectives }, 1, true, true);
|
conv2d_UNROLL = false;
|
||||||
} else {
|
} else if (device->vendor_id == VK_VENDOR_ID_AMD) {
|
||||||
ggml_vk_create_pipeline(
|
conv2d_SHMEM_PAD = device->architecture == vk_device_architecture::AMD_GCN ? 1 : 4;
|
||||||
device, device->pipeline_conv2d_f32, "conv2d_f32", conv2d_f32_len, conv2d_f32_data, "main", 3,
|
}
|
||||||
sizeof(vk_op_conv2d_push_constants), { conv2d_BS_K, conv2d_BS_NPQ, 1 },
|
|
||||||
{ conv2d_WG_SIZE, conv2d_BS_K, conv2d_BS_CRS, conv2d_BS_NPQ, conv2d_TS_K, use_collectives }, 1, true,
|
switch (s) {
|
||||||
false);
|
default:
|
||||||
ggml_vk_create_pipeline(
|
case CONV_SHAPE_128x128:
|
||||||
device, device->pipeline_conv2d_f16_f32, "conv2d_f16_f32", conv2d_f16_f32_len, conv2d_f16_f32_data, "main", 3,
|
conv2d_BS_K = 128;
|
||||||
sizeof(vk_op_conv2d_push_constants), { conv2d_BS_K, conv2d_BS_NPQ, 1 },
|
conv2d_BS_NPQ = 128;
|
||||||
{ conv2d_WG_SIZE, conv2d_BS_K, conv2d_BS_CRS, conv2d_BS_NPQ, conv2d_TS_K, use_collectives }, 1, true,
|
conv2d_BS_CRS = 16;
|
||||||
false);
|
if (device->vendor_id == VK_VENDOR_ID_AMD && device->architecture != vk_device_architecture::AMD_GCN) {
|
||||||
|
conv2d_UNROLL = false;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case CONV_SHAPE_64x32:
|
||||||
|
conv2d_BS_K = 64;
|
||||||
|
conv2d_BS_NPQ = 32;
|
||||||
|
conv2d_BS_CRS = 32;
|
||||||
|
conv2d_TS_K = 4;
|
||||||
|
break;
|
||||||
|
case CONV_SHAPE_32x256:
|
||||||
|
conv2d_BS_K = 32;
|
||||||
|
conv2d_BS_NPQ = 256;
|
||||||
|
conv2d_BS_CRS = 16;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Use collectives on pre-Turing NVIDIA GPUs and GCN AMD cards, which had slower integer math.
|
||||||
|
bool allow_collectives_nv = device->vendor_id != VK_VENDOR_ID_NVIDIA ||
|
||||||
|
device->architecture == vk_device_architecture::NVIDIA_PRE_TURING;
|
||||||
|
bool allow_collectives_amd = device->vendor_id != VK_VENDOR_ID_AMD ||
|
||||||
|
device->architecture == vk_device_architecture::AMD_GCN;
|
||||||
|
|
||||||
|
if (device->subgroup_shuffle &&
|
||||||
|
device->vendor_id != VK_VENDOR_ID_INTEL && // Do not enable collectives on Intel, see PR 14316.
|
||||||
|
allow_collectives_nv &&
|
||||||
|
allow_collectives_amd) {
|
||||||
|
use_collectives = 1;
|
||||||
|
conv2d_BS_CRS = std::min(
|
||||||
|
device->subgroup_size,
|
||||||
|
conv2d_BS_CRS); // CRS block size should be capped at subgroup size for correctness when shuffle is used.
|
||||||
|
}
|
||||||
|
|
||||||
|
uint32_t conv2d_shmem_req =
|
||||||
|
(conv2d_BS_K * (conv2d_BS_CRS + conv2d_SHMEM_PAD) + conv2d_BS_CRS * (conv2d_BS_NPQ + conv2d_SHMEM_PAD)) * sizeof(float);
|
||||||
|
if (device->properties.limits.maxComputeSharedMemorySize < conv2d_shmem_req) {
|
||||||
|
conv2d_BS_CRS = 8;
|
||||||
|
if (use_collectives) {
|
||||||
|
conv2d_BS_CRS = std::min(device->subgroup_size, conv2d_BS_CRS);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
std::array<uint32_t, 3> wg_denoms = { conv2d_BS_K, conv2d_BS_NPQ, 1 };
|
||||||
|
std::vector<uint32_t> spec_constants = { conv2d_WG_SIZE, conv2d_BS_K, conv2d_BS_CRS, conv2d_BS_NPQ, conv2d_TS_K, use_collectives, conv2d_SHMEM_PAD };
|
||||||
|
|
||||||
|
#if defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
|
||||||
|
if (device->coopmat2) {
|
||||||
|
ggml_vk_create_pipeline(
|
||||||
|
device, device->pipeline_conv2d_f32[s], "conv2d_f32", conv2d_f32_cm2_len, conv2d_f32_cm2_data, "main", 3,
|
||||||
|
sizeof(vk_op_conv2d_push_constants), wg_denoms, spec_constants, 1, true, use_collectives);
|
||||||
|
ggml_vk_create_pipeline(
|
||||||
|
device, device->pipeline_conv2d_f16_f32[s], "conv2d_f16_f32", conv2d_f16_f32_cm2_len, conv2d_f16_f32_cm2_data, "main", 3,
|
||||||
|
sizeof(vk_op_conv2d_push_constants), wg_denoms, spec_constants, 1, true, use_collectives);
|
||||||
|
} else
|
||||||
|
#endif
|
||||||
|
if (conv2d_UNROLL) {
|
||||||
|
ggml_vk_create_pipeline(
|
||||||
|
device, device->pipeline_conv2d_f32[s], "conv2d_f32", conv2d_f32_unroll_len, conv2d_f32_unroll_data, "main", 3,
|
||||||
|
sizeof(vk_op_conv2d_push_constants), wg_denoms, spec_constants, 1, true, use_collectives);
|
||||||
|
ggml_vk_create_pipeline(
|
||||||
|
device, device->pipeline_conv2d_f16_f32[s], "conv2d_f16_f32", conv2d_f16_f32_unroll_len, conv2d_f16_f32_unroll_data, "main", 3,
|
||||||
|
sizeof(vk_op_conv2d_push_constants), wg_denoms, spec_constants, 1, true, use_collectives);
|
||||||
|
} else {
|
||||||
|
ggml_vk_create_pipeline(
|
||||||
|
device, device->pipeline_conv2d_f32[s], "conv2d_f32", conv2d_f32_len, conv2d_f32_data, "main", 3,
|
||||||
|
sizeof(vk_op_conv2d_push_constants), wg_denoms, spec_constants, 1, true, use_collectives);
|
||||||
|
ggml_vk_create_pipeline(
|
||||||
|
device, device->pipeline_conv2d_f16_f32[s], "conv2d_f16_f32", conv2d_f16_f32_len, conv2d_f16_f32_data, "main", 3,
|
||||||
|
sizeof(vk_op_conv2d_push_constants), wg_denoms, spec_constants, 1, true, use_collectives);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_conv2d_dw_whcn_f32, "conv2d_dw_whcn_f32", conv2d_dw_whcn_f32_len, conv2d_dw_whcn_f32_data, "main", 3, sizeof(vk_op_conv2d_dw_push_constants), {512, 1, 1}, {}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_conv2d_dw_whcn_f32, "conv2d_dw_whcn_f32", conv2d_dw_whcn_f32_len, conv2d_dw_whcn_f32_data, "main", 3, sizeof(vk_op_conv2d_dw_push_constants), {512, 1, 1}, {}, 1);
|
||||||
|
|
@ -4943,26 +5038,37 @@ static void ggml_vk_buffer_memset(vk_buffer& dst, size_t offset, uint32_t c, siz
|
||||||
ggml_vk_queue_command_pools_cleanup(dst->device);
|
ggml_vk_queue_command_pools_cleanup(dst->device);
|
||||||
}
|
}
|
||||||
|
|
||||||
static uint32_t ggml_vk_guess_split_k(ggml_backend_vk_context * ctx, int m, int n, int k, const vk_pipeline& pipeline) {
|
static uint32_t ggml_vk_guess_split_k(ggml_backend_vk_context * ctx, uint32_t m, uint32_t n, uint32_t k, const vk_pipeline& pipeline) {
|
||||||
VK_LOG_DEBUG("ggml_vk_guess_split_k(" << m << ", " << n << ", " << k << ")");
|
VK_LOG_DEBUG("ggml_vk_guess_split_k(" << m << ", " << n << ", " << k << ")");
|
||||||
|
|
||||||
uint32_t split_k = 1;
|
uint32_t split_k = 1;
|
||||||
if (ctx->device->shader_core_count != 0 && m >= (int)pipeline->wg_denoms[0] && n >= (int)pipeline->wg_denoms[1]) {
|
if (ctx->device->shader_core_count != 0 && m >= pipeline->wg_denoms[0] && n >= pipeline->wg_denoms[1]) {
|
||||||
// If k is 'large' and the SMs will fill less than halfway, use split_k.
|
// If k is 'large' and the SMs will fill less than halfway, use split_k.
|
||||||
uint32_t m_tiles = CEIL_DIV(m, pipeline->wg_denoms[0]);
|
uint32_t m_tiles = CEIL_DIV(m, pipeline->wg_denoms[0]);
|
||||||
uint32_t n_tiles = CEIL_DIV(n, pipeline->wg_denoms[1]);
|
uint32_t n_tiles = CEIL_DIV(n, pipeline->wg_denoms[1]);
|
||||||
if (k >= 2048 && m_tiles * n_tiles < ctx->device->shader_core_count / 2) {
|
|
||||||
split_k = ctx->device->shader_core_count / (m_tiles * n_tiles);
|
if (k >= 2048) {
|
||||||
// Clamp to 2 or 4
|
if (m_tiles * n_tiles <= ctx->device->shader_core_count / 2) {
|
||||||
split_k = std::min(split_k, 4u);
|
split_k = ctx->device->shader_core_count / (m_tiles * n_tiles);
|
||||||
if (split_k == 3) {
|
} else if (m_tiles * n_tiles <= ctx->device->shader_core_count * 2 / 3) {
|
||||||
split_k = 2;
|
split_k = 3;
|
||||||
}
|
}
|
||||||
if (ctx->device->coopmat2) {
|
// Cap the split at 8x. Unless k is huge this is a lot of overhead.
|
||||||
// coopmat2 shader expects splits to be aligned to 256
|
split_k = std::min(split_k, 8u);
|
||||||
while (split_k > 1 && ((k / split_k) % 256) != 0) {
|
|
||||||
split_k /= 2;
|
// ggml_vk_matmul will align the splits to be a multiple of 256.
|
||||||
|
// If this rounded up size would cause the last split to be empty,
|
||||||
|
// then reduce the split count.
|
||||||
|
while (true) {
|
||||||
|
if (split_k == 1) {
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
|
uint32_t k_split = CEIL_DIV(k, split_k);
|
||||||
|
k_split = ROUNDUP_POW2(k_split, 256);
|
||||||
|
if (k_split * (split_k - 1) < k) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
split_k--;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -4974,9 +5080,22 @@ static vk_pipeline ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx,
|
||||||
VK_LOG_DEBUG("ggml_vk_guess_matmul_pipeline(" << m << ", " << n << ", " << aligned << ", " << ggml_type_name(src0_type) << ", " << ggml_type_name(src1_type) << ")");
|
VK_LOG_DEBUG("ggml_vk_guess_matmul_pipeline(" << m << ", " << n << ", " << aligned << ", " << ggml_type_name(src0_type) << ", " << ggml_type_name(src1_type) << ")");
|
||||||
|
|
||||||
if (ctx->device->coopmat2) {
|
if (ctx->device->coopmat2) {
|
||||||
|
const uint32_t shader_core_count = ctx->device->shader_core_count;
|
||||||
|
const uint32_t tiles_l = CEIL_DIV(m, mmp->a_l->wg_denoms[0]) * CEIL_DIV(n, mmp->a_l->wg_denoms[1]);
|
||||||
|
const uint32_t tiles_m = CEIL_DIV(m, mmp->a_m->wg_denoms[0]) * CEIL_DIV(n, mmp->a_m->wg_denoms[1]);
|
||||||
|
|
||||||
// Use large shader when the N dimension is greater than the medium shader's tile size
|
// Use large shader when the N dimension is greater than the medium shader's tile size
|
||||||
uint32_t crossover_large = mmp->m->wg_denoms[1];
|
uint32_t crossover_large = mmp->m->wg_denoms[1];
|
||||||
if ((ctx->device->mul_mat_l[src0_type] && (n > crossover_large)) || (!ctx->device->mul_mat_m[src0_type] && !ctx->device->mul_mat_s[src0_type])) {
|
|
||||||
|
// Prefer large over medium if either:
|
||||||
|
// - medium or large tiles would overfill the GPU
|
||||||
|
// - large tiles with a split_k==3 fits in the GPU and medium tiles with split_k==2 does not
|
||||||
|
// (medium with split_k==2 is probably better if it fits - more workgroups running and less split_k overhead)
|
||||||
|
bool prefer_large = tiles_m > shader_core_count || tiles_l > shader_core_count ||
|
||||||
|
// split_k==3 with large tiles likely better than medium tiles with no split_k.
|
||||||
|
(tiles_l <= shader_core_count / 3 && tiles_m > shader_core_count / 2);
|
||||||
|
|
||||||
|
if ((ctx->device->mul_mat_l[src0_type] && (n > crossover_large && prefer_large)) || (!ctx->device->mul_mat_m[src0_type] && !ctx->device->mul_mat_s[src0_type])) {
|
||||||
return aligned ? mmp->a_l : mmp->l;
|
return aligned ? mmp->a_l : mmp->l;
|
||||||
}
|
}
|
||||||
// Use medium shader when the N dimension is greater than the small shader's tile size
|
// Use medium shader when the N dimension is greater than the small shader's tile size
|
||||||
|
|
@ -5020,7 +5139,11 @@ static void ggml_vk_matmul(
|
||||||
|
|
||||||
GGML_ASSERT(batch_stride_d == m * n);
|
GGML_ASSERT(batch_stride_d == m * n);
|
||||||
|
|
||||||
const vk_mat_mat_push_constants pc1 = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d, CEIL_DIV(k, split_k), ne02, ne12, broadcast2, broadcast3, padded_n };
|
// Round the split size up to a multiple of 256 (k-quant alignment)
|
||||||
|
uint32_t k_split = CEIL_DIV(k, split_k);
|
||||||
|
k_split = ROUNDUP_POW2(k_split, 256);
|
||||||
|
|
||||||
|
const vk_mat_mat_push_constants pc1 = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d, k_split, ne02, ne12, broadcast2, broadcast3, padded_n };
|
||||||
// Make sure enough workgroups get assigned for split k to work
|
// Make sure enough workgroups get assigned for split k to work
|
||||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, split_k_buffer }, pc1, { (CEIL_DIV(m, pipeline->wg_denoms[0]) * pipeline->wg_denoms[0]) * split_k, n, batch });
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, split_k_buffer }, pc1, { (CEIL_DIV(m, pipeline->wg_denoms[0]) * pipeline->wg_denoms[0]) * split_k, n, batch });
|
||||||
ggml_vk_sync_buffers(subctx);
|
ggml_vk_sync_buffers(subctx);
|
||||||
|
|
@ -5742,7 +5865,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
|
||||||
const uint64_t ne00 = src0->ne[0];
|
const uint64_t ne00 = src0->ne[0];
|
||||||
const uint64_t ne01 = src0->ne[1];
|
const uint64_t ne01 = src0->ne[1];
|
||||||
const uint64_t ne02 = src0->ne[2];
|
const uint64_t ne02 = src0->ne[2];
|
||||||
// const uint64_t ne03 = src0->ne[3];
|
const uint64_t ne03 = src0->ne[3];
|
||||||
|
|
||||||
const uint64_t nb01 = src0->nb[1];
|
const uint64_t nb01 = src0->nb[1];
|
||||||
const uint64_t nb02 = src0->nb[2];
|
const uint64_t nb02 = src0->nb[2];
|
||||||
|
|
@ -5754,7 +5877,12 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
|
||||||
const uint64_t ne12 = src1->ne[2];
|
const uint64_t ne12 = src1->ne[2];
|
||||||
// const uint64_t ne13 = src1->ne[3];
|
// const uint64_t ne13 = src1->ne[3];
|
||||||
|
|
||||||
|
const uint32_t nb03 = (uint32_t)(src0->nb[3] / sizeof(ggml_fp16_t));
|
||||||
|
const uint32_t nb13 = (uint32_t)(src1->nb[3] / sizeof(float));
|
||||||
|
const uint32_t nb23 = (uint32_t)(dst->nb[3] / sizeof(float));
|
||||||
|
|
||||||
GGML_ASSERT(ne11 == 1);
|
GGML_ASSERT(ne11 == 1);
|
||||||
|
GGML_ASSERT(src0->ne[3] == src1->ne[3]); // checked in supports_op
|
||||||
|
|
||||||
ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context;
|
ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context;
|
||||||
ggml_backend_vk_buffer_context * src0_buf_ctx = (ggml_backend_vk_buffer_context *)src0->buffer->context;
|
ggml_backend_vk_buffer_context * src0_buf_ctx = (ggml_backend_vk_buffer_context *)src0->buffer->context;
|
||||||
|
|
@ -5770,7 +5898,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
|
||||||
src1_uma = d_Qy != nullptr;
|
src1_uma = d_Qy != nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
const uint64_t d_ne = ne01 * ne11 * ne12;
|
const uint64_t d_ne = ne01 * ne11 * ne12 * ne03;
|
||||||
|
|
||||||
const uint32_t row_stride_x = nb01 / sizeof(ggml_fp16_t);
|
const uint32_t row_stride_x = nb01 / sizeof(ggml_fp16_t);
|
||||||
const uint32_t channel_stride_x = nb02 / sizeof(ggml_fp16_t);
|
const uint32_t channel_stride_x = nb02 / sizeof(ggml_fp16_t);
|
||||||
|
|
@ -5805,10 +5933,10 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
|
||||||
const uint64_t d_shader_offset = d_buf_offset - d_buffer_offset;
|
const uint64_t d_shader_offset = d_buf_offset - d_buffer_offset;
|
||||||
|
|
||||||
// compute
|
// compute
|
||||||
const std::array<uint32_t, 9> pc = { (uint32_t)ne00, (uint32_t)ne01, row_stride_x, channel_stride_x, channel_stride_y, (uint32_t)(ne12 / ne02), (uint32_t)ne12, (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) };
|
const std::array<uint32_t, 12> pc = { (uint32_t)ne00, (uint32_t)ne01, row_stride_x, channel_stride_x, channel_stride_y, (uint32_t)(ne12 / ne02), (uint32_t)ne12, (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)), nb03, nb13, nb23 };
|
||||||
ggml_vk_sync_buffers(subctx);
|
ggml_vk_sync_buffers(subctx);
|
||||||
ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_nc_f16_f32,
|
ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_nc_f16_f32,
|
||||||
{ vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz }, vk_subbuffer{ d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, vk_subbuffer{ d_D, d_buffer_offset, d_sz + d_shader_offset } }, pc, { 1, (uint32_t)ne01, (uint32_t)ne12 });
|
{ vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz }, vk_subbuffer{ d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, vk_subbuffer{ d_D, d_buffer_offset, d_sz + d_shader_offset } }, pc, { (uint32_t)ne03, (uint32_t)ne01, (uint32_t)ne12 });
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
|
static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
|
||||||
|
|
@ -6641,6 +6769,34 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static std::array<uint32_t, 3> ggml_vk_get_conv_elements(const ggml_tensor *dst) {
|
||||||
|
const ggml_tensor *src0 = dst->src[0];
|
||||||
|
const ggml_tensor *src1 = dst->src[1];
|
||||||
|
|
||||||
|
// src0 - kernel: [KW, KH, Cin, Cout]
|
||||||
|
// src1 - input: [W, H, Cin, N]
|
||||||
|
// dst - result: [OW, OH, Cout, N]
|
||||||
|
|
||||||
|
// Copied from ggml.c: int64_t ggml_calc_conv_output_size(int64_t ins, int64_t ks, int s, int p, int d)
|
||||||
|
auto calc_conv_output_size = [](int64_t ins, int64_t ks, int s, int p, int d) -> int64_t {
|
||||||
|
return (ins + 2 * p - d * (ks - 1) - 1) / s + 1;
|
||||||
|
};
|
||||||
|
// parallelize in {OW/BS_K, OH/BS_NPQ, 1}
|
||||||
|
int64_t W = src1->ne[0];
|
||||||
|
int64_t H = src1->ne[1];
|
||||||
|
int64_t KW = src0->ne[0];
|
||||||
|
int64_t KH = src0->ne[1];
|
||||||
|
int64_t Cout = src0->ne[3];
|
||||||
|
int64_t N = src1->ne[3];
|
||||||
|
int64_t OH = calc_conv_output_size(H, KH, dst->op_params[1], dst->op_params[3], dst->op_params[5]);
|
||||||
|
int64_t OW = calc_conv_output_size(W, KW, dst->op_params[0], dst->op_params[2], dst->op_params[4]);
|
||||||
|
int64_t NPQ = N * OW * OH;
|
||||||
|
|
||||||
|
// Tile output matrix to (K/NB_K, NPQ/NB_NPQ, 1) workgroups
|
||||||
|
std::array<uint32_t, 3> elements = { static_cast<uint32_t>(Cout), static_cast<uint32_t>(NPQ), 1 };
|
||||||
|
return elements;
|
||||||
|
}
|
||||||
|
|
||||||
static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, ggml_op op) {
|
static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, ggml_op op) {
|
||||||
switch (op) {
|
switch (op) {
|
||||||
case GGML_OP_GET_ROWS:
|
case GGML_OP_GET_ROWS:
|
||||||
|
|
@ -6970,10 +7126,30 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
|
||||||
case GGML_OP_CONV_2D:
|
case GGML_OP_CONV_2D:
|
||||||
if (src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 &&
|
if (src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 &&
|
||||||
ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && ggml_is_contiguous(dst)) {
|
ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && ggml_is_contiguous(dst)) {
|
||||||
|
auto elements = ggml_vk_get_conv_elements(dst);
|
||||||
|
vk_conv_shapes shape;
|
||||||
|
|
||||||
|
uint32_t tiles[CONV_SHAPE_COUNT];
|
||||||
|
for (uint32_t i = 0; i < CONV_SHAPE_COUNT; ++i) {
|
||||||
|
tiles[i] = CEIL_DIV(elements[0], ctx->device->pipeline_conv2d_f32[i]->wg_denoms[0]) * CEIL_DIV(elements[1], ctx->device->pipeline_conv2d_f32[i]->wg_denoms[1]);
|
||||||
|
}
|
||||||
|
|
||||||
|
// We can't query number of shader cores on Intel, use 32 as a placeholder
|
||||||
|
// so small convolutions will still choose a smaller tile.
|
||||||
|
const uint32_t shader_core_count = ctx->device->shader_core_count > 0 ? ctx->device->shader_core_count : 32;
|
||||||
|
|
||||||
|
if (elements[0] > 64 && tiles[CONV_SHAPE_128x128] >= shader_core_count * 2) {
|
||||||
|
shape = CONV_SHAPE_128x128;
|
||||||
|
} else if (elements[0] <= 32 && tiles[CONV_SHAPE_32x256] >= shader_core_count * 2) {
|
||||||
|
shape = CONV_SHAPE_32x256;
|
||||||
|
} else {
|
||||||
|
shape = CONV_SHAPE_64x32;
|
||||||
|
}
|
||||||
|
|
||||||
if (src0->type == GGML_TYPE_F32) {
|
if (src0->type == GGML_TYPE_F32) {
|
||||||
return ctx->device->pipeline_conv2d_f32;
|
return ctx->device->pipeline_conv2d_f32[shape];
|
||||||
} else if (src0->type == GGML_TYPE_F16) {
|
} else if (src0->type == GGML_TYPE_F16) {
|
||||||
return ctx->device->pipeline_conv2d_f16_f32;
|
return ctx->device->pipeline_conv2d_f16_f32[shape];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return nullptr;
|
return nullptr;
|
||||||
|
|
@ -7301,29 +7477,8 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_CONV_2D:
|
case GGML_OP_CONV_2D:
|
||||||
{
|
{
|
||||||
// src0 - kernel: [KW, KH, Cin, Cout]
|
elements = ggml_vk_get_conv_elements(dst);
|
||||||
// src1 - input: [W, H, Cin, N]
|
} break;
|
||||||
// dst - result: [OW, OH, Cout, N]
|
|
||||||
|
|
||||||
// Copied from ggml.c: int64_t ggml_calc_conv_output_size(int64_t ins, int64_t ks, int s, int p, int d)
|
|
||||||
auto calc_conv_output_size = [](int64_t ins, int64_t ks, int s, int p, int d) -> int64_t {
|
|
||||||
return (ins + 2 * p - d * (ks - 1) - 1) / s + 1;
|
|
||||||
};
|
|
||||||
// parallelize in {OW/BS_K, OH/BS_NPQ, 1}
|
|
||||||
int64_t W = src1->ne[0];
|
|
||||||
int64_t H = src1->ne[1];
|
|
||||||
int64_t KW = src0->ne[0];
|
|
||||||
int64_t KH = src0->ne[1];
|
|
||||||
int64_t Cout = src0->ne[3];
|
|
||||||
int64_t N = src1->ne[3];
|
|
||||||
int64_t OH = calc_conv_output_size(H, KH, dst->op_params[1], dst->op_params[3], dst->op_params[5]);
|
|
||||||
int64_t OW = calc_conv_output_size(W, KW, dst->op_params[0], dst->op_params[2], dst->op_params[4]);
|
|
||||||
int64_t NPQ = N * OW * OH;
|
|
||||||
|
|
||||||
// Tile output matrix to (K/NB_K, NPQ/NB_NPQ, 1) workgroups
|
|
||||||
elements = { static_cast<uint32_t>(Cout), static_cast<uint32_t>(NPQ), 1 };
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
case GGML_OP_ADD:
|
case GGML_OP_ADD:
|
||||||
case GGML_OP_SUB:
|
case GGML_OP_SUB:
|
||||||
case GGML_OP_DIV:
|
case GGML_OP_DIV:
|
||||||
|
|
|
||||||
|
|
@ -1,14 +1,18 @@
|
||||||
#version 450
|
#version 450
|
||||||
|
|
||||||
|
#extension GL_EXT_control_flow_attributes : enable
|
||||||
|
#ifdef COOPMAT2
|
||||||
|
#extension GL_NV_cooperative_matrix2 : enable
|
||||||
|
#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require
|
||||||
|
#extension GL_KHR_memory_scope_semantics : enable
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef USE_COLLECTIVES
|
#ifdef USE_COLLECTIVES
|
||||||
# extension GL_KHR_shader_subgroup_shuffle : enable
|
# extension GL_KHR_shader_subgroup_shuffle : enable
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#include "types.comp"
|
#include "types.comp"
|
||||||
|
|
||||||
// Make spec constant
|
|
||||||
#define SHMEM_PAD 0
|
|
||||||
|
|
||||||
// shape notation: [dim(N), ..., dim(0)] -- stride(dim(j)) >= stride(dim(i)) if i > j
|
// shape notation: [dim(N), ..., dim(0)] -- stride(dim(j)) >= stride(dim(i)) if i > j
|
||||||
layout(binding = 0) readonly buffer A {
|
layout(binding = 0) readonly buffer A {
|
||||||
A_TYPE knl_data[];
|
A_TYPE knl_data[];
|
||||||
|
|
@ -56,6 +60,12 @@ layout(push_constant) uniform parameter {
|
||||||
uint32_t nb1;
|
uint32_t nb1;
|
||||||
uint32_t nb2;
|
uint32_t nb2;
|
||||||
uint32_t nb3;
|
uint32_t nb3;
|
||||||
|
|
||||||
|
// fastdiv helper values
|
||||||
|
uint32_t KWmp; uint32_t KWL;
|
||||||
|
uint32_t KWKHmp; uint32_t KWKHL;
|
||||||
|
uint32_t OWmp; uint32_t OWL;
|
||||||
|
uint32_t OWOHmp; uint32_t OWOHL;
|
||||||
}
|
}
|
||||||
|
|
||||||
p;
|
p;
|
||||||
|
|
@ -68,6 +78,7 @@ layout(constant_id = 3) const uint BS_NPQ = 128;
|
||||||
// Thread-tile sizes
|
// Thread-tile sizes
|
||||||
layout(constant_id = 4) const uint TS_K = 8;
|
layout(constant_id = 4) const uint TS_K = 8;
|
||||||
layout(constant_id = 5) const uint use_collectives = 1;
|
layout(constant_id = 5) const uint use_collectives = 1;
|
||||||
|
layout(constant_id = 6) const uint SHMEM_PAD = 4;
|
||||||
|
|
||||||
uint32_t tid = gl_LocalInvocationID.x;
|
uint32_t tid = gl_LocalInvocationID.x;
|
||||||
const uint32_t WG_SIZE = gl_WorkGroupSize.x;
|
const uint32_t WG_SIZE = gl_WorkGroupSize.x;
|
||||||
|
|
@ -85,6 +96,12 @@ uint32_t n_elems_out = K * NPQ;
|
||||||
// Number of blocktiles per input
|
// Number of blocktiles per input
|
||||||
uint32_t NB_CRS = splitWork(CRS, BS_CRS);
|
uint32_t NB_CRS = splitWork(CRS, BS_CRS);
|
||||||
|
|
||||||
|
#ifdef COOPMAT2
|
||||||
|
#define SHMEM_TYPE float16_t
|
||||||
|
#else
|
||||||
|
#define SHMEM_TYPE float
|
||||||
|
#endif
|
||||||
|
|
||||||
const uint32_t Ash_stride = BS_CRS + SHMEM_PAD;
|
const uint32_t Ash_stride = BS_CRS + SHMEM_PAD;
|
||||||
const uint32_t Bsh_stride = BS_NPQ + SHMEM_PAD;
|
const uint32_t Bsh_stride = BS_NPQ + SHMEM_PAD;
|
||||||
|
|
||||||
|
|
@ -94,8 +111,8 @@ const uint32_t Bsh_numel = BS_CRS * BS_NPQ;
|
||||||
const uint32_t Ash_len = BS_K * Ash_stride;
|
const uint32_t Ash_len = BS_K * Ash_stride;
|
||||||
const uint32_t Bsh_len = BS_CRS * Bsh_stride;
|
const uint32_t Bsh_len = BS_CRS * Bsh_stride;
|
||||||
|
|
||||||
shared float Ash[Ash_len]; // K x CRS
|
shared SHMEM_TYPE Ash[Ash_len]; // K x CRS
|
||||||
shared float Bsh[Bsh_len]; // CRS x NPQ
|
shared SHMEM_TYPE Bsh[Bsh_len]; // CRS x NPQ
|
||||||
|
|
||||||
// Threadtile sizes
|
// Threadtile sizes
|
||||||
const uint32_t TS_NPQ = BS_K * BS_NPQ / WG_SIZE / TS_K;
|
const uint32_t TS_NPQ = BS_K * BS_NPQ / WG_SIZE / TS_K;
|
||||||
|
|
@ -104,10 +121,6 @@ const uint32_t TS_NPQ = BS_K * BS_NPQ / WG_SIZE / TS_K;
|
||||||
const uint32_t NT_K = BS_K / TS_K;
|
const uint32_t NT_K = BS_K / TS_K;
|
||||||
const uint32_t NT_NPQ = BS_NPQ / TS_NPQ;
|
const uint32_t NT_NPQ = BS_NPQ / TS_NPQ;
|
||||||
|
|
||||||
float regA[TS_K];
|
|
||||||
float regB[TS_NPQ];
|
|
||||||
float regC[TS_K][TS_NPQ];
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
Compute
|
Compute
|
||||||
KxCRS @ CRSxNPQ = K x NPQ
|
KxCRS @ CRSxNPQ = K x NPQ
|
||||||
|
|
@ -131,12 +144,44 @@ uint32_t Br = tid / BS_NPQ;
|
||||||
uint32_t Bc = tid % BS_NPQ;
|
uint32_t Bc = tid % BS_NPQ;
|
||||||
const uint32_t BrpWg = WG_SIZE / BS_NPQ;
|
const uint32_t BrpWg = WG_SIZE / BS_NPQ;
|
||||||
|
|
||||||
|
// see init_fastdiv_values in ggml-vulkan.cpp
|
||||||
|
uint fastdiv(uint n, uint mp, uint L) {
|
||||||
|
uint msbs, lsbs;
|
||||||
|
// msbs = mulhi(n, mp)
|
||||||
|
umulExtended(n, mp, msbs, lsbs);
|
||||||
|
return (msbs + n) >> L;
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef COOPMAT2
|
||||||
|
#define ACC_TYPE float16_t
|
||||||
|
|
||||||
|
ACC_TYPE perElemOpStore(const in uint32_t r, const in uint32_t c, const in ACC_TYPE elem)
|
||||||
|
{
|
||||||
|
uint32_t K_idx = B_idx_K * BS_K + r;
|
||||||
|
uint32_t NPQ_idx = B_idx_NPQ * BS_NPQ + c;
|
||||||
|
uint32_t N_idx = fastdiv(NPQ_idx, p.OWOHmp, p.OWOHL); // divide by p.OH * p.OW;
|
||||||
|
uint32_t OH_idx = fastdiv(NPQ_idx - N_idx * p.OH * p.OW, p.OWmp, p.OWL); // divide by p.OW;
|
||||||
|
uint32_t OW_idx = NPQ_idx - N_idx * p.OH * p.OW - OH_idx * p.OW;
|
||||||
|
uint32_t dst_idx = OW_idx + OH_idx * p.nb1 + K_idx * p.nb2 + N_idx * p.nb3;
|
||||||
|
if (K_idx < K && NPQ_idx < NPQ) {
|
||||||
|
dst_data[dst_idx] = D_TYPE(elem);
|
||||||
|
}
|
||||||
|
return elem;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
void main() {
|
void main() {
|
||||||
|
#ifdef COOPMAT2
|
||||||
|
coopmat<ACC_TYPE, gl_ScopeWorkgroup, BS_K, BS_NPQ, gl_MatrixUseAccumulator> matC;
|
||||||
|
matC = coopmat<ACC_TYPE, gl_ScopeWorkgroup, BS_K, BS_NPQ, gl_MatrixUseAccumulator>(0.0);
|
||||||
|
#else
|
||||||
|
float regC[TS_K][TS_NPQ];
|
||||||
for (uint32_t T_ly = 0; T_ly < TS_K; T_ly++) {
|
for (uint32_t T_ly = 0; T_ly < TS_K; T_ly++) {
|
||||||
for (uint32_t T_lx = 0; T_lx < TS_NPQ; T_lx++) {
|
for (uint32_t T_lx = 0; T_lx < TS_NPQ; T_lx++) {
|
||||||
regC[T_ly][T_lx] = 0.0;
|
regC[T_ly][T_lx] = 0.0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
/* Advance block in CRS dim */
|
/* Advance block in CRS dim */
|
||||||
for (uint32_t B_idx_CRS = 0; B_idx_CRS < NB_CRS; B_idx_CRS++) {
|
for (uint32_t B_idx_CRS = 0; B_idx_CRS < NB_CRS; B_idx_CRS++) {
|
||||||
uint32_t CRS_idx_a;
|
uint32_t CRS_idx_a;
|
||||||
|
|
@ -151,9 +196,9 @@ void main() {
|
||||||
uint32_t cached_KW_idx;
|
uint32_t cached_KW_idx;
|
||||||
if (use_collectives == 1) {
|
if (use_collectives == 1) {
|
||||||
cached_CRS_idx = B_idx_CRS * BS_CRS + gl_SubgroupInvocationID;
|
cached_CRS_idx = B_idx_CRS * BS_CRS + gl_SubgroupInvocationID;
|
||||||
cached_Cin_idx = cached_CRS_idx / (p.KW * p.KH);
|
cached_Cin_idx = fastdiv(cached_CRS_idx, p.KWKHmp, p.KWKHL); // divide by (p.KW * p.KH);
|
||||||
uint32_t cached_CRS_remainder = (cached_CRS_idx - cached_Cin_idx * p.KW * p.KH);
|
uint32_t cached_CRS_remainder = (cached_CRS_idx - cached_Cin_idx * p.KW * p.KH);
|
||||||
cached_KH_idx = cached_CRS_remainder / p.KW;
|
cached_KH_idx = fastdiv(cached_CRS_remainder, p.KWmp, p.KWL); // divide by p.KW;
|
||||||
cached_KW_idx = cached_CRS_remainder - cached_KH_idx * p.KW;
|
cached_KW_idx = cached_CRS_remainder - cached_KH_idx * p.KW;
|
||||||
|
|
||||||
CRS_idx_a = subgroupShuffle(cached_CRS_idx, Ac);
|
CRS_idx_a = subgroupShuffle(cached_CRS_idx, Ac);
|
||||||
|
|
@ -162,16 +207,16 @@ void main() {
|
||||||
KW_idx_a = subgroupShuffle(cached_KW_idx, Ac);
|
KW_idx_a = subgroupShuffle(cached_KW_idx, Ac);
|
||||||
} else {
|
} else {
|
||||||
CRS_idx_a = B_idx_CRS * BS_CRS + Ac; // Global CRS_idx_a (column index of A)
|
CRS_idx_a = B_idx_CRS * BS_CRS + Ac; // Global CRS_idx_a (column index of A)
|
||||||
Cin_idx_a = CRS_idx_a / (p.KW * p.KH);
|
Cin_idx_a = fastdiv(CRS_idx_a, p.KWKHmp, p.KWKHL); // divide by (p.KW * p.KH);
|
||||||
uint32_t CRS_remainder = CRS_idx_a - Cin_idx_a * p.KW * p.KH;
|
uint32_t CRS_remainder = CRS_idx_a - Cin_idx_a * p.KW * p.KH;
|
||||||
KH_idx_a = CRS_remainder / p.KW;
|
KH_idx_a = fastdiv(CRS_remainder, p.KWmp, p.KWL); // divide by p.KW;
|
||||||
KW_idx_a = CRS_remainder - KH_idx_a * p.KW;
|
KW_idx_a = CRS_remainder - KH_idx_a * p.KW;
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
CRS_idx_a = B_idx_CRS * BS_CRS + Ac; // Global CRS_idx_a (column index of A)
|
CRS_idx_a = B_idx_CRS * BS_CRS + Ac; // Global CRS_idx_a (column index of A)
|
||||||
Cin_idx_a = CRS_idx_a / (p.KW * p.KH);
|
Cin_idx_a = fastdiv(CRS_idx_a, p.KWKHmp, p.KWKHL); // divide by (p.KW * p.KH); / (p.KW * p.KH);
|
||||||
CRS_remainder = CRS_idx_a - Cin_idx_a * p.KW * p.KH;
|
CRS_remainder = CRS_idx_a - Cin_idx_a * p.KW * p.KH;
|
||||||
KH_idx_a = CRS_remainder / p.KW;
|
KH_idx_a = fastdiv(CRS_remainder, p.KWmp, p.KWL); // divide by p.KW;
|
||||||
KW_idx_a = CRS_remainder - KH_idx_a * p.KW;
|
KW_idx_a = CRS_remainder - KH_idx_a * p.KW;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
@ -185,16 +230,16 @@ void main() {
|
||||||
if (K_idx >= K || CRS_idx_a >= CRS) {
|
if (K_idx >= K || CRS_idx_a >= CRS) {
|
||||||
val = 0.0;
|
val = 0.0;
|
||||||
}
|
}
|
||||||
Ash[B_ly * Ash_stride + B_lx] = val;
|
Ash[B_ly * Ash_stride + B_lx] = SHMEM_TYPE(val);
|
||||||
}
|
}
|
||||||
/* Load input to B_block: (BS_CRS x BS_NPQ) */
|
/* Load input to B_block: (BS_CRS x BS_NPQ) */
|
||||||
for (uint32_t r_offset = 0; r_offset < BS_CRS; r_offset += BrpWg) {
|
UNROLL for (uint32_t r_offset = 0; r_offset < BS_CRS; r_offset += BrpWg) {
|
||||||
uint32_t B_ly = r_offset + Br; /* Row index of B block */
|
uint32_t B_ly = r_offset + Br; /* Row index of B block */
|
||||||
uint32_t B_lx = Bc;
|
uint32_t B_lx = Bc;
|
||||||
uint32_t NPQ_idx = B_idx_NPQ * BS_NPQ + B_lx; /* Global NPQ index (column index of B) */
|
uint32_t NPQ_idx = B_idx_NPQ * BS_NPQ + B_lx; /* Global NPQ index (column index of B) */
|
||||||
uint32_t N_idx = NPQ_idx / (p.OH * p.OW);
|
uint32_t N_idx = fastdiv(NPQ_idx, p.OWOHmp, p.OWOHL); // divide by p.OH * p.OW;
|
||||||
uint32_t NPQ_remainder = NPQ_idx - N_idx * p.OH * p.OW;
|
uint32_t NPQ_remainder = NPQ_idx - N_idx * p.OH * p.OW;
|
||||||
uint32_t OH_idx = NPQ_remainder / p.OW;
|
uint32_t OH_idx = fastdiv(NPQ_remainder, p.OWmp, p.OWL); // divide by p.OW;
|
||||||
uint32_t OW_idx = NPQ_remainder - OH_idx * p.OW;
|
uint32_t OW_idx = NPQ_remainder - OH_idx * p.OW;
|
||||||
|
|
||||||
uint32_t CRS_idx_b;
|
uint32_t CRS_idx_b;
|
||||||
|
|
@ -209,16 +254,16 @@ void main() {
|
||||||
KW_idx_b = subgroupShuffle(cached_KW_idx, r_offset + Br);
|
KW_idx_b = subgroupShuffle(cached_KW_idx, r_offset + Br);
|
||||||
} else {
|
} else {
|
||||||
CRS_idx_b = B_idx_CRS * BS_CRS + B_ly; /* Global CRS index (row index of B) */
|
CRS_idx_b = B_idx_CRS * BS_CRS + B_ly; /* Global CRS index (row index of B) */
|
||||||
Cin_idx_b = CRS_idx_b / (p.KW * p.KH);
|
Cin_idx_b = fastdiv(CRS_idx_b, p.KWKHmp, p.KWKHL); // divide by (p.KW * p.KH);
|
||||||
uint32_t CRS_remainder = CRS_idx_b - Cin_idx_b * p.KW * p.KH;
|
uint32_t CRS_remainder = CRS_idx_b - Cin_idx_b * p.KW * p.KH;
|
||||||
KH_idx_b = CRS_remainder / p.KW;
|
KH_idx_b = fastdiv(CRS_remainder, p.KWmp, p.KWL); // divide by p.KW;
|
||||||
KW_idx_b = CRS_remainder - KH_idx_b * p.KW;
|
KW_idx_b = CRS_remainder - KH_idx_b * p.KW;
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
CRS_idx_b = B_idx_CRS * BS_CRS + B_ly; /* Global CRS index (row index of B) */
|
CRS_idx_b = B_idx_CRS * BS_CRS + B_ly; /* Global CRS index (row index of B) */
|
||||||
Cin_idx_b = CRS_idx_b / (p.KW * p.KH);
|
Cin_idx_b = fastdiv(CRS_idx_b, p.KWKHmp, p.KWKHL); // divide by (p.KW * p.KH);
|
||||||
uint32_t CRS_remainder = CRS_idx_b - Cin_idx_b * p.KW * p.KH;
|
uint32_t CRS_remainder = CRS_idx_b - Cin_idx_b * p.KW * p.KH;
|
||||||
KH_idx_b = CRS_remainder / p.KW;
|
KH_idx_b = fastdiv(CRS_remainder, p.KWmp, p.KWL); // divide by p.KW;
|
||||||
KW_idx_b = CRS_remainder - KH_idx_b * p.KW;
|
KW_idx_b = CRS_remainder - KH_idx_b * p.KW;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
@ -230,36 +275,55 @@ void main() {
|
||||||
if (CRS_idx_b >= CRS || NPQ_idx >= NPQ || H_idx < 0 || H_idx >= p.H || W_idx < 0 || W_idx >= p.W) {
|
if (CRS_idx_b >= CRS || NPQ_idx >= NPQ || H_idx < 0 || H_idx >= p.H || W_idx < 0 || W_idx >= p.W) {
|
||||||
val = 0.0;
|
val = 0.0;
|
||||||
}
|
}
|
||||||
Bsh[B_ly * Bsh_stride + B_lx] = val;
|
Bsh[B_ly * Bsh_stride + B_lx] = SHMEM_TYPE(val);
|
||||||
}
|
}
|
||||||
barrier();
|
barrier();
|
||||||
for (uint32_t CRS_lidx = 0; CRS_lidx < BS_CRS; CRS_lidx++) {
|
#ifdef COOPMAT2
|
||||||
for (uint32_t T_ly = 0; T_ly < TS_K; T_ly++) {
|
coopmat<float16_t, gl_ScopeWorkgroup, BS_K, BS_CRS, gl_MatrixUseA> matA;
|
||||||
regA[T_ly] = Ash[(T_y * TS_K + T_ly) * Ash_stride + CRS_lidx];
|
coopmat<float16_t, gl_ScopeWorkgroup, BS_CRS, BS_NPQ, gl_MatrixUseB> matB;
|
||||||
}
|
|
||||||
for (uint32_t T_lx = 0; T_lx < TS_NPQ; T_lx++) {
|
coopMatLoad(matA, Ash, 0, Ash_stride, gl_CooperativeMatrixLayoutRowMajor);
|
||||||
regB[T_lx] = Bsh[CRS_lidx * Bsh_stride + T_x * TS_NPQ + T_lx];
|
coopMatLoad(matB, Bsh, 0, Bsh_stride, gl_CooperativeMatrixLayoutRowMajor);
|
||||||
}
|
matC = coopMatMulAdd(matA, matB, matC);
|
||||||
for (uint32_t T_ly = 0; T_ly < TS_K; T_ly++) {
|
#else
|
||||||
|
if (T_y * TS_K < K) {
|
||||||
|
UNROLL for (uint32_t CRS_lidx = 0; CRS_lidx < BS_CRS; CRS_lidx++) {
|
||||||
|
float regA[TS_K];
|
||||||
|
float regB[TS_NPQ];
|
||||||
|
for (uint32_t T_ly = 0; T_ly < TS_K; T_ly++) {
|
||||||
|
regA[T_ly] = Ash[(T_y * TS_K + T_ly) * Ash_stride + CRS_lidx];
|
||||||
|
}
|
||||||
for (uint32_t T_lx = 0; T_lx < TS_NPQ; T_lx++) {
|
for (uint32_t T_lx = 0; T_lx < TS_NPQ; T_lx++) {
|
||||||
regC[T_ly][T_lx] = fma(regA[T_ly], regB[T_lx], regC[T_ly][T_lx]);
|
regB[T_lx] = Bsh[CRS_lidx * Bsh_stride + T_x * TS_NPQ + T_lx];
|
||||||
|
}
|
||||||
|
for (uint32_t T_ly = 0; T_ly < TS_K; T_ly++) {
|
||||||
|
for (uint32_t T_lx = 0; T_lx < TS_NPQ; T_lx++) {
|
||||||
|
regC[T_ly][T_lx] = fma(regA[T_ly], regB[T_lx], regC[T_ly][T_lx]);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
barrier();
|
barrier();
|
||||||
}
|
}
|
||||||
/* Save C* */
|
/* Save C* */
|
||||||
for (uint32_t T_ly = 0; T_ly < TS_K; T_ly++) {
|
#ifdef COOPMAT2
|
||||||
for (uint32_t T_lx = 0; T_lx < TS_NPQ; T_lx++) {
|
coopMatPerElementNV(matC, matC, perElemOpStore);
|
||||||
uint32_t K_idx = B_idx_K * BS_K + T_y * TS_K + T_ly;
|
#else
|
||||||
uint32_t NPQ_idx = B_idx_NPQ * BS_NPQ + T_x * TS_NPQ + T_lx;
|
if (T_y * TS_K < K) {
|
||||||
uint32_t N_idx = NPQ_idx / (p.OH * p.OW);
|
for (uint32_t T_ly = 0; T_ly < TS_K; T_ly++) {
|
||||||
uint32_t OH_idx = (NPQ_idx - N_idx * p.OH * p.OW) / p.OW;
|
for (uint32_t T_lx = 0; T_lx < TS_NPQ; T_lx++) {
|
||||||
uint32_t OW_idx = NPQ_idx - N_idx * p.OH * p.OW - OH_idx * p.OW;
|
uint32_t K_idx = B_idx_K * BS_K + T_y * TS_K + T_ly;
|
||||||
uint32_t dst_idx = OW_idx + OH_idx * p.nb1 + K_idx * p.nb2 + N_idx * p.nb3;
|
uint32_t NPQ_idx = B_idx_NPQ * BS_NPQ + T_x * TS_NPQ + T_lx;
|
||||||
if (K_idx < K && NPQ_idx < NPQ) {
|
uint32_t N_idx = fastdiv(NPQ_idx, p.OWOHmp, p.OWOHL); // divide by p.OH * p.OW;
|
||||||
dst_data[dst_idx] = regC[T_ly][T_lx];
|
uint32_t OH_idx = fastdiv(NPQ_idx - N_idx * p.OH * p.OW, p.OWmp, p.OWL); // divide by p.OW;
|
||||||
|
uint32_t OW_idx = NPQ_idx - N_idx * p.OH * p.OW - OH_idx * p.OW;
|
||||||
|
uint32_t dst_idx = OW_idx + OH_idx * p.nb1 + K_idx * p.nb2 + N_idx * p.nb3;
|
||||||
|
if (K_idx < K && NPQ_idx < NPQ) {
|
||||||
|
dst_data[dst_idx] = regC[T_ly][T_lx];
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -26,6 +26,9 @@ layout (push_constant) uniform parameter
|
||||||
uint ne12;
|
uint ne12;
|
||||||
uint b_offset;
|
uint b_offset;
|
||||||
uint d_offset;
|
uint d_offset;
|
||||||
|
uint nb03;
|
||||||
|
uint nb13;
|
||||||
|
uint nb23;
|
||||||
} p;
|
} p;
|
||||||
|
|
||||||
shared FLOAT_TYPE tmp[BLOCK_SIZE];
|
shared FLOAT_TYPE tmp[BLOCK_SIZE];
|
||||||
|
|
@ -34,6 +37,7 @@ void main() {
|
||||||
const uint tid = gl_LocalInvocationID.x;
|
const uint tid = gl_LocalInvocationID.x;
|
||||||
const uint row_x = gl_GlobalInvocationID.y;
|
const uint row_x = gl_GlobalInvocationID.y;
|
||||||
const uint channel = gl_GlobalInvocationID.z;
|
const uint channel = gl_GlobalInvocationID.z;
|
||||||
|
const uint i3 = gl_WorkGroupID.x;
|
||||||
const uint channel_x = channel / p.channel_x_divisor;
|
const uint channel_x = channel / p.channel_x_divisor;
|
||||||
const uint channel_y = channel % p.ne12;
|
const uint channel_y = channel % p.ne12;
|
||||||
|
|
||||||
|
|
@ -41,7 +45,7 @@ void main() {
|
||||||
const uint nrows_dst = p.nrows_x;
|
const uint nrows_dst = p.nrows_x;
|
||||||
const uint row_dst = row_x;
|
const uint row_dst = row_x;
|
||||||
|
|
||||||
const uint idst = channel*nrows_dst + row_dst;
|
const uint idst = i3*p.nb23 + channel*nrows_dst + row_dst;
|
||||||
|
|
||||||
FLOAT_TYPE temp = 0.0f;
|
FLOAT_TYPE temp = 0.0f;
|
||||||
|
|
||||||
|
|
@ -58,8 +62,8 @@ void main() {
|
||||||
|
|
||||||
const uint row_y = col_x;
|
const uint row_y = col_x;
|
||||||
|
|
||||||
const uint ix = channel_x*p.channel_stride_x + row_x*p.row_stride_x + col_x;
|
const uint ix = i3*p.nb03 + channel_x*p.channel_stride_x + row_x*p.row_stride_x + col_x;
|
||||||
const uint iy = channel_y*p.channel_stride_y + row_y;
|
const uint iy = i3*p.nb13 + channel_y*p.channel_stride_y + row_y;
|
||||||
|
|
||||||
const vec4 av4 = vec4(data_a_v4[ix / 4]);
|
const vec4 av4 = vec4(data_a_v4[ix / 4]);
|
||||||
const vec4 bv4 = vec4(data_b_v4[iy / 4]);
|
const vec4 bv4 = vec4(data_b_v4[iy / 4]);
|
||||||
|
|
@ -74,8 +78,8 @@ void main() {
|
||||||
|
|
||||||
const uint row_y = col_x;
|
const uint row_y = col_x;
|
||||||
|
|
||||||
const uint ix = channel_x*p.channel_stride_x + row_x*p.row_stride_x + col_x;
|
const uint ix = i3*p.nb03 + channel_x*p.channel_stride_x + row_x*p.row_stride_x + col_x;
|
||||||
const uint iy = channel_y*p.channel_stride_y + row_y;
|
const uint iy = i3*p.nb13 + channel_y*p.channel_stride_y + row_y;
|
||||||
|
|
||||||
const vec4 av4 = vec4(data_a_v4[ix / 4]);
|
const vec4 av4 = vec4(data_a_v4[ix / 4]);
|
||||||
const vec4 bv4 = vec4(data_b_v4[iy / 4]);
|
const vec4 bv4 = vec4(data_b_v4[iy / 4]);
|
||||||
|
|
@ -91,8 +95,8 @@ void main() {
|
||||||
|
|
||||||
const uint row_y = col_x;
|
const uint row_y = col_x;
|
||||||
|
|
||||||
const uint ix = channel_x*p.channel_stride_x + row_x*p.row_stride_x + col_x;
|
const uint ix = i3*p.nb03 + channel_x*p.channel_stride_x + row_x*p.row_stride_x + col_x;
|
||||||
const uint iy = channel_y*p.channel_stride_y + row_y;
|
const uint iy = i3*p.nb13 + channel_y*p.channel_stride_y + row_y;
|
||||||
|
|
||||||
const FLOAT_TYPE xi = FLOAT_TYPE(data_a[ix]);
|
const FLOAT_TYPE xi = FLOAT_TYPE(data_a[ix]);
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -655,8 +655,16 @@ void process_shaders() {
|
||||||
|
|
||||||
string_to_spv("opt_step_adamw_f32", "opt_step_adamw.comp", merge_maps(base_dict, {{"A_TYPE", "float"}}));
|
string_to_spv("opt_step_adamw_f32", "opt_step_adamw.comp", merge_maps(base_dict, {{"A_TYPE", "float"}}));
|
||||||
|
|
||||||
string_to_spv("conv2d_f32", "conv2d_mm.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"USE_COLLECTIVES", "1"}});
|
string_to_spv("conv2d_f32_unroll", "conv2d_mm.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"USE_COLLECTIVES", "1"}, {"UNROLL", "[[unroll]]"}});
|
||||||
string_to_spv("conv2d_f16_f32", "conv2d_mm.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"USE_COLLECTIVES", "1"}});
|
string_to_spv("conv2d_f16_f32_unroll", "conv2d_mm.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"USE_COLLECTIVES", "1"}, {"UNROLL", "[[unroll]]"}});
|
||||||
|
|
||||||
|
string_to_spv("conv2d_f32", "conv2d_mm.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"USE_COLLECTIVES", "1"}, {"UNROLL", ""}});
|
||||||
|
string_to_spv("conv2d_f16_f32", "conv2d_mm.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"USE_COLLECTIVES", "1"}, {"UNROLL", ""}});
|
||||||
|
|
||||||
|
#if defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
|
||||||
|
string_to_spv("conv2d_f32", "conv2d_mm.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"USE_COLLECTIVES", "1"}, {"UNROLL", "[[unroll]]"}, {"COOPMAT2", "1"}}, true, false, true);
|
||||||
|
string_to_spv("conv2d_f16_f32", "conv2d_mm.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"USE_COLLECTIVES", "1"}, {"UNROLL", "[[unroll]]"}, {"COOPMAT2", "1"}}, true, false, true);
|
||||||
|
#endif
|
||||||
|
|
||||||
string_to_spv("conv2d_dw_whcn_f32", "conv2d_dw.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"WHCN", "1"}}));
|
string_to_spv("conv2d_dw_whcn_f32", "conv2d_dw.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"WHCN", "1"}}));
|
||||||
string_to_spv("conv2d_dw_cwhn_f32", "conv2d_dw.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"CWHN", "1"}}));
|
string_to_spv("conv2d_dw_cwhn_f32", "conv2d_dw.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"CWHN", "1"}}));
|
||||||
|
|
|
||||||
|
|
@ -376,6 +376,7 @@ class MODEL_ARCH(IntEnum):
|
||||||
ERNIE4_5 = auto()
|
ERNIE4_5 = auto()
|
||||||
ERNIE4_5_MOE = auto()
|
ERNIE4_5_MOE = auto()
|
||||||
HUNYUAN_MOE = auto()
|
HUNYUAN_MOE = auto()
|
||||||
|
HUNYUAN_DENSE = auto()
|
||||||
SMOLLM3 = auto()
|
SMOLLM3 = auto()
|
||||||
LFM2 = auto()
|
LFM2 = auto()
|
||||||
DREAM = auto()
|
DREAM = auto()
|
||||||
|
|
@ -697,6 +698,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||||
MODEL_ARCH.ERNIE4_5_MOE: "ernie4_5-moe",
|
MODEL_ARCH.ERNIE4_5_MOE: "ernie4_5-moe",
|
||||||
MODEL_ARCH.FALCON_H1: "falcon-h1",
|
MODEL_ARCH.FALCON_H1: "falcon-h1",
|
||||||
MODEL_ARCH.HUNYUAN_MOE: "hunyuan-moe",
|
MODEL_ARCH.HUNYUAN_MOE: "hunyuan-moe",
|
||||||
|
MODEL_ARCH.HUNYUAN_DENSE: "hunyuan-dense",
|
||||||
MODEL_ARCH.SMOLLM3: "smollm3",
|
MODEL_ARCH.SMOLLM3: "smollm3",
|
||||||
MODEL_ARCH.LFM2: "lfm2",
|
MODEL_ARCH.LFM2: "lfm2",
|
||||||
MODEL_ARCH.DREAM: "dream",
|
MODEL_ARCH.DREAM: "dream",
|
||||||
|
|
@ -2471,6 +2473,22 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||||
MODEL_TENSOR.FFN_DOWN_SHEXP,
|
MODEL_TENSOR.FFN_DOWN_SHEXP,
|
||||||
MODEL_TENSOR.FFN_UP_SHEXP,
|
MODEL_TENSOR.FFN_UP_SHEXP,
|
||||||
],
|
],
|
||||||
|
MODEL_ARCH.HUNYUAN_DENSE: [
|
||||||
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
|
MODEL_TENSOR.OUTPUT,
|
||||||
|
MODEL_TENSOR.ATTN_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_Q,
|
||||||
|
MODEL_TENSOR.ATTN_Q_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_K,
|
||||||
|
MODEL_TENSOR.ATTN_K_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_V,
|
||||||
|
MODEL_TENSOR.ATTN_OUT,
|
||||||
|
MODEL_TENSOR.FFN_NORM,
|
||||||
|
MODEL_TENSOR.FFN_GATE,
|
||||||
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
|
MODEL_TENSOR.FFN_UP,
|
||||||
|
],
|
||||||
MODEL_ARCH.SMOLLM3: [
|
MODEL_ARCH.SMOLLM3: [
|
||||||
MODEL_TENSOR.TOKEN_EMBD,
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
MODEL_TENSOR.OUTPUT_NORM,
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
|
|
|
||||||
|
|
@ -33,6 +33,7 @@ class TensorNameMap:
|
||||||
"language_model.model.embed_tokens", # llama4
|
"language_model.model.embed_tokens", # llama4
|
||||||
"encoder", # neobert
|
"encoder", # neobert
|
||||||
"model.transformer.wte", # llada
|
"model.transformer.wte", # llada
|
||||||
|
"embed_tokens", # qwen3-embedding
|
||||||
),
|
),
|
||||||
|
|
||||||
# Token type embeddings
|
# Token type embeddings
|
||||||
|
|
@ -143,6 +144,7 @@ class TensorNameMap:
|
||||||
"transformer_encoder.{bid}.attention_norm", # neobert
|
"transformer_encoder.{bid}.attention_norm", # neobert
|
||||||
"model.layers.{bid}.operator_norm", # lfm2
|
"model.layers.{bid}.operator_norm", # lfm2
|
||||||
"model.transformer.blocks.{bid}.attn_norm", # llada
|
"model.transformer.blocks.{bid}.attn_norm", # llada
|
||||||
|
"layers.{bid}.input_layernorm", # qwen3-embedding
|
||||||
),
|
),
|
||||||
|
|
||||||
# Attention norm 2
|
# Attention norm 2
|
||||||
|
|
@ -188,6 +190,7 @@ class TensorNameMap:
|
||||||
"transformer.h.{bid}.attn.attention.q_proj", # exaone
|
"transformer.h.{bid}.attn.attention.q_proj", # exaone
|
||||||
"model.layers.{bid}.self_attn.q_proj", # llama4
|
"model.layers.{bid}.self_attn.q_proj", # llama4
|
||||||
"model.transformer.blocks.{bid}.q_proj", # llada
|
"model.transformer.blocks.{bid}.q_proj", # llada
|
||||||
|
"layers.{bid}.self_attn.q_proj", # qwen3-embedding
|
||||||
),
|
),
|
||||||
|
|
||||||
# Attention key
|
# Attention key
|
||||||
|
|
@ -205,6 +208,7 @@ class TensorNameMap:
|
||||||
"transformer.h.{bid}.attn.attention.k_proj", # exaone
|
"transformer.h.{bid}.attn.attention.k_proj", # exaone
|
||||||
"model.layers.{bid}.self_attn.k_proj", # llama4
|
"model.layers.{bid}.self_attn.k_proj", # llama4
|
||||||
"model.transformer.blocks.{bid}.k_proj", # llada
|
"model.transformer.blocks.{bid}.k_proj", # llada
|
||||||
|
"layers.{bid}.self_attn.k_proj", # qwen3-embedding
|
||||||
),
|
),
|
||||||
|
|
||||||
# Attention value
|
# Attention value
|
||||||
|
|
@ -221,6 +225,7 @@ class TensorNameMap:
|
||||||
"transformer.h.{bid}.attn.attention.v_proj", # exaone
|
"transformer.h.{bid}.attn.attention.v_proj", # exaone
|
||||||
"model.layers.{bid}.self_attn.v_proj", # llama4
|
"model.layers.{bid}.self_attn.v_proj", # llama4
|
||||||
"model.transformer.blocks.{bid}.v_proj", # llada
|
"model.transformer.blocks.{bid}.v_proj", # llada
|
||||||
|
"layers.{bid}.self_attn.v_proj", # qwen3-embedding
|
||||||
),
|
),
|
||||||
|
|
||||||
# Attention output
|
# Attention output
|
||||||
|
|
@ -254,6 +259,7 @@ class TensorNameMap:
|
||||||
"model.layers.{bid}.self_attn.o_proj", # llama4
|
"model.layers.{bid}.self_attn.o_proj", # llama4
|
||||||
"transformer_encoder.{bid}.wo", # neobert
|
"transformer_encoder.{bid}.wo", # neobert
|
||||||
"model.transformer.blocks.{bid}.attn_out", # llada
|
"model.transformer.blocks.{bid}.attn_out", # llada
|
||||||
|
"layers.{bid}.self_attn.o_proj", # qwen3-embedding
|
||||||
),
|
),
|
||||||
|
|
||||||
# Attention output norm
|
# Attention output norm
|
||||||
|
|
@ -300,6 +306,7 @@ class TensorNameMap:
|
||||||
"transformer_encoder.{bid}.ffn_norm", # neobert
|
"transformer_encoder.{bid}.ffn_norm", # neobert
|
||||||
"model.layers.layers.{bid}.pre_mlp_norm", # plamo2
|
"model.layers.layers.{bid}.pre_mlp_norm", # plamo2
|
||||||
"model.transformer.blocks.{bid}.ff_norm", # llada
|
"model.transformer.blocks.{bid}.ff_norm", # llada
|
||||||
|
"layers.{bid}.post_attention_layernorm", # qwen3-embedding
|
||||||
),
|
),
|
||||||
|
|
||||||
# Post feed-forward norm
|
# Post feed-forward norm
|
||||||
|
|
@ -373,7 +380,8 @@ class TensorNameMap:
|
||||||
"model.layers.{bid}.feed_forward.up_proj", # llama4 jamba granite-hybrid
|
"model.layers.{bid}.feed_forward.up_proj", # llama4 jamba granite-hybrid
|
||||||
"transformer_encoder.{bid}.ffn.w12", # neobert
|
"transformer_encoder.{bid}.ffn.w12", # neobert
|
||||||
"model.layers.{bid}.block_sparse_moe.up", # smallthinker
|
"model.layers.{bid}.block_sparse_moe.up", # smallthinker
|
||||||
"model.transformer.blocks.{bid}.up_proj", # llada
|
"model.transformer.blocks.{bid}.up_proj", # llada
|
||||||
|
"layers.{bid}.mlp.up_proj", # qwen3-embedding
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.FFN_UP_EXP: (
|
MODEL_TENSOR.FFN_UP_EXP: (
|
||||||
|
|
@ -416,6 +424,7 @@ class TensorNameMap:
|
||||||
"model.layers.{bid}.feed_forward.gate_proj", # llama4 jamba granite-hybrid
|
"model.layers.{bid}.feed_forward.gate_proj", # llama4 jamba granite-hybrid
|
||||||
"model.layers.{bid}.block_sparse_moe.gate", # smallthinker
|
"model.layers.{bid}.block_sparse_moe.gate", # smallthinker
|
||||||
"model.transformer.blocks.{bid}.ff_proj", # llada
|
"model.transformer.blocks.{bid}.ff_proj", # llada
|
||||||
|
"layers.{bid}.mlp.gate_proj", # qwen3-embedding
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.FFN_GATE_EXP: (
|
MODEL_TENSOR.FFN_GATE_EXP: (
|
||||||
|
|
@ -465,7 +474,8 @@ class TensorNameMap:
|
||||||
"model.layers.{bid}.feed_forward.down_proj", # llama4 jamba granite-hybrid
|
"model.layers.{bid}.feed_forward.down_proj", # llama4 jamba granite-hybrid
|
||||||
"transformer_encoder.{bid}.ffn.w3", # neobert
|
"transformer_encoder.{bid}.ffn.w3", # neobert
|
||||||
"model.layers.{bid}.block_sparse_moe.down", # smallthinker
|
"model.layers.{bid}.block_sparse_moe.down", # smallthinker
|
||||||
"model.transformer.blocks.{bid}.ff_out", # llada
|
"model.transformer.blocks.{bid}.ff_out", # llada
|
||||||
|
"layers.{bid}.mlp.down_proj", # qwen3-embedding
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.FFN_DOWN_EXP: (
|
MODEL_TENSOR.FFN_DOWN_EXP: (
|
||||||
|
|
@ -497,6 +507,7 @@ class TensorNameMap:
|
||||||
"encoder.layer.{bid}.attention.self.layer_norm_q", # jina-bert-v2
|
"encoder.layer.{bid}.attention.self.layer_norm_q", # jina-bert-v2
|
||||||
"transformer.layers.{bid}.attn.q_norm", # openelm
|
"transformer.layers.{bid}.attn.q_norm", # openelm
|
||||||
"model.layers.layers.{bid}.mixer.q", # plamo2
|
"model.layers.layers.{bid}.mixer.q", # plamo2
|
||||||
|
"layers.{bid}.self_attn.q_norm", # qwen3-embedding
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.ATTN_K_NORM: (
|
MODEL_TENSOR.ATTN_K_NORM: (
|
||||||
|
|
@ -508,6 +519,7 @@ class TensorNameMap:
|
||||||
"encoder.layer.{bid}.attention.self.layer_norm_k", # jina-bert-v2
|
"encoder.layer.{bid}.attention.self.layer_norm_k", # jina-bert-v2
|
||||||
"transformer.layers.{bid}.attn.k_norm", # openelm
|
"transformer.layers.{bid}.attn.k_norm", # openelm
|
||||||
"model.layers.layers.{bid}.mixer.k", # plamo2
|
"model.layers.layers.{bid}.mixer.k", # plamo2
|
||||||
|
"layers.{bid}.self_attn.k_norm", # qwen3-embedding
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.ROPE_FREQS: (
|
MODEL_TENSOR.ROPE_FREQS: (
|
||||||
|
|
|
||||||
|
|
@ -312,7 +312,11 @@ class SpecialVocab:
|
||||||
with open(config_file, encoding = 'utf-8') as f:
|
with open(config_file, encoding = 'utf-8') as f:
|
||||||
config = json.load(f)
|
config = json.load(f)
|
||||||
for typ in self.special_token_types:
|
for typ in self.special_token_types:
|
||||||
self._set_special_token(typ, config.get(f'{typ}_token_id'))
|
token_id = config.get(f'{typ}_token_id')
|
||||||
|
# If not found at root, check in text_config (for multimodal models like Kimi-VL)
|
||||||
|
if token_id is None and 'text_config' in config:
|
||||||
|
token_id = config['text_config'].get(f'{typ}_token_id')
|
||||||
|
self._set_special_token(typ, token_id)
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,7 +1 @@
|
||||||
-r ./requirements-convert_legacy_llama.txt
|
-r ./requirements-convert_legacy_llama.txt
|
||||||
--extra-index-url https://download.pytorch.org/whl/cpu
|
|
||||||
torch~=2.2.1; platform_machine != "s390x"
|
|
||||||
|
|
||||||
# torch s390x packages can only be found from nightly builds
|
|
||||||
--extra-index-url https://download.pytorch.org/whl/nightly
|
|
||||||
torch>=0.0.0.dev0; platform_machine == "s390x"
|
|
||||||
|
|
|
||||||
|
|
@ -1,19 +1,41 @@
|
||||||
#!/usr/bin/env bash
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
if [ $# -lt 2 ]; then
|
if [ $# -lt 2 ]; then
|
||||||
echo "usage: ./scripts/compare-commits.sh <commit1> <commit2> [additional llama-bench arguments]"
|
echo "usage: ./scripts/compare-commits.sh <commit1> <commit2> [tool] [additional arguments]"
|
||||||
|
echo " tool: 'llama-bench' (default) or 'test-backend-ops'"
|
||||||
|
echo " additional arguments: passed to the selected tool"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
set -e
|
set -e
|
||||||
set -x
|
set -x
|
||||||
|
|
||||||
|
# Parse arguments
|
||||||
|
commit1=$1
|
||||||
|
commit2=$2
|
||||||
|
tool=${3:-llama-bench}
|
||||||
|
additional_args="${@:4}"
|
||||||
|
|
||||||
|
# Validate tool argument
|
||||||
|
if [ "$tool" != "llama-bench" ] && [ "$tool" != "test-backend-ops" ]; then
|
||||||
|
echo "Error: tool must be 'llama-bench' or 'test-backend-ops'"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
# verify at the start that the compare script has all the necessary dependencies installed
|
# verify at the start that the compare script has all the necessary dependencies installed
|
||||||
./scripts/compare-llama-bench.py --check
|
./scripts/compare-llama-bench.py --check
|
||||||
|
|
||||||
bench_args="${@:3}"
|
if [ "$tool" = "llama-bench" ]; then
|
||||||
|
db_file="llama-bench.sqlite"
|
||||||
|
target="llama-bench"
|
||||||
|
run_args="-o sql -oe md $additional_args"
|
||||||
|
else # test-backend-ops
|
||||||
|
db_file="test-backend-ops.sqlite"
|
||||||
|
target="test-backend-ops"
|
||||||
|
run_args="perf --output sql $additional_args"
|
||||||
|
fi
|
||||||
|
|
||||||
rm -f llama-bench.sqlite > /dev/null
|
rm -f "$db_file" > /dev/null
|
||||||
|
|
||||||
# to test a backend, call the script with the corresponding environment variable (e.g. GGML_CUDA=1 ./scripts/compare-commits.sh ...)
|
# to test a backend, call the script with the corresponding environment variable (e.g. GGML_CUDA=1 ./scripts/compare-commits.sh ...)
|
||||||
if [ -n "$GGML_CUDA" ]; then
|
if [ -n "$GGML_CUDA" ]; then
|
||||||
|
|
@ -25,14 +47,14 @@ dir="build-bench"
|
||||||
function run {
|
function run {
|
||||||
rm -fr ${dir} > /dev/null
|
rm -fr ${dir} > /dev/null
|
||||||
cmake -B ${dir} -S . ${CMAKE_OPTS} > /dev/null
|
cmake -B ${dir} -S . ${CMAKE_OPTS} > /dev/null
|
||||||
cmake --build ${dir} -t llama-bench > /dev/null
|
cmake --build ${dir} -t $target -j $(nproc) > /dev/null
|
||||||
${dir}/bin/llama-bench -o sql -oe md $bench_args | sqlite3 llama-bench.sqlite
|
${dir}/bin/$target $run_args | sqlite3 "$db_file"
|
||||||
}
|
}
|
||||||
|
|
||||||
git checkout $1 > /dev/null
|
git checkout $commit1 > /dev/null
|
||||||
run
|
run
|
||||||
|
|
||||||
git checkout $2 > /dev/null
|
git checkout $commit2 > /dev/null
|
||||||
run
|
run
|
||||||
|
|
||||||
./scripts/compare-llama-bench.py -b $1 -c $2
|
./scripts/compare-llama-bench.py -b $commit1 -c $commit2 --tool $tool -i "$db_file"
|
||||||
|
|
|
||||||
|
|
@ -1,16 +1,16 @@
|
||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
import logging
|
|
||||||
import argparse
|
import argparse
|
||||||
import heapq
|
|
||||||
import sys
|
|
||||||
import os
|
|
||||||
from glob import glob
|
|
||||||
import sqlite3
|
|
||||||
import json
|
|
||||||
import csv
|
import csv
|
||||||
from typing import Optional, Union
|
import heapq
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import sqlite3
|
||||||
|
import sys
|
||||||
from collections.abc import Iterator, Sequence
|
from collections.abc import Iterator, Sequence
|
||||||
|
from glob import glob
|
||||||
|
from typing import Any, Optional, Union
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import git
|
import git
|
||||||
|
|
@ -23,7 +23,7 @@ except ImportError as e:
|
||||||
logger = logging.getLogger("compare-llama-bench")
|
logger = logging.getLogger("compare-llama-bench")
|
||||||
|
|
||||||
# All llama-bench SQL fields
|
# All llama-bench SQL fields
|
||||||
DB_FIELDS = [
|
LLAMA_BENCH_DB_FIELDS = [
|
||||||
"build_commit", "build_number", "cpu_info", "gpu_info", "backends", "model_filename",
|
"build_commit", "build_number", "cpu_info", "gpu_info", "backends", "model_filename",
|
||||||
"model_type", "model_size", "model_n_params", "n_batch", "n_ubatch", "n_threads",
|
"model_type", "model_size", "model_n_params", "n_batch", "n_ubatch", "n_threads",
|
||||||
"cpu_mask", "cpu_strict", "poll", "type_k", "type_v", "n_gpu_layers",
|
"cpu_mask", "cpu_strict", "poll", "type_k", "type_v", "n_gpu_layers",
|
||||||
|
|
@ -33,7 +33,7 @@ DB_FIELDS = [
|
||||||
"test_time", "avg_ns", "stddev_ns", "avg_ts", "stddev_ts",
|
"test_time", "avg_ns", "stddev_ns", "avg_ts", "stddev_ts",
|
||||||
]
|
]
|
||||||
|
|
||||||
DB_TYPES = [
|
LLAMA_BENCH_DB_TYPES = [
|
||||||
"TEXT", "INTEGER", "TEXT", "TEXT", "TEXT", "TEXT",
|
"TEXT", "INTEGER", "TEXT", "TEXT", "TEXT", "TEXT",
|
||||||
"TEXT", "INTEGER", "INTEGER", "INTEGER", "INTEGER", "INTEGER",
|
"TEXT", "INTEGER", "INTEGER", "INTEGER", "INTEGER", "INTEGER",
|
||||||
"TEXT", "INTEGER", "INTEGER", "TEXT", "TEXT", "INTEGER",
|
"TEXT", "INTEGER", "INTEGER", "TEXT", "TEXT", "INTEGER",
|
||||||
|
|
@ -42,20 +42,41 @@ DB_TYPES = [
|
||||||
"INTEGER", "INTEGER", "INTEGER", "INTEGER", "INTEGER", "INTEGER",
|
"INTEGER", "INTEGER", "INTEGER", "INTEGER", "INTEGER", "INTEGER",
|
||||||
"TEXT", "INTEGER", "INTEGER", "REAL", "REAL",
|
"TEXT", "INTEGER", "INTEGER", "REAL", "REAL",
|
||||||
]
|
]
|
||||||
assert len(DB_FIELDS) == len(DB_TYPES)
|
|
||||||
|
|
||||||
# Properties by which to differentiate results per commit:
|
# All test-backend-ops SQL fields
|
||||||
KEY_PROPERTIES = [
|
TEST_BACKEND_OPS_DB_FIELDS = [
|
||||||
|
"test_time", "build_commit", "backend_name", "op_name", "op_params", "test_mode",
|
||||||
|
"supported", "passed", "error_message", "time_us", "flops", "bandwidth_gb_s",
|
||||||
|
"memory_kb", "n_runs"
|
||||||
|
]
|
||||||
|
|
||||||
|
TEST_BACKEND_OPS_DB_TYPES = [
|
||||||
|
"TEXT", "TEXT", "TEXT", "TEXT", "TEXT", "TEXT",
|
||||||
|
"INTEGER", "INTEGER", "TEXT", "REAL", "REAL", "REAL",
|
||||||
|
"INTEGER", "INTEGER"
|
||||||
|
]
|
||||||
|
|
||||||
|
assert len(LLAMA_BENCH_DB_FIELDS) == len(LLAMA_BENCH_DB_TYPES)
|
||||||
|
assert len(TEST_BACKEND_OPS_DB_FIELDS) == len(TEST_BACKEND_OPS_DB_TYPES)
|
||||||
|
|
||||||
|
# Properties by which to differentiate results per commit for llama-bench:
|
||||||
|
LLAMA_BENCH_KEY_PROPERTIES = [
|
||||||
"cpu_info", "gpu_info", "backends", "n_gpu_layers", "tensor_buft_overrides", "model_filename", "model_type",
|
"cpu_info", "gpu_info", "backends", "n_gpu_layers", "tensor_buft_overrides", "model_filename", "model_type",
|
||||||
"n_batch", "n_ubatch", "embeddings", "cpu_mask", "cpu_strict", "poll", "n_threads", "type_k", "type_v",
|
"n_batch", "n_ubatch", "embeddings", "cpu_mask", "cpu_strict", "poll", "n_threads", "type_k", "type_v",
|
||||||
"use_mmap", "no_kv_offload", "split_mode", "main_gpu", "tensor_split", "flash_attn", "n_prompt", "n_gen", "n_depth"
|
"use_mmap", "no_kv_offload", "split_mode", "main_gpu", "tensor_split", "flash_attn", "n_prompt", "n_gen", "n_depth"
|
||||||
]
|
]
|
||||||
|
|
||||||
# Properties that are boolean and are converted to Yes/No for the table:
|
# Properties by which to differentiate results per commit for test-backend-ops:
|
||||||
BOOL_PROPERTIES = ["embeddings", "cpu_strict", "use_mmap", "no_kv_offload", "flash_attn"]
|
TEST_BACKEND_OPS_KEY_PROPERTIES = [
|
||||||
|
"backend_name", "op_name", "op_params", "test_mode"
|
||||||
|
]
|
||||||
|
|
||||||
# Header names for the table:
|
# Properties that are boolean and are converted to Yes/No for the table:
|
||||||
PRETTY_NAMES = {
|
LLAMA_BENCH_BOOL_PROPERTIES = ["embeddings", "cpu_strict", "use_mmap", "no_kv_offload", "flash_attn"]
|
||||||
|
TEST_BACKEND_OPS_BOOL_PROPERTIES = ["supported", "passed"]
|
||||||
|
|
||||||
|
# Header names for the table (llama-bench):
|
||||||
|
LLAMA_BENCH_PRETTY_NAMES = {
|
||||||
"cpu_info": "CPU", "gpu_info": "GPU", "backends": "Backends", "n_gpu_layers": "GPU layers",
|
"cpu_info": "CPU", "gpu_info": "GPU", "backends": "Backends", "n_gpu_layers": "GPU layers",
|
||||||
"tensor_buft_overrides": "Tensor overrides", "model_filename": "File", "model_type": "Model", "model_size": "Model size [GiB]",
|
"tensor_buft_overrides": "Tensor overrides", "model_filename": "File", "model_type": "Model", "model_size": "Model size [GiB]",
|
||||||
"model_n_params": "Num. of par.", "n_batch": "Batch size", "n_ubatch": "Microbatch size", "embeddings": "Embeddings",
|
"model_n_params": "Num. of par.", "n_batch": "Batch size", "n_ubatch": "Microbatch size", "embeddings": "Embeddings",
|
||||||
|
|
@ -64,21 +85,42 @@ PRETTY_NAMES = {
|
||||||
"flash_attn": "FlashAttention",
|
"flash_attn": "FlashAttention",
|
||||||
}
|
}
|
||||||
|
|
||||||
DEFAULT_SHOW = ["model_type"] # Always show these properties by default.
|
# Header names for the table (test-backend-ops):
|
||||||
DEFAULT_HIDE = ["model_filename"] # Always hide these properties by default.
|
TEST_BACKEND_OPS_PRETTY_NAMES = {
|
||||||
|
"backend_name": "Backend", "op_name": "GGML op", "op_params": "Op parameters", "test_mode": "Mode",
|
||||||
|
"supported": "Supported", "passed": "Passed", "error_message": "Error",
|
||||||
|
"flops": "FLOPS", "bandwidth_gb_s": "Bandwidth (GB/s)", "memory_kb": "Memory (KB)", "n_runs": "Runs"
|
||||||
|
}
|
||||||
|
|
||||||
|
DEFAULT_SHOW_LLAMA_BENCH = ["model_type"] # Always show these properties by default.
|
||||||
|
DEFAULT_HIDE_LLAMA_BENCH = ["model_filename"] # Always hide these properties by default.
|
||||||
|
|
||||||
|
DEFAULT_SHOW_TEST_BACKEND_OPS = ["backend_name", "op_name"] # Always show these properties by default.
|
||||||
|
DEFAULT_HIDE_TEST_BACKEND_OPS = ["error_message"] # Always hide these properties by default.
|
||||||
|
|
||||||
GPU_NAME_STRIP = ["NVIDIA GeForce ", "Tesla ", "AMD Radeon "] # Strip prefixes for smaller tables.
|
GPU_NAME_STRIP = ["NVIDIA GeForce ", "Tesla ", "AMD Radeon "] # Strip prefixes for smaller tables.
|
||||||
MODEL_SUFFIX_REPLACE = {" - Small": "_S", " - Medium": "_M", " - Large": "_L"}
|
MODEL_SUFFIX_REPLACE = {" - Small": "_S", " - Medium": "_M", " - Large": "_L"}
|
||||||
|
|
||||||
DESCRIPTION = """Creates tables from llama-bench data written to multiple JSON/CSV files, a single JSONL file or SQLite database. Example usage (Linux):
|
DESCRIPTION = """Creates tables from llama-bench or test-backend-ops data written to multiple JSON/CSV files, a single JSONL file or SQLite database. Example usage (Linux):
|
||||||
|
|
||||||
|
For llama-bench:
|
||||||
$ git checkout master
|
$ git checkout master
|
||||||
$ make clean && make llama-bench
|
$ cmake -B ${BUILD_DIR} ${CMAKE_OPTS} && cmake --build ${BUILD_DIR} -t llama-bench -j $(nproc)
|
||||||
$ ./llama-bench -o sql | sqlite3 llama-bench.sqlite
|
$ ./llama-bench -o sql | sqlite3 llama-bench.sqlite
|
||||||
$ git checkout some_branch
|
$ git checkout some_branch
|
||||||
$ make clean && make llama-bench
|
$ cmake -B ${BUILD_DIR} ${CMAKE_OPTS} && cmake --build ${BUILD_DIR} -t llama-bench -j $(nproc)
|
||||||
$ ./llama-bench -o sql | sqlite3 llama-bench.sqlite
|
$ ./llama-bench -o sql | sqlite3 llama-bench.sqlite
|
||||||
$ ./scripts/compare-llama-bench.py
|
$ ./scripts/compare-llama-bench.py
|
||||||
|
|
||||||
|
For test-backend-ops:
|
||||||
|
$ git checkout master
|
||||||
|
$ cmake -B ${BUILD_DIR} ${CMAKE_OPTS} && cmake --build ${BUILD_DIR} -t test-backend-ops -j $(nproc)
|
||||||
|
$ ./test-backend-ops perf --output sql | sqlite3 test-backend-ops.sqlite
|
||||||
|
$ git checkout some_branch
|
||||||
|
$ cmake -B ${BUILD_DIR} ${CMAKE_OPTS} && cmake --build ${BUILD_DIR} -t test-backend-ops -j $(nproc)
|
||||||
|
$ ./test-backend-ops perf --output sql | sqlite3 test-backend-ops.sqlite
|
||||||
|
$ ./scripts/compare-llama-bench.py --tool test-backend-ops -i test-backend-ops.sqlite
|
||||||
|
|
||||||
Performance numbers from multiple runs per commit are averaged WITHOUT being weighted by the --repetitions parameter of llama-bench.
|
Performance numbers from multiple runs per commit are averaged WITHOUT being weighted by the --repetitions parameter of llama-bench.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
@ -96,6 +138,13 @@ help_c = (
|
||||||
"Defaults to the non-master commit for which llama-bench was run most recently."
|
"Defaults to the non-master commit for which llama-bench was run most recently."
|
||||||
)
|
)
|
||||||
parser.add_argument("-c", "--compare", help=help_c)
|
parser.add_argument("-c", "--compare", help=help_c)
|
||||||
|
help_t = (
|
||||||
|
"The tool whose data is being compared. "
|
||||||
|
"Either 'llama-bench' or 'test-backend-ops'. "
|
||||||
|
"This determines the database schema and comparison logic used. "
|
||||||
|
"If left unspecified, try to determine from the input file."
|
||||||
|
)
|
||||||
|
parser.add_argument("-t", "--tool", help=help_t, default=None, choices=[None, "llama-bench", "test-backend-ops"])
|
||||||
help_i = (
|
help_i = (
|
||||||
"JSON/JSONL/SQLite/CSV files for comparing commits. "
|
"JSON/JSONL/SQLite/CSV files for comparing commits. "
|
||||||
"Specify multiple times to use multiple input files (JSON/CSV only). "
|
"Specify multiple times to use multiple input files (JSON/CSV only). "
|
||||||
|
|
@ -114,7 +163,8 @@ parser.add_argument("-o", "--output", help=help_o, default="pipe")
|
||||||
help_s = (
|
help_s = (
|
||||||
"Columns to add to the table. "
|
"Columns to add to the table. "
|
||||||
"Accepts a comma-separated list of values. "
|
"Accepts a comma-separated list of values. "
|
||||||
f"Legal values: {', '.join(KEY_PROPERTIES[:-3])}. "
|
f"Legal values for test-backend-ops: {', '.join(TEST_BACKEND_OPS_KEY_PROPERTIES)}. "
|
||||||
|
f"Legal values for llama-bench: {', '.join(LLAMA_BENCH_KEY_PROPERTIES[:-3])}. "
|
||||||
"Defaults to model name (model_type) and CPU and/or GPU name (cpu_info, gpu_info) "
|
"Defaults to model name (model_type) and CPU and/or GPU name (cpu_info, gpu_info) "
|
||||||
"plus any column where not all data points are the same. "
|
"plus any column where not all data points are the same. "
|
||||||
"If the columns are manually specified, then the results for each unique combination of the "
|
"If the columns are manually specified, then the results for each unique combination of the "
|
||||||
|
|
@ -142,8 +192,14 @@ if unknown_args:
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
input_file = known_args.input
|
input_file = known_args.input
|
||||||
if not input_file and os.path.exists("./llama-bench.sqlite"):
|
tool = known_args.tool
|
||||||
input_file = ["llama-bench.sqlite"]
|
|
||||||
|
if not input_file:
|
||||||
|
if tool == "llama-bench" and os.path.exists("./llama-bench.sqlite"):
|
||||||
|
input_file = ["llama-bench.sqlite"]
|
||||||
|
elif tool == "test-backend-ops" and os.path.exists("./test-backend-ops.sqlite"):
|
||||||
|
input_file = ["test-backend-ops.sqlite"]
|
||||||
|
|
||||||
if not input_file:
|
if not input_file:
|
||||||
sqlite_files = glob("*.sqlite")
|
sqlite_files = glob("*.sqlite")
|
||||||
if len(sqlite_files) == 1:
|
if len(sqlite_files) == 1:
|
||||||
|
|
@ -161,14 +217,23 @@ class LlamaBenchData:
|
||||||
build_len_max: int
|
build_len_max: int
|
||||||
build_len: int = 8
|
build_len: int = 8
|
||||||
builds: list[str] = []
|
builds: list[str] = []
|
||||||
check_keys = set(KEY_PROPERTIES + ["build_commit", "test_time", "avg_ts"])
|
tool: str = "llama-bench" # Tool type: "llama-bench" or "test-backend-ops"
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self, tool: str = "llama-bench"):
|
||||||
|
self.tool = tool
|
||||||
try:
|
try:
|
||||||
self.repo = git.Repo(".", search_parent_directories=True)
|
self.repo = git.Repo(".", search_parent_directories=True)
|
||||||
except git.InvalidGitRepositoryError:
|
except git.InvalidGitRepositoryError:
|
||||||
self.repo = None
|
self.repo = None
|
||||||
|
|
||||||
|
# Set schema-specific properties based on tool
|
||||||
|
if self.tool == "llama-bench":
|
||||||
|
self.check_keys = set(LLAMA_BENCH_KEY_PROPERTIES + ["build_commit", "test_time", "avg_ts"])
|
||||||
|
elif self.tool == "test-backend-ops":
|
||||||
|
self.check_keys = set(TEST_BACKEND_OPS_KEY_PROPERTIES + ["build_commit", "test_time"])
|
||||||
|
else:
|
||||||
|
assert False
|
||||||
|
|
||||||
def _builds_init(self):
|
def _builds_init(self):
|
||||||
self.build_len = self.build_len_min
|
self.build_len = self.build_len_min
|
||||||
|
|
||||||
|
|
@ -252,52 +317,121 @@ class LlamaBenchData:
|
||||||
class LlamaBenchDataSQLite3(LlamaBenchData):
|
class LlamaBenchDataSQLite3(LlamaBenchData):
|
||||||
connection: sqlite3.Connection
|
connection: sqlite3.Connection
|
||||||
cursor: sqlite3.Cursor
|
cursor: sqlite3.Cursor
|
||||||
|
table_name: str
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self, tool: str = "llama-bench"):
|
||||||
super().__init__()
|
super().__init__(tool)
|
||||||
self.connection = sqlite3.connect(":memory:")
|
self.connection = sqlite3.connect(":memory:")
|
||||||
self.cursor = self.connection.cursor()
|
self.cursor = self.connection.cursor()
|
||||||
self.cursor.execute(f"CREATE TABLE test({', '.join(' '.join(x) for x in zip(DB_FIELDS, DB_TYPES))});")
|
|
||||||
|
# Set table name and schema based on tool
|
||||||
|
if self.tool == "llama-bench":
|
||||||
|
self.table_name = "llama_bench"
|
||||||
|
db_fields = LLAMA_BENCH_DB_FIELDS
|
||||||
|
db_types = LLAMA_BENCH_DB_TYPES
|
||||||
|
elif self.tool == "test-backend-ops":
|
||||||
|
self.table_name = "test_backend_ops"
|
||||||
|
db_fields = TEST_BACKEND_OPS_DB_FIELDS
|
||||||
|
db_types = TEST_BACKEND_OPS_DB_TYPES
|
||||||
|
else:
|
||||||
|
assert False
|
||||||
|
|
||||||
|
self.cursor.execute(f"CREATE TABLE {self.table_name}({', '.join(' '.join(x) for x in zip(db_fields, db_types))});")
|
||||||
|
|
||||||
def _builds_init(self):
|
def _builds_init(self):
|
||||||
if self.connection:
|
if self.connection:
|
||||||
self.build_len_min = self.cursor.execute("SELECT MIN(LENGTH(build_commit)) from test;").fetchone()[0]
|
self.build_len_min = self.cursor.execute(f"SELECT MIN(LENGTH(build_commit)) from {self.table_name};").fetchone()[0]
|
||||||
self.build_len_max = self.cursor.execute("SELECT MAX(LENGTH(build_commit)) from test;").fetchone()[0]
|
self.build_len_max = self.cursor.execute(f"SELECT MAX(LENGTH(build_commit)) from {self.table_name};").fetchone()[0]
|
||||||
|
|
||||||
if self.build_len_min != self.build_len_max:
|
if self.build_len_min != self.build_len_max:
|
||||||
logger.warning("Data contains commit hashes of differing lengths. It's possible that the wrong commits will be compared. "
|
logger.warning("Data contains commit hashes of differing lengths. It's possible that the wrong commits will be compared. "
|
||||||
"Try purging the the database of old commits.")
|
"Try purging the the database of old commits.")
|
||||||
self.cursor.execute(f"UPDATE test SET build_commit = SUBSTRING(build_commit, 1, {self.build_len_min});")
|
self.cursor.execute(f"UPDATE {self.table_name} SET build_commit = SUBSTRING(build_commit, 1, {self.build_len_min});")
|
||||||
|
|
||||||
builds = self.cursor.execute("SELECT DISTINCT build_commit FROM test;").fetchall()
|
builds = self.cursor.execute(f"SELECT DISTINCT build_commit FROM {self.table_name};").fetchall()
|
||||||
self.builds = list(map(lambda b: b[0], builds)) # list[tuple[str]] -> list[str]
|
self.builds = list(map(lambda b: b[0], builds)) # list[tuple[str]] -> list[str]
|
||||||
super()._builds_init()
|
super()._builds_init()
|
||||||
|
|
||||||
def builds_timestamp(self, reverse: bool = False) -> Union[Iterator[tuple], Sequence[tuple]]:
|
def builds_timestamp(self, reverse: bool = False) -> Union[Iterator[tuple], Sequence[tuple]]:
|
||||||
data = self.cursor.execute(
|
data = self.cursor.execute(
|
||||||
"SELECT build_commit, test_time FROM test ORDER BY test_time;").fetchall()
|
f"SELECT build_commit, test_time FROM {self.table_name} ORDER BY test_time;").fetchall()
|
||||||
return reversed(data) if reverse else data
|
return reversed(data) if reverse else data
|
||||||
|
|
||||||
def get_rows(self, properties: list[str], hexsha8_baseline: str, hexsha8_compare: str) -> Sequence[tuple]:
|
def get_rows(self, properties: list[str], hexsha8_baseline: str, hexsha8_compare: str) -> Sequence[tuple]:
|
||||||
|
if self.tool == "llama-bench":
|
||||||
|
return self._get_rows_llama_bench(properties, hexsha8_baseline, hexsha8_compare)
|
||||||
|
elif self.tool == "test-backend-ops":
|
||||||
|
return self._get_rows_test_backend_ops(properties, hexsha8_baseline, hexsha8_compare)
|
||||||
|
else:
|
||||||
|
assert False
|
||||||
|
|
||||||
|
def _get_rows_llama_bench(self, properties: list[str], hexsha8_baseline: str, hexsha8_compare: str) -> Sequence[tuple]:
|
||||||
select_string = ", ".join(
|
select_string = ", ".join(
|
||||||
[f"tb.{p}" for p in properties] + ["tb.n_prompt", "tb.n_gen", "tb.n_depth", "AVG(tb.avg_ts)", "AVG(tc.avg_ts)"])
|
[f"tb.{p}" for p in properties] + ["tb.n_prompt", "tb.n_gen", "tb.n_depth", "AVG(tb.avg_ts)", "AVG(tc.avg_ts)"])
|
||||||
equal_string = " AND ".join(
|
equal_string = " AND ".join(
|
||||||
[f"tb.{p} = tc.{p}" for p in KEY_PROPERTIES] + [
|
[f"tb.{p} = tc.{p}" for p in LLAMA_BENCH_KEY_PROPERTIES] + [
|
||||||
f"tb.build_commit = '{hexsha8_baseline}'", f"tc.build_commit = '{hexsha8_compare}'"]
|
f"tb.build_commit = '{hexsha8_baseline}'", f"tc.build_commit = '{hexsha8_compare}'"]
|
||||||
)
|
)
|
||||||
group_order_string = ", ".join([f"tb.{p}" for p in properties] + ["tb.n_gen", "tb.n_prompt", "tb.n_depth"])
|
group_order_string = ", ".join([f"tb.{p}" for p in properties] + ["tb.n_gen", "tb.n_prompt", "tb.n_depth"])
|
||||||
query = (f"SELECT {select_string} FROM test tb JOIN test tc ON {equal_string} "
|
query = (f"SELECT {select_string} FROM {self.table_name} tb JOIN {self.table_name} tc ON {equal_string} "
|
||||||
|
f"GROUP BY {group_order_string} ORDER BY {group_order_string};")
|
||||||
|
return self.cursor.execute(query).fetchall()
|
||||||
|
|
||||||
|
def _get_rows_test_backend_ops(self, properties: list[str], hexsha8_baseline: str, hexsha8_compare: str) -> Sequence[tuple]:
|
||||||
|
# For test-backend-ops, we compare FLOPS and bandwidth metrics (prioritizing FLOPS over bandwidth)
|
||||||
|
select_string = ", ".join(
|
||||||
|
[f"tb.{p}" for p in properties] + [
|
||||||
|
"AVG(tb.flops)", "AVG(tc.flops)",
|
||||||
|
"AVG(tb.bandwidth_gb_s)", "AVG(tc.bandwidth_gb_s)"
|
||||||
|
])
|
||||||
|
equal_string = " AND ".join(
|
||||||
|
[f"tb.{p} = tc.{p}" for p in TEST_BACKEND_OPS_KEY_PROPERTIES] + [
|
||||||
|
f"tb.build_commit = '{hexsha8_baseline}'", f"tc.build_commit = '{hexsha8_compare}'",
|
||||||
|
"tb.supported = 1", "tc.supported = 1", "tb.passed = 1", "tc.passed = 1"] # Only compare successful tests
|
||||||
|
)
|
||||||
|
group_order_string = ", ".join([f"tb.{p}" for p in properties])
|
||||||
|
query = (f"SELECT {select_string} FROM {self.table_name} tb JOIN {self.table_name} tc ON {equal_string} "
|
||||||
f"GROUP BY {group_order_string} ORDER BY {group_order_string};")
|
f"GROUP BY {group_order_string} ORDER BY {group_order_string};")
|
||||||
return self.cursor.execute(query).fetchall()
|
return self.cursor.execute(query).fetchall()
|
||||||
|
|
||||||
|
|
||||||
class LlamaBenchDataSQLite3File(LlamaBenchDataSQLite3):
|
class LlamaBenchDataSQLite3File(LlamaBenchDataSQLite3):
|
||||||
def __init__(self, data_file: str):
|
def __init__(self, data_file: str, tool: Any):
|
||||||
super().__init__()
|
super().__init__(tool)
|
||||||
|
|
||||||
self.connection.close()
|
self.connection.close()
|
||||||
self.connection = sqlite3.connect(data_file)
|
self.connection = sqlite3.connect(data_file)
|
||||||
self.cursor = self.connection.cursor()
|
self.cursor = self.connection.cursor()
|
||||||
|
|
||||||
|
# Check which table exists in the database
|
||||||
|
tables = self.cursor.execute("SELECT name FROM sqlite_master WHERE type='table';").fetchall()
|
||||||
|
table_names = [table[0] for table in tables]
|
||||||
|
|
||||||
|
# Tool selection logic
|
||||||
|
if tool is None:
|
||||||
|
if "llama_bench" in table_names:
|
||||||
|
self.table_name = "llama_bench"
|
||||||
|
self.tool = "llama-bench"
|
||||||
|
elif "test_backend_ops" in table_names:
|
||||||
|
self.table_name = "test_backend_ops"
|
||||||
|
self.tool = "test-backend-ops"
|
||||||
|
else:
|
||||||
|
raise RuntimeError(f"No suitable table found in database. Available tables: {table_names}")
|
||||||
|
elif tool == "llama-bench":
|
||||||
|
if "llama_bench" in table_names:
|
||||||
|
self.table_name = "llama_bench"
|
||||||
|
self.tool = "llama-bench"
|
||||||
|
else:
|
||||||
|
raise RuntimeError(f"Table 'test' not found for tool 'llama-bench'. Available tables: {table_names}")
|
||||||
|
elif tool == "test-backend-ops":
|
||||||
|
if "test_backend_ops" in table_names:
|
||||||
|
self.table_name = "test_backend_ops"
|
||||||
|
self.tool = "test-backend-ops"
|
||||||
|
else:
|
||||||
|
raise RuntimeError(f"Table 'test_backend_ops' not found for tool 'test-backend-ops'. Available tables: {table_names}")
|
||||||
|
else:
|
||||||
|
raise RuntimeError(f"Unknown tool: {tool}")
|
||||||
|
|
||||||
self._builds_init()
|
self._builds_init()
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
|
@ -317,20 +451,23 @@ class LlamaBenchDataSQLite3File(LlamaBenchDataSQLite3):
|
||||||
|
|
||||||
|
|
||||||
class LlamaBenchDataJSONL(LlamaBenchDataSQLite3):
|
class LlamaBenchDataJSONL(LlamaBenchDataSQLite3):
|
||||||
def __init__(self, data_file: str):
|
def __init__(self, data_file: str, tool: str = "llama-bench"):
|
||||||
super().__init__()
|
super().__init__(tool)
|
||||||
|
|
||||||
|
# Get the appropriate field list based on tool
|
||||||
|
db_fields = LLAMA_BENCH_DB_FIELDS if tool == "llama-bench" else TEST_BACKEND_OPS_DB_FIELDS
|
||||||
|
|
||||||
with open(data_file, "r", encoding="utf-8") as fp:
|
with open(data_file, "r", encoding="utf-8") as fp:
|
||||||
for i, line in enumerate(fp):
|
for i, line in enumerate(fp):
|
||||||
parsed = json.loads(line)
|
parsed = json.loads(line)
|
||||||
|
|
||||||
for k in parsed.keys() - set(DB_FIELDS):
|
for k in parsed.keys() - set(db_fields):
|
||||||
del parsed[k]
|
del parsed[k]
|
||||||
|
|
||||||
if (missing_keys := self._check_keys(parsed.keys())):
|
if (missing_keys := self._check_keys(parsed.keys())):
|
||||||
raise RuntimeError(f"Missing required data key(s) at line {i + 1}: {', '.join(missing_keys)}")
|
raise RuntimeError(f"Missing required data key(s) at line {i + 1}: {', '.join(missing_keys)}")
|
||||||
|
|
||||||
self.cursor.execute(f"INSERT INTO test({', '.join(parsed.keys())}) VALUES({', '.join('?' * len(parsed))});", tuple(parsed.values()))
|
self.cursor.execute(f"INSERT INTO {self.table_name}({', '.join(parsed.keys())}) VALUES({', '.join('?' * len(parsed))});", tuple(parsed.values()))
|
||||||
|
|
||||||
self._builds_init()
|
self._builds_init()
|
||||||
|
|
||||||
|
|
@ -349,21 +486,24 @@ class LlamaBenchDataJSONL(LlamaBenchDataSQLite3):
|
||||||
|
|
||||||
|
|
||||||
class LlamaBenchDataJSON(LlamaBenchDataSQLite3):
|
class LlamaBenchDataJSON(LlamaBenchDataSQLite3):
|
||||||
def __init__(self, data_files: list[str]):
|
def __init__(self, data_files: list[str], tool: str = "llama-bench"):
|
||||||
super().__init__()
|
super().__init__(tool)
|
||||||
|
|
||||||
|
# Get the appropriate field list based on tool
|
||||||
|
db_fields = LLAMA_BENCH_DB_FIELDS if tool == "llama-bench" else TEST_BACKEND_OPS_DB_FIELDS
|
||||||
|
|
||||||
for data_file in data_files:
|
for data_file in data_files:
|
||||||
with open(data_file, "r", encoding="utf-8") as fp:
|
with open(data_file, "r", encoding="utf-8") as fp:
|
||||||
parsed = json.load(fp)
|
parsed = json.load(fp)
|
||||||
|
|
||||||
for i, entry in enumerate(parsed):
|
for i, entry in enumerate(parsed):
|
||||||
for k in entry.keys() - set(DB_FIELDS):
|
for k in entry.keys() - set(db_fields):
|
||||||
del entry[k]
|
del entry[k]
|
||||||
|
|
||||||
if (missing_keys := self._check_keys(entry.keys())):
|
if (missing_keys := self._check_keys(entry.keys())):
|
||||||
raise RuntimeError(f"Missing required data key(s) at entry {i + 1}: {', '.join(missing_keys)}")
|
raise RuntimeError(f"Missing required data key(s) at entry {i + 1}: {', '.join(missing_keys)}")
|
||||||
|
|
||||||
self.cursor.execute(f"INSERT INTO test({', '.join(entry.keys())}) VALUES({', '.join('?' * len(entry))});", tuple(entry.values()))
|
self.cursor.execute(f"INSERT INTO {self.table_name}({', '.join(entry.keys())}) VALUES({', '.join('?' * len(entry))});", tuple(entry.values()))
|
||||||
|
|
||||||
self._builds_init()
|
self._builds_init()
|
||||||
|
|
||||||
|
|
@ -384,21 +524,24 @@ class LlamaBenchDataJSON(LlamaBenchDataSQLite3):
|
||||||
|
|
||||||
|
|
||||||
class LlamaBenchDataCSV(LlamaBenchDataSQLite3):
|
class LlamaBenchDataCSV(LlamaBenchDataSQLite3):
|
||||||
def __init__(self, data_files: list[str]):
|
def __init__(self, data_files: list[str], tool: str = "llama-bench"):
|
||||||
super().__init__()
|
super().__init__(tool)
|
||||||
|
|
||||||
|
# Get the appropriate field list based on tool
|
||||||
|
db_fields = LLAMA_BENCH_DB_FIELDS if tool == "llama-bench" else TEST_BACKEND_OPS_DB_FIELDS
|
||||||
|
|
||||||
for data_file in data_files:
|
for data_file in data_files:
|
||||||
with open(data_file, "r", encoding="utf-8") as fp:
|
with open(data_file, "r", encoding="utf-8") as fp:
|
||||||
for i, parsed in enumerate(csv.DictReader(fp)):
|
for i, parsed in enumerate(csv.DictReader(fp)):
|
||||||
keys = set(parsed.keys())
|
keys = set(parsed.keys())
|
||||||
|
|
||||||
for k in keys - set(DB_FIELDS):
|
for k in keys - set(db_fields):
|
||||||
del parsed[k]
|
del parsed[k]
|
||||||
|
|
||||||
if (missing_keys := self._check_keys(keys)):
|
if (missing_keys := self._check_keys(keys)):
|
||||||
raise RuntimeError(f"Missing required data key(s) at line {i + 1}: {', '.join(missing_keys)}")
|
raise RuntimeError(f"Missing required data key(s) at line {i + 1}: {', '.join(missing_keys)}")
|
||||||
|
|
||||||
self.cursor.execute(f"INSERT INTO test({', '.join(parsed.keys())}) VALUES({', '.join('?' * len(parsed))});", tuple(parsed.values()))
|
self.cursor.execute(f"INSERT INTO {self.table_name}({', '.join(parsed.keys())}) VALUES({', '.join('?' * len(parsed))});", tuple(parsed.values()))
|
||||||
|
|
||||||
self._builds_init()
|
self._builds_init()
|
||||||
|
|
||||||
|
|
@ -419,21 +562,90 @@ class LlamaBenchDataCSV(LlamaBenchDataSQLite3):
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
def format_flops(flops_value: float) -> str:
|
||||||
|
"""Format FLOPS values with appropriate units for better readability."""
|
||||||
|
if flops_value == 0:
|
||||||
|
return "0.00"
|
||||||
|
|
||||||
|
# Define unit thresholds and names
|
||||||
|
units = [
|
||||||
|
(1e12, "T"), # TeraFLOPS
|
||||||
|
(1e9, "G"), # GigaFLOPS
|
||||||
|
(1e6, "M"), # MegaFLOPS
|
||||||
|
(1e3, "k"), # kiloFLOPS
|
||||||
|
(1, "") # FLOPS
|
||||||
|
]
|
||||||
|
|
||||||
|
for threshold, unit in units:
|
||||||
|
if abs(flops_value) >= threshold:
|
||||||
|
formatted_value = flops_value / threshold
|
||||||
|
if formatted_value >= 100:
|
||||||
|
return f"{formatted_value:.1f}{unit}"
|
||||||
|
else:
|
||||||
|
return f"{formatted_value:.2f}{unit}"
|
||||||
|
|
||||||
|
# Fallback for very small values
|
||||||
|
return f"{flops_value:.2f}"
|
||||||
|
|
||||||
|
|
||||||
|
def format_flops_for_table(flops_value: float, target_unit: str) -> str:
|
||||||
|
"""Format FLOPS values for table display without unit suffix (since unit is in header)."""
|
||||||
|
if flops_value == 0:
|
||||||
|
return "0.00"
|
||||||
|
|
||||||
|
# Define unit thresholds based on target unit
|
||||||
|
unit_divisors = {
|
||||||
|
"TFLOPS": 1e12,
|
||||||
|
"GFLOPS": 1e9,
|
||||||
|
"MFLOPS": 1e6,
|
||||||
|
"kFLOPS": 1e3,
|
||||||
|
"FLOPS": 1
|
||||||
|
}
|
||||||
|
|
||||||
|
divisor = unit_divisors.get(target_unit, 1)
|
||||||
|
formatted_value = flops_value / divisor
|
||||||
|
|
||||||
|
if formatted_value >= 100:
|
||||||
|
return f"{formatted_value:.1f}"
|
||||||
|
else:
|
||||||
|
return f"{formatted_value:.2f}"
|
||||||
|
|
||||||
|
|
||||||
|
def get_flops_unit_name(flops_values: list) -> str:
|
||||||
|
"""Determine the best FLOPS unit name based on the magnitude of values."""
|
||||||
|
if not flops_values or all(v == 0 for v in flops_values):
|
||||||
|
return "FLOPS"
|
||||||
|
|
||||||
|
# Find the maximum absolute value to determine appropriate unit
|
||||||
|
max_flops = max(abs(v) for v in flops_values if v != 0)
|
||||||
|
|
||||||
|
if max_flops >= 1e12:
|
||||||
|
return "TFLOPS"
|
||||||
|
elif max_flops >= 1e9:
|
||||||
|
return "GFLOPS"
|
||||||
|
elif max_flops >= 1e6:
|
||||||
|
return "MFLOPS"
|
||||||
|
elif max_flops >= 1e3:
|
||||||
|
return "kFLOPS"
|
||||||
|
else:
|
||||||
|
return "FLOPS"
|
||||||
|
|
||||||
|
|
||||||
bench_data = None
|
bench_data = None
|
||||||
if len(input_file) == 1:
|
if len(input_file) == 1:
|
||||||
if LlamaBenchDataSQLite3File.valid_format(input_file[0]):
|
if LlamaBenchDataSQLite3File.valid_format(input_file[0]):
|
||||||
bench_data = LlamaBenchDataSQLite3File(input_file[0])
|
bench_data = LlamaBenchDataSQLite3File(input_file[0], tool)
|
||||||
elif LlamaBenchDataJSON.valid_format(input_file):
|
elif LlamaBenchDataJSON.valid_format(input_file):
|
||||||
bench_data = LlamaBenchDataJSON(input_file)
|
bench_data = LlamaBenchDataJSON(input_file, tool)
|
||||||
elif LlamaBenchDataJSONL.valid_format(input_file[0]):
|
elif LlamaBenchDataJSONL.valid_format(input_file[0]):
|
||||||
bench_data = LlamaBenchDataJSONL(input_file[0])
|
bench_data = LlamaBenchDataJSONL(input_file[0], tool)
|
||||||
elif LlamaBenchDataCSV.valid_format(input_file):
|
elif LlamaBenchDataCSV.valid_format(input_file):
|
||||||
bench_data = LlamaBenchDataCSV(input_file)
|
bench_data = LlamaBenchDataCSV(input_file, tool)
|
||||||
else:
|
else:
|
||||||
if LlamaBenchDataJSON.valid_format(input_file):
|
if LlamaBenchDataJSON.valid_format(input_file):
|
||||||
bench_data = LlamaBenchDataJSON(input_file)
|
bench_data = LlamaBenchDataJSON(input_file, tool)
|
||||||
elif LlamaBenchDataCSV.valid_format(input_file):
|
elif LlamaBenchDataCSV.valid_format(input_file):
|
||||||
bench_data = LlamaBenchDataCSV(input_file)
|
bench_data = LlamaBenchDataCSV(input_file, tool)
|
||||||
|
|
||||||
if not bench_data:
|
if not bench_data:
|
||||||
raise RuntimeError("No valid (or some invalid) input files found.")
|
raise RuntimeError("No valid (or some invalid) input files found.")
|
||||||
|
|
@ -504,12 +716,29 @@ else:
|
||||||
|
|
||||||
name_compare = bench_data.get_commit_name(hexsha8_compare)
|
name_compare = bench_data.get_commit_name(hexsha8_compare)
|
||||||
|
|
||||||
|
# Get tool-specific configuration
|
||||||
|
if tool == "llama-bench":
|
||||||
|
key_properties = LLAMA_BENCH_KEY_PROPERTIES
|
||||||
|
bool_properties = LLAMA_BENCH_BOOL_PROPERTIES
|
||||||
|
pretty_names = LLAMA_BENCH_PRETTY_NAMES
|
||||||
|
default_show = DEFAULT_SHOW_LLAMA_BENCH
|
||||||
|
default_hide = DEFAULT_HIDE_LLAMA_BENCH
|
||||||
|
elif tool == "test-backend-ops":
|
||||||
|
key_properties = TEST_BACKEND_OPS_KEY_PROPERTIES
|
||||||
|
bool_properties = TEST_BACKEND_OPS_BOOL_PROPERTIES
|
||||||
|
pretty_names = TEST_BACKEND_OPS_PRETTY_NAMES
|
||||||
|
default_show = DEFAULT_SHOW_TEST_BACKEND_OPS
|
||||||
|
default_hide = DEFAULT_HIDE_TEST_BACKEND_OPS
|
||||||
|
else:
|
||||||
|
assert False
|
||||||
|
|
||||||
# If the user provided columns to group the results by, use them:
|
# If the user provided columns to group the results by, use them:
|
||||||
if known_args.show is not None:
|
if known_args.show is not None:
|
||||||
show = known_args.show.split(",")
|
show = known_args.show.split(",")
|
||||||
unknown_cols = []
|
unknown_cols = []
|
||||||
for prop in show:
|
for prop in show:
|
||||||
if prop not in KEY_PROPERTIES[:-3]: # Last three values are n_prompt, n_gen, n_depth.
|
valid_props = key_properties if tool == "test-backend-ops" else key_properties[:-3] # Exclude n_prompt, n_gen, n_depth for llama-bench
|
||||||
|
if prop not in valid_props:
|
||||||
unknown_cols.append(prop)
|
unknown_cols.append(prop)
|
||||||
if unknown_cols:
|
if unknown_cols:
|
||||||
logger.error(f"Unknown values for --show: {', '.join(unknown_cols)}")
|
logger.error(f"Unknown values for --show: {', '.join(unknown_cols)}")
|
||||||
|
|
@ -518,32 +747,54 @@ if known_args.show is not None:
|
||||||
rows_show = bench_data.get_rows(show, hexsha8_baseline, hexsha8_compare)
|
rows_show = bench_data.get_rows(show, hexsha8_baseline, hexsha8_compare)
|
||||||
# Otherwise, select those columns where the values are not all the same:
|
# Otherwise, select those columns where the values are not all the same:
|
||||||
else:
|
else:
|
||||||
rows_full = bench_data.get_rows(KEY_PROPERTIES, hexsha8_baseline, hexsha8_compare)
|
rows_full = bench_data.get_rows(key_properties, hexsha8_baseline, hexsha8_compare)
|
||||||
properties_different = []
|
properties_different = []
|
||||||
for i, kp_i in enumerate(KEY_PROPERTIES):
|
|
||||||
if kp_i in DEFAULT_SHOW or kp_i in ["n_prompt", "n_gen", "n_depth"]:
|
if tool == "llama-bench":
|
||||||
continue
|
# For llama-bench, skip n_prompt, n_gen, n_depth from differentiation logic
|
||||||
for row_full in rows_full:
|
check_properties = [kp for kp in key_properties if kp not in ["n_prompt", "n_gen", "n_depth"]]
|
||||||
if row_full[i] != rows_full[0][i]:
|
for i, kp_i in enumerate(key_properties):
|
||||||
properties_different.append(kp_i)
|
if kp_i in default_show or kp_i in ["n_prompt", "n_gen", "n_depth"]:
|
||||||
break
|
continue
|
||||||
|
for row_full in rows_full:
|
||||||
|
if row_full[i] != rows_full[0][i]:
|
||||||
|
properties_different.append(kp_i)
|
||||||
|
break
|
||||||
|
elif tool == "test-backend-ops":
|
||||||
|
# For test-backend-ops, check all key properties
|
||||||
|
for i, kp_i in enumerate(key_properties):
|
||||||
|
if kp_i in default_show:
|
||||||
|
continue
|
||||||
|
for row_full in rows_full:
|
||||||
|
if row_full[i] != rows_full[0][i]:
|
||||||
|
properties_different.append(kp_i)
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
assert False
|
||||||
|
|
||||||
show = []
|
show = []
|
||||||
# Show CPU and/or GPU by default even if the hardware for all results is the same:
|
|
||||||
if rows_full and "n_gpu_layers" not in properties_different:
|
|
||||||
ngl = int(rows_full[0][KEY_PROPERTIES.index("n_gpu_layers")])
|
|
||||||
|
|
||||||
if ngl != 99 and "cpu_info" not in properties_different:
|
if tool == "llama-bench":
|
||||||
show.append("cpu_info")
|
# Show CPU and/or GPU by default even if the hardware for all results is the same:
|
||||||
|
if rows_full and "n_gpu_layers" not in properties_different:
|
||||||
|
ngl = int(rows_full[0][key_properties.index("n_gpu_layers")])
|
||||||
|
|
||||||
show += properties_different
|
if ngl != 99 and "cpu_info" not in properties_different:
|
||||||
|
show.append("cpu_info")
|
||||||
|
|
||||||
index_default = 0
|
show += properties_different
|
||||||
for prop in ["cpu_info", "gpu_info", "n_gpu_layers", "main_gpu"]:
|
|
||||||
if prop in show:
|
index_default = 0
|
||||||
index_default += 1
|
for prop in ["cpu_info", "gpu_info", "n_gpu_layers", "main_gpu"]:
|
||||||
show = show[:index_default] + DEFAULT_SHOW + show[index_default:]
|
if prop in show:
|
||||||
for prop in DEFAULT_HIDE:
|
index_default += 1
|
||||||
|
show = show[:index_default] + default_show + show[index_default:]
|
||||||
|
elif tool == "test-backend-ops":
|
||||||
|
show = default_show + properties_different
|
||||||
|
else:
|
||||||
|
assert False
|
||||||
|
|
||||||
|
for prop in default_hide:
|
||||||
try:
|
try:
|
||||||
show.remove(prop)
|
show.remove(prop)
|
||||||
except ValueError:
|
except ValueError:
|
||||||
|
|
@ -551,7 +802,7 @@ else:
|
||||||
|
|
||||||
# Add plot_x parameter to parameters to show if it's not already present:
|
# Add plot_x parameter to parameters to show if it's not already present:
|
||||||
if known_args.plot:
|
if known_args.plot:
|
||||||
for k, v in PRETTY_NAMES.items():
|
for k, v in pretty_names.items():
|
||||||
if v == known_args.plot_x and k not in show:
|
if v == known_args.plot_x and k not in show:
|
||||||
show.append(k)
|
show.append(k)
|
||||||
break
|
break
|
||||||
|
|
@ -563,60 +814,120 @@ if not rows_show:
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
table = []
|
table = []
|
||||||
for row in rows_show:
|
primary_metric = "FLOPS" # Default to FLOPS for test-backend-ops
|
||||||
n_prompt = int(row[-5])
|
|
||||||
n_gen = int(row[-4])
|
if tool == "llama-bench":
|
||||||
n_depth = int(row[-3])
|
# For llama-bench, create test names and compare avg_ts values
|
||||||
if n_prompt != 0 and n_gen == 0:
|
for row in rows_show:
|
||||||
test_name = f"pp{n_prompt}"
|
n_prompt = int(row[-5])
|
||||||
elif n_prompt == 0 and n_gen != 0:
|
n_gen = int(row[-4])
|
||||||
test_name = f"tg{n_gen}"
|
n_depth = int(row[-3])
|
||||||
else:
|
if n_prompt != 0 and n_gen == 0:
|
||||||
test_name = f"pp{n_prompt}+tg{n_gen}"
|
test_name = f"pp{n_prompt}"
|
||||||
if n_depth != 0:
|
elif n_prompt == 0 and n_gen != 0:
|
||||||
test_name = f"{test_name}@d{n_depth}"
|
test_name = f"tg{n_gen}"
|
||||||
# Regular columns test name avg t/s values Speedup
|
else:
|
||||||
# VVVVVVVVVVVVV VVVVVVVVV VVVVVVVVVVVVVV VVVVVVV
|
test_name = f"pp{n_prompt}+tg{n_gen}"
|
||||||
table.append(list(row[:-5]) + [test_name] + list(row[-2:]) + [float(row[-1]) / float(row[-2])])
|
if n_depth != 0:
|
||||||
|
test_name = f"{test_name}@d{n_depth}"
|
||||||
|
# Regular columns test name avg t/s values Speedup
|
||||||
|
# VVVVVVVVVVVVV VVVVVVVVV VVVVVVVVVVVVVV VVVVVVV
|
||||||
|
table.append(list(row[:-5]) + [test_name] + list(row[-2:]) + [float(row[-1]) / float(row[-2])])
|
||||||
|
elif tool == "test-backend-ops":
|
||||||
|
# Determine the primary metric by checking rows until we find one with valid data
|
||||||
|
if rows_show:
|
||||||
|
primary_metric = "FLOPS" # Default to FLOPS
|
||||||
|
flops_values = []
|
||||||
|
|
||||||
|
# Collect all FLOPS values to determine the best unit
|
||||||
|
for sample_row in rows_show:
|
||||||
|
baseline_flops = float(sample_row[-4])
|
||||||
|
compare_flops = float(sample_row[-3])
|
||||||
|
baseline_bandwidth = float(sample_row[-2])
|
||||||
|
|
||||||
|
if baseline_flops > 0:
|
||||||
|
flops_values.extend([baseline_flops, compare_flops])
|
||||||
|
elif baseline_bandwidth > 0 and not flops_values:
|
||||||
|
primary_metric = "Bandwidth (GB/s)"
|
||||||
|
|
||||||
|
# If we have FLOPS data, determine the appropriate unit
|
||||||
|
if flops_values:
|
||||||
|
primary_metric = get_flops_unit_name(flops_values)
|
||||||
|
|
||||||
|
# For test-backend-ops, prioritize FLOPS > bandwidth for comparison
|
||||||
|
for row in rows_show:
|
||||||
|
# Extract metrics: flops, bandwidth_gb_s (baseline and compare)
|
||||||
|
baseline_flops = float(row[-4])
|
||||||
|
compare_flops = float(row[-3])
|
||||||
|
baseline_bandwidth = float(row[-2])
|
||||||
|
compare_bandwidth = float(row[-1])
|
||||||
|
|
||||||
|
# Determine which metric to use for comparison (prioritize FLOPS > bandwidth)
|
||||||
|
if baseline_flops > 0 and compare_flops > 0:
|
||||||
|
# Use FLOPS comparison (higher is better)
|
||||||
|
speedup = compare_flops / baseline_flops
|
||||||
|
baseline_str = format_flops_for_table(baseline_flops, primary_metric)
|
||||||
|
compare_str = format_flops_for_table(compare_flops, primary_metric)
|
||||||
|
elif baseline_bandwidth > 0 and compare_bandwidth > 0:
|
||||||
|
# Use bandwidth comparison (higher is better)
|
||||||
|
speedup = compare_bandwidth / baseline_bandwidth
|
||||||
|
baseline_str = f"{baseline_bandwidth:.2f}"
|
||||||
|
compare_str = f"{compare_bandwidth:.2f}"
|
||||||
|
else:
|
||||||
|
# Fallback if no valid data is available
|
||||||
|
baseline_str = "N/A"
|
||||||
|
compare_str = "N/A"
|
||||||
|
from math import nan
|
||||||
|
speedup = nan
|
||||||
|
|
||||||
|
table.append(list(row[:-4]) + [baseline_str, compare_str, speedup])
|
||||||
|
else:
|
||||||
|
assert False
|
||||||
|
|
||||||
# Some a-posteriori fixes to make the table contents prettier:
|
# Some a-posteriori fixes to make the table contents prettier:
|
||||||
for bool_property in BOOL_PROPERTIES:
|
for bool_property in bool_properties:
|
||||||
if bool_property in show:
|
if bool_property in show:
|
||||||
ip = show.index(bool_property)
|
ip = show.index(bool_property)
|
||||||
for row_table in table:
|
for row_table in table:
|
||||||
row_table[ip] = "Yes" if int(row_table[ip]) == 1 else "No"
|
row_table[ip] = "Yes" if int(row_table[ip]) == 1 else "No"
|
||||||
|
|
||||||
if "model_type" in show:
|
if tool == "llama-bench":
|
||||||
ip = show.index("model_type")
|
if "model_type" in show:
|
||||||
for (old, new) in MODEL_SUFFIX_REPLACE.items():
|
ip = show.index("model_type")
|
||||||
|
for (old, new) in MODEL_SUFFIX_REPLACE.items():
|
||||||
|
for row_table in table:
|
||||||
|
row_table[ip] = row_table[ip].replace(old, new)
|
||||||
|
|
||||||
|
if "model_size" in show:
|
||||||
|
ip = show.index("model_size")
|
||||||
for row_table in table:
|
for row_table in table:
|
||||||
row_table[ip] = row_table[ip].replace(old, new)
|
row_table[ip] = float(row_table[ip]) / 1024 ** 3
|
||||||
|
|
||||||
if "model_size" in show:
|
if "gpu_info" in show:
|
||||||
ip = show.index("model_size")
|
ip = show.index("gpu_info")
|
||||||
for row_table in table:
|
for row_table in table:
|
||||||
row_table[ip] = float(row_table[ip]) / 1024 ** 3
|
for gns in GPU_NAME_STRIP:
|
||||||
|
row_table[ip] = row_table[ip].replace(gns, "")
|
||||||
|
|
||||||
if "gpu_info" in show:
|
gpu_names = row_table[ip].split(", ")
|
||||||
ip = show.index("gpu_info")
|
num_gpus = len(gpu_names)
|
||||||
for row_table in table:
|
all_names_the_same = len(set(gpu_names)) == 1
|
||||||
for gns in GPU_NAME_STRIP:
|
if len(gpu_names) >= 2 and all_names_the_same:
|
||||||
row_table[ip] = row_table[ip].replace(gns, "")
|
row_table[ip] = f"{num_gpus}x {gpu_names[0]}"
|
||||||
|
|
||||||
gpu_names = row_table[ip].split(", ")
|
headers = [pretty_names.get(p, p) for p in show]
|
||||||
num_gpus = len(gpu_names)
|
if tool == "llama-bench":
|
||||||
all_names_the_same = len(set(gpu_names)) == 1
|
headers += ["Test", f"t/s {name_baseline}", f"t/s {name_compare}", "Speedup"]
|
||||||
if len(gpu_names) >= 2 and all_names_the_same:
|
elif tool == "test-backend-ops":
|
||||||
row_table[ip] = f"{num_gpus}x {gpu_names[0]}"
|
headers += [f"{primary_metric} {name_baseline}", f"{primary_metric} {name_compare}", "Speedup"]
|
||||||
|
else:
|
||||||
headers = [PRETTY_NAMES[p] for p in show]
|
assert False
|
||||||
headers += ["Test", f"t/s {name_baseline}", f"t/s {name_compare}", "Speedup"]
|
|
||||||
|
|
||||||
if known_args.plot:
|
if known_args.plot:
|
||||||
def create_performance_plot(table_data: list[list[str]], headers: list[str], baseline_name: str, compare_name: str, output_file: str, plot_x_param: str, log_scale: bool = False):
|
def create_performance_plot(table_data: list[list[str]], headers: list[str], baseline_name: str, compare_name: str, output_file: str, plot_x_param: str, log_scale: bool = False, tool_type: str = "llama-bench", metric_name: str = "t/s"):
|
||||||
try:
|
try:
|
||||||
import matplotlib.pyplot as plt
|
|
||||||
import matplotlib
|
import matplotlib
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
matplotlib.use('Agg')
|
matplotlib.use('Agg')
|
||||||
except ImportError as e:
|
except ImportError as e:
|
||||||
logger.error("matplotlib is required for --plot.")
|
logger.error("matplotlib is required for --plot.")
|
||||||
|
|
@ -627,7 +938,7 @@ if known_args.plot:
|
||||||
plot_x_label = plot_x_param
|
plot_x_label = plot_x_param
|
||||||
|
|
||||||
if plot_x_param not in ["n_prompt", "n_gen", "n_depth"]:
|
if plot_x_param not in ["n_prompt", "n_gen", "n_depth"]:
|
||||||
pretty_name = PRETTY_NAMES.get(plot_x_param, plot_x_param)
|
pretty_name = LLAMA_BENCH_PRETTY_NAMES.get(plot_x_param, plot_x_param)
|
||||||
if pretty_name in data_headers:
|
if pretty_name in data_headers:
|
||||||
plot_x_index = data_headers.index(pretty_name)
|
plot_x_index = data_headers.index(pretty_name)
|
||||||
plot_x_label = pretty_name
|
plot_x_label = pretty_name
|
||||||
|
|
@ -746,8 +1057,16 @@ if known_args.plot:
|
||||||
|
|
||||||
title = ', '.join(title_parts) if title_parts else "Performance comparison"
|
title = ', '.join(title_parts) if title_parts else "Performance comparison"
|
||||||
|
|
||||||
|
# Determine y-axis label based on tool type
|
||||||
|
if tool_type == "llama-bench":
|
||||||
|
y_label = "Tokens per second (t/s)"
|
||||||
|
elif tool_type == "test-backend-ops":
|
||||||
|
y_label = metric_name
|
||||||
|
else:
|
||||||
|
assert False
|
||||||
|
|
||||||
ax.set_xlabel(plot_x_label, fontsize=12, fontweight='bold')
|
ax.set_xlabel(plot_x_label, fontsize=12, fontweight='bold')
|
||||||
ax.set_ylabel('Tokens per second (t/s)', fontsize=12, fontweight='bold')
|
ax.set_ylabel(y_label, fontsize=12, fontweight='bold')
|
||||||
ax.set_title(title, fontsize=12, fontweight='bold')
|
ax.set_title(title, fontsize=12, fontweight='bold')
|
||||||
ax.legend(loc='best', fontsize=10)
|
ax.legend(loc='best', fontsize=10)
|
||||||
ax.grid(True, alpha=0.3)
|
ax.grid(True, alpha=0.3)
|
||||||
|
|
@ -765,7 +1084,7 @@ if known_args.plot:
|
||||||
plt.savefig(output_file, dpi=300, bbox_inches='tight')
|
plt.savefig(output_file, dpi=300, bbox_inches='tight')
|
||||||
plt.close()
|
plt.close()
|
||||||
|
|
||||||
create_performance_plot(table, headers, name_baseline, name_compare, known_args.plot, known_args.plot_x, known_args.plot_log_scale)
|
create_performance_plot(table, headers, name_baseline, name_compare, known_args.plot, known_args.plot_x, known_args.plot_log_scale, tool, primary_metric)
|
||||||
|
|
||||||
print(tabulate( # noqa: NP100
|
print(tabulate( # noqa: NP100
|
||||||
table,
|
table,
|
||||||
|
|
|
||||||
|
|
@ -85,6 +85,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||||
{ LLM_ARCH_ERNIE4_5, "ernie4_5" },
|
{ LLM_ARCH_ERNIE4_5, "ernie4_5" },
|
||||||
{ LLM_ARCH_ERNIE4_5_MOE, "ernie4_5-moe" },
|
{ LLM_ARCH_ERNIE4_5_MOE, "ernie4_5-moe" },
|
||||||
{ LLM_ARCH_HUNYUAN_MOE, "hunyuan-moe" },
|
{ LLM_ARCH_HUNYUAN_MOE, "hunyuan-moe" },
|
||||||
|
{ LLM_ARCH_HUNYUAN_DENSE, "hunyuan-dense" },
|
||||||
{ LLM_ARCH_SMOLLM3, "smollm3" },
|
{ LLM_ARCH_SMOLLM3, "smollm3" },
|
||||||
{ LLM_ARCH_LFM2, "lfm2" },
|
{ LLM_ARCH_LFM2, "lfm2" },
|
||||||
{ LLM_ARCH_DREAM, "dream" },
|
{ LLM_ARCH_DREAM, "dream" },
|
||||||
|
|
@ -1897,6 +1898,26 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||||
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
LLM_ARCH_HUNYUAN_DENSE,
|
||||||
|
{
|
||||||
|
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||||
|
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||||
|
{ LLM_TENSOR_OUTPUT, "output" },
|
||||||
|
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||||
|
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
||||||
|
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
||||||
|
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
||||||
|
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
||||||
|
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
||||||
|
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||||
|
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
||||||
|
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
||||||
|
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||||
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||||
|
|
||||||
|
},
|
||||||
|
},
|
||||||
{
|
{
|
||||||
LLM_ARCH_SMOLLM3,
|
LLM_ARCH_SMOLLM3,
|
||||||
{
|
{
|
||||||
|
|
|
||||||
|
|
@ -89,6 +89,7 @@ enum llm_arch {
|
||||||
LLM_ARCH_ERNIE4_5,
|
LLM_ARCH_ERNIE4_5,
|
||||||
LLM_ARCH_ERNIE4_5_MOE,
|
LLM_ARCH_ERNIE4_5_MOE,
|
||||||
LLM_ARCH_HUNYUAN_MOE,
|
LLM_ARCH_HUNYUAN_MOE,
|
||||||
|
LLM_ARCH_HUNYUAN_DENSE,
|
||||||
LLM_ARCH_SMOLLM3,
|
LLM_ARCH_SMOLLM3,
|
||||||
LLM_ARCH_LFM2,
|
LLM_ARCH_LFM2,
|
||||||
LLM_ARCH_DREAM,
|
LLM_ARCH_DREAM,
|
||||||
|
|
|
||||||
|
|
@ -66,6 +66,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
|
||||||
{ "llama4", LLM_CHAT_TEMPLATE_LLAMA4 },
|
{ "llama4", LLM_CHAT_TEMPLATE_LLAMA4 },
|
||||||
{ "smolvlm", LLM_CHAT_TEMPLATE_SMOLVLM },
|
{ "smolvlm", LLM_CHAT_TEMPLATE_SMOLVLM },
|
||||||
{ "hunyuan-moe", LLM_CHAT_TEMPLATE_HUNYUAN_MOE },
|
{ "hunyuan-moe", LLM_CHAT_TEMPLATE_HUNYUAN_MOE },
|
||||||
|
{ "hunyuan-dense", LLM_CHAT_TEMPLATE_HUNYUAN_DENSE },
|
||||||
{ "kimi-k2", LLM_CHAT_TEMPLATE_KIMI_K2 },
|
{ "kimi-k2", LLM_CHAT_TEMPLATE_KIMI_K2 },
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
@ -193,6 +194,8 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
|
||||||
return LLM_CHAT_TEMPLATE_DOTS1;
|
return LLM_CHAT_TEMPLATE_DOTS1;
|
||||||
} else if (tmpl_contains("<|startoftext|>") && tmpl_contains("<|extra_4|>")) {
|
} else if (tmpl_contains("<|startoftext|>") && tmpl_contains("<|extra_4|>")) {
|
||||||
return LLM_CHAT_TEMPLATE_HUNYUAN_MOE;
|
return LLM_CHAT_TEMPLATE_HUNYUAN_MOE;
|
||||||
|
} else if (tmpl_contains("<|hy_place▁holder▁no▁2|>") && tmpl_contains("<|hy_place▁holder▁no▁3|>")) {
|
||||||
|
return LLM_CHAT_TEMPLATE_HUNYUAN_DENSE;
|
||||||
} else if (tmpl_contains("<|im_assistant|>assistant<|im_middle|>")) {
|
} else if (tmpl_contains("<|im_assistant|>assistant<|im_middle|>")) {
|
||||||
return LLM_CHAT_TEMPLATE_KIMI_K2;
|
return LLM_CHAT_TEMPLATE_KIMI_K2;
|
||||||
}
|
}
|
||||||
|
|
@ -698,11 +701,27 @@ int32_t llm_chat_apply_template(
|
||||||
if (role == "system") {
|
if (role == "system") {
|
||||||
ss << "<|startoftext|>" << message->content << "<|extra_4|>";
|
ss << "<|startoftext|>" << message->content << "<|extra_4|>";
|
||||||
} else if (role == "assistant") {
|
} else if (role == "assistant") {
|
||||||
ss << "<|startoftext|>" << message->content << "<|eos|>";
|
ss << message->content << "<|eos|>";
|
||||||
} else {
|
} else {
|
||||||
ss << "<|startoftext|>" << message->content << "<|extra_0|>";
|
ss << "<|startoftext|>" << message->content << "<|extra_0|>";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
} else if (tmpl == LLM_CHAT_TEMPLATE_HUNYUAN_DENSE) {
|
||||||
|
// tencent/Hunyuan-4B-Instruct
|
||||||
|
for (size_t i = 0; i < chat.size(); i++) {
|
||||||
|
std::string role(chat[i]->role);
|
||||||
|
if (i == 0) {
|
||||||
|
if (role == "system") {
|
||||||
|
ss << chat[i]->content << "<|hy_place▁holder▁no▁3|>";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (role == "assistant") {
|
||||||
|
ss << "<|hy_Assistant|>" << chat[i]->content << "<|hy_place▁holder▁no▁2|>";
|
||||||
|
} else if (role == "user") {
|
||||||
|
ss << "<|hy_User|>" << chat[i]->content << "<|hy_Assistant|>";
|
||||||
|
}
|
||||||
|
}
|
||||||
} else if (tmpl == LLM_CHAT_TEMPLATE_KIMI_K2) {
|
} else if (tmpl == LLM_CHAT_TEMPLATE_KIMI_K2) {
|
||||||
// moonshotai/Kimi-K2-Instruct
|
// moonshotai/Kimi-K2-Instruct
|
||||||
for (auto message : chat) {
|
for (auto message : chat) {
|
||||||
|
|
|
||||||
|
|
@ -46,6 +46,7 @@ enum llm_chat_template {
|
||||||
LLM_CHAT_TEMPLATE_SMOLVLM,
|
LLM_CHAT_TEMPLATE_SMOLVLM,
|
||||||
LLM_CHAT_TEMPLATE_DOTS1,
|
LLM_CHAT_TEMPLATE_DOTS1,
|
||||||
LLM_CHAT_TEMPLATE_HUNYUAN_MOE,
|
LLM_CHAT_TEMPLATE_HUNYUAN_MOE,
|
||||||
|
LLM_CHAT_TEMPLATE_HUNYUAN_DENSE,
|
||||||
LLM_CHAT_TEMPLATE_KIMI_K2,
|
LLM_CHAT_TEMPLATE_KIMI_K2,
|
||||||
LLM_CHAT_TEMPLATE_UNKNOWN,
|
LLM_CHAT_TEMPLATE_UNKNOWN,
|
||||||
};
|
};
|
||||||
|
|
|
||||||
|
|
@ -105,7 +105,7 @@ llama_context::llama_context(
|
||||||
|
|
||||||
{
|
{
|
||||||
const char * LLAMA_SET_ROWS = getenv("LLAMA_SET_ROWS");
|
const char * LLAMA_SET_ROWS = getenv("LLAMA_SET_ROWS");
|
||||||
supports_set_rows = LLAMA_SET_ROWS ? (atoi(LLAMA_SET_ROWS) != 0) : false;
|
supports_set_rows = LLAMA_SET_ROWS ? (atoi(LLAMA_SET_ROWS) != 0) : supports_set_rows;
|
||||||
|
|
||||||
if (!supports_set_rows && !cparams.kv_unified) {
|
if (!supports_set_rows && !cparams.kv_unified) {
|
||||||
LLAMA_LOG_WARN("%s: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache\n", __func__);
|
LLAMA_LOG_WARN("%s: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache\n", __func__);
|
||||||
|
|
@ -113,6 +113,15 @@ llama_context::llama_context(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
const char * LLAMA_GRAPH_REUSE_DISABLE = getenv("LLAMA_GRAPH_REUSE_DISABLE");
|
||||||
|
graph_reuse_disable = LLAMA_GRAPH_REUSE_DISABLE ? (atoi(LLAMA_GRAPH_REUSE_DISABLE) != 0) : graph_reuse_disable;
|
||||||
|
|
||||||
|
if (graph_reuse_disable) {
|
||||||
|
LLAMA_LOG_WARN("%s: graph reuse disabled\n", __func__);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max;
|
const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max;
|
||||||
|
|
||||||
LLAMA_LOG_INFO("%s: n_seq_max = %u\n", __func__, cparams.n_seq_max);
|
LLAMA_LOG_INFO("%s: n_seq_max = %u\n", __func__, cparams.n_seq_max);
|
||||||
|
|
@ -716,7 +725,7 @@ llm_graph_result * llama_context::process_ubatch(const llama_ubatch & ubatch, ll
|
||||||
// in order to correctly reuse a graph, it's full topology has to be uniquely determined by these parameters
|
// in order to correctly reuse a graph, it's full topology has to be uniquely determined by these parameters
|
||||||
const auto gparams = graph_params(res, ubatch, mctx, gtype);
|
const auto gparams = graph_params(res, ubatch, mctx, gtype);
|
||||||
|
|
||||||
if (res->can_reuse(gparams)) {
|
if (!graph_reuse_disable && res->can_reuse(gparams)) {
|
||||||
//LLAMA_LOG_DEBUG("%s: reusing previous graph\n", __func__);
|
//LLAMA_LOG_DEBUG("%s: reusing previous graph\n", __func__);
|
||||||
|
|
||||||
n_reused++;
|
n_reused++;
|
||||||
|
|
|
||||||
|
|
@ -289,7 +289,10 @@ private:
|
||||||
|
|
||||||
// env: LLAMA_SET_ROWS (temporary)
|
// env: LLAMA_SET_ROWS (temporary)
|
||||||
// ref: https://github.com/ggml-org/llama.cpp/pull/14285
|
// ref: https://github.com/ggml-org/llama.cpp/pull/14285
|
||||||
bool supports_set_rows = false;
|
bool supports_set_rows = true;
|
||||||
|
|
||||||
|
// env: LLAMA_GRAPH_REUSE_DISABLE
|
||||||
|
bool graph_reuse_disable = false;
|
||||||
|
|
||||||
// perf
|
// perf
|
||||||
mutable int64_t t_start_us = 0;
|
mutable int64_t t_start_us = 0;
|
||||||
|
|
|
||||||
|
|
@ -423,7 +423,9 @@ struct llm_graph_params {
|
||||||
(!ubatch.embd && !other.ubatch.embd)
|
(!ubatch.embd && !other.ubatch.embd)
|
||||||
);
|
);
|
||||||
|
|
||||||
if (can_reuse_ubatch && !ubatch.equal_seqs()) {
|
// when we split the batch using "equal_seqs" we have to verify that the participating sequences are the same
|
||||||
|
// the reason is because the set of attention streams would be different for different sequences
|
||||||
|
if (can_reuse_ubatch && ubatch.equal_seqs()) {
|
||||||
if (!ubatch.data) {
|
if (!ubatch.data) {
|
||||||
// if the old ubatch does not own it's data, then we cannot guarantee that it is still alive, and
|
// if the old ubatch does not own it's data, then we cannot guarantee that it is still alive, and
|
||||||
// therefore we cannot perform the sequence id check. normally should never happen
|
// therefore we cannot perform the sequence id check. normally should never happen
|
||||||
|
|
|
||||||
|
|
@ -183,7 +183,7 @@ llama_kv_cache_unified::llama_kv_cache_unified(
|
||||||
const size_t memory_size_k = size_k_bytes();
|
const size_t memory_size_k = size_k_bytes();
|
||||||
const size_t memory_size_v = size_v_bytes();
|
const size_t memory_size_v = size_v_bytes();
|
||||||
|
|
||||||
LLAMA_LOG_INFO("%s: size = %7.2f MiB (%6u cells, %3d layers, %2u/%2u seqs), K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__,
|
LLAMA_LOG_INFO("%s: size = %7.2f MiB (%6u cells, %3d layers, %2u/%u seqs), K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__,
|
||||||
(float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f), kv_size, (int) layers.size(), n_seq_max, n_stream,
|
(float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f), kv_size, (int) layers.size(), n_seq_max, n_stream,
|
||||||
ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f),
|
ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f),
|
||||||
ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
|
ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
|
||||||
|
|
@ -193,7 +193,7 @@ llama_kv_cache_unified::llama_kv_cache_unified(
|
||||||
debug = LLAMA_KV_CACHE_DEBUG ? atoi(LLAMA_KV_CACHE_DEBUG) : 0;
|
debug = LLAMA_KV_CACHE_DEBUG ? atoi(LLAMA_KV_CACHE_DEBUG) : 0;
|
||||||
|
|
||||||
const char * LLAMA_SET_ROWS = getenv("LLAMA_SET_ROWS");
|
const char * LLAMA_SET_ROWS = getenv("LLAMA_SET_ROWS");
|
||||||
supports_set_rows = LLAMA_SET_ROWS ? atoi(LLAMA_SET_ROWS) != 0 : 0;
|
supports_set_rows = LLAMA_SET_ROWS ? atoi(LLAMA_SET_ROWS) != 0 : supports_set_rows;
|
||||||
|
|
||||||
if (!supports_set_rows) {
|
if (!supports_set_rows) {
|
||||||
// ref: https://github.com/ggml-org/llama.cpp/pull/14363
|
// ref: https://github.com/ggml-org/llama.cpp/pull/14363
|
||||||
|
|
|
||||||
|
|
@ -230,7 +230,7 @@ private:
|
||||||
|
|
||||||
// env: LLAMA_SET_ROWS (temporary)
|
// env: LLAMA_SET_ROWS (temporary)
|
||||||
// ref: https://github.com/ggml-org/llama.cpp/pull/14285
|
// ref: https://github.com/ggml-org/llama.cpp/pull/14285
|
||||||
bool supports_set_rows = false;
|
bool supports_set_rows = true;
|
||||||
|
|
||||||
const llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
|
const llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -25,6 +25,7 @@ llama_memory_hybrid::llama_memory_hybrid(
|
||||||
/* common */
|
/* common */
|
||||||
uint32_t n_seq_max,
|
uint32_t n_seq_max,
|
||||||
bool offload,
|
bool offload,
|
||||||
|
bool unified,
|
||||||
/* layer filters */
|
/* layer filters */
|
||||||
layer_filter_cb && filter_attn,
|
layer_filter_cb && filter_attn,
|
||||||
layer_filter_cb && filter_recr) :
|
layer_filter_cb && filter_recr) :
|
||||||
|
|
@ -38,7 +39,7 @@ llama_memory_hybrid::llama_memory_hybrid(
|
||||||
type_v,
|
type_v,
|
||||||
v_trans,
|
v_trans,
|
||||||
offload,
|
offload,
|
||||||
1,
|
unified,
|
||||||
kv_size,
|
kv_size,
|
||||||
n_seq_max,
|
n_seq_max,
|
||||||
n_pad,
|
n_pad,
|
||||||
|
|
|
||||||
|
|
@ -39,6 +39,7 @@ public:
|
||||||
/* common */
|
/* common */
|
||||||
uint32_t n_seq_max,
|
uint32_t n_seq_max,
|
||||||
bool offload,
|
bool offload,
|
||||||
|
bool unified,
|
||||||
/* layer filters */
|
/* layer filters */
|
||||||
layer_filter_cb && filter_attn = nullptr,
|
layer_filter_cb && filter_attn = nullptr,
|
||||||
layer_filter_cb && filter_recr = nullptr);
|
layer_filter_cb && filter_recr = nullptr);
|
||||||
|
|
|
||||||
|
|
@ -899,6 +899,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||||
} break;
|
} break;
|
||||||
case LLM_ARCH_QWEN3:
|
case LLM_ARCH_QWEN3:
|
||||||
{
|
{
|
||||||
|
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
|
||||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||||
switch (hparams.n_layer) {
|
switch (hparams.n_layer) {
|
||||||
case 28: type = hparams.n_embd == 1024 ? LLM_TYPE_0_6B : LLM_TYPE_1_7B; break;
|
case 28: type = hparams.n_embd == 1024 ? LLM_TYPE_0_6B : LLM_TYPE_1_7B; break;
|
||||||
|
|
@ -1760,6 +1761,18 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||||
default: type = LLM_TYPE_UNKNOWN;
|
default: type = LLM_TYPE_UNKNOWN;
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
|
case LLM_ARCH_HUNYUAN_DENSE:
|
||||||
|
{
|
||||||
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||||
|
|
||||||
|
switch (hparams.n_embd) {
|
||||||
|
case 1024: type = LLM_TYPE_0_5B; break;
|
||||||
|
case 2048: type = LLM_TYPE_1_8B; break;
|
||||||
|
case 3072: type = LLM_TYPE_4B; break;
|
||||||
|
case 4096: type = LLM_TYPE_7B; break;
|
||||||
|
default: type = LLM_TYPE_UNKNOWN;
|
||||||
|
}
|
||||||
|
} break;
|
||||||
case LLM_ARCH_SMOLLM3:
|
case LLM_ARCH_SMOLLM3:
|
||||||
{
|
{
|
||||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||||
|
|
@ -5195,6 +5208,39 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||||
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0);
|
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0);
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
|
case LLM_ARCH_HUNYUAN_DENSE:
|
||||||
|
{
|
||||||
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
||||||
|
|
||||||
|
// output
|
||||||
|
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
||||||
|
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
||||||
|
// if output is NULL, init from the input tok embed
|
||||||
|
if (output == NULL) {
|
||||||
|
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < n_layer; ++i) {
|
||||||
|
auto & layer = layers[i];
|
||||||
|
|
||||||
|
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
||||||
|
|
||||||
|
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
|
||||||
|
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
|
||||||
|
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
|
||||||
|
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
|
||||||
|
|
||||||
|
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
|
||||||
|
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
|
||||||
|
|
||||||
|
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
||||||
|
|
||||||
|
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
||||||
|
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
||||||
|
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
||||||
|
|
||||||
|
}
|
||||||
|
} break;
|
||||||
case LLM_ARCH_SMOLLM3:
|
case LLM_ARCH_SMOLLM3:
|
||||||
{
|
{
|
||||||
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
||||||
|
|
@ -16930,6 +16976,144 @@ struct llm_build_hunyuan_moe : public llm_graph_context {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct llm_build_hunyuan_dense : public llm_graph_context {
|
||||||
|
llm_build_hunyuan_dense(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||||
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||||
|
|
||||||
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||||
|
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||||
|
|
||||||
|
ggml_tensor * cur;
|
||||||
|
ggml_tensor * inpL;
|
||||||
|
|
||||||
|
inpL = build_inp_embd(model.tok_embd);
|
||||||
|
|
||||||
|
// inp_pos - contains the positions
|
||||||
|
ggml_tensor * inp_pos = build_inp_pos();
|
||||||
|
|
||||||
|
auto * inp_attn = build_attn_inp_kv_unified();
|
||||||
|
|
||||||
|
const float kq_scale = 1.0f / sqrtf(float(n_embd_head));
|
||||||
|
|
||||||
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||||
|
|
||||||
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
|
ggml_tensor * inpSA = inpL;
|
||||||
|
|
||||||
|
// norm
|
||||||
|
cur = build_norm(inpL,
|
||||||
|
model.layers[il].attn_norm, NULL,
|
||||||
|
LLM_NORM_RMS, il);
|
||||||
|
cb(cur, "attn_norm", il);
|
||||||
|
// self-attention
|
||||||
|
{
|
||||||
|
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
||||||
|
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
||||||
|
|
||||||
|
// compute Q and K and RoPE them
|
||||||
|
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
||||||
|
cb(Qcur, "Qcur", il);
|
||||||
|
if (model.layers[il].bq) {
|
||||||
|
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
||||||
|
cb(Qcur, "Qcur", il);
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
||||||
|
cb(Kcur, "Kcur", il);
|
||||||
|
if (model.layers[il].bk) {
|
||||||
|
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
||||||
|
cb(Kcur, "Kcur", il);
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
||||||
|
cb(Vcur, "Vcur", il);
|
||||||
|
if (model.layers[il].bv) {
|
||||||
|
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
||||||
|
cb(Vcur, "Vcur", il);
|
||||||
|
}
|
||||||
|
|
||||||
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
||||||
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
||||||
|
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
||||||
|
|
||||||
|
Qcur = ggml_rope_ext(
|
||||||
|
ctx0, Qcur, inp_pos, rope_factors,
|
||||||
|
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||||
|
ext_factor, attn_factor, beta_fast, beta_slow
|
||||||
|
);
|
||||||
|
|
||||||
|
cb(Qcur, "Qcur", il);
|
||||||
|
cb(Kcur, "Kcur", il);
|
||||||
|
cb(Vcur, "Vcur", il);
|
||||||
|
|
||||||
|
Kcur = ggml_rope_ext(
|
||||||
|
ctx0, Kcur, inp_pos, rope_factors,
|
||||||
|
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||||
|
ext_factor, attn_factor, beta_fast, beta_slow
|
||||||
|
);
|
||||||
|
|
||||||
|
Kcur = build_norm(Kcur,
|
||||||
|
model.layers[il].attn_k_norm, nullptr,
|
||||||
|
LLM_NORM_RMS, il);
|
||||||
|
cb(Kcur, "Kcur_norm", il);
|
||||||
|
|
||||||
|
Qcur = build_norm(Qcur,
|
||||||
|
model.layers[il].attn_q_norm, nullptr,
|
||||||
|
LLM_NORM_RMS, il);
|
||||||
|
cb(Qcur, "Qcur_norm", il);
|
||||||
|
|
||||||
|
cur = build_attn(inp_attn,
|
||||||
|
model.layers[il].wo, model.layers[il].bo,
|
||||||
|
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
||||||
|
cb(cur, "attn_out", il);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (il == n_layer - 1 && inp_out_ids) {
|
||||||
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||||
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
||||||
|
cb(ffn_inp, "ffn_inp", il);
|
||||||
|
|
||||||
|
cur = build_norm(ffn_inp,
|
||||||
|
model.layers[il].ffn_norm, NULL,
|
||||||
|
LLM_NORM_RMS, il);
|
||||||
|
cb(cur, "ffn_norm", il);
|
||||||
|
// feed-forward network (non-MoE)
|
||||||
|
ggml_tensor * cur_mlp = build_ffn(cur,
|
||||||
|
model.layers[il].ffn_up, NULL, NULL,
|
||||||
|
model.layers[il].ffn_gate, NULL, NULL,
|
||||||
|
model.layers[il].ffn_down, NULL, NULL,
|
||||||
|
NULL,
|
||||||
|
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
||||||
|
cb(cur_mlp, "ffn_out", il);
|
||||||
|
|
||||||
|
cur = ggml_add(ctx0, cur_mlp, ffn_inp);
|
||||||
|
|
||||||
|
cur = build_cvec(cur, il);
|
||||||
|
cb(cur, "l_out", il);
|
||||||
|
|
||||||
|
// input for next layer
|
||||||
|
inpL = cur;
|
||||||
|
}
|
||||||
|
cur = inpL;
|
||||||
|
|
||||||
|
cur = build_norm(cur,
|
||||||
|
model.output_norm, NULL,
|
||||||
|
LLM_NORM_RMS, -1);
|
||||||
|
|
||||||
|
cb(cur, "result_norm", -1);
|
||||||
|
res->t_embd = cur;
|
||||||
|
// lm_head
|
||||||
|
cur = build_lora_mm(model.output, cur);
|
||||||
|
cb(cur, "result_output", -1);
|
||||||
|
res->t_logits = cur;
|
||||||
|
|
||||||
|
ggml_build_forward_expand(gf, cur);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
struct llm_build_smollm3 : public llm_graph_context {
|
struct llm_build_smollm3 : public llm_graph_context {
|
||||||
llm_build_smollm3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
llm_build_smollm3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||||
|
|
@ -17414,6 +17598,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
||||||
/* recurrent_kv_size */ std::max((uint32_t) 1, cparams.n_seq_max),
|
/* recurrent_kv_size */ std::max((uint32_t) 1, cparams.n_seq_max),
|
||||||
/* n_seq_max */ cparams.n_seq_max,
|
/* n_seq_max */ cparams.n_seq_max,
|
||||||
/* offload */ cparams.offload_kqv,
|
/* offload */ cparams.offload_kqv,
|
||||||
|
/* unified */ cparams.kv_unified,
|
||||||
/* filter_attn */ (arch == LLM_ARCH_FALCON_H1) ? [&](int32_t) { return true; } : (llama_memory_hybrid::layer_filter_cb)nullptr,
|
/* filter_attn */ (arch == LLM_ARCH_FALCON_H1) ? [&](int32_t) { return true; } : (llama_memory_hybrid::layer_filter_cb)nullptr,
|
||||||
/* filter_recr */ (arch == LLM_ARCH_FALCON_H1) ? [&](int32_t) { return true; } : (llama_memory_hybrid::layer_filter_cb)nullptr);
|
/* filter_recr */ (arch == LLM_ARCH_FALCON_H1) ? [&](int32_t) { return true; } : (llama_memory_hybrid::layer_filter_cb)nullptr);
|
||||||
} else {
|
} else {
|
||||||
|
|
@ -17797,6 +17982,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
||||||
{
|
{
|
||||||
llm = std::make_unique<llm_build_hunyuan_moe>(*this, params);
|
llm = std::make_unique<llm_build_hunyuan_moe>(*this, params);
|
||||||
} break;
|
} break;
|
||||||
|
case LLM_ARCH_HUNYUAN_DENSE:
|
||||||
|
{
|
||||||
|
llm = std::make_unique<llm_build_hunyuan_dense>(*this, params);
|
||||||
|
} break;
|
||||||
case LLM_ARCH_SMOLLM3:
|
case LLM_ARCH_SMOLLM3:
|
||||||
{
|
{
|
||||||
llm = std::make_unique<llm_build_smollm3>(*this, params);
|
llm = std::make_unique<llm_build_smollm3>(*this, params);
|
||||||
|
|
@ -18016,6 +18205,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
||||||
case LLM_ARCH_MINICPM3:
|
case LLM_ARCH_MINICPM3:
|
||||||
case LLM_ARCH_DOTS1:
|
case LLM_ARCH_DOTS1:
|
||||||
case LLM_ARCH_HUNYUAN_MOE:
|
case LLM_ARCH_HUNYUAN_MOE:
|
||||||
|
case LLM_ARCH_HUNYUAN_DENSE:
|
||||||
case LLM_ARCH_LFM2:
|
case LLM_ARCH_LFM2:
|
||||||
case LLM_ARCH_SMALLTHINKER:
|
case LLM_ARCH_SMALLTHINKER:
|
||||||
return LLAMA_ROPE_TYPE_NEOX;
|
return LLAMA_ROPE_TYPE_NEOX;
|
||||||
|
|
|
||||||
|
|
@ -307,6 +307,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
|
||||||
};
|
};
|
||||||
break;
|
break;
|
||||||
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM:
|
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM:
|
||||||
|
case LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE:
|
||||||
regex_exprs = {
|
regex_exprs = {
|
||||||
"\\p{N}{1,3}",
|
"\\p{N}{1,3}",
|
||||||
"[一-龥-ゟ゠-ヿ]+",
|
"[一-龥-ゟ゠-ヿ]+",
|
||||||
|
|
@ -1855,7 +1856,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||||
tokenizer_pre == "gigachat" ||
|
tokenizer_pre == "gigachat" ||
|
||||||
tokenizer_pre == "jina-v2-es" ||
|
tokenizer_pre == "jina-v2-es" ||
|
||||||
tokenizer_pre == "jina-v2-de" ||
|
tokenizer_pre == "jina-v2-de" ||
|
||||||
tokenizer_pre == "a.x-4.0") {
|
tokenizer_pre == "a.x-4.0" ||
|
||||||
|
tokenizer_pre == "mellum") {
|
||||||
pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
|
pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
|
||||||
} else if (
|
} else if (
|
||||||
tokenizer_pre == "jina-v1-en" ||
|
tokenizer_pre == "jina-v1-en" ||
|
||||||
|
|
@ -1964,6 +1966,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||||
tokenizer_pre == "hunyuan") {
|
tokenizer_pre == "hunyuan") {
|
||||||
pre_type = LLAMA_VOCAB_PRE_TYPE_HUNYUAN;
|
pre_type = LLAMA_VOCAB_PRE_TYPE_HUNYUAN;
|
||||||
clean_spaces = false;
|
clean_spaces = false;
|
||||||
|
} else if (
|
||||||
|
tokenizer_pre == "hunyuan-dense") {
|
||||||
|
pre_type = LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE;
|
||||||
|
clean_spaces = false;
|
||||||
} else if (
|
} else if (
|
||||||
tokenizer_pre == "kimi-k2") {
|
tokenizer_pre == "kimi-k2") {
|
||||||
pre_type = LLAMA_VOCAB_PRE_TYPE_KIMI_K2;
|
pre_type = LLAMA_VOCAB_PRE_TYPE_KIMI_K2;
|
||||||
|
|
|
||||||
|
|
@ -46,6 +46,7 @@ enum llama_vocab_pre_type {
|
||||||
LLAMA_VOCAB_PRE_TYPE_SEED_CODER = 35,
|
LLAMA_VOCAB_PRE_TYPE_SEED_CODER = 35,
|
||||||
LLAMA_VOCAB_PRE_TYPE_HUNYUAN = 36,
|
LLAMA_VOCAB_PRE_TYPE_HUNYUAN = 36,
|
||||||
LLAMA_VOCAB_PRE_TYPE_KIMI_K2 = 37,
|
LLAMA_VOCAB_PRE_TYPE_KIMI_K2 = 37,
|
||||||
|
LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE = 38,
|
||||||
};
|
};
|
||||||
|
|
||||||
struct LLM_KV;
|
struct LLM_KV;
|
||||||
|
|
|
||||||
|
|
@ -5592,13 +5592,15 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
||||||
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 1056, 1, 193, {1, 1}, {4, 1}, {0, 2, 1, 3}));
|
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 1056, 1, 193, {1, 1}, {4, 1}, {0, 2, 1, 3}));
|
||||||
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 1056, 1, 67, {1, 1}, {4, 1}, {0, 2, 1, 3}));
|
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 1056, 1, 67, {1, 1}, {4, 1}, {0, 2, 1, 3}));
|
||||||
|
|
||||||
for (auto bs : {1,2,4,8}) {
|
for (auto bs2 : {1,3}) {
|
||||||
for (auto nr : {1,4}) {
|
for (auto bs : {1,2,4,8}) {
|
||||||
for (uint32_t m = 0; m < 2; ++m) {
|
for (auto nr : {1,4}) {
|
||||||
for (uint32_t k = 0; k < 2; ++k) {
|
for (uint32_t m = 0; m < 2; ++m) {
|
||||||
for (ggml_type type: {GGML_TYPE_F16, GGML_TYPE_BF16, GGML_TYPE_F32}) {
|
for (uint32_t k = 0; k < 2; ++k) {
|
||||||
test_cases.emplace_back(new test_mul_mat(type, GGML_TYPE_F32, 1056 + m, 1, 128 + k, {bs, 1}, {nr, 1}, {0, 2, 1, 3}));
|
for (ggml_type type: {GGML_TYPE_F16, GGML_TYPE_BF16, GGML_TYPE_F32}) {
|
||||||
test_cases.emplace_back(new test_mul_mat(type, GGML_TYPE_F32, 128 + m, 1, 1056 + k, {bs, 1}, {nr, 1}, {0, 1, 2, 3}, true));
|
test_cases.emplace_back(new test_mul_mat(type, GGML_TYPE_F32, 1056 + m, 1, 128 + k, {bs, bs2}, {nr, 1}, {0, 2, 1, 3}));
|
||||||
|
test_cases.emplace_back(new test_mul_mat(type, GGML_TYPE_F32, 128 + m, 1, 1056 + k, {bs, bs2}, {nr, 1}, {0, 1, 2, 3}, true));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -953,6 +953,33 @@ static void test_template_output_parsers() {
|
||||||
/* is_partial= */ false,
|
/* is_partial= */ false,
|
||||||
{COMMON_CHAT_FORMAT_HERMES_2_PRO}));
|
{COMMON_CHAT_FORMAT_HERMES_2_PRO}));
|
||||||
|
|
||||||
|
// Test multiple tool calls
|
||||||
|
common_chat_msg message_assist_multiple_calls;
|
||||||
|
message_assist_multiple_calls.role = "assistant";
|
||||||
|
message_assist_multiple_calls.content = "";
|
||||||
|
message_assist_multiple_calls.tool_calls.push_back({"special_function", "{\"arg1\": 1}", ""});
|
||||||
|
message_assist_multiple_calls.tool_calls.push_back({"python", "{\"code\":\"print('hello')\"}", ""});
|
||||||
|
|
||||||
|
assert_msg_equals(
|
||||||
|
message_assist_multiple_calls,
|
||||||
|
common_chat_parse(
|
||||||
|
"<tool_call>\n"
|
||||||
|
"{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n"
|
||||||
|
"</tool_call>\n"
|
||||||
|
"<tool_call>\n"
|
||||||
|
"{\"name\": \"python\", \"arguments\": {\"code\":\"print('hello')\"}}\n"
|
||||||
|
"</tool_call>",
|
||||||
|
/* is_partial= */ false,
|
||||||
|
{COMMON_CHAT_FORMAT_HERMES_2_PRO}));
|
||||||
|
|
||||||
|
assert_msg_equals(
|
||||||
|
message_assist_multiple_calls,
|
||||||
|
common_chat_parse(
|
||||||
|
"<function=special_function>{\"arg1\": 1}</function>\n"
|
||||||
|
"<function=python>{\"code\":\"print('hello')\"}</function>",
|
||||||
|
/* is_partial= */ false,
|
||||||
|
{COMMON_CHAT_FORMAT_HERMES_2_PRO}));
|
||||||
|
|
||||||
assert_msg_equals(
|
assert_msg_equals(
|
||||||
simple_assist_msg(
|
simple_assist_msg(
|
||||||
"This is not a tool call:",
|
"This is not a tool call:",
|
||||||
|
|
@ -1039,6 +1066,22 @@ static void test_template_output_parsers() {
|
||||||
"<tool_call>\n"
|
"<tool_call>\n"
|
||||||
"{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n"
|
"{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n"
|
||||||
"</tool_call>");
|
"</tool_call>");
|
||||||
|
|
||||||
|
// Test multiple tool calls with template
|
||||||
|
common_chat_msg message_assist_multiple_calls_template;
|
||||||
|
message_assist_multiple_calls_template.role = "assistant";
|
||||||
|
message_assist_multiple_calls_template.content = "";
|
||||||
|
message_assist_multiple_calls_template.tool_calls.push_back({"special_function", "{\"arg1\": 1}", ""});
|
||||||
|
message_assist_multiple_calls_template.tool_calls.push_back({"python", "{\"code\":\"print('test')\"}", ""});
|
||||||
|
|
||||||
|
test_templates(tmpls.get(), end_tokens, message_assist_multiple_calls_template, tools,
|
||||||
|
"<tool_call>\n"
|
||||||
|
"{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n"
|
||||||
|
"</tool_call>\n"
|
||||||
|
"<tool_call>\n"
|
||||||
|
"{\"name\": \"python\", \"arguments\": {\"code\":\"print('test')\"}}\n"
|
||||||
|
"</tool_call>");
|
||||||
|
|
||||||
test_templates(tmpls.get(), end_tokens, message_assist_call_python_lines, tools,
|
test_templates(tmpls.get(), end_tokens, message_assist_call_python_lines, tools,
|
||||||
"<tool_call>\n"
|
"<tool_call>\n"
|
||||||
"{\"name\": \"python\", \"arguments\": {\"code\":\"# This is a program:\\nprint('hey')\"}}\n"
|
"{\"name\": \"python\", \"arguments\": {\"code\":\"# This is a program:\\nprint('hey')\"}}\n"
|
||||||
|
|
|
||||||
|
|
@ -7,7 +7,7 @@ More information is available in <https://github.com/ggml-org/llama.cpp/pull/486
|
||||||
|
|
||||||
```
|
```
|
||||||
./llama-imatrix \
|
./llama-imatrix \
|
||||||
-m model.gguf -f some-text.txt [-o imatrix.gguf] [--no-ppl] \
|
-m model.gguf -f some-text.txt [-o imatrix.gguf] [--output-format {gguf,dat}] [--no-ppl] \
|
||||||
[--process-output] [--chunk 123] [--save-frequency 0] [--output-frequency 10] \
|
[--process-output] [--chunk 123] [--save-frequency 0] [--output-frequency 10] \
|
||||||
[--in-file imatrix-prev-0.gguf --in-file imatrix-prev-1.gguf ...] [--parse-special] \
|
[--in-file imatrix-prev-0.gguf --in-file imatrix-prev-1.gguf ...] [--parse-special] \
|
||||||
[--show-statistics] [...]
|
[--show-statistics] [...]
|
||||||
|
|
@ -20,6 +20,7 @@ The parameters in square brackets are optional and have the following meaning:
|
||||||
* `-lv | --verbosity` specifies the verbosity level. If set to `0`, no output other than the perplexity of the processed chunks will be generated. If set to `1`, each time the results are saved a message is written to `stderr`. If `>=2`, a message is output each time data is collected for any tensor. Default verbosity level is `1`.
|
* `-lv | --verbosity` specifies the verbosity level. If set to `0`, no output other than the perplexity of the processed chunks will be generated. If set to `1`, each time the results are saved a message is written to `stderr`. If `>=2`, a message is output each time data is collected for any tensor. Default verbosity level is `1`.
|
||||||
* `-o | --output-file` specifies the name of the file where the computed data will be stored. If missing `imatrix.gguf` is used.
|
* `-o | --output-file` specifies the name of the file where the computed data will be stored. If missing `imatrix.gguf` is used.
|
||||||
* `-ofreq | --output-frequency` specifies how often the so far computed result is saved to disk. Default is 10 (i.e., every 10 chunks)
|
* `-ofreq | --output-frequency` specifies how often the so far computed result is saved to disk. Default is 10 (i.e., every 10 chunks)
|
||||||
|
* `--output-format` specifies the output format of the generated imatrix file. Either "gguf", or "dat" (the legacy format). Defaults to "gguf".
|
||||||
* `--save-frequency` specifies how often to save a copy of the imatrix in a separate file. Default is 0 (i.e., never)
|
* `--save-frequency` specifies how often to save a copy of the imatrix in a separate file. Default is 0 (i.e., never)
|
||||||
* `--process-output` specifies if data will be collected for the `output.weight` tensor. Typically, it is better not to utilize the importance matrix when quantizing `output.weight`, so this is set to `false` by default.
|
* `--process-output` specifies if data will be collected for the `output.weight` tensor. Typically, it is better not to utilize the importance matrix when quantizing `output.weight`, so this is set to `false` by default.
|
||||||
* `--in-file` one or more existing imatrix files to load and combine. Useful for merging files from multiple runs/datasets.
|
* `--in-file` one or more existing imatrix files to load and combine. Useful for merging files from multiple runs/datasets.
|
||||||
|
|
@ -45,14 +46,19 @@ Recent versions of `llama-imatrix` store data in GGUF format by default. For the
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# generate and save the imatrix using legacy format
|
# generate and save the imatrix using legacy format
|
||||||
./llama-imatrix -m ggml-model-f16.gguf -f calibration-data.txt -o imatrix-legcy-format.dat -ngl 99
|
./llama-imatrix -m ggml-model-f16.gguf -f calibration-data.txt --output-format dat -o imatrix-legcy-format.dat -ngl 99
|
||||||
```
|
```
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# covert legacy (binary) imatrix format to new (GGUF) format
|
# convert legacy (binary) imatrix format to new (GGUF) format
|
||||||
./llama-imatrix --in-file imatrix-legacy-format.dat -o imatrix-new-format.gguf
|
./llama-imatrix --in-file imatrix-legacy-format.dat -o imatrix-new-format.gguf
|
||||||
```
|
```
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# convert new (GGUF) imatrix format to legacy (binary) format
|
||||||
|
./llama-imatrix --in-file imatrix-new-format.gguf --output-format dat -o imatrix-legacy-format.dat
|
||||||
|
```
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# combine existing imatrices
|
# combine existing imatrices
|
||||||
./llama-imatrix --in-file imatrix-prev-0.gguf --in-file imatrix-prev-1.gguf -o imatrix-combined.gguf
|
./llama-imatrix --in-file imatrix-prev-0.gguf --in-file imatrix-prev-1.gguf -o imatrix-combined.gguf
|
||||||
|
|
|
||||||
|
|
@ -26,7 +26,7 @@
|
||||||
static void print_usage(int, char ** argv) {
|
static void print_usage(int, char ** argv) {
|
||||||
LOG("\nexample usage:\n");
|
LOG("\nexample usage:\n");
|
||||||
LOG("\n %s \\\n"
|
LOG("\n %s \\\n"
|
||||||
" -m model.gguf -f some-text.txt [-o imatrix.gguf] [--no-ppl] \\\n"
|
" -m model.gguf -f some-text.txt [-o imatrix.gguf] [--output-format {gguf,dat}] [--no-ppl] \\\n"
|
||||||
" [--process-output] [--chunk 123] [--save-frequency 0] [--output-frequency 10] \\\n"
|
" [--process-output] [--chunk 123] [--save-frequency 0] [--output-frequency 10] \\\n"
|
||||||
" [--in-file imatrix-prev-0.gguf --in-file imatrix-prev-1.gguf ...] [--parse-special] \\\n"
|
" [--in-file imatrix-prev-0.gguf --in-file imatrix-prev-1.gguf ...] [--parse-special] \\\n"
|
||||||
" [--show-statistics] [...]\n" , argv[0]);
|
" [--show-statistics] [...]\n" , argv[0]);
|
||||||
|
|
@ -250,13 +250,6 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
|
||||||
const char * data = is_host ? (const char *) src1->data : m_src1_data.data();
|
const char * data = is_host ? (const char *) src1->data : m_src1_data.data();
|
||||||
GGML_ASSERT(src1->nb[0] == ggml_element_size(src1));
|
GGML_ASSERT(src1->nb[0] == ggml_element_size(src1));
|
||||||
|
|
||||||
// TODO: 4d? (is that even used in practice?)
|
|
||||||
// the extra dimension would need to be stored somewhere to be reflected in the imatrix file
|
|
||||||
if (ggml_nrows(src1) != src1->ne[1] * src1->ne[2]) {
|
|
||||||
LOG_ERR("%s: tensor has more than 3 dimensions: %s", __func__, wname.c_str());
|
|
||||||
GGML_ASSERT(false);
|
|
||||||
}
|
|
||||||
|
|
||||||
// this has been adapted to the new format of storing merged experts in a single 3d tensor
|
// this has been adapted to the new format of storing merged experts in a single 3d tensor
|
||||||
// ref: https://github.com/ggml-org/llama.cpp/pull/6387
|
// ref: https://github.com/ggml-org/llama.cpp/pull/6387
|
||||||
if (t->op == GGML_OP_MUL_MAT_ID) {
|
if (t->op == GGML_OP_MUL_MAT_ID) {
|
||||||
|
|
@ -272,6 +265,12 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
|
||||||
|
|
||||||
GGML_ASSERT(ids->ne[1] == src1->ne[2]);
|
GGML_ASSERT(ids->ne[1] == src1->ne[2]);
|
||||||
|
|
||||||
|
// the extra dimension would need to be stored somewhere to be reflected in the imatrix file
|
||||||
|
if (ggml_nrows(src1) != src1->ne[1] * src1->ne[2]) {
|
||||||
|
LOG_ERR("%s: tensor has more than 3 dimensions: %s", __func__, wname.c_str());
|
||||||
|
GGML_ASSERT(false);
|
||||||
|
}
|
||||||
|
|
||||||
m_ids.resize(ggml_nbytes(ids));
|
m_ids.resize(ggml_nbytes(ids));
|
||||||
ggml_backend_tensor_get(ids, m_ids.data(), 0, ggml_nbytes(ids));
|
ggml_backend_tensor_get(ids, m_ids.data(), 0, ggml_nbytes(ids));
|
||||||
|
|
||||||
|
|
@ -335,29 +334,40 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
auto & e = m_stats[wname];
|
auto & e = m_stats[wname];
|
||||||
const int64_t n_mat = src1->ne[2] * src1->ne[3];
|
const int64_t n_mat = src0->ne[2] * src0->ne[3];
|
||||||
|
|
||||||
|
// use a single count per dense tensor
|
||||||
|
// (necessary when merging older GGUF-imatrix files with 3d tensors)
|
||||||
|
if (e.counts.size() > 1) {
|
||||||
|
bool all_equal = true;
|
||||||
|
for (size_t i = 1; i < e.counts.size(); ++i) {
|
||||||
|
if (e.counts[0] != e.counts[i]) {
|
||||||
|
all_equal = false;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (all_equal) {
|
||||||
|
e.counts.resize(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
if (e.values.empty()) {
|
if (e.values.empty()) {
|
||||||
e.values.resize(src1->ne[0] * n_mat, 0);
|
e.values.resize(src1->ne[0] * n_mat, 0);
|
||||||
e.counts.resize(n_mat, 0);
|
e.counts.resize(1, 0);
|
||||||
}
|
}
|
||||||
else if (e.values.size() != (size_t)(src1->ne[0] * n_mat)) {
|
else if (e.values.size() != (size_t)(src1->ne[0] * n_mat)) {
|
||||||
LOG_ERR("%s: inconsistent size for %s (%d vs %d)\n", __func__, wname.c_str(), (int)e.values.size(), (int)(src1->ne[0] * n_mat));
|
LOG_ERR("%s: inconsistent size for %s (%d vs %d)\n", __func__, wname.c_str(), (int)e.values.size(), (int)(src1->ne[0] * n_mat));
|
||||||
exit(1); //GGML_ABORT("fatal error");
|
exit(1); //GGML_ABORT("fatal error");
|
||||||
}
|
}
|
||||||
else if (e.counts.size() != (size_t)n_mat) {
|
|
||||||
LOG_ERR("%s: inconsistent expert count for %s (%d vs %d)\n", __func__, wname.c_str(), (int)e.counts.size(), (int)n_mat);
|
|
||||||
exit(1); //GGML_ABORT("fatal error");
|
|
||||||
}
|
|
||||||
LOG_DBGV(2, "%s[%d]: %32s, %s, %5d x %5d x %5d, %d\n", __func__, m_last_chunk, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->ne[2], (int)src1->type);
|
LOG_DBGV(2, "%s[%d]: %32s, %s, %5d x %5d x %5d, %d\n", __func__, m_last_chunk, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->ne[2], (int)src1->type);
|
||||||
|
|
||||||
for (int64_t i3 = 0; i3 < src1->ne[3]; ++i3) {
|
for (int64_t i3 = 0; i3 < src1->ne[3]; ++i3) {
|
||||||
for (int64_t i2 = 0; i2 < src1->ne[2]; ++i2) {
|
for (int64_t i2 = 0; i2 < src1->ne[2]; ++i2) {
|
||||||
const int64_t mat_id = i3 * src1->ne[2] + i2;
|
// handle 3D+ tensors, but flatten 3D+ activations when model tensor is 2D
|
||||||
|
const int64_t mat_id = (i3 % src0->ne[3]) * src0->ne[2] + (i2 % src0->ne[2]);
|
||||||
const int64_t mat_start = mat_id * src1->ne[0];
|
const int64_t mat_start = mat_id * src1->ne[0];
|
||||||
|
|
||||||
for (int64_t row = 0; row < src1->ne[1]; ++row) {
|
for (int64_t row = 0; row < src1->ne[1]; ++row) {
|
||||||
const float * x = (const float *) (data + row * src1->nb[1] + i2 * src1->nb[2] + i3 * src1->ne[3]);
|
const float * x = (const float *) (data + row * src1->nb[1] + i2 * src1->nb[2] + i3 * src1->nb[3]);
|
||||||
e.counts[mat_id]++;
|
|
||||||
for (int64_t j = 0; j < src1->ne[0]; ++j) {
|
for (int64_t j = 0; j < src1->ne[0]; ++j) {
|
||||||
e.values[mat_start + j] += x[j] * x[j];
|
e.values[mat_start + j] += x[j] * x[j];
|
||||||
if (!std::isfinite((float)e.values[j])) {
|
if (!std::isfinite((float)e.values[j])) {
|
||||||
|
|
@ -366,16 +376,20 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
const int32_t n_chunk = e.counts[mat_id] / chunk_size;
|
}
|
||||||
if (n_chunk > m_last_chunk) {
|
}
|
||||||
const int32_t chunk_step = n_chunk - m_last_chunk;
|
// only 1 count in practice, except when a tensor is used for both MUL_MAT_ID and MUL_MAT
|
||||||
m_last_chunk = n_chunk;
|
for (size_t i = 0; i < e.counts.size(); ++i) {
|
||||||
if ((m_last_chunk % m_params.n_out_freq) / chunk_step == 0) {
|
e.counts[i] += ggml_nrows(src1) / n_mat;
|
||||||
save_imatrix();
|
const int32_t n_chunk = e.counts[i] / chunk_size;
|
||||||
}
|
if (n_chunk > m_last_chunk) {
|
||||||
if (m_params.n_save_freq > 0 && (m_last_chunk % m_params.n_save_freq) / chunk_step == 0) {
|
const int32_t chunk_step = n_chunk - m_last_chunk;
|
||||||
save_imatrix(m_last_chunk);
|
m_last_chunk = n_chunk;
|
||||||
}
|
if ((m_last_chunk % m_params.n_out_freq) / chunk_step == 0) {
|
||||||
|
save_imatrix();
|
||||||
|
}
|
||||||
|
if (m_params.n_save_freq > 0 && (m_last_chunk % m_params.n_save_freq) / chunk_step == 0) {
|
||||||
|
save_imatrix(m_last_chunk);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -492,13 +506,13 @@ void IMatrixCollector::save_imatrix_legacy(int32_t ncall) const {
|
||||||
|
|
||||||
void IMatrixCollector::save_imatrix(int32_t n_chunk) const {
|
void IMatrixCollector::save_imatrix(int32_t n_chunk) const {
|
||||||
auto fname = m_params.out_file;
|
auto fname = m_params.out_file;
|
||||||
|
bool use_legacy_format = m_params.imat_dat;
|
||||||
|
|
||||||
// TODO: use the new format in more cases
|
if (use_legacy_format) {
|
||||||
if (!string_ends_with(fname, ".gguf")) {
|
|
||||||
LOG_WRN("\n%s: saving to legacy imatrix format because output suffix is not .gguf\n", __func__);
|
|
||||||
this->save_imatrix_legacy(n_chunk);
|
this->save_imatrix_legacy(n_chunk);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
// else, default to GGUF imatrix
|
||||||
|
|
||||||
if (n_chunk > 0) {
|
if (n_chunk > 0) {
|
||||||
fname += ".at_";
|
fname += ".at_";
|
||||||
|
|
|
||||||
|
|
@ -1738,7 +1738,7 @@ struct sql_printer : public printer {
|
||||||
|
|
||||||
void print_header(const cmd_params & params) override {
|
void print_header(const cmd_params & params) override {
|
||||||
std::vector<std::string> fields = test::get_fields();
|
std::vector<std::string> fields = test::get_fields();
|
||||||
fprintf(fout, "CREATE TABLE IF NOT EXISTS test (\n");
|
fprintf(fout, "CREATE TABLE IF NOT EXISTS llama_bench (\n");
|
||||||
for (size_t i = 0; i < fields.size(); i++) {
|
for (size_t i = 0; i < fields.size(); i++) {
|
||||||
fprintf(fout, " %s %s%s\n", fields.at(i).c_str(), get_sql_field_type(fields.at(i)).c_str(),
|
fprintf(fout, " %s %s%s\n", fields.at(i).c_str(), get_sql_field_type(fields.at(i)).c_str(),
|
||||||
i < fields.size() - 1 ? "," : "");
|
i < fields.size() - 1 ? "," : "");
|
||||||
|
|
@ -1749,7 +1749,7 @@ struct sql_printer : public printer {
|
||||||
}
|
}
|
||||||
|
|
||||||
void print_test(const test & t) override {
|
void print_test(const test & t) override {
|
||||||
fprintf(fout, "INSERT INTO test (%s) ", join(test::get_fields(), ", ").c_str());
|
fprintf(fout, "INSERT INTO llama_bench (%s) ", join(test::get_fields(), ", ").c_str());
|
||||||
fprintf(fout, "VALUES (");
|
fprintf(fout, "VALUES (");
|
||||||
std::vector<std::string> values = t.get_values();
|
std::vector<std::string> values = t.get_values();
|
||||||
for (size_t i = 0; i < values.size(); i++) {
|
for (size_t i = 0; i < values.size(); i++) {
|
||||||
|
|
|
||||||
|
|
@ -611,7 +611,7 @@ int main(int argc, char ** argv) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
if (!try_parse_ftype(argv[arg_idx], params.ftype, ftype_str)) {
|
if (!try_parse_ftype(argv[arg_idx], params.ftype, ftype_str)) {
|
||||||
fprintf(stderr, "%s: invalid ftype '%s'\n", __func__, argv[3]);
|
fprintf(stderr, "%s: invalid ftype '%s'\n", __func__, argv[arg_idx]);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
if (ftype_str == "COPY") {
|
if (ftype_str == "COPY") {
|
||||||
|
|
|
||||||
|
|
@ -4249,9 +4249,6 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
// process prompt
|
// process prompt
|
||||||
std::vector<server_tokens> inputs;
|
std::vector<server_tokens> inputs;
|
||||||
if (oaicompat && !prompt.is_string()) {
|
|
||||||
throw std::runtime_error("prompt must be a string");
|
|
||||||
}
|
|
||||||
|
|
||||||
if (oaicompat && has_mtmd) {
|
if (oaicompat && has_mtmd) {
|
||||||
// multimodal
|
// multimodal
|
||||||
|
|
|
||||||
|
|
@ -162,10 +162,15 @@ class chat_template {
|
||||||
}), false);
|
}), false);
|
||||||
caps_.supports_tools = contains(out, "some_tool");
|
caps_.supports_tools = contains(out, "some_tool");
|
||||||
|
|
||||||
|
auto out_empty = try_raw_render(json::array({dummy_user_msg, {{"role", "assistant"}, {"content", ""}}}), {}, false);
|
||||||
|
auto out_null = try_raw_render(json::array({dummy_user_msg, {{"role", "assistant"}, {"content", nullptr}}}), {}, false);
|
||||||
|
caps_.requires_non_null_content = contains(out_empty, user_needle) && !contains(out_null, user_needle);
|
||||||
|
|
||||||
|
json j_null;
|
||||||
auto make_tool_calls_msg = [&](const json & tool_calls) {
|
auto make_tool_calls_msg = [&](const json & tool_calls) {
|
||||||
return json {
|
return json {
|
||||||
{"role", "assistant"},
|
{"role", "assistant"},
|
||||||
{"content", nullptr},
|
{"content", caps_.requires_non_null_content? "" : j_null},
|
||||||
{"tool_calls", tool_calls},
|
{"tool_calls", tool_calls},
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
@ -195,9 +200,6 @@ class chat_template {
|
||||||
|
|
||||||
caps_.supports_tool_calls = tool_call_renders_str_arguments || tool_call_renders_obj_arguments;
|
caps_.supports_tool_calls = tool_call_renders_str_arguments || tool_call_renders_obj_arguments;
|
||||||
caps_.requires_object_arguments = !tool_call_renders_str_arguments && tool_call_renders_obj_arguments;
|
caps_.requires_object_arguments = !tool_call_renders_str_arguments && tool_call_renders_obj_arguments;
|
||||||
auto out_empty = try_raw_render(json::array({dummy_user_msg, {{"role", "assistant"}, {"content", ""}}}), {}, false);
|
|
||||||
auto out_null = try_raw_render(json::array({dummy_user_msg, {{"role", "assistant"}, {"content", nullptr}}}), {}, false);
|
|
||||||
caps_.requires_non_null_content = contains(out_empty, user_needle) && !contains(out_null, user_needle);
|
|
||||||
|
|
||||||
if (caps_.supports_tool_calls) {
|
if (caps_.supports_tool_calls) {
|
||||||
auto dummy_args = caps_.requires_object_arguments ? dummy_args_obj : json(dummy_args_obj.dump());
|
auto dummy_args = caps_.requires_object_arguments ? dummy_args_obj : json(dummy_args_obj.dump());
|
||||||
|
|
@ -234,7 +236,7 @@ class chat_template {
|
||||||
};
|
};
|
||||||
const json tool_call_msg {
|
const json tool_call_msg {
|
||||||
{"role", "assistant"},
|
{"role", "assistant"},
|
||||||
{"content", nullptr},
|
{"content", caps_.requires_non_null_content ? "" : j_null},
|
||||||
{"tool_calls", json::array({
|
{"tool_calls", json::array({
|
||||||
{
|
{
|
||||||
// TODO: detect if requires numerical id or fixed length == 6 like Nemo
|
// TODO: detect if requires numerical id or fixed length == 6 like Nemo
|
||||||
|
|
|
||||||
|
|
@ -1355,8 +1355,13 @@ public:
|
||||||
case Op::Gt: return l > r;
|
case Op::Gt: return l > r;
|
||||||
case Op::Le: return l <= r;
|
case Op::Le: return l <= r;
|
||||||
case Op::Ge: return l >= r;
|
case Op::Ge: return l >= r;
|
||||||
case Op::In: return (r.is_array() || r.is_object()) && r.contains(l);
|
case Op::In: return (((r.is_array() || r.is_object()) && r.contains(l)) ||
|
||||||
case Op::NotIn: return !(r.is_array() && r.contains(l));
|
(l.is_string() && r.is_string() &&
|
||||||
|
r.to_str().find(l.to_str()) != std::string::npos));
|
||||||
|
case Op::NotIn:
|
||||||
|
return !(((r.is_array() || r.is_object()) && r.contains(l)) ||
|
||||||
|
(l.is_string() && r.is_string() &&
|
||||||
|
r.to_str().find(l.to_str()) != std::string::npos));
|
||||||
default: break;
|
default: break;
|
||||||
}
|
}
|
||||||
throw std::runtime_error("Unknown binary operator");
|
throw std::runtime_error("Unknown binary operator");
|
||||||
|
|
@ -1552,6 +1557,19 @@ public:
|
||||||
else res[i] = std::tolower(res[i]);
|
else res[i] = std::tolower(res[i]);
|
||||||
}
|
}
|
||||||
return res;
|
return res;
|
||||||
|
} else if (method->get_name() == "replace") {
|
||||||
|
vargs.expectArgs("replace method", {2, 3}, {0, 0});
|
||||||
|
auto before = vargs.args[0].get<std::string>();
|
||||||
|
auto after = vargs.args[1].get<std::string>();
|
||||||
|
auto count = vargs.args.size() == 3 ? vargs.args[2].get<int64_t>()
|
||||||
|
: str.length();
|
||||||
|
size_t start_pos = 0;
|
||||||
|
while ((start_pos = str.find(before, start_pos)) != std::string::npos &&
|
||||||
|
count-- > 0) {
|
||||||
|
str.replace(start_pos, before.length(), after);
|
||||||
|
start_pos += after.length();
|
||||||
|
}
|
||||||
|
return str;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
throw std::runtime_error("Unknown method: " + method->get_name());
|
throw std::runtime_error("Unknown method: " + method->get_name());
|
||||||
|
|
@ -2128,7 +2146,7 @@ private:
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if ((has_first_colon || has_second_colon) && (start || end || step)) {
|
if ((has_first_colon || has_second_colon)) {
|
||||||
index = std::make_shared<SliceExpr>(slice_loc, std::move(start), std::move(end), std::move(step));
|
index = std::make_shared<SliceExpr>(slice_loc, std::move(start), std::move(end), std::move(step));
|
||||||
} else {
|
} else {
|
||||||
index = std::move(start);
|
index = std::move(start);
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue