Merge branch 'master' into imatrix

This commit is contained in:
Ed Addario 2025-12-23 16:58:05 +00:00
commit c52cd09238
No known key found for this signature in database
GPG Key ID: E7875815A3230993
73 changed files with 3110 additions and 1518 deletions

View File

@ -688,13 +688,15 @@ jobs:
- name: Pack artifacts - name: Pack artifacts
id: pack_artifacts id: pack_artifacts
run: | run: |
tar -czvf llama-${{ steps.tag.outputs.name }}-xcframework.tar.gz -C build-apple llama.xcframework # Zip file is required for Swift Package Manager, which does not support tar.gz for binary targets.
# For more details, see https://developer.apple.com/documentation/xcode/distributing-binary-frameworks-as-swift-packages
zip -r -y llama-${{ steps.tag.outputs.name }}-xcframework.zip build-apple/llama.xcframework
- name: Upload artifacts - name: Upload artifacts
uses: actions/upload-artifact@v4 uses: actions/upload-artifact@v4
with: with:
path: llama-${{ steps.tag.outputs.name }}-xcframework.tar.gz path: llama-${{ steps.tag.outputs.name }}-xcframework.zip
name: llama-${{ steps.tag.outputs.name }}-xcframework.tar.gz name: llama-${{ steps.tag.outputs.name }}-xcframework.zip
openEuler-cann: openEuler-cann:
@ -863,7 +865,7 @@ jobs:
**macOS/iOS:** **macOS/iOS:**
- [macOS Apple Silicon (arm64)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.tar.gz) - [macOS Apple Silicon (arm64)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.tar.gz)
- [macOS Intel (x64)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-macos-x64.tar.gz) - [macOS Intel (x64)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-macos-x64.tar.gz)
- [iOS XCFramework](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-xcframework.tar.gz) - [iOS XCFramework](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-xcframework.zip)
**Linux:** **Linux:**
- [Ubuntu x64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.tar.gz) - [Ubuntu x64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.tar.gz)

View File

@ -1078,6 +1078,8 @@ struct common_init_result::impl {
impl() = default; impl() = default;
~impl() = default; ~impl() = default;
// note: the order in which model, context, etc. are declared matters because their destructors will be called bottom-to-top
llama_model_ptr model; llama_model_ptr model;
llama_context_ptr context; llama_context_ptr context;

View File

@ -141,16 +141,24 @@ class ModelBase:
self.model_name = model_name self.model_name = model_name
self.dir_model_card = dir_model # overridden in convert_lora_to_gguf.py self.dir_model_card = dir_model # overridden in convert_lora_to_gguf.py
# Apply heuristics to figure out typical tensor encoding based on first layer tensor encoding type # Apply heuristics to figure out typical tensor encoding based on first tensor's dtype
# NOTE: can't use field "torch_dtype" in config.json, because some finetunes lie.
if self.ftype == gguf.LlamaFileType.GUESSED: if self.ftype == gguf.LlamaFileType.GUESSED:
# NOTE: can't use field "torch_dtype" in config.json, because some finetunes lie. for _, tensor in self.get_tensors():
_, first_tensor = next(self.get_tensors()) if tensor.dim() < 2:
if first_tensor.dtype == torch.float16: continue
logger.info(f"choosing --outtype f16 from first tensor type ({first_tensor.dtype})")
self.ftype = gguf.LlamaFileType.MOSTLY_F16 if tensor.dtype == torch.bfloat16:
self.ftype = gguf.LlamaFileType.MOSTLY_BF16
logger.info("heuristics detected bfloat16 tensor dtype, setting --outtype bf16")
break
elif tensor.dtype == torch.float16:
self.ftype = gguf.LlamaFileType.MOSTLY_F16
logger.info("heuristics detected float16 tensor dtype, setting --outtype f16")
break
else: else:
logger.info(f"choosing --outtype bf16 from first tensor type ({first_tensor.dtype})") self.ftype = gguf.LlamaFileType.MOSTLY_F16
self.ftype = gguf.LlamaFileType.MOSTLY_BF16 logger.info("heuristics unable to detect tensor dtype, defaulting to --outtype f16")
self.dequant_model() self.dequant_model()
@ -1204,6 +1212,9 @@ class TextModel(ModelBase):
if chkhsh == "a1e163ecab2e718a4c829d1148b6e86824ec36163bb71941c3dca9cd5ac25756": if chkhsh == "a1e163ecab2e718a4c829d1148b6e86824ec36163bb71941c3dca9cd5ac25756":
# ref: https://huggingface.co/JetBrains/Mellum-4b-base # ref: https://huggingface.co/JetBrains/Mellum-4b-base
res = "mellum" res = "mellum"
if chkhsh == "a0b64b4385f123663873756336c085744376d015ff328bb1d901598f63c44152":
# ref: https://huggingface.co/answerdotai/ModernBERT-base
res = "modern-bert"
if chkhsh == "49fc0303c9e0d2c2c565c510f64b2d9b271276acdcdadff733249eda9f7d59df": if chkhsh == "49fc0303c9e0d2c2c565c510f64b2d9b271276acdcdadff733249eda9f7d59df":
# ref: https://huggingface.co/arcee-ai/Trinity-Tokenizer # ref: https://huggingface.co/arcee-ai/Trinity-Tokenizer
res = "afmoe" res = "afmoe"
@ -9991,6 +10002,36 @@ class SmallThinkerModel(TextModel):
raise ValueError(f"Unprocessed experts: {experts}") raise ValueError(f"Unprocessed experts: {experts}")
@ModelBase.register("ModernBertModel", "ModernBertForMaskedLM", "ModernBertForSequenceClassification")
class ModernBertModel(BertModel):
model_arch = gguf.MODEL_ARCH.MODERN_BERT
def set_vocab(self):
self.gguf_writer.add_add_bos_token(True)
self.gguf_writer.add_add_eos_token(True)
self.gguf_writer.add_add_sep_token(True)
self._set_vocab_gpt2()
def set_gguf_parameters(self):
super().set_gguf_parameters()
self.gguf_writer.add_sliding_window(self.hparams["local_attention"])
if (sliding_window_pattern := self.hparams.get("global_attn_every_n_layers")) is not None:
self.gguf_writer.add_sliding_window_pattern(sliding_window_pattern)
self.gguf_writer.add_rope_freq_base_swa(self.rope_parameters.get("sliding_attention", {"rope_theta": self.hparams.get("local_rope_theta")})["rope_theta"])
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
# these layers act as MLM head, so we don't need them
if name.startswith("decoder."):
return []
if name.startswith("model."):
name = name[6:]
return super().modify_tensors(data_torch, name, bid)
@ModelBase.register("ApertusForCausalLM") @ModelBase.register("ApertusForCausalLM")
class ApertusModel(LlamaModel): class ApertusModel(LlamaModel):
model_arch = gguf.MODEL_ARCH.APERTUS model_arch = gguf.MODEL_ARCH.APERTUS
@ -10557,8 +10598,8 @@ def parse_args() -> argparse.Namespace:
help="path to write to; default: based on input. {ftype} will be replaced by the outtype.", help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
) )
parser.add_argument( parser.add_argument(
"--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "tq1_0", "tq2_0", "auto"], default="f16", "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "tq1_0", "tq2_0", "auto"], default="auto",
help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, tq1_0 or tq2_0 for ternary, and auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type", help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, tq1_0 or tq2_0 for ternary, and auto for the highest-fidelity 16-bit float type",
) )
parser.add_argument( parser.add_argument(
"--bigendian", action="store_true", "--bigendian", action="store_true",

View File

@ -139,6 +139,7 @@ models = [
{"name": "lfm2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LiquidAI/LFM2-Tokenizer"}, {"name": "lfm2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LiquidAI/LFM2-Tokenizer"},
{"name": "exaone4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B", }, {"name": "exaone4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B", },
{"name": "mellum", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/JetBrains/Mellum-4b-base", }, {"name": "mellum", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/JetBrains/Mellum-4b-base", },
{"name": "modern-bert", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/answerdotai/ModernBERT-base", },
{"name": "afmoe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/arcee-ai/Trinity-Tokenizer", }, {"name": "afmoe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/arcee-ai/Trinity-Tokenizer", },
{"name": "bailingmoe2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/inclusionAI/Ling-mini-base-2.0", }, {"name": "bailingmoe2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/inclusionAI/Ling-mini-base-2.0", },
{"name": "granite-docling", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ibm-granite/granite-docling-258M", }, {"name": "granite-docling", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ibm-granite/granite-docling-258M", },

View File

@ -106,7 +106,7 @@ Here are some examples of running various llama.cpp tools via ADB.
Simple question for Llama-3.2-1B Simple question for Llama-3.2-1B
``` ```
~/src/llama.cpp$ M=Llama-3.2-1B-Instruct-Q4_0.gguf D=HTP0 ./scripts/snapdragon/adb/run-cli.sh -no-cnv -p "what is the most popular cookie in the world?" ~/src/llama.cpp$ M=Llama-3.2-1B-Instruct-Q4_0.gguf D=HTP0 ./scripts/snapdragon/adb/run-completion.sh -p "what is the most popular cookie in the world?"
... ...
ggml-hex: Hexagon backend (experimental) : allocating new registry : ndev 1 ggml-hex: Hexagon backend (experimental) : allocating new registry : ndev 1
ggml-hex: Hexagon Arch version v79 ggml-hex: Hexagon Arch version v79
@ -136,7 +136,7 @@ llama_memory_breakdown_print: | - HTP0-REPACK | 504 =
Summary request for OLMoE-1B-7B. This is a large model that requires two HTP sessions/devices Summary request for OLMoE-1B-7B. This is a large model that requires two HTP sessions/devices
``` ```
~/src/llama.cpp$ M=OLMoE-1B-7B-0125-Instruct-Q4_0.gguf NDEV=2 D=HTP0,HTP1 ./scripts/snapdragon/adb/run-cli.sh -f surfing.txt -no-cnv ~/src/llama.cpp$ M=OLMoE-1B-7B-0125-Instruct-Q4_0.gguf NDEV=2 D=HTP0,HTP1 ./scripts/snapdragon/adb/run-completion.sh -f surfing.txt
... ...
ggml-hex: Hexagon backend (experimental) : allocating new registry : ndev 1 ggml-hex: Hexagon backend (experimental) : allocating new registry : ndev 1
ggml-hex: Hexagon Arch version v81 ggml-hex: Hexagon Arch version v81
@ -234,6 +234,6 @@ build: 6a8cf8914 (6733)
Examples: Examples:
`GGML_HEXAGON_OPMASK=0x1 llama-cli ...` - Ops are enqueued but NPU-side processing is stubbed out `GGML_HEXAGON_OPMASK=0x1 llama-completion ...` - Ops are enqueued but NPU-side processing is stubbed out
`GGML_HEXAGON_OPMASK=0x3 llama-cli ...` - NPU performs dynamic quantization and skips the rest `GGML_HEXAGON_OPMASK=0x3 llama-completion ...` - NPU performs dynamic quantization and skips the rest
`GGML_HEXAGON_OPMASK=0x7 llama-cli ...` - Full queuing and processing of Ops (default) `GGML_HEXAGON_OPMASK=0x7 llama-completion ...` - Full queuing and processing of Ops (default)

View File

@ -49,7 +49,7 @@ Each Hexagon device behaves like a GPU from the offload and model splitting pers
Here is an example of running GPT-OSS-20B model on a newer Snapdragon device with 16GB of DDR. Here is an example of running GPT-OSS-20B model on a newer Snapdragon device with 16GB of DDR.
``` ```
M=gpt-oss-20b-Q4_0.gguf NDEV=4 D=HTP0,HTP1,HTP2,HTP3 P=surfing.txt scripts/snapdragon/adb/run-cli.sh -no-cnv -f surfing.txt -n 32 M=gpt-oss-20b-Q4_0.gguf NDEV=4 D=HTP0,HTP1,HTP2,HTP3 P=surfing.txt scripts/snapdragon/adb/run-completion.sh -f surfing.txt -n 32
... ...
LD_LIBRARY_PATH=/data/local/tmp/llama.cpp/lib LD_LIBRARY_PATH=/data/local/tmp/llama.cpp/lib
ADSP_LIBRARY_PATH=/data/local/tmp/llama.cpp/lib ADSP_LIBRARY_PATH=/data/local/tmp/llama.cpp/lib

View File

@ -2,57 +2,74 @@
#include "common.h" #include "common.h"
#include <fstream> #include <fstream>
#include <sstream>
#include <string> #include <string>
// Export usage message (-h) to markdown format // Export usage message (-h) to markdown format
// Automatically update the markdown docs
static void write_table_header(std::ofstream & file) { #define HELP_START_MARKER "<!-- HELP_START -->"
file << "| Argument | Explanation |\n"; #define HELP_END_MARKER "<!-- HELP_END -->"
file << "| -------- | ----------- |\n"; #define NOTE_MESSAGE "<!-- IMPORTANT: The list below is auto-generated by llama-gen-docs; do NOT modify it manually -->"
struct md_file {
llama_example ex;
std::string fname;
std::string specific_section_header;
};
std::vector<md_file> md_files = {
{LLAMA_EXAMPLE_CLI, "tools/cli/README.md", "CLI-specific params"},
{LLAMA_EXAMPLE_COMPLETION, "tools/completion/README.md", "Completion-specific params"},
{LLAMA_EXAMPLE_SERVER, "tools/server/README.md", "Server-specific params"},
};
static void write_table_header(std::ostringstream & ss) {
ss << "| Argument | Explanation |\n";
ss << "| -------- | ----------- |\n";
} }
static void write_table_entry(std::ofstream & file, const common_arg & opt) { static void write_table_entry(std::ostringstream & ss, const common_arg & opt) {
file << "| `"; ss << "| `";
// args // args
auto all_args = opt.get_args(); auto all_args = opt.get_args();
for (const auto & arg : all_args) { for (const auto & arg : all_args) {
if (arg == all_args.front()) { if (arg == all_args.front()) {
file << arg; ss << arg;
if (all_args.size() > 1) file << ", "; if (all_args.size() > 1) ss << ", ";
} else { } else {
file << arg << (arg != all_args.back() ? ", " : ""); ss << arg << (arg != all_args.back() ? ", " : "");
} }
} }
// value hint // value hint
if (opt.value_hint) { if (opt.value_hint) {
std::string md_value_hint(opt.value_hint); std::string md_value_hint(opt.value_hint);
string_replace_all(md_value_hint, "|", "\\|"); string_replace_all(md_value_hint, "|", "\\|");
file << " " << md_value_hint; ss << " " << md_value_hint;
} }
if (opt.value_hint_2) { if (opt.value_hint_2) {
std::string md_value_hint_2(opt.value_hint_2); std::string md_value_hint_2(opt.value_hint_2);
string_replace_all(md_value_hint_2, "|", "\\|"); string_replace_all(md_value_hint_2, "|", "\\|");
file << " " << md_value_hint_2; ss << " " << md_value_hint_2;
} }
// help text // help text
std::string md_help(opt.help); std::string md_help(opt.help);
md_help = string_strip(md_help);
string_replace_all(md_help, "\n", "<br/>"); string_replace_all(md_help, "\n", "<br/>");
string_replace_all(md_help, "|", "\\|"); string_replace_all(md_help, "|", "\\|");
file << "` | " << md_help << " |\n"; ss << "` | " << md_help << " |\n";
} }
static void write_table(std::ofstream & file, std::vector<common_arg *> & opts) { static void write_table(std::ostringstream & ss, std::vector<common_arg *> & opts) {
write_table_header(file); write_table_header(ss);
for (const auto & opt : opts) { for (const auto & opt : opts) {
write_table_entry(file, *opt); write_table_entry(ss, *opt);
} }
} }
static void export_md(std::string fname, llama_example ex, std::string name) { static void write_help(std::ostringstream & ss, const md_file & md) {
std::ofstream file(fname, std::ofstream::out | std::ofstream::trunc);
common_params params; common_params params;
auto ctx_arg = common_params_parser_init(params, ex); auto ctx_arg = common_params_parser_init(params, md.ex);
std::vector<common_arg *> common_options; std::vector<common_arg *> common_options;
std::vector<common_arg *> sparam_options; std::vector<common_arg *> sparam_options;
@ -68,18 +85,58 @@ static void export_md(std::string fname, llama_example ex, std::string name) {
} }
} }
file << "**Common params**\n\n"; ss << HELP_START_MARKER << "\n\n";
write_table(file, common_options);
file << "\n\n**Sampling params**\n\n"; ss << NOTE_MESSAGE << "\n\n";
write_table(file, sparam_options);
file << "\n\n**" << name << "-specific params**\n\n"; ss << "### Common params\n\n";
write_table(file, specific_options); write_table(ss, common_options);
ss << "\n\n### Sampling params\n\n";
write_table(ss, sparam_options);
ss << "\n\n### " << md.specific_section_header << "\n\n";
write_table(ss, specific_options);
ss << "\n" << HELP_END_MARKER;
} }
int main(int, char **) { int main(int, char **) {
// TODO: add CLI for (const auto & md : md_files) {
export_md("autogen-completion.md", LLAMA_EXAMPLE_COMPLETION, "Tool"); std::ifstream infile(md.fname);
export_md("autogen-server.md", LLAMA_EXAMPLE_SERVER, "Server"); if (!infile.is_open()) {
fprintf(stderr, "failed to open file '%s' for reading\n", md.fname.c_str());
return 1;
}
std::ostringstream ss;
ss << infile.rdbuf();
infile.close();
std::string content = ss.str();
size_t help_start = content.find(HELP_START_MARKER);
size_t help_end = content.find(HELP_END_MARKER);
if (help_start == std::string::npos || help_end == std::string::npos || help_end <= help_start) {
fprintf(stderr, "failed to find help markers in file '%s'\n", md.fname.c_str());
return 1;
}
std::ostringstream new_help_ss;
write_help(new_help_ss, md);
std::string new_help = new_help_ss.str();
content = content.substr(0, help_start) + new_help + content.substr(help_end + strlen(HELP_END_MARKER));
std::ofstream outfile(md.fname);
if (!outfile.is_open()) {
fprintf(stderr, "failed to open file '%s' for writing\n", md.fname.c_str());
return 1;
}
outfile << content;
outfile.close();
printf("Updated help in '%s'\n", md.fname.c_str());
}
return 0; return 0;
} }

View File

@ -25,6 +25,8 @@ define quantize_model
@echo "Export the quantized model path to $(2) variable in your environment" @echo "Export the quantized model path to $(2) variable in your environment"
endef endef
DEVICE ?= auto
### ###
### Casual Model targets/recipes ### Casual Model targets/recipes
### ###
@ -53,7 +55,7 @@ causal-convert-mm-model:
causal-run-original-model: causal-run-original-model:
$(call validate_model_path,causal-run-original-model) $(call validate_model_path,causal-run-original-model)
@MODEL_PATH="$(MODEL_PATH)" ./scripts/causal/run-org-model.py @MODEL_PATH="$(MODEL_PATH)" ./scripts/causal/run-org-model.py --device "$(DEVICE)"
causal-run-converted-model: causal-run-converted-model:
@CONVERTED_MODEL="$(CONVERTED_MODEL)" ./scripts/causal/run-converted-model.sh @CONVERTED_MODEL="$(CONVERTED_MODEL)" ./scripts/causal/run-converted-model.sh

View File

@ -4,149 +4,179 @@ import argparse
import os import os
import sys import sys
import importlib import importlib
import torch
import numpy as np
from pathlib import Path from pathlib import Path
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForImageTextToText, AutoConfig
# Add parent directory to path for imports # Add parent directory to path for imports
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..')) sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForImageTextToText, AutoConfig
import torch
import numpy as np
from utils.common import debug_hook from utils.common import debug_hook
parser = argparse.ArgumentParser(description="Process model with specified path") def parse_arguments():
parser.add_argument("--model-path", "-m", help="Path to the model") parser = argparse.ArgumentParser(description="Process model with specified path")
parser.add_argument("--prompt-file", "-f", help="Optional prompt file", required=False) parser.add_argument("--model-path", "-m", help="Path to the model")
parser.add_argument("--verbose", "-v", action="store_true", help="Enable verbose debug output") parser.add_argument("--prompt-file", "-f", help="Optional prompt file", required=False)
args = parser.parse_args() parser.add_argument("--verbose", "-v", action="store_true", help="Enable verbose debug output")
parser.add_argument("--device", "-d", help="Device to use (cpu, cuda, mps, auto)", default="auto")
return parser.parse_args()
model_path = os.environ.get("MODEL_PATH", args.model_path) def load_model_and_tokenizer(model_path, device="auto"):
if model_path is None: print("Loading model and tokenizer using AutoTokenizer:", model_path)
parser.error( tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
"Model path must be specified either via --model-path argument or MODEL_PATH environment variable" config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
) multimodal = False
full_config = config
### If you want to dump RoPE activations, uncomment the following lines: # Determine device_map based on device argument
### === START ROPE DEBUG === if device == "cpu":
# from utils.common import setup_rope_debug device_map = {"": "cpu"}
# setup_rope_debug("transformers.models.apertus.modeling_apertus") print("Forcing CPU usage")
### == END ROPE DEBUG === elif device == "auto":
device_map = "auto"
print("Loading model and tokenizer using AutoTokenizer:", model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
multimodal = False
full_config = config
print("Model type: ", config.model_type)
if "vocab_size" not in config and "text_config" in config:
config = config.text_config
multimodal = True
print("Vocab size: ", config.vocab_size)
print("Hidden size: ", config.hidden_size)
print("Number of layers: ", config.num_hidden_layers)
print("BOS token id: ", config.bos_token_id)
print("EOS token id: ", config.eos_token_id)
unreleased_model_name = os.getenv("UNRELEASED_MODEL_NAME")
if unreleased_model_name:
model_name_lower = unreleased_model_name.lower()
unreleased_module_path = (
f"transformers.models.{model_name_lower}.modular_{model_name_lower}"
)
class_name = f"{unreleased_model_name}ForCausalLM"
print(f"Importing unreleased model module: {unreleased_module_path}")
try:
model_class = getattr(
importlib.import_module(unreleased_module_path), class_name
)
model = model_class.from_pretrained(
model_path
) # Note: from_pretrained, not fromPretrained
except (ImportError, AttributeError) as e:
print(f"Failed to import or load model: {e}")
exit(1)
else:
if multimodal:
model = AutoModelForImageTextToText.from_pretrained(
model_path, device_map="auto", offload_folder="offload", trust_remote_code=True, config=full_config
)
else: else:
model = AutoModelForCausalLM.from_pretrained( device_map = {"": device}
model_path, device_map="auto", offload_folder="offload", trust_remote_code=True, config=config
print("Model type: ", config.model_type)
if "vocab_size" not in config and "text_config" in config:
config = config.text_config
multimodal = True
print("Vocab size: ", config.vocab_size)
print("Hidden size: ", config.hidden_size)
print("Number of layers: ", config.num_hidden_layers)
print("BOS token id: ", config.bos_token_id)
print("EOS token id: ", config.eos_token_id)
unreleased_model_name = os.getenv("UNRELEASED_MODEL_NAME")
if unreleased_model_name:
model_name_lower = unreleased_model_name.lower()
unreleased_module_path = (
f"transformers.models.{model_name_lower}.modular_{model_name_lower}"
) )
class_name = f"{unreleased_model_name}ForCausalLM"
print(f"Importing unreleased model module: {unreleased_module_path}")
if args.verbose: try:
for name, module in model.named_modules(): model_class = getattr(importlib.import_module(unreleased_module_path), class_name)
if len(list(module.children())) == 0: # only leaf modules model = model_class.from_pretrained(
module.register_forward_hook(debug_hook(name)) model_path,
device_map=device_map,
offload_folder="offload",
trust_remote_code=True,
config=config
)
except (ImportError, AttributeError) as e:
print(f"Failed to import or load model: {e}")
exit(1)
else:
if multimodal:
model = AutoModelForImageTextToText.from_pretrained(
model_path,
device_map=device_map,
offload_folder="offload",
trust_remote_code=True,
config=full_config
)
else:
model = AutoModelForCausalLM.from_pretrained(
model_path,
device_map=device_map,
offload_folder="offload",
trust_remote_code=True,
config=config
)
model_name = os.path.basename(model_path) print(f"Model class: {model.__class__.__name__}")
# Printing the Model class to allow for easier debugging. This can be useful
# when working with models that have not been publicly released yet and this
# migth require that the concrete class is imported and used directly instead
# of using AutoModelForCausalLM.
print(f"Model class: {model.__class__.__name__}")
device = next(model.parameters()).device return model, tokenizer, config
if args.prompt_file:
with open(args.prompt_file, encoding='utf-8') as f:
prompt = f.read()
elif os.getenv("MODEL_TESTING_PROMPT"):
prompt = os.getenv("MODEL_TESTING_PROMPT")
else:
prompt = "Hello, my name is"
input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
print(f"Input tokens: {input_ids}") def enable_torch_debugging(model):
print(f"Input text: {repr(prompt)}") for name, module in model.named_modules():
print(f"Tokenized: {tokenizer.convert_ids_to_tokens(input_ids[0])}") if len(list(module.children())) == 0: # only leaf modules
module.register_forward_hook(debug_hook(name))
batch_size = 512 def get_prompt(args):
if args.prompt_file:
with open(args.prompt_file, encoding='utf-8') as f:
return f.read()
elif os.getenv("MODEL_TESTING_PROMPT"):
return os.getenv("MODEL_TESTING_PROMPT")
else:
return "Hello, my name is"
with torch.no_grad(): def main():
past = None args = parse_arguments()
outputs = None model_path = os.environ.get("MODEL_PATH", args.model_path)
for i in range(0, input_ids.size(1), batch_size): if model_path is None:
print(f"Processing chunk with tokens {i} to {i + batch_size}") print("Error: Model path must be specified either via --model-path argument or MODEL_PATH environment variable")
chunk = input_ids[:, i:i + batch_size] sys.exit(1)
outputs = model(chunk.to(model.device), past_key_values=past, use_cache=True)
past = outputs.past_key_values
logits = outputs.logits # type: ignore
# Extract logits for the last token (next token prediction) model, tokenizer, config = load_model_and_tokenizer(model_path, args.device)
last_logits = logits[0, -1, :].float().cpu().numpy()
print(f"Logits shape: {logits.shape}") if args.verbose:
print(f"Last token logits shape: {last_logits.shape}") enable_torch_debugging(model)
print(f"Vocab size: {len(last_logits)}")
data_dir = Path("data") model_name = os.path.basename(model_path)
data_dir.mkdir(exist_ok=True)
bin_filename = data_dir / f"pytorch-{model_name}.bin"
txt_filename = data_dir / f"pytorch-{model_name}.txt"
# Save to file for comparison # Iterate over the model parameters (the tensors) and get the first one
last_logits.astype(np.float32).tofile(bin_filename) # and use it to get the device the model is on.
device = next(model.parameters()).device
prompt = get_prompt(args)
input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
# Also save as text file for easy inspection print(f"Input tokens: {input_ids}")
with open(txt_filename, "w") as f: print(f"Input text: {repr(prompt)}")
for i, logit in enumerate(last_logits): print(f"Tokenized: {tokenizer.convert_ids_to_tokens(input_ids[0])}")
f.write(f"{i}: {logit:.6f}\n")
# Print some sample logits for quick verification batch_size = 512
print(f"First 10 logits: {last_logits[:10]}")
print(f"Last 10 logits: {last_logits[-10:]}")
# Show top 5 predicted tokens with torch.no_grad():
top_indices = np.argsort(last_logits)[-5:][::-1] past = None
print("Top 5 predictions:") outputs = None
for idx in top_indices: for i in range(0, input_ids.size(1), batch_size):
token = tokenizer.decode([idx]) print(f"Processing chunk with tokens {i} to {i + batch_size}")
print(f" Token {idx} ({repr(token)}): {last_logits[idx]:.6f}") chunk = input_ids[:, i:i + batch_size]
outputs = model(chunk.to(model.device), past_key_values=past, use_cache=True)
past = outputs.past_key_values
print(f"Saved bin logits to: {bin_filename}") logits = outputs.logits # type: ignore
print(f"Saved txt logist to: {txt_filename}")
# Extract logits for the last token (next token prediction)
last_logits = logits[0, -1, :].float().cpu().numpy()
print(f"Logits shape: {logits.shape}")
print(f"Last token logits shape: {last_logits.shape}")
print(f"Vocab size: {len(last_logits)}")
data_dir = Path("data")
data_dir.mkdir(exist_ok=True)
bin_filename = data_dir / f"pytorch-{model_name}.bin"
txt_filename = data_dir / f"pytorch-{model_name}.txt"
# Save to file for comparison
last_logits.astype(np.float32).tofile(bin_filename)
# Also save as text file for easy inspection
with open(txt_filename, "w") as f:
for i, logit in enumerate(last_logits):
f.write(f"{i}: {logit:.6f}\n")
# Print some sample logits for quick verification
print(f"First 10 logits: {last_logits[:10]}")
print(f"Last 10 logits: {last_logits[-10:]}")
# Show top 5 predicted tokens
top_indices = np.argsort(last_logits)[-5:][::-1]
print("Top 5 predictions:")
for idx in top_indices:
token = tokenizer.decode([idx])
print(f" Token {idx} ({repr(token)}): {last_logits[idx]:.6f}")
print(f"Saved bin logits to: {bin_filename}")
print(f"Saved txt logist to: {txt_filename}")
if __name__ == "__main__":
main()

View File

@ -45,7 +45,7 @@ if use_sentence_transformers:
else: else:
tokenizer = AutoTokenizer.from_pretrained(model_path) tokenizer = AutoTokenizer.from_pretrained(model_path)
config = AutoConfig.from_pretrained(model_path) config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
# This can be used to override the sliding window size for manual testing. This # This can be used to override the sliding window size for manual testing. This
# can be useful to verify the sliding window attention mask in the original model # can be useful to verify the sliding window attention mask in the original model
@ -64,12 +64,12 @@ else:
try: try:
model_class = getattr(importlib.import_module(unreleased_module_path), class_name) model_class = getattr(importlib.import_module(unreleased_module_path), class_name)
model = model_class.from_pretrained(model_path, config=config) model = model_class.from_pretrained(model_path, config=config, trust_remote_code=True)
except (ImportError, AttributeError) as e: except (ImportError, AttributeError) as e:
print(f"Failed to import or load model: {e}") print(f"Failed to import or load model: {e}")
exit(1) exit(1)
else: else:
model = AutoModel.from_pretrained(model_path, config=config) model = AutoModel.from_pretrained(model_path, config=config, trust_remote_code=True)
print(f"Model class: {type(model)}") print(f"Model class: {type(model)}")
print(f"Model file: {type(model).__module__}") print(f"Model file: {type(model).__module__}")
@ -123,7 +123,7 @@ with torch.no_grad():
outputs = model(**encoded) outputs = model(**encoded)
hidden_states = outputs.last_hidden_state # Shape: [batch_size, seq_len, hidden_size] hidden_states = outputs.last_hidden_state # Shape: [batch_size, seq_len, hidden_size]
all_embeddings = hidden_states[0].cpu().numpy() # Shape: [seq_len, hidden_size] all_embeddings = hidden_states[0].float().cpu().numpy() # Shape: [seq_len, hidden_size]
print(f"Hidden states shape: {hidden_states.shape}") print(f"Hidden states shape: {hidden_states.shape}")
print(f"All embeddings shape: {all_embeddings.shape}") print(f"All embeddings shape: {all_embeddings.shape}")

View File

@ -166,7 +166,7 @@ def main():
# Load the python model to get configuration information and also to load the tokenizer. # Load the python model to get configuration information and also to load the tokenizer.
print("Loading model and tokenizer using AutoTokenizer:", args.model_path) print("Loading model and tokenizer using AutoTokenizer:", args.model_path)
tokenizer = AutoTokenizer.from_pretrained(args.model_path) tokenizer = AutoTokenizer.from_pretrained(args.model_path)
config = AutoConfig.from_pretrained(args.model_path) config = AutoConfig.from_pretrained(args.model_path, trust_remote_code=True)
if unreleased_model_name: if unreleased_model_name:
model_name_lower = unreleased_model_name.lower() model_name_lower = unreleased_model_name.lower()
@ -186,9 +186,9 @@ def main():
exit(1) exit(1)
else: else:
if args.causal: if args.causal:
model = AutoModelForCausalLM.from_pretrained(args.model_path) model = AutoModelForCausalLM.from_pretrained(args.model_path, trust_remote_code=True)
else: else:
model = AutoModel.from_pretrained(args.model_path) model = AutoModel.from_pretrained(args.model_path, trust_remote_code=True)
encoded = tokenizer(prompt, return_tensors="pt") encoded = tokenizer(prompt, return_tensors="pt")
tokens = tokenizer.convert_ids_to_tokens(encoded['input_ids'][0]) tokens = tokenizer.convert_ids_to_tokens(encoded['input_ids'][0])

View File

@ -22,9 +22,9 @@ if [ $# -gt 0 ]; then
GGML_SYCL_DEVICE=$1 GGML_SYCL_DEVICE=$1
echo "use $GGML_SYCL_DEVICE as main GPU" echo "use $GGML_SYCL_DEVICE as main GPU"
#use signle GPU only #use signle GPU only
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m ${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONTEXT} -mg $GGML_SYCL_DEVICE -sm none ZES_ENABLE_SYSMAN=1 ./build/bin/llama-completion -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONTEXT} -mg $GGML_SYCL_DEVICE -sm none
else else
#use multiple GPUs with same max compute units #use multiple GPUs with same max compute units
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m ${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONTEXT} ZES_ENABLE_SYSMAN=1 ./build/bin/llama-completion -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONTEXT}
fi fi

View File

@ -24,8 +24,8 @@ export UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1
if [ $# -gt 0 ]; then if [ $# -gt 0 ]; then
GGML_SYCL_DEVICE=$1 GGML_SYCL_DEVICE=$1
echo "Using $GGML_SYCL_DEVICE as the main GPU" echo "Using $GGML_SYCL_DEVICE as the main GPU"
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m ${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONTEXT} -mg $GGML_SYCL_DEVICE -sm none ZES_ENABLE_SYSMAN=1 ./build/bin/llama-completion -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONTEXT} -mg $GGML_SYCL_DEVICE -sm none
else else
#use multiple GPUs with same max compute units #use multiple GPUs with same max compute units
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m ${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONTEXT} ZES_ENABLE_SYSMAN=1 ./build/bin/llama-completion -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONTEXT}
fi fi

View File

@ -8,4 +8,4 @@ set INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
:: support malloc device memory more than 4GB. :: support malloc device memory more than 4GB.
set UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1 set UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1
.\build\bin\llama-cli.exe -m models\llama-2-7b.Q4_0.gguf -p %INPUT2% -n 400 -e -ngl 99 -s 0 .\build\bin\llama-completion.exe -m models\llama-2-7b.Q4_0.gguf -no-cnv -p %INPUT2% -n 400 -e -ngl 99 -s 0

View File

@ -8,4 +8,4 @@ set INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
:: support malloc device memory more than 4GB. :: support malloc device memory more than 4GB.
set UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1 set UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1
.\build\bin\llama-cli.exe -m models\Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf -p %INPUT2% -n 400 -s 0 -e -ngl 99 .\build\bin\llama-completion.exe -m models\Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf -no-cnv -p %INPUT2% -n 400 -s 0 -e -ngl 99

View File

@ -69,6 +69,10 @@
#define VECTOR_REGISTERS 16 #define VECTOR_REGISTERS 16
#endif #endif
#if defined(__riscv_v_intrinsic)
#define LMUL 4
#endif
#define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1) #define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1)
namespace { namespace {
@ -175,6 +179,46 @@ inline float32x4_t madd(float32x4_t a, float32x4_t b, float32x4_t c) {
} }
#endif #endif
#if defined(__riscv_zvfh)
template <>
inline vfloat32m1_t madd(vfloat16mf2_t a, vfloat16mf2_t b, vfloat32m1_t c) {
return __riscv_vfwmacc_vv_f32m1(c, a, b, __riscv_vsetvlmax_e32m1());
}
inline vfloat32m2_t madd(vfloat16m1_t a, vfloat16m1_t b, vfloat32m2_t c) {
return __riscv_vfwmacc_vv_f32m2(c, a, b, __riscv_vsetvlmax_e32m2());
}
inline vfloat32m4_t madd(vfloat16m2_t a, vfloat16m2_t b, vfloat32m4_t c) {
return __riscv_vfwmacc_vv_f32m4(c, a, b, __riscv_vsetvlmax_e32m4());
}
inline vfloat32m8_t madd(vfloat16m4_t a, vfloat16m4_t b, vfloat32m8_t c) {
return __riscv_vfwmacc_vv_f32m8(c, a, b, __riscv_vsetvlmax_e32m8());
}
inline vfloat32m1_t madd(vfloat32m1_t a, vfloat32m1_t b, vfloat32m1_t c) {
return __riscv_vfmacc_vv_f32m1(c, a, b, __riscv_vsetvlmax_e32m1());
}
inline vfloat32m2_t madd(vfloat32m2_t a, vfloat32m2_t b, vfloat32m2_t c) {
return __riscv_vfmacc_vv_f32m2(c, a, b, __riscv_vsetvlmax_e32m2());
}
inline vfloat32m4_t madd(vfloat32m4_t a, vfloat32m4_t b, vfloat32m4_t c) {
return __riscv_vfmacc_vv_f32m4(c, a, b, __riscv_vsetvlmax_e32m4());
}
inline vfloat32m8_t madd(vfloat32m8_t a, vfloat32m8_t b, vfloat32m8_t c) {
return __riscv_vfmacc_vv_f32m8(c, a, b, __riscv_vsetvlmax_e32m8());
}
#endif
#if defined(__riscv_zvfbfwma)
inline vfloat32m1_t madd(vbfloat16mf2_t a, vbfloat16mf2_t b, vfloat32m1_t c) {
return __riscv_vfwmaccbf16_vv_f32m1(c, a, b, __riscv_vsetvlmax_e32m1());
}
inline vfloat32m2_t madd(vbfloat16m1_t a, vbfloat16m1_t b, vfloat32m2_t c) {
return __riscv_vfwmaccbf16_vv_f32m2(c, a, b, __riscv_vsetvlmax_e32m2());
}
inline vfloat32m4_t madd(vbfloat16m2_t a, vbfloat16m2_t b, vfloat32m4_t c) {
return __riscv_vfwmaccbf16_vv_f32m4(c, a, b, __riscv_vsetvlmax_e32m4());
}
#endif
//////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////
// VECTORIZED HORIZONTAL SUM // VECTORIZED HORIZONTAL SUM
@ -227,6 +271,25 @@ inline float hsum(__m512 x) {
} }
#endif // __AVX512F__ #endif // __AVX512F__
#if defined(__riscv_zvfh)
inline float hsum(vfloat32m1_t x) {
return __riscv_vfmv_f_s_f32m1_f32(
__riscv_vfredusum_vs_f32m1_f32m1(x, __riscv_vfmv_v_f_f32m1(0, 1), __riscv_vsetvlmax_e32m1()));
}
inline float hsum(vfloat32m2_t x) {
return __riscv_vfmv_f_s_f32m1_f32(
__riscv_vfredusum_vs_f32m2_f32m1(x, __riscv_vfmv_v_f_f32m1(0, 1), __riscv_vsetvlmax_e32m2()));
}
inline float hsum(vfloat32m4_t x) {
return __riscv_vfmv_f_s_f32m1_f32(
__riscv_vfredusum_vs_f32m4_f32m1(x, __riscv_vfmv_v_f_f32m1(0, 1), __riscv_vsetvlmax_e32m4()));
}
inline float hsum(vfloat32m8_t x) {
return __riscv_vfmv_f_s_f32m1_f32(
__riscv_vfredusum_vs_f32m8_f32m1(x, __riscv_vfmv_v_f_f32m1(0, 1), __riscv_vsetvlmax_e32m8()));
}
#endif
//////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////
// VECTORIZED MEMORY LOADING // VECTORIZED MEMORY LOADING
@ -315,6 +378,88 @@ template <> inline __m256bh load(const float *p) {
} }
#endif #endif
#if defined(__riscv_zvfh)
template <> inline vfloat16mf2_t load(const ggml_fp16_t *p) {
return __riscv_vle16_v_f16mf2(reinterpret_cast<const _Float16 *>(p), __riscv_vsetvlmax_e16mf2());
}
template <> inline vfloat16m1_t load(const ggml_fp16_t *p) {
return __riscv_vle16_v_f16m1(reinterpret_cast<const _Float16 *>(p), __riscv_vsetvlmax_e16m1());
}
template <> inline vfloat16m2_t load(const ggml_fp16_t *p) {
return __riscv_vle16_v_f16m2(reinterpret_cast<const _Float16 *>(p), __riscv_vsetvlmax_e16m2());
}
template <> inline vfloat16m4_t load(const ggml_fp16_t *p) {
return __riscv_vle16_v_f16m4(reinterpret_cast<const _Float16 *>(p), __riscv_vsetvlmax_e16m4());
}
template <> inline vfloat32m1_t load(const float *p) {
return __riscv_vle32_v_f32m1(p, __riscv_vsetvlmax_e32m1());
}
template <> inline vfloat32m2_t load(const float *p) {
return __riscv_vle32_v_f32m2(p, __riscv_vsetvlmax_e32m2());
}
template <> inline vfloat32m4_t load(const float *p) {
return __riscv_vle32_v_f32m4(p, __riscv_vsetvlmax_e32m4());
}
template <> inline vfloat32m8_t load(const float *p) {
return __riscv_vle32_v_f32m8(p, __riscv_vsetvlmax_e32m8());
}
#endif
#if defined(__riscv_zvfbfwma)
template <> inline vbfloat16mf2_t load(const ggml_bf16_t *p) {
return __riscv_vle16_v_bf16mf2(reinterpret_cast<const __bf16*>(p), __riscv_vsetvlmax_e16mf2());
}
template <> inline vbfloat16m1_t load(const ggml_bf16_t *p) {
return __riscv_vle16_v_bf16m1(reinterpret_cast<const __bf16*>(p), __riscv_vsetvlmax_e16m1());
}
template <> inline vbfloat16m2_t load(const ggml_bf16_t *p) {
return __riscv_vle16_v_bf16m2(reinterpret_cast<const __bf16*>(p), __riscv_vsetvlmax_e16m2());
}
#endif
#if defined(__riscv_zvfh)
template <typename T> T set_zero();
template <> inline vfloat16mf2_t set_zero() {
return __riscv_vfmv_v_f_f16mf2(0, __riscv_vsetvlmax_e16mf2());
}
template <> inline vfloat16m1_t set_zero() {
return __riscv_vfmv_v_f_f16m1(0, __riscv_vsetvlmax_e16m1());
}
template <> inline vfloat16m2_t set_zero() {
return __riscv_vfmv_v_f_f16m2(0, __riscv_vsetvlmax_e16m2());
}
template <> inline vfloat16m4_t set_zero() {
return __riscv_vfmv_v_f_f16m4(0, __riscv_vsetvlmax_e16m4());
}
template <> inline vfloat32m1_t set_zero() {
return __riscv_vfmv_v_f_f32m1(0.0f, __riscv_vsetvlmax_e32m1());
}
template <> inline vfloat32m2_t set_zero() {
return __riscv_vfmv_v_f_f32m2(0, __riscv_vsetvlmax_e32m2());
}
template <> inline vfloat32m4_t set_zero() {
return __riscv_vfmv_v_f_f32m4(0, __riscv_vsetvlmax_e32m4());
}
template <> inline vfloat32m8_t set_zero() {
return __riscv_vfmv_v_f_f32m8(0, __riscv_vsetvlmax_e32m8());
}
#endif
#if defined(__riscv_v_intrinsic)
template <typename T> size_t vlmax() {
if constexpr (std::is_same_v<T, vfloat16mf2_t>) { return __riscv_vsetvlmax_e16mf2(); }
else if constexpr (std::is_same_v<T, vfloat16m1_t>) { return __riscv_vsetvlmax_e16m1(); }
else if constexpr (std::is_same_v<T, vfloat16m2_t>) { return __riscv_vsetvlmax_e16m2(); }
else if constexpr (std::is_same_v<T, vfloat16m4_t>) { return __riscv_vsetvlmax_e16m4(); }
else if constexpr (std::is_same_v<T, vfloat32m1_t>) { return __riscv_vsetvlmax_e32m1(); }
else if constexpr (std::is_same_v<T, vfloat32m2_t>) { return __riscv_vsetvlmax_e32m2(); }
else if constexpr (std::is_same_v<T, vfloat32m4_t>) { return __riscv_vsetvlmax_e32m4(); }
else if constexpr (std::is_same_v<T, vfloat32m8_t>) { return __riscv_vsetvlmax_e32m8(); }
return 0;
}
#endif
//////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////
// FLOATING POINT MATRIX MULTIPLICATION // FLOATING POINT MATRIX MULTIPLICATION
@ -488,6 +633,573 @@ class tinyBLAS {
const int64_t ldc; const int64_t ldc;
}; };
#if defined(__riscv_v_intrinsic)
template <typename D, typename V, typename TA, typename TB, typename TC>
class tinyBLAS_RVV {
public:
tinyBLAS_RVV(const ggml_compute_params * params, int64_t k,
const TA *A, int64_t lda,
const TB *B, int64_t ldb,
TC *C, int64_t ldc)
: params(params), A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc) {
}
bool matmul(int64_t m, int64_t n) {
if (k % vlmax<V>() != 0) {
return false;
}
#if LMUL == 1
if (m % 16 == 0 && (m/16 >= params->nth)) {
const int64_t SIZE_N = BLOCK_SIZE<6>(n);
mnpack<4, 6, 4>(m, n, SIZE_N, 12);
return true;
}
if (m % 8 == 0 ) {
const int64_t SIZE_N = BLOCK_SIZE<6>(n);
mnpack<4, 6, 2>(m, n, SIZE_N, 12);
return true;
}
if (m % 4 == 0) {
const int64_t SIZE_N = BLOCK_SIZE<6>(n);
mnpack<4, 6, 1>(m, n, SIZE_N, 12);
return true;
}
#elif LMUL == 2
if (m % 16 == 0 && (m/16 >= params->nth)) {
const int64_t SIZE_N = BLOCK_SIZE<3>(n);
mnpack<4, 3, 4>(m, n, SIZE_N, 24);
return true;
}
if (m % 8 == 0 ) {
const int64_t SIZE_N = BLOCK_SIZE<3>(n);
mnpack<4, 3, 2>(m, n, SIZE_N, 24);
return true;
}
if (m % 4 == 0) {
const int64_t SIZE_N = BLOCK_SIZE<3>(n);
mnpack<4, 3, 1>(m, n, SIZE_N, 24);
return true;
}
#else // LMUL = 4
if (m % 16 == 0 && (m/16 >= params->nth)) {
const int64_t SIZE_N = BLOCK_SIZE<2>(n);
mnpack<2, 2, 8>(m, n, SIZE_N, 36);
return true;
}
if (m % 8 == 0 ) {
const int64_t SIZE_N = BLOCK_SIZE<2>(n);
mnpack<2, 2, 4>(m, n, SIZE_N, 36);
return true;
}
if (m % 4 == 0) {
const int64_t SIZE_N = BLOCK_SIZE<2>(n);
mnpack<2, 2, 2>(m, n, SIZE_N, 36);
return true;
}
#endif
return false;
}
private:
template<int RM, int RN, int BM>
inline void mnpack(int64_t m, int64_t n, int64_t SIZE_N, int64_t BN) {
if (SIZE_N == RN) {
return gemm<RM, RN, BM>(m, n, BN);
}
if constexpr (RN > 1) {
return mnpack<RM, RN-1, BM>(m, n, SIZE_N, BN);
} else {
GGML_LOG_ERROR("mnpack<%d, %d> bloc size not supported\n", RM, (int)SIZE_N);
GGML_ASSERT(false); // we have miss something.
}
}
inline void gemm_bloc_4x6(int64_t ii, int64_t jj) {
size_t vl = vlmax<V>();
D Cv00 = set_zero<D>();
D Cv01 = set_zero<D>();
D Cv02 = set_zero<D>();
D Cv03 = set_zero<D>();
D Cv10 = set_zero<D>();
D Cv11 = set_zero<D>();
D Cv12 = set_zero<D>();
D Cv13 = set_zero<D>();
D Cv20 = set_zero<D>();
D Cv21 = set_zero<D>();
D Cv22 = set_zero<D>();
D Cv23 = set_zero<D>();
D Cv30 = set_zero<D>();
D Cv31 = set_zero<D>();
D Cv32 = set_zero<D>();
D Cv33 = set_zero<D>();
D Cv40 = set_zero<D>();
D Cv41 = set_zero<D>();
D Cv42 = set_zero<D>();
D Cv43 = set_zero<D>();
D Cv50 = set_zero<D>();
D Cv51 = set_zero<D>();
D Cv52 = set_zero<D>();
D Cv53 = set_zero<D>();
for (int64_t l = 0; l < k; l += vl) {
V Bv0 = load<V>(B + ldb * (jj + 0) + l);
V Bv1 = load<V>(B + ldb * (jj + 1) + l);
V Bv2 = load<V>(B + ldb * (jj + 2) + l);
V Bv3 = load<V>(B + ldb * (jj + 3) + l);
V Bv4 = load<V>(B + ldb * (jj + 4) + l);
V Bv5 = load<V>(B + ldb * (jj + 5) + l);
V Av0 = load<V>(A + lda * (ii + 0) + l);
Cv00 = madd(Av0, Bv0, Cv00);
Cv10 = madd(Av0, Bv1, Cv10);
Cv20 = madd(Av0, Bv2, Cv20);
Cv30 = madd(Av0, Bv3, Cv30);
Cv40 = madd(Av0, Bv4, Cv40);
Cv50 = madd(Av0, Bv5, Cv50);
V Av1 = load<V>(A + lda * (ii + 1) + l);
Cv01 = madd(Av1, Bv0, Cv01);
Cv11 = madd(Av1, Bv1, Cv11);
Cv21 = madd(Av1, Bv2, Cv21);
Cv31 = madd(Av1, Bv3, Cv31);
Cv41 = madd(Av1, Bv4, Cv41);
Cv51 = madd(Av1, Bv5, Cv51);
V Av2 = load<V>(A + lda * (ii + 2) + l);
Cv02 = madd(Av2, Bv0, Cv02);
Cv12 = madd(Av2, Bv1, Cv12);
Cv22 = madd(Av2, Bv2, Cv22);
Cv32 = madd(Av2, Bv3, Cv32);
Cv42 = madd(Av2, Bv4, Cv42);
Cv52 = madd(Av2, Bv5, Cv52);
V Av3 = load<V>(A + lda * (ii + 3) + l);
Cv03 = madd(Av3, Bv0, Cv03);
Cv13 = madd(Av3, Bv1, Cv13);
Cv23 = madd(Av3, Bv2, Cv23);
Cv33 = madd(Av3, Bv3, Cv33);
Cv43 = madd(Av3, Bv4, Cv43);
Cv53 = madd(Av3, Bv5, Cv53);
}
C[ldc * (jj + 0) + (ii + 0)] = hsum(Cv00);
C[ldc * (jj + 0) + (ii + 1)] = hsum(Cv01);
C[ldc * (jj + 0) + (ii + 2)] = hsum(Cv02);
C[ldc * (jj + 0) + (ii + 3)] = hsum(Cv03);
C[ldc * (jj + 1) + (ii + 0)] = hsum(Cv10);
C[ldc * (jj + 1) + (ii + 1)] = hsum(Cv11);
C[ldc * (jj + 1) + (ii + 2)] = hsum(Cv12);
C[ldc * (jj + 1) + (ii + 3)] = hsum(Cv13);
C[ldc * (jj + 2) + (ii + 0)] = hsum(Cv20);
C[ldc * (jj + 2) + (ii + 1)] = hsum(Cv21);
C[ldc * (jj + 2) + (ii + 2)] = hsum(Cv22);
C[ldc * (jj + 2) + (ii + 3)] = hsum(Cv23);
C[ldc * (jj + 3) + (ii + 0)] = hsum(Cv30);
C[ldc * (jj + 3) + (ii + 1)] = hsum(Cv31);
C[ldc * (jj + 3) + (ii + 2)] = hsum(Cv32);
C[ldc * (jj + 3) + (ii + 3)] = hsum(Cv33);
C[ldc * (jj + 4) + (ii + 0)] = hsum(Cv40);
C[ldc * (jj + 4) + (ii + 1)] = hsum(Cv41);
C[ldc * (jj + 4) + (ii + 2)] = hsum(Cv42);
C[ldc * (jj + 4) + (ii + 3)] = hsum(Cv43);
C[ldc * (jj + 5) + (ii + 0)] = hsum(Cv50);
C[ldc * (jj + 5) + (ii + 1)] = hsum(Cv51);
C[ldc * (jj + 5) + (ii + 2)] = hsum(Cv52);
C[ldc * (jj + 5) + (ii + 3)] = hsum(Cv53);
}
inline void gemm_bloc_4x5(int64_t ii, int64_t jj) {
size_t vl = vlmax<V>();
D Cv00 = set_zero<D>();
D Cv01 = set_zero<D>();
D Cv02 = set_zero<D>();
D Cv03 = set_zero<D>();
D Cv10 = set_zero<D>();
D Cv11 = set_zero<D>();
D Cv12 = set_zero<D>();
D Cv13 = set_zero<D>();
D Cv20 = set_zero<D>();
D Cv21 = set_zero<D>();
D Cv22 = set_zero<D>();
D Cv23 = set_zero<D>();
D Cv30 = set_zero<D>();
D Cv31 = set_zero<D>();
D Cv32 = set_zero<D>();
D Cv33 = set_zero<D>();
D Cv40 = set_zero<D>();
D Cv41 = set_zero<D>();
D Cv42 = set_zero<D>();
D Cv43 = set_zero<D>();
for (int64_t l = 0; l < k; l += vl) {
V Bv0 = load<V>(B + ldb * (jj + 0) + l);
V Bv1 = load<V>(B + ldb * (jj + 1) + l);
V Bv2 = load<V>(B + ldb * (jj + 2) + l);
V Bv3 = load<V>(B + ldb * (jj + 3) + l);
V Bv4 = load<V>(B + ldb * (jj + 4) + l);
V Av0 = load<V>(A + lda * (ii + 0) + l);
Cv00 = madd(Av0, Bv0, Cv00);
Cv10 = madd(Av0, Bv1, Cv10);
Cv20 = madd(Av0, Bv2, Cv20);
Cv30 = madd(Av0, Bv3, Cv30);
Cv40 = madd(Av0, Bv4, Cv40);
V Av1 = load<V>(A + lda * (ii + 1) + l);
Cv01 = madd(Av1, Bv0, Cv01);
Cv11 = madd(Av1, Bv1, Cv11);
Cv21 = madd(Av1, Bv2, Cv21);
Cv31 = madd(Av1, Bv3, Cv31);
Cv41 = madd(Av1, Bv4, Cv41);
V Av2 = load<V>(A + lda * (ii + 2) + l);
Cv02 = madd(Av2, Bv0, Cv02);
Cv12 = madd(Av2, Bv1, Cv12);
Cv22 = madd(Av2, Bv2, Cv22);
Cv32 = madd(Av2, Bv3, Cv32);
Cv42 = madd(Av2, Bv4, Cv42);
V Av3 = load<V>(A + lda * (ii + 3) + l);
Cv03 = madd(Av3, Bv0, Cv03);
Cv13 = madd(Av3, Bv1, Cv13);
Cv23 = madd(Av3, Bv2, Cv23);
Cv33 = madd(Av3, Bv3, Cv33);
Cv43 = madd(Av3, Bv4, Cv43);
}
C[ldc * (jj + 0) + (ii + 0)] = hsum(Cv00);
C[ldc * (jj + 0) + (ii + 1)] = hsum(Cv01);
C[ldc * (jj + 0) + (ii + 2)] = hsum(Cv02);
C[ldc * (jj + 0) + (ii + 3)] = hsum(Cv03);
C[ldc * (jj + 1) + (ii + 0)] = hsum(Cv10);
C[ldc * (jj + 1) + (ii + 1)] = hsum(Cv11);
C[ldc * (jj + 1) + (ii + 2)] = hsum(Cv12);
C[ldc * (jj + 1) + (ii + 3)] = hsum(Cv13);
C[ldc * (jj + 2) + (ii + 0)] = hsum(Cv20);
C[ldc * (jj + 2) + (ii + 1)] = hsum(Cv21);
C[ldc * (jj + 2) + (ii + 2)] = hsum(Cv22);
C[ldc * (jj + 2) + (ii + 3)] = hsum(Cv23);
C[ldc * (jj + 3) + (ii + 0)] = hsum(Cv30);
C[ldc * (jj + 3) + (ii + 1)] = hsum(Cv31);
C[ldc * (jj + 3) + (ii + 2)] = hsum(Cv32);
C[ldc * (jj + 3) + (ii + 3)] = hsum(Cv33);
C[ldc * (jj + 4) + (ii + 0)] = hsum(Cv40);
C[ldc * (jj + 4) + (ii + 1)] = hsum(Cv41);
C[ldc * (jj + 4) + (ii + 2)] = hsum(Cv42);
C[ldc * (jj + 4) + (ii + 3)] = hsum(Cv43);
}
inline void gemm_bloc_4x4(int64_t ii, int64_t jj) {
size_t vl = vlmax<V>();
D Cv00 = set_zero<D>();
D Cv01 = set_zero<D>();
D Cv02 = set_zero<D>();
D Cv03 = set_zero<D>();
D Cv10 = set_zero<D>();
D Cv11 = set_zero<D>();
D Cv12 = set_zero<D>();
D Cv13 = set_zero<D>();
D Cv20 = set_zero<D>();
D Cv21 = set_zero<D>();
D Cv22 = set_zero<D>();
D Cv23 = set_zero<D>();
D Cv30 = set_zero<D>();
D Cv31 = set_zero<D>();
D Cv32 = set_zero<D>();
D Cv33 = set_zero<D>();
for (int64_t l = 0; l < k; l += vl) {
V Av0 = load<V>(A + lda * (ii + 0) + l);
V Av1 = load<V>(A + lda * (ii + 1) + l);
V Av2 = load<V>(A + lda * (ii + 2) + l);
V Av3 = load<V>(A + lda * (ii + 3) + l);
V Bv0 = load<V>(B + ldb * (jj + 0) + l);
Cv00 = madd(Av0, Bv0, Cv00);
Cv01 = madd(Av1, Bv0, Cv01);
Cv02 = madd(Av2, Bv0, Cv02);
Cv03 = madd(Av3, Bv0, Cv03);
V Bv1 = load<V>(B + ldb * (jj + 1) + l);
Cv10 = madd(Av0, Bv1, Cv10);
Cv11 = madd(Av1, Bv1, Cv11);
Cv12 = madd(Av2, Bv1, Cv12);
Cv13 = madd(Av3, Bv1, Cv13);
V Bv2 = load<V>(B + ldb * (jj + 2) + l);
Cv20 = madd(Av0, Bv2, Cv20);
Cv21 = madd(Av1, Bv2, Cv21);
Cv22 = madd(Av2, Bv2, Cv22);
Cv23 = madd(Av3, Bv2, Cv23);
V Bv3 = load<V>(B + ldb * (jj + 3) + l);
Cv30 = madd(Av0, Bv3, Cv30);
Cv31 = madd(Av1, Bv3, Cv31);
Cv32 = madd(Av2, Bv3, Cv32);
Cv33 = madd(Av3, Bv3, Cv33);
}
C[ldc * (jj + 0) + (ii + 0)] = hsum(Cv00);
C[ldc * (jj + 0) + (ii + 1)] = hsum(Cv01);
C[ldc * (jj + 0) + (ii + 2)] = hsum(Cv02);
C[ldc * (jj + 0) + (ii + 3)] = hsum(Cv03);
C[ldc * (jj + 1) + (ii + 0)] = hsum(Cv10);
C[ldc * (jj + 1) + (ii + 1)] = hsum(Cv11);
C[ldc * (jj + 1) + (ii + 2)] = hsum(Cv12);
C[ldc * (jj + 1) + (ii + 3)] = hsum(Cv13);
C[ldc * (jj + 2) + (ii + 0)] = hsum(Cv20);
C[ldc * (jj + 2) + (ii + 1)] = hsum(Cv21);
C[ldc * (jj + 2) + (ii + 2)] = hsum(Cv22);
C[ldc * (jj + 2) + (ii + 3)] = hsum(Cv23);
C[ldc * (jj + 3) + (ii + 0)] = hsum(Cv30);
C[ldc * (jj + 3) + (ii + 1)] = hsum(Cv31);
C[ldc * (jj + 3) + (ii + 2)] = hsum(Cv32);
C[ldc * (jj + 3) + (ii + 3)] = hsum(Cv33);
}
inline void gemm_bloc_4x3(int64_t ii, int64_t jj) {
size_t vl = vlmax<V>();
D Cv00 = set_zero<D>();
D Cv01 = set_zero<D>();
D Cv02 = set_zero<D>();
D Cv03 = set_zero<D>();
D Cv10 = set_zero<D>();
D Cv11 = set_zero<D>();
D Cv12 = set_zero<D>();
D Cv13 = set_zero<D>();
D Cv20 = set_zero<D>();
D Cv21 = set_zero<D>();
D Cv22 = set_zero<D>();
D Cv23 = set_zero<D>();
for (int64_t l = 0; l < k; l += vl) {
V Av0 = load<V>(A + lda * (ii + 0) + l);
V Av1 = load<V>(A + lda * (ii + 1) + l);
V Av2 = load<V>(A + lda * (ii + 2) + l);
V Av3 = load<V>(A + lda * (ii + 3) + l);
V Bv0 = load<V>(B + ldb * (jj + 0) + l);
Cv00 = madd(Av0, Bv0, Cv00);
Cv01 = madd(Av1, Bv0, Cv01);
Cv02 = madd(Av2, Bv0, Cv02);
Cv03 = madd(Av3, Bv0, Cv03);
V Bv1 = load<V>(B + ldb * (jj + 1) + l);
Cv10 = madd(Av0, Bv1, Cv10);
Cv11 = madd(Av1, Bv1, Cv11);
Cv12 = madd(Av2, Bv1, Cv12);
Cv13 = madd(Av3, Bv1, Cv13);
V Bv2 = load<V>(B + ldb * (jj + 2) + l);
Cv20 = madd(Av0, Bv2, Cv20);
Cv21 = madd(Av1, Bv2, Cv21);
Cv22 = madd(Av2, Bv2, Cv22);
Cv23 = madd(Av3, Bv2, Cv23);
}
C[ldc * (jj + 0) + (ii + 0)] = hsum(Cv00);
C[ldc * (jj + 0) + (ii + 1)] = hsum(Cv01);
C[ldc * (jj + 0) + (ii + 2)] = hsum(Cv02);
C[ldc * (jj + 0) + (ii + 3)] = hsum(Cv03);
C[ldc * (jj + 1) + (ii + 0)] = hsum(Cv10);
C[ldc * (jj + 1) + (ii + 1)] = hsum(Cv11);
C[ldc * (jj + 1) + (ii + 2)] = hsum(Cv12);
C[ldc * (jj + 1) + (ii + 3)] = hsum(Cv13);
C[ldc * (jj + 2) + (ii + 0)] = hsum(Cv20);
C[ldc * (jj + 2) + (ii + 1)] = hsum(Cv21);
C[ldc * (jj + 2) + (ii + 2)] = hsum(Cv22);
C[ldc * (jj + 2) + (ii + 3)] = hsum(Cv23);
}
inline void gemm_bloc_4x2(int64_t ii, int64_t jj) {
size_t vl = vlmax<V>();
D Cv00 = set_zero<D>();
D Cv01 = set_zero<D>();
D Cv02 = set_zero<D>();
D Cv03 = set_zero<D>();
D Cv10 = set_zero<D>();
D Cv11 = set_zero<D>();
D Cv12 = set_zero<D>();
D Cv13 = set_zero<D>();
for (int64_t l = 0; l < k; l += vl) {
V Av0 = load<V>(A + lda * (ii + 0) + l);
V Av1 = load<V>(A + lda * (ii + 1) + l);
V Av2 = load<V>(A + lda * (ii + 2) + l);
V Av3 = load<V>(A + lda * (ii + 3) + l);
V Bv0 = load<V>(B + ldb * (jj + 0) + l);
Cv00 = madd(Av0, Bv0, Cv00);
Cv01 = madd(Av1, Bv0, Cv01);
Cv02 = madd(Av2, Bv0, Cv02);
Cv03 = madd(Av3, Bv0, Cv03);
V Bv1 = load<V>(B + ldb * (jj + 1) + l);
Cv10 = madd(Av0, Bv1, Cv10);
Cv11 = madd(Av1, Bv1, Cv11);
Cv12 = madd(Av2, Bv1, Cv12);
Cv13 = madd(Av3, Bv1, Cv13);
}
C[ldc * (jj + 0) + (ii + 0)] = hsum(Cv00);
C[ldc * (jj + 0) + (ii + 1)] = hsum(Cv01);
C[ldc * (jj + 0) + (ii + 2)] = hsum(Cv02);
C[ldc * (jj + 0) + (ii + 3)] = hsum(Cv03);
C[ldc * (jj + 1) + (ii + 0)] = hsum(Cv10);
C[ldc * (jj + 1) + (ii + 1)] = hsum(Cv11);
C[ldc * (jj + 1) + (ii + 2)] = hsum(Cv12);
C[ldc * (jj + 1) + (ii + 3)] = hsum(Cv13);
}
inline void gemm_bloc_4x1(int64_t ii, int64_t jj) {
size_t vl = vlmax<V>();
D Cv00 = set_zero<D>();
D Cv01 = set_zero<D>();
D Cv02 = set_zero<D>();
D Cv03 = set_zero<D>();
for (int64_t l = 0; l < k; l += vl) {
V Av0 = load<V>(A + lda * (ii + 0) + l);
V Av1 = load<V>(A + lda * (ii + 1) + l);
V Av2 = load<V>(A + lda * (ii + 2) + l);
V Av3 = load<V>(A + lda * (ii + 3) + l);
V Bv0 = load<V>(B + ldb * (jj + 0) + l);
Cv00 = madd(Av0, Bv0, Cv00);
Cv01 = madd(Av1, Bv0, Cv01);
Cv02 = madd(Av2, Bv0, Cv02);
Cv03 = madd(Av3, Bv0, Cv03);
}
C[ldc * (jj + 0) + (ii + 0)] = hsum(Cv00);
C[ldc * (jj + 0) + (ii + 1)] = hsum(Cv01);
C[ldc * (jj + 0) + (ii + 2)] = hsum(Cv02);
C[ldc * (jj + 0) + (ii + 3)] = hsum(Cv03);
}
inline void gemm_bloc_2x2(int64_t ii, int64_t jj) {
size_t vl = vlmax<V>();
D Cv00 = set_zero<D>();
D Cv01 = set_zero<D>();
D Cv10 = set_zero<D>();
D Cv11 = set_zero<D>();
for (int64_t l = 0; l < k; l += vl) {
V Av0 = load<V>(A + lda * (ii + 0) + l);
V Av1 = load<V>(A + lda * (ii + 1) + l);
V Bv0 = load<V>(B + ldb * (jj + 0) + l);
Cv00 = madd(Av0, Bv0, Cv00);
Cv01 = madd(Av1, Bv0, Cv01);
V Bv1 = load<V>(B + ldb * (jj + 1) + l);
Cv10 = madd(Av0, Bv1, Cv10);
Cv11 = madd(Av1, Bv1, Cv11);
}
C[ldc * (jj + 0) + (ii + 0)] = hsum(Cv00);
C[ldc * (jj + 0) + (ii + 1)] = hsum(Cv01);
C[ldc * (jj + 1) + (ii + 0)] = hsum(Cv10);
C[ldc * (jj + 1) + (ii + 1)] = hsum(Cv11);
}
inline void gemm_bloc_2x1(int64_t ii, int64_t jj) {
size_t vl = vlmax<V>();
D Cv00 = set_zero<D>();
D Cv01 = set_zero<D>();
for (int64_t l = 0; l < k; l += vl) {
V Av0 = load<V>(A + lda * (ii + 0) + l);
V Av1 = load<V>(A + lda * (ii + 1) + l);
V Bv0 = load<V>(B + ldb * (jj + 0) + l);
Cv00 = madd(Av0, Bv0, Cv00);
Cv01 = madd(Av1, Bv0, Cv01);
}
C[ldc * (jj + 0) + (ii + 0)] = hsum(Cv00);
C[ldc * (jj + 0) + (ii + 1)] = hsum(Cv01);
}
template <int RM, int RN>
inline void gemm_bloc(int64_t ii, int64_t jj) {
if constexpr (RM == 4) {
if constexpr (RN == 6) { return gemm_bloc_4x6(ii, jj); }
if constexpr (RN == 5) { return gemm_bloc_4x5(ii, jj); }
if constexpr (RN == 4) { return gemm_bloc_4x4(ii, jj); }
if constexpr (RN == 3) { return gemm_bloc_4x3(ii, jj); }
if constexpr (RN == 2) { return gemm_bloc_4x2(ii, jj); }
if constexpr (RN == 1) { return gemm_bloc_4x1(ii, jj); }
} else if constexpr (RM == 2) {
if constexpr (RN == 2) { return gemm_bloc_2x2(ii, jj); }
if constexpr (RN == 1) { return gemm_bloc_2x1(ii, jj); }
}
}
template <int RM, int RN, int BM>
NOINLINE void gemm(int64_t m, int64_t n, int64_t BN) {
GGML_ASSERT(m % (RM * BM) == 0);
const int64_t ytiles = m / (RM * BM);
const int64_t xtiles = (n + RN -1) / RN;
const int64_t jj_RN = (xtiles - (xtiles * RN - n));
// "round" bloc_size to "nearest" BN
const int64_t NB_BN = xtiles < BN ? 1 : (xtiles + BN / 2) / BN;
const int64_t SIZE_BN = xtiles % NB_BN == 0 ? xtiles / NB_BN : xtiles / NB_BN + 1;
const int64_t jj_BN = (NB_BN - (NB_BN * SIZE_BN - xtiles));
const int64_t nb_job = ytiles * NB_BN;
if (params->ith == 0) {
GGML_ASSERT( jj_BN * SIZE_BN + (NB_BN - jj_BN) * (SIZE_BN - 1) == xtiles);
// Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start.
ggml_threadpool_chunk_set(params->threadpool, params->nth);
}
ggml_barrier(params->threadpool);
int64_t job = params->ith;
while (job < nb_job) {
const int64_t ii = (job % ytiles) * RM * BM;
const int64_t jb = job / ytiles;
const int64_t jr0 = BLOC_POS(jb , jj_BN, SIZE_BN);
const int64_t jrN = BLOC_POS(jb+1, jj_BN, SIZE_BN);
const int64_t jj0 = BLOC_POS(jr0, jj_RN, RN);
const int64_t jj2 = BLOC_POS(jrN, jj_RN, RN);
const int64_t jj1 = jj2 < jj_RN * RN ? jj2 : jj_RN * RN;
for (int64_t bi = 0; bi < BM * RM; bi += RM) {
int64_t jj = jj0;
for (; jj < jj1; jj += RN) {
gemm_bloc<RM, RN>(ii + bi, jj);
}
if constexpr (RN > 1) {
for (; jj < jj2; jj += RN - 1) {
gemm_bloc<RM, RN-1>(ii + bi, jj);
}
}
GGML_ASSERT(jj == jj2);
}
job = ggml_threadpool_chunk_add(params->threadpool, 1);
}
ggml_barrier(params->threadpool);
return;
}
const ggml_compute_params * params;
const TA *const A;
const TB *const B;
TC *const C;
const int64_t k;
const int64_t lda;
const int64_t ldb;
const int64_t ldc;
};
#endif
////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////
// QUANT ZERO MATRIX MULTIPLICATION // QUANT ZERO MATRIX MULTIPLICATION
@ -2657,6 +3369,24 @@ bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64
params->ith, params->nth}; params->ith, params->nth};
tb.matmul(m, n); tb.matmul(m, n);
return true; return true;
#elif defined(__riscv_zvfh)
#if LMUL == 1
tinyBLAS_RVV<vfloat32m1_t, vfloat32m1_t, float, float, float> tb{ params,
k, (const float *)A, lda,
(const float *)B, ldb,
(float *)C, ldc};
#elif LMUL == 2
tinyBLAS_RVV<vfloat32m2_t, vfloat32m2_t, float, float, float> tb{ params,
k, (const float *)A, lda,
(const float *)B, ldb,
(float *)C, ldc};
#else // LMUL = 4
tinyBLAS_RVV<vfloat32m4_t, vfloat32m4_t, float, float, float> tb{ params,
k, (const float *)A, lda,
(const float *)B, ldb,
(float *)C, ldc};
#endif
return tb.matmul(m, n);
#else #else
return false; return false;
#endif #endif
@ -2699,6 +3429,24 @@ bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64
tb.matmul(m, n); tb.matmul(m, n);
return true; return true;
} }
#elif defined(__riscv_zvfbfwma)
#if LMUL == 1
tinyBLAS_RVV<vfloat32m1_t, vbfloat16mf2_t, ggml_bf16_t, ggml_bf16_t, float> tb{ params,
k, (const ggml_bf16_t *)A, lda,
(const ggml_bf16_t *)B, ldb,
(float *)C, ldc};
#elif LMUL == 2
tinyBLAS_RVV<vfloat32m2_t, vbfloat16m1_t, ggml_bf16_t, ggml_bf16_t, float> tb{ params,
k, (const ggml_bf16_t *)A, lda,
(const ggml_bf16_t *)B, ldb,
(float *)C, ldc};
#else // LMUL = 4
tinyBLAS_RVV<vfloat32m4_t, vbfloat16m2_t, ggml_bf16_t, ggml_bf16_t, float> tb{ params,
k, (const ggml_bf16_t *)A, lda,
(const ggml_bf16_t *)B, ldb,
(float *)C, ldc};
#endif
return tb.matmul(m, n);
#endif #endif
return false; return false;
} }
@ -2748,6 +3496,26 @@ bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64
(float *)C, ldc}; (float *)C, ldc};
return tb.matmul(m, n); return tb.matmul(m, n);
} }
#elif defined(__riscv_zvfh)
if (Btype == GGML_TYPE_F16) {
#if LMUL == 1
tinyBLAS_RVV<vfloat32m1_t, vfloat16mf2_t, ggml_fp16_t, ggml_fp16_t, float> tb{ params,
k, (const ggml_fp16_t *)A, lda,
(const ggml_fp16_t *)B, ldb,
(float *)C, ldc};
#elif LMUL == 2
tinyBLAS_RVV<vfloat32m2_t, vfloat16m1_t, ggml_fp16_t, ggml_fp16_t, float> tb{ params,
k, (const ggml_fp16_t *)A, lda,
(const ggml_fp16_t *)B, ldb,
(float *)C, ldc};
#else // LMUL = 4
tinyBLAS_RVV<vfloat32m4_t, vfloat16m2_t, ggml_fp16_t, ggml_fp16_t, float> tb{ params,
k, (const ggml_fp16_t *)A, lda,
(const ggml_fp16_t *)B, ldb,
(float *)C, ldc};
#endif
return tb.matmul(m, n);
}
#endif #endif
return false; return false;
} }

File diff suppressed because it is too large Load Diff

View File

@ -8,6 +8,7 @@ extern "C" {
#include <AEEStdErr.h> #include <AEEStdErr.h>
#include <inttypes.h> #include <inttypes.h>
#include <remote.h> #include <remote.h>
#include <rpcmem.h>
#include <stdbool.h> #include <stdbool.h>
/* Offset to differentiate HLOS and Hexagon error codes. /* Offset to differentiate HLOS and Hexagon error codes.

View File

@ -263,7 +263,8 @@ static void unary_gelu_fp32_per_thread(const struct htp_tensor * src0,
struct htp_spad * dst_spad, struct htp_spad * dst_spad,
uint32_t nth, uint32_t nth,
uint32_t ith, uint32_t ith,
uint32_t src0_nrows_per_thread) { uint32_t src0_nrows_per_thread,
dma_queue * dma_queue) {
htp_act_preamble2; htp_act_preamble2;
uint64_t t1, t2; uint64_t t1, t2;
@ -271,6 +272,8 @@ static void unary_gelu_fp32_per_thread(const struct htp_tensor * src0,
const size_t src0_row_size = nb01; const size_t src0_row_size = nb01;
const size_t dst_row_size = nb1; const size_t dst_row_size = nb1;
const size_t src0_row_size_aligned = htp_round_up(src0_row_size, VLEN);
const size_t dst_row_size_aligned = htp_round_up(dst_row_size, VLEN);
const uint32_t src0_nrows = ne01 * ne02 * ne03; const uint32_t src0_nrows = ne01 * ne02 * ne03;
@ -282,60 +285,81 @@ static void unary_gelu_fp32_per_thread(const struct htp_tensor * src0,
return; return;
} }
int is_aligned = 1; const uint8_t * data_src0 = (const uint8_t *) src0->data;
int opt_path = 0; uint8_t * data_dst = (uint8_t *) dst->data;
if (!htp_is_aligned((void *) src0->data, VLEN) || !htp_is_aligned((void *) dst->data, VLEN)) {
is_aligned = 0; uint8_t * src0_spad_data = src0_spad->data + (ith * src0_spad->size_per_thread);
FARF(HIGH, "silu-f32: unaligned addresses in elementwise op, possibly slower execution\n"); uint8_t * dst_spad_data = dst_spad->data + (ith * dst_spad->size_per_thread);
}
if ((1 == is_aligned) && !(nb01 & (VLEN - 1))) { // While given src0_spad->size_per_thread, divide it to two ping-pong buffer for src0
opt_path = 1; size_t src0_spad_half_size = src0_spad->size_per_thread / 2;
size_t dst_spad_half_size = dst_spad->size_per_thread / 2;
// In gelu = x*sigmoid(x*1.702)
const int BLOCK = src0_spad_half_size / src0_row_size_aligned; // How many rows can we process in one block
if (BLOCK == 0) {
FARF(ERROR, "gelu-f32 : current VTCM reservation %zu is too small for even 1 row per thread, needed at least %zu\n",
src0_spad->size_per_thread, src0_row_size_aligned);
return;
} }
const uint8_t * restrict data_src0 = (const uint8_t *) src0->data; // See discussion: https://github.com/ggml-org/llama.cpp/pull/18151#issuecomment-3678235379
uint8_t * restrict data_dst = (uint8_t *) dst->data; for (uint32_t ir = src0_start_row, spad_idx = 0; ir < src0_end_row && spad_idx < 2; ir += BLOCK, spad_idx++) {
const uint32_t block_size = MIN(BLOCK, src0_end_row - ir);
uint8_t * restrict src0_spad_data = src0_spad->data + (ith * src0_row_size); // Dummy DMA transation for sequencing (interleaving dst,src,dst,...)
uint8_t * restrict dst_spad_data = dst_spad->data + (ith * dst_row_size); dma_queue_push_vtcm_to_ddr(dma_queue,
dma_make_ptr(data_dst, dst_spad_data + (spad_idx * dst_spad_half_size)),
dst_row_size, dst_row_size_aligned, 0);
dma_queue_push_ddr_to_vtcm(dma_queue,
dma_make_ptr(src0_spad_data + (spad_idx * src0_spad_half_size), data_src0 + (ir * src0_row_size)),
src0_row_size_aligned, src0_row_size, block_size);
}
const int BLOCK = 8;
for (uint32_t ir = src0_start_row; ir < src0_end_row; ir += BLOCK) { for (uint32_t ir = src0_start_row; ir < src0_end_row; ir += BLOCK) {
const uint32_t block_end = MIN(ir + BLOCK, src0_end_row); const uint32_t block_size = MIN(BLOCK, src0_end_row - ir);
// Prefetch next block float* dst_spad = (float *) dma_queue_pop(dma_queue).src;
if (block_end < src0_end_row) { float* src0_spad = (float *) dma_queue_pop(dma_queue).dst;
const float * restrict prefetch_ptr = (float *) (data_src0 + (block_end * src0_row_size));
htp_l2fetch(prefetch_ptr, 1, block_end * src0_row_size, src0_row_size);
}
// Process rows in current block for (uint32_t ib = 0; ib < block_size; ib++) {
for (uint32_t ib = ir; ib < block_end; ib++) { const float* src0_spad_ptr = src0_spad + ib * (src0_row_size_aligned / sizeof(float));
const float * restrict src0 = (float *) (data_src0 + (ib * src0_row_size)); float* dst_spad_ptr = dst_spad + ib * (dst_row_size_aligned / sizeof(float));
float * restrict dst = (float *) (data_dst + (ib * dst_row_size));
// gelu = x * sigmoid(1.702 * x) // current implementation // gelu = x * sigmoid(1.702 * x) // current implementation
if (1 == opt_path) { hvx_mul_scalar_f32((const uint8_t *) src0_spad_ptr, (float) 1.702, (uint8_t *) dst_spad_ptr, ne0);
hvx_mul_scalar_f32((const uint8_t *) src0, (float) 1.702, (uint8_t *) src0_spad_data, ne0); hvx_fast_sigmoid_f32((const uint8_t *) dst_spad_ptr, (uint8_t *) dst_spad_ptr, ne0);
hvx_fast_sigmoid_f32((const uint8_t *) src0_spad_data, (uint8_t *) src0_spad_data, ne0); hvx_mul_f32_opt((const uint8_t *) src0_spad_ptr, (uint8_t *) dst_spad_ptr, (uint8_t *) dst_spad_ptr, ne0);
hvx_mul_f32_opt((const uint8_t *) src0, src0_spad_data, (uint8_t *) dst, ne0); }
} else {
hvx_mul_scalar_f32( (const uint8_t *) src0, (float)1.702, (uint8_t *) src0_spad_data, ne0); dma_queue_push_vtcm_to_ddr(dma_queue,
hvx_sigmoid_f32((const uint8_t *) src0_spad_data, (uint8_t *) src0_spad_data, ne0); dma_make_ptr(data_dst + (ir * dst_row_size), dst_spad),
hvx_mul_f32((const uint8_t *) src0, src0_spad_data, (uint8_t *) dst, ne0); dst_row_size, dst_row_size_aligned, block_size);
}
// prefetch N+2 loop iteration if any
const uint32_t pref_block = (ir + BLOCK * 2);
if (pref_block < src0_end_row) {
const uint32_t pref_block_size = MIN(BLOCK, src0_end_row - pref_block);
dma_queue_push_ddr_to_vtcm(dma_queue,
dma_make_ptr(src0_spad, data_src0 + (pref_block * src0_row_size)),
src0_row_size_aligned, src0_row_size, pref_block_size);
} }
} }
dma_queue_flush(dma_queue);
t2 = HAP_perf_get_qtimer_count(); t2 = HAP_perf_get_qtimer_count();
FARF(HIGH, "gelu-f32 %d/%d/%d: %ux%ux%ux%u (%u:%u) -> %ux%ux%ux%u usec %u\n", ith, nth, opt_path, ne00, ne01, ne02, FARF(HIGH, "gelu-f32 %d/%d: %ux%ux%ux%u (%u:%u) -> %ux%ux%ux%u usec %u\n", ith, nth, ne00, ne01, ne02,
ne03, src0_start_row, src0_end_row, ne0, ne1, ne2, ne3, (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1)); ne03, src0_start_row, src0_end_row, ne0, ne1, ne2, ne3, (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
} }
static void unary_gelu_fp32(unsigned int n, unsigned int i, void * data) { static void unary_gelu_fp32(unsigned int n, unsigned int i, void * data) {
struct htp_ops_context * octx = (struct htp_ops_context *) data; struct htp_ops_context * octx = (struct htp_ops_context *) data;
unary_gelu_fp32_per_thread(&octx->src0, &octx->dst, octx->op_params, &octx->src0_spad, &octx->dst_spad, n, i, unary_gelu_fp32_per_thread(&octx->src0, &octx->dst, octx->op_params, &octx->src0_spad, &octx->dst_spad, n, i,
octx->src0_nrows_per_thread); octx->src0_nrows_per_thread, octx->ctx->dma[i]);
} }
@ -468,21 +492,45 @@ static int execute_op_activations_fp32(struct htp_ops_context * octx) {
const uint32_t n_threads = octx->n_threads; const uint32_t n_threads = octx->n_threads;
const uint32_t src0_nrows = src0->ne[1] * src0->ne[2] * src0->ne[3]; const uint32_t src0_nrows = src0->ne[1] * src0->ne[2] * src0->ne[3];
const size_t src0_row_size = src0->nb[1]; size_t src0_row_size = src0->nb[1];
const size_t src1_row_size = src1->ne[0] ? src1->nb[1] : src0->nb[1]; size_t src1_row_size = src1->nb[1]; // zero bytes if src1 is not used
const size_t dst_row_size = dst->nb[1]; size_t dst_row_size = dst->nb[1];
const bool src1_valid = src1->ne[0];
if (!src1_valid) {
src1_row_size = src0_row_size;
}
const size_t src0_row_size_aligned = htp_round_up(src0_row_size, VLEN);
const size_t src1_row_size_aligned = htp_round_up(src1_row_size, VLEN);
const size_t dst_row_size_aligned = htp_round_up(dst_row_size, VLEN);
// VTCM scratchpads for all tensors // VTCM scratchpads for all tensors
// N rows per thread, padded to HVX vector size // N rows per thread, padded to HVX vector size
octx->dst_spad.size = htp_round_up(dst_row_size, 128) * octx->n_threads;
octx->src0_spad.size = htp_round_up(src0_row_size, 128) * octx->n_threads;
octx->src1_spad.size = htp_round_up(src1_row_size, 128) * octx->n_threads;
size_t spad_size = octx->src0_spad.size + octx->src1_spad.size + octx->dst_spad.size; size_t spad_size_per_row = (src0_row_size_aligned + src1_row_size_aligned) + dst_row_size_aligned;
size_t vtcm_row_per_thread = (octx->ctx->vtcm_size)/ (n_threads* spad_size_per_row);
// Make sure the reserved vtcm size is sufficient
if(vtcm_row_per_thread ==0){
FARF(ERROR, "act-%s : current VTCM reservation %zu is too small for even 1 row per thread, needed at least %zu\n", op_type, octx->ctx->vtcm_size,
spad_size_per_row * n_threads);
return HTP_STATUS_VTCM_TOO_SMALL;
}
octx->src0_spad.size_per_thread = src0_row_size_aligned * vtcm_row_per_thread;
octx->src1_spad.size_per_thread = src1_row_size_aligned * vtcm_row_per_thread;
octx->dst_spad.size_per_thread = dst_row_size_aligned * vtcm_row_per_thread;
octx->dst_spad.size = n_threads* octx->dst_spad.size_per_thread;
octx->src0_spad.size = n_threads* octx->src0_spad.size_per_thread;
octx->src1_spad.size = n_threads* octx->src1_spad.size_per_thread;
octx->src0_spad.data = octx->ctx->vtcm_base;
octx->src1_spad.data = octx->src0_spad.data + octx->src0_spad.size;
octx->dst_spad.data = octx->src1_spad.data + octx->src1_spad.size;
if (src1->ne[0]) { if (src1->ne[0]) {
FARF(HIGH, FARF(HIGH, "%s: %ux%ux%ux%u x %ux%ux%ux%u -> %ux%ux%ux%u : src0-spad-size %u src1-spad-size %u dst-spad-size %u\n",
"%s: %ux%ux%ux%u x %ux%ux%ux%u -> %ux%ux%ux%u : src0-spad-size %u src1-spad-size %u dst-spad-size %u\n",
op_type, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src1->ne[0], src1->ne[1], src1->ne[2], op_type, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src1->ne[0], src1->ne[1], src1->ne[2],
src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], octx->src0_spad.size, octx->src1_spad.size, src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], octx->src0_spad.size, octx->src1_spad.size,
octx->dst_spad.size); octx->dst_spad.size);
@ -492,20 +540,8 @@ static int execute_op_activations_fp32(struct htp_ops_context * octx) {
octx->src0_spad.size, octx->src1_spad.size, octx->dst_spad.size); octx->src0_spad.size, octx->src1_spad.size, octx->dst_spad.size);
} }
// Make sure the reserved vtcm size is sufficient
if (octx->ctx->vtcm_size < spad_size) {
FARF(ERROR, "act-%s : current VTCM reservation %zu is too small, needed %zu\n", op_type, octx->ctx->vtcm_size,
spad_size);
return HTP_STATUS_VTCM_TOO_SMALL;
}
octx->src0_spad.data = octx->ctx->vtcm_base;
octx->src1_spad.data = octx->src0_spad.data + octx->src0_spad.size;
octx->dst_spad.data = octx->src1_spad.data + octx->src1_spad.size;
if (!(octx->flags & HTP_OPFLAGS_SKIP_COMPUTE)) { if (!(octx->flags & HTP_OPFLAGS_SKIP_COMPUTE)) {
uint32_t n_jobs = MIN(n_threads, src0_nrows); uint32_t n_jobs = MIN(n_threads, src0_nrows);
octx->src0_nrows_per_thread = (src0_nrows + n_jobs - 1) / n_jobs; octx->src0_nrows_per_thread = (src0_nrows + n_jobs - 1) / n_jobs;
worker_pool_run_func(octx->ctx->worker_pool, act_op_func, octx, n_jobs); worker_pool_run_func(octx->ctx->worker_pool, act_op_func, octx, n_jobs);
} }

View File

@ -34,12 +34,12 @@ dma_queue * dma_queue_create(size_t capacity) {
q->desc = (hexagon_udma_descriptor_type1_t *) memalign(64, capacity * sizeof(hexagon_udma_descriptor_type1_t)); q->desc = (hexagon_udma_descriptor_type1_t *) memalign(64, capacity * sizeof(hexagon_udma_descriptor_type1_t));
memset(q->desc, 0, capacity * sizeof(hexagon_udma_descriptor_type1_t)); memset(q->desc, 0, capacity * sizeof(hexagon_udma_descriptor_type1_t));
q->dst = (void **) memalign(4, capacity * sizeof(void *)); q->dptr = (dma_ptr *) memalign(4, capacity * sizeof(dma_ptr));
memset(q->dst, 0, capacity * sizeof(void *)); memset(q->dptr, 0, capacity * sizeof(dma_ptr));
q->tail = &q->desc[capacity - 1]; q->tail = &q->desc[capacity - 1];
if (!q->desc && !q->dst) { if (!q->desc && !q->dptr) {
FARF(ERROR, "%s: failed to allocate DMA queue items\n", __FUNCTION__); FARF(ERROR, "%s: failed to allocate DMA queue items\n", __FUNCTION__);
return NULL; return NULL;
} }
@ -54,16 +54,10 @@ void dma_queue_delete(dma_queue * q) {
return; return;
} }
free(q->desc); free(q->desc);
free(q->dst); free(q->dptr);
free(q); free(q);
} }
void dma_queue_flush(dma_queue * q) { void dma_queue_flush(dma_queue * q) {
while (1) { while (dma_queue_pop(q).dst != NULL) ;
uint32_t s = dmwait() & 0x3;
if (s == HEXAGON_UDMA_DM0_STATUS_IDLE) {
break;
}
}
q->tail = NULL;
} }

View File

@ -11,10 +11,15 @@
extern "C" { extern "C" {
#endif #endif
typedef struct {
void *dst;
const void *src;
} dma_ptr;
typedef struct { typedef struct {
hexagon_udma_descriptor_type1_t * desc; // descriptor pointers hexagon_udma_descriptor_type1_t * desc; // descriptor pointers
hexagon_udma_descriptor_type1_t * tail; // tail pointer hexagon_udma_descriptor_type1_t * tail; // tail pointer
void ** dst; // dst pointers dma_ptr * dptr; // dst/src pointers
uint32_t push_idx; uint32_t push_idx;
uint32_t pop_idx; uint32_t pop_idx;
uint32_t capacity; uint32_t capacity;
@ -49,13 +54,20 @@ static inline unsigned int dmwait(void) {
return ret; return ret;
} }
static inline bool dma_queue_push(dma_queue * q, static inline dma_ptr dma_make_ptr(void *dst, const void *src)
void * dst, {
const void * src, dma_ptr p = { dst, src };
size_t dst_row_size, return p;
size_t src_row_size, }
size_t nrows) {
static inline bool dma_queue_push(dma_queue * q,
dma_ptr dptr,
size_t dst_row_size,
size_t src_row_size,
size_t width, // width in bytes. number of bytes to transfer per row
size_t nrows) {
if (((q->push_idx + 1) & q->idx_mask) == q->pop_idx) { if (((q->push_idx + 1) & q->idx_mask) == q->pop_idx) {
FARF(ERROR, "dma-push: queue full\n");
return false; return false;
} }
@ -75,18 +87,18 @@ static inline bool dma_queue_push(dma_queue * q,
#endif #endif
desc->order = 0; desc->order = 0;
desc->dstate = HEXAGON_UDMA_DESC_DSTATE_INCOMPLETE; desc->dstate = HEXAGON_UDMA_DESC_DSTATE_INCOMPLETE;
desc->src = (void *) src; desc->src = (void *) dptr.src;
desc->dst = (void *) dst; desc->dst = (void *) dptr.dst;
desc->allocation = 0; desc->allocation = 0;
desc->padding = 0; desc->padding = 0;
desc->roiwidth = src_row_size; desc->roiwidth = width;
desc->roiheight = nrows; desc->roiheight = nrows;
desc->srcstride = src_row_size; desc->srcstride = src_row_size;
desc->dststride = dst_row_size; desc->dststride = dst_row_size;
desc->srcwidthoffset = 0; desc->srcwidthoffset = 0;
desc->dstwidthoffset = 0; desc->dstwidthoffset = 0;
q->dst[q->push_idx] = dst; q->dptr[q->push_idx] = dptr;
dmlink(q->tail, desc); dmlink(q->tail, desc);
q->tail = desc; q->tail = desc;
@ -96,9 +108,28 @@ static inline bool dma_queue_push(dma_queue * q,
return true; return true;
} }
static inline uint8_t * dma_queue_pop(dma_queue * q) { static inline bool dma_queue_push_ddr_to_vtcm(dma_queue * q,
dma_ptr dptr,
size_t dst_row_size,
size_t src_row_size,
size_t nrows) {
return dma_queue_push(q, dptr, dst_row_size, src_row_size, src_row_size, nrows);
}
static inline bool dma_queue_push_vtcm_to_ddr(dma_queue * q,
dma_ptr dptr,
size_t dst_row_size,
size_t src_row_size,
size_t nrows) {
return dma_queue_push(q, dptr, dst_row_size, src_row_size, dst_row_size, nrows);
}
static inline dma_ptr dma_queue_pop(dma_queue * q) {
dma_ptr dptr = { NULL };
if (q->push_idx == q->pop_idx) { if (q->push_idx == q->pop_idx) {
return NULL; return dptr;
} }
hexagon_udma_descriptor_type1_t * desc = &q->desc[q->pop_idx]; hexagon_udma_descriptor_type1_t * desc = &q->desc[q->pop_idx];
@ -112,11 +143,11 @@ static inline uint8_t * dma_queue_pop(dma_queue * q) {
// FARF(ERROR, "dma-pop: waiting for DMA : %u\n", q->pop_idx); // FARF(ERROR, "dma-pop: waiting for DMA : %u\n", q->pop_idx);
} }
uint8_t * dst = (uint8_t *) q->dst[q->pop_idx]; dptr = q->dptr[q->pop_idx];
// FARF(ERROR, "dma-pop: i %u dst %p\n", q->pop_idx, dst); // FARF(ERROR, "dma-pop: i %u dst %p\n", q->pop_idx, dst);
q->pop_idx = (q->pop_idx + 1) & q->idx_mask; q->pop_idx = (q->pop_idx + 1) & q->idx_mask;
return dst; return dptr;
} }
#ifdef __cplusplus #ifdef __cplusplus

View File

@ -980,8 +980,6 @@ static inline void hvx_fast_sigmoid_f32(const uint8_t * restrict src, uint8_t *
int step_of_1 = num_elems >> 5; int step_of_1 = num_elems >> 5;
int remaining = num_elems - step_of_1 * VLEN_FP32; int remaining = num_elems - step_of_1 * VLEN_FP32;
assert(remaining == 0);
const HVX_Vector * restrict v_src = (HVX_Vector *) src; const HVX_Vector * restrict v_src = (HVX_Vector *) src;
HVX_Vector * restrict v_dst = (HVX_Vector *) dst; HVX_Vector * restrict v_dst = (HVX_Vector *) dst;
@ -996,8 +994,16 @@ static inline void hvx_fast_sigmoid_f32(const uint8_t * restrict src, uint8_t *
for (int i = 0; i < step_of_1; i++) { for (int i = 0; i < step_of_1; i++) {
v_dst[i] = hvx_vec_fast_sigmoid_fp32_guard(v_src[i], one, max_exp, min_exp); v_dst[i] = hvx_vec_fast_sigmoid_fp32_guard(v_src[i], one, max_exp, min_exp);
} }
}
if (remaining > 0) {
const float * srcf = ((const float *) src) + step_of_1* VLEN_FP32;
float * dstf = (float *) dst + step_of_1*VLEN_FP32;
HVX_Vector in = *(HVX_UVector *) srcf;
HVX_Vector out = hvx_vec_fast_sigmoid_fp32_guard(in, one, max_exp, min_exp);
hvx_vec_store_u((void *) dstf, remaining * SIZEOF_FP32, out);
}
}
static inline void hvx_sigmoid_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems){ static inline void hvx_sigmoid_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems){
int step_of_1 = num_elems >> 5; // divby 32, because 32 float = 128 bytes per HVX vector int step_of_1 = num_elems >> 5; // divby 32, because 32 float = 128 bytes per HVX vector

View File

@ -299,7 +299,8 @@ AEEResult htp_iface_start(remote_handle64 handle, uint32 sess_id, uint64 dsp_que
ctx->n_threads = n_hvx; ctx->n_threads = n_hvx;
for (int i = 0; i < ctx->n_threads; i++) { for (int i = 0; i < ctx->n_threads; i++) {
ctx->dma[i] = dma_queue_create(HTP_SPAD_SRC0_NROWS * 2); // see discussion https://github.com/ggml-org/llama.cpp/pull/18151#discussion_r2632388541
ctx->dma[i] = dma_queue_create(64);
} }
// init worker pool // init worker pool

View File

@ -1127,13 +1127,13 @@ static void matmul(struct htp_matmul_type * mt,
if (is0 >= HTP_SPAD_SRC0_NROWS) { if (is0 >= HTP_SPAD_SRC0_NROWS) {
break; break;
} }
dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size, dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size),
src0_row_size_padded, src0_row_size, 2); src0_row_size_padded, src0_row_size, 2);
} }
// Process src0 rows // Process src0 rows
for (uint32_t ir0 = src0_start_row; ir0 < src0_end_row_x2; ir0 += 2) { for (uint32_t ir0 = src0_start_row; ir0 < src0_end_row_x2; ir0 += 2) {
const uint8_t * ss0 = dma_queue_pop(dma_queue); const uint8_t * ss0 = dma_queue_pop(dma_queue).dst;
#pragma unroll(2) #pragma unroll(2)
for (uint32_t ir1 = 0; ir1 < src1_nrows; ++ir1) { for (uint32_t ir1 = 0; ir1 < src1_nrows; ++ir1) {
@ -1146,7 +1146,7 @@ static void matmul(struct htp_matmul_type * mt,
const int pr0 = (ir0 + HTP_SPAD_SRC0_NROWS); const int pr0 = (ir0 + HTP_SPAD_SRC0_NROWS);
const int is0 = (pr0 - src0_start_row) % HTP_SPAD_SRC0_NROWS; const int is0 = (pr0 - src0_start_row) % HTP_SPAD_SRC0_NROWS;
if (pr0 < src0_end_row_x2) { if (pr0 < src0_end_row_x2) {
dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + pr0 * src0_row_size, dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_row_size_padded, src0_row + pr0 * src0_row_size),
src0_row_size_padded, src0_row_size, 2); src0_row_size_padded, src0_row_size, 2);
} }
} }
@ -1155,9 +1155,9 @@ static void matmul(struct htp_matmul_type * mt,
if (src0_end_row != src0_end_row_x2) { if (src0_end_row != src0_end_row_x2) {
uint32_t ir0 = src0_end_row_x2; uint32_t ir0 = src0_end_row_x2;
const int is0 = (ir0 - src0_start_row); const int is0 = (ir0 - src0_start_row);
dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size, dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size),
src0_row_size_padded, src0_row_size, 1); src0_row_size_padded, src0_row_size, 1);
const uint8_t * ss0 = dma_queue_pop(dma_queue); const uint8_t * ss0 = dma_queue_pop(dma_queue).dst;
#pragma unroll(2) #pragma unroll(2)
for (uint32_t ir1 = 0; ir1 < src1_nrows; ++ir1) { for (uint32_t ir1 = 0; ir1 < src1_nrows; ++ir1) {
@ -1229,20 +1229,20 @@ static void matvec(struct htp_matmul_type * mt,
if (is0 >= HTP_SPAD_SRC0_NROWS) { if (is0 >= HTP_SPAD_SRC0_NROWS) {
break; break;
} }
dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size, dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size),
src0_row_size_padded, src0_row_size, 2); src0_row_size_padded, src0_row_size, 2);
} }
// Process src0 rows // Process src0 rows
for (uint32_t ir0 = src0_start_row; ir0 < src0_end_row_x2; ir0 += 2) { for (uint32_t ir0 = src0_start_row; ir0 < src0_end_row_x2; ir0 += 2) {
const uint8_t * ss0 = dma_queue_pop(dma_queue); const uint8_t * ss0 = dma_queue_pop(dma_queue).dst;
mt->vec_dot_rx2(ne00, &tmp[ir0 - src0_start_row], ss0, src0_row_size_padded, src1_col); mt->vec_dot_rx2(ne00, &tmp[ir0 - src0_start_row], ss0, src0_row_size_padded, src1_col);
// Prefetch next (n + spad_nrows) row // Prefetch next (n + spad_nrows) row
const uint32_t pr0 = (ir0 + HTP_SPAD_SRC0_NROWS); const uint32_t pr0 = (ir0 + HTP_SPAD_SRC0_NROWS);
const uint32_t is0 = (pr0 - src0_start_row) % HTP_SPAD_SRC0_NROWS; const uint32_t is0 = (pr0 - src0_start_row) % HTP_SPAD_SRC0_NROWS;
if (pr0 < src0_end_row_x2) { if (pr0 < src0_end_row_x2) {
dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + pr0 * src0_row_size, dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_row_size_padded, src0_row + pr0 * src0_row_size),
src0_row_size_padded, src0_row_size, 2); src0_row_size_padded, src0_row_size, 2);
} }
} }
@ -1251,9 +1251,9 @@ static void matvec(struct htp_matmul_type * mt,
if (src0_end_row != src0_end_row_x2) { if (src0_end_row != src0_end_row_x2) {
const uint32_t ir0 = src0_end_row_x2; const uint32_t ir0 = src0_end_row_x2;
const uint32_t is0 = (ir0 - src0_start_row); const uint32_t is0 = (ir0 - src0_start_row);
dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size, dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size),
src0_row_size_padded, src0_row_size, 1); src0_row_size_padded, src0_row_size, 1);
const uint8_t * ss0 = dma_queue_pop(dma_queue); const uint8_t * ss0 = dma_queue_pop(dma_queue).dst;
mt->vec_dot(ne00, &tmp[ir0 - src0_start_row], ss0, src1_col); mt->vec_dot(ne00, &tmp[ir0 - src0_start_row], ss0, src1_col);
} }
@ -1343,13 +1343,13 @@ static void matmul_id(struct htp_matmul_type * mt,
if (is0 >= HTP_SPAD_SRC0_NROWS) { if (is0 >= HTP_SPAD_SRC0_NROWS) {
break; break;
} }
dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size, dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size),
src0_row_size_padded, src0_row_size, 2); src0_row_size_padded, src0_row_size, 2);
} }
// Process src0 rows // Process src0 rows
for (uint32_t ir0 = src0_start_row; ir0 < src0_end_row_x2; ir0 += 2) { for (uint32_t ir0 = src0_start_row; ir0 < src0_end_row_x2; ir0 += 2) {
const uint8_t * ss0 = dma_queue_pop(dma_queue); const uint8_t * ss0 = dma_queue_pop(dma_queue).dst;
for (uint32_t cid = 0; cid < cne1; ++cid) { for (uint32_t cid = 0; cid < cne1; ++cid) {
struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, cid); struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, cid);
@ -1368,7 +1368,7 @@ static void matmul_id(struct htp_matmul_type * mt,
const int pr0 = (ir0 + HTP_SPAD_SRC0_NROWS); const int pr0 = (ir0 + HTP_SPAD_SRC0_NROWS);
const int is0 = (pr0 - src0_start_row) % HTP_SPAD_SRC0_NROWS; const int is0 = (pr0 - src0_start_row) % HTP_SPAD_SRC0_NROWS;
if (pr0 < src0_end_row_x2) { if (pr0 < src0_end_row_x2) {
dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + pr0 * src0_row_size, dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_row_size_padded, src0_row + pr0 * src0_row_size),
src0_row_size_padded, src0_row_size, 2); src0_row_size_padded, src0_row_size, 2);
} }
} }
@ -1377,9 +1377,9 @@ static void matmul_id(struct htp_matmul_type * mt,
if (src0_end_row != src0_end_row_x2) { if (src0_end_row != src0_end_row_x2) {
uint32_t ir0 = src0_end_row_x2; uint32_t ir0 = src0_end_row_x2;
const uint32_t is0 = (ir0 - src0_start_row); const uint32_t is0 = (ir0 - src0_start_row);
dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size, dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size),
src0_row_size_padded, src0_row_size, 1); src0_row_size_padded, src0_row_size, 1);
const uint8_t * ss0 = dma_queue_pop(dma_queue); const uint8_t * ss0 = dma_queue_pop(dma_queue).dst;
for (uint32_t cid = 0; cid < cne1; ++cid) { for (uint32_t cid = 0; cid < cne1; ++cid) {
struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, cid); struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, cid);
@ -1467,20 +1467,20 @@ static void matvec_id(struct htp_matmul_type * mt,
if (is0 >= HTP_SPAD_SRC0_NROWS) { if (is0 >= HTP_SPAD_SRC0_NROWS) {
break; break;
} }
dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size, dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size),
src0_row_size_padded, src0_row_size, 2); src0_row_size_padded, src0_row_size, 2);
} }
// Process src0 rows // Process src0 rows
for (uint32_t ir0 = src0_start_row; ir0 < src0_end_row_x2; ir0 += 2) { for (uint32_t ir0 = src0_start_row; ir0 < src0_end_row_x2; ir0 += 2) {
const uint8_t * ss0 = dma_queue_pop(dma_queue); const uint8_t * ss0 = dma_queue_pop(dma_queue).dst;
mt->vec_dot_rx2(ne00, &dst_row[ir0], ss0, src0_row_size_padded, src1_col); mt->vec_dot_rx2(ne00, &dst_row[ir0], ss0, src0_row_size_padded, src1_col);
// Prefetch next (n + spad_nrows) row // Prefetch next (n + spad_nrows) row
const int pr0 = (ir0 + HTP_SPAD_SRC0_NROWS); const int pr0 = (ir0 + HTP_SPAD_SRC0_NROWS);
const int is0 = (pr0 - src0_start_row) % HTP_SPAD_SRC0_NROWS; const int is0 = (pr0 - src0_start_row) % HTP_SPAD_SRC0_NROWS;
if (pr0 < src0_end_row_x2) { if (pr0 < src0_end_row_x2) {
dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + pr0 * src0_row_size, dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_row_size_padded, src0_row + pr0 * src0_row_size),
src0_row_size_padded, src0_row_size, 2); src0_row_size_padded, src0_row_size, 2);
} }
} }
@ -1489,9 +1489,9 @@ static void matvec_id(struct htp_matmul_type * mt,
if (src0_end_row != src0_end_row_x2) { if (src0_end_row != src0_end_row_x2) {
uint32_t ir0 = src0_end_row_x2; uint32_t ir0 = src0_end_row_x2;
const uint32_t is0 = (ir0 - src0_start_row); const uint32_t is0 = (ir0 - src0_start_row);
dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size, dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size),
src0_row_size_padded, src0_row_size, 1); src0_row_size_padded, src0_row_size, 1);
const uint8_t * ss0 = dma_queue_pop(dma_queue); const uint8_t * ss0 = dma_queue_pop(dma_queue).dst;
mt->vec_dot(ne00, &dst_row[ir0], ss0, src1_col); mt->vec_dot(ne00, &dst_row[ir0], ss0, src1_col);
} }
} }

View File

@ -0,0 +1,153 @@
#ifndef OP_DESC_H
#define OP_DESC_H
#define GGML_COMMON_IMPL_CPP
#include "ggml-backend-impl.h"
#include "ggml-common.h"
#include <string>
#include <stdio.h>
struct op_desc {
char strides[64 * GGML_MAX_SRC];
char dims[64 * GGML_MAX_SRC];
char types[16 * GGML_MAX_SRC];
char buffs[64 * GGML_MAX_SRC];
char names[64 * GGML_MAX_SRC];
int format_tensor_dims(char * str, const struct ggml_tensor * t) {
if (t->ne[2] == 1 && t->ne[3] == 1) {
return sprintf(str, "%d:%d", (int) t->ne[0], (int) t->ne[1]);
} else {
return sprintf(str, "%d:%d:%d:%d", (int) t->ne[0], (int) t->ne[1], (int) t->ne[2], (int) t->ne[3]);
}
}
void format_op_dims(char * str, const struct ggml_tensor * t) {
char * p = str;
// append src0 and src1 (if any)
if (t->src[0]) {
p += format_tensor_dims(p, t->src[0]);
for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) {
p += sprintf(p, " x ");
p += format_tensor_dims(p, t->src[i]);
}
p += sprintf(p, " -> ");
}
// format self dims separately for better visual alignment
char self[64];
format_tensor_dims(self, t);
p += sprintf(p, "%s", self);
}
int format_tensor_strides(char * str, const struct ggml_tensor * t) {
const char * c = ggml_is_contiguous(t) ? "" : "!";
if (t->ne[2] == 1 && t->ne[3] == 1) {
return sprintf(str, "%zu:%zu%s", (size_t) t->nb[0], (size_t) t->nb[1], c);
} else {
return sprintf(str, "%zu:%zu:%zu:%zu%s", (size_t) t->nb[0], (size_t) t->nb[1], (size_t) t->nb[2], (size_t) t->nb[3], c);
}
}
void format_op_strides(char * str, const struct ggml_tensor * t) {
char * p = str;
// append src0 and src1 (if any)
if (t->src[0]) {
p += format_tensor_strides(p, t->src[0]);
for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) {
p += sprintf(p, " x ");
p += format_tensor_strides(p, t->src[i]);
}
p += sprintf(p, " -> ");
}
// format self dims separately for better visual alignment
char self[64];
format_tensor_strides(self, t);
p += sprintf(p, "%s", self);
}
void format_op_types(char * str, const struct ggml_tensor * t) {
char * p = str;
// append src0 and src1 (if any)
if (t->src[0]) {
p += sprintf(p, "%s", ggml_type_name(t->src[0]->type));
for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) {
p += sprintf(p, " x ");
p += sprintf(p, "%s", ggml_type_name(t->src[i]->type));
}
p += sprintf(p, " -> ");
}
p += sprintf(p, "%s", ggml_type_name(t->type));
}
const char * tensor_buff_name(const struct ggml_tensor * t) {
if (t->buffer) {
return ggml_backend_buffer_name(t->buffer);
}
return "NONE";
}
void format_op_buffs(char * str, const struct ggml_tensor * t) {
char * p = str;
// append src0 and src1 (if any)
if (t->src[0]) {
p += sprintf(p, "%s", tensor_buff_name(t->src[0]));
for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) {
p += sprintf(p, " x ");
p += sprintf(p, "%s", tensor_buff_name(t->src[i]));
}
p += sprintf(p, " -> ");
}
p += sprintf(p, "%s", tensor_buff_name(t));
}
void format_op_names(char * str, const struct ggml_tensor * t) {
char * p = str;
// append src0 and src1 (if any)
if (t->src[0]) {
p += sprintf(p, "%s", t->src[0]->name);
for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) {
p += sprintf(p, " x ");
p += sprintf(p, "%s", t->src[i]->name);
}
p += sprintf(p, " -> ");
}
p += sprintf(p, "%s", t->name);
}
void format(const ggml_tensor * op) {
format_op_dims(dims, op);
format_op_strides(strides, op);
format_op_types(types, op);
format_op_buffs(buffs, op);
format_op_names(names, op);
}
op_desc() {}
op_desc(const ggml_tensor * op) { format(op); }
};
#endif // OP_DESC_H

View File

@ -494,6 +494,7 @@ struct ggml_backend_opencl_context {
cl_kernel kernel_convert_block_q8_0, kernel_restore_block_q8_0; cl_kernel kernel_convert_block_q8_0, kernel_restore_block_q8_0;
cl_kernel kernel_mul_mat_q4_0_f32_8x_flat; cl_kernel kernel_mul_mat_q4_0_f32_8x_flat;
cl_kernel kernel_convert_block_q4_0_noshuffle; cl_kernel kernel_convert_block_q4_0_noshuffle;
cl_kernel kernel_restore_block_q4_0_noshuffle;
cl_kernel kernel_mul_mat_q4_0_f32_1d_8x_flat, kernel_mul_mat_q4_0_f32_1d_16x_flat; cl_kernel kernel_mul_mat_q4_0_f32_1d_8x_flat, kernel_mul_mat_q4_0_f32_1d_16x_flat;
cl_kernel kernel_mul_mv_q6_K_f32; cl_kernel kernel_mul_mv_q6_K_f32;
cl_kernel kernel_mul_mv_mxfp4_f32, kernel_mul_mv_mxfp4_f32_flat; cl_kernel kernel_mul_mv_mxfp4_f32, kernel_mul_mv_mxfp4_f32_flat;
@ -634,6 +635,7 @@ struct ggml_backend_opencl_context {
cl_kernel kernel_transpose_32; cl_kernel kernel_transpose_32;
cl_kernel kernel_transpose_32_16; cl_kernel kernel_transpose_32_16;
cl_kernel kernel_transpose_16; cl_kernel kernel_transpose_16;
cl_kernel kernel_transpose_16_buf;
cl_kernel kernel_transpose_16_4x1; cl_kernel kernel_transpose_16_4x1;
cl_mem A_s_d_max; // max scale buffer size for transpose cl_mem A_s_d_max; // max scale buffer size for transpose
@ -806,6 +808,7 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts); build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
CL_CHECK((backend_ctx->kernel_convert_block_q4_0_noshuffle = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q4_0_noshuffle", &err), err)); CL_CHECK((backend_ctx->kernel_convert_block_q4_0_noshuffle = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q4_0_noshuffle", &err), err));
CL_CHECK((backend_ctx->kernel_restore_block_q4_0_noshuffle = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q4_0_noshuffle", &err), err));
CL_CHECK((backend_ctx->kernel_convert_block_q4_0 = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q4_0", &err), err)); CL_CHECK((backend_ctx->kernel_convert_block_q4_0 = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q4_0", &err), err));
CL_CHECK((backend_ctx->kernel_restore_block_q4_0 = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q4_0", &err), err)); CL_CHECK((backend_ctx->kernel_restore_block_q4_0 = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q4_0", &err), err));
CL_CHECK((backend_ctx->kernel_convert_block_mxfp4 = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_mxfp4", &err), err)); CL_CHECK((backend_ctx->kernel_convert_block_mxfp4 = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_mxfp4", &err), err));
@ -2004,7 +2007,8 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
CL_CHECK((backend_ctx->kernel_transpose_32_16 = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_32_16", &err), err)); CL_CHECK((backend_ctx->kernel_transpose_32_16 = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_32_16", &err), err));
CL_CHECK((backend_ctx->kernel_transpose_32 = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_32", &err), err)); CL_CHECK((backend_ctx->kernel_transpose_32 = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_32", &err), err));
CL_CHECK((backend_ctx->kernel_transpose_16 = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_16", &err), err)); CL_CHECK((backend_ctx->kernel_transpose_16 = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_16", &err), err));
CL_CHECK((backend_ctx->kernel_transpose_16_4x1 = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_16_4x1", &err), err)); CL_CHECK((backend_ctx->kernel_transpose_16_buf = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_16_buf", &err), err));
CL_CHECK((backend_ctx->kernel_transpose_16_4x1 = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_16_4x1", &err), err));
GGML_LOG_CONT("."); GGML_LOG_CONT(".");
} }
@ -3933,6 +3937,91 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer,
if (tensor->type == GGML_TYPE_Q4_0) { if (tensor->type == GGML_TYPE_Q4_0) {
ggml_tensor_extra_cl_q4_0 * extra = (ggml_tensor_extra_cl_q4_0 *)tensor->extra; ggml_tensor_extra_cl_q4_0 * extra = (ggml_tensor_extra_cl_q4_0 *)tensor->extra;
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
if (use_adreno_kernels(backend_ctx, tensor)) {
cl_int err;
cl_kernel kernel;
cl_int M = tensor->ne[1]; // ne01
cl_int K = tensor->ne[0]; // ne00
GGML_ASSERT(K % 32 == 0);
GGML_ASSERT(M % 4 == 0);
size_t size_q = (ggml_nelements(tensor)/ggml_blck_size(tensor->type))*ggml_blck_size(tensor->type)/2;
size_t size_d = (ggml_nelements(tensor)/ggml_blck_size(tensor->type))*sizeof(ggml_fp16_t);
GGML_ASSERT(size_d + size_q == ggml_nbytes(tensor) && "Incorrect tensor size");
cl_mem buf_trans_q;
cl_mem buf_trans_d;
CL_CHECK((buf_trans_q = clCreateBuffer(context, CL_MEM_READ_WRITE,
size_q, NULL, &err), err));
CL_CHECK((buf_trans_d = clCreateBuffer(context, CL_MEM_READ_WRITE,
size_d, NULL, &err), err));
kernel = backend_ctx->kernel_transpose_16_buf;
// transpose q back
cl_int stride_k_q = K/4;
size_t local_size_q[3] = {64, 1, 1};
size_t global_size_q[3] = {(size_t)M, (size_t)stride_k_q, 1};
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->q));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &buf_trans_q));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_int), &M));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_int), &stride_k_q));
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
global_size_q, local_size_q, 0, NULL, NULL));
// transpose scales back
cl_int stride_k_d = K/32;
size_t local_size_d[3] = {64, 1, 1};
size_t global_size_d[3] = {(size_t)M, (size_t)stride_k_d, 1};
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->d));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &buf_trans_d));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_int), &M));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_int), &stride_k_d));
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
global_size_d, local_size_d, 0, NULL, NULL));
// unpack
cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE,
ggml_nbytes(tensor), NULL, &err);
CL_CHECK(err);
cl_uchar mask_0F = 0x0F;
cl_uchar mask_F0 = 0xF0;
size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
size_t local_work_size[] = {1, 1, 1};
kernel = backend_ctx->kernel_restore_block_q4_0_noshuffle;
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &buf_trans_q));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &buf_trans_d));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &data_device));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_uchar), &mask_0F));
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_uchar), &mask_F0));
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
global_work_size, local_work_size, 0, NULL, NULL));
// read back to host
CL_CHECK(clEnqueueReadBuffer(
queue, data_device, CL_TRUE, offset,
size, data, 0, NULL, NULL));
CL_CHECK(clReleaseMemObject(data_device));
CL_CHECK(clReleaseMemObject(buf_trans_q));
CL_CHECK(clReleaseMemObject(buf_trans_d));
return;
}
#endif
cl_int err; cl_int err;
cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE, cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE,
ggml_nbytes(tensor), NULL, &err); ggml_nbytes(tensor), NULL, &err);

View File

@ -117,6 +117,27 @@ kernel void kernel_convert_block_q4_0_noshuffle(
} }
} }
kernel void kernel_restore_block_q4_0_noshuffle(
global uchar * src_q,
global half * src_d,
global struct block_q4_0 * dst,
uchar mask_0F,
uchar mask_F0
) {
global struct block_q4_0 * b = (global struct block_q4_0 *) dst + get_global_id(0);
global uchar * q = (global uchar *) src_q + QK4_0/2*get_global_id(0);
global half * d = (global half *) src_d + get_global_id(0);
b->d = *d;
for (int i = 0; i < QK4_0/4; ++i) {
uchar x0 = q[i + 0 ] ;
uchar x1 = q[i + QK4_0/4];
b->qs[2*i + 0] = convert_uchar((x0 & mask_0F) | ((x1 & mask_0F) << 4));
b->qs[2*i + 1] = convert_uchar(((x0 & mask_F0) >> 4) | (x1 & mask_F0));
}
}
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
// block_mxfp4 // block_mxfp4
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------

View File

@ -44,6 +44,19 @@ kernel void kernel_transpose_16_4x1(
write_imageh(output, i * rows + j, (half4)(temp0, temp1, temp2, temp3)); write_imageh(output, i * rows + j, (half4)(temp0, temp1, temp2, temp3));
} }
// Transpose treating each element as 16-bit using buffer
kernel void kernel_transpose_16_buf(
global const ushort * input,
global ushort * output,
const int ldi,
const int ldo
) {
const int x = get_global_id(0);
const int y = get_global_id(1);
output[x*ldo + y] = input[y*ldi + x];
}
// 32-bit transpose, loading/storing a 4x4 tile of elements // 32-bit transpose, loading/storing a 4x4 tile of elements
kernel void kernel_transpose_32( kernel void kernel_transpose_32(
__read_only image1d_buffer_t input, __read_only image1d_buffer_t input,

View File

@ -571,6 +571,10 @@ static void * ggml_backend_rpc_buffer_get_base(ggml_backend_buffer_t buffer) {
return ctx->base_ptr; return ctx->base_ptr;
} }
static bool ggml_backend_buffer_is_rpc(ggml_backend_buffer_t buffer) {
return buffer->iface.free_buffer == ggml_backend_rpc_buffer_free_buffer;
}
static rpc_tensor serialize_tensor(const ggml_tensor * tensor) { static rpc_tensor serialize_tensor(const ggml_tensor * tensor) {
rpc_tensor result; rpc_tensor result;
if (!tensor) { if (!tensor) {
@ -580,7 +584,7 @@ static rpc_tensor serialize_tensor(const ggml_tensor * tensor) {
result.id = reinterpret_cast<uint64_t>(tensor); result.id = reinterpret_cast<uint64_t>(tensor);
result.type = tensor->type; result.type = tensor->type;
if (tensor->buffer) { if (tensor->buffer && ggml_backend_buffer_is_rpc(tensor->buffer)) {
ggml_backend_buffer_t buffer = tensor->buffer; ggml_backend_buffer_t buffer = tensor->buffer;
ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context; ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
result.buffer = ctx != nullptr ? ctx->remote_ptr : 0; result.buffer = ctx != nullptr ? ctx->remote_ptr : 0;
@ -664,10 +668,6 @@ static void ggml_backend_rpc_buffer_get_tensor(ggml_backend_buffer_t buffer, con
RPC_STATUS_ASSERT(status); RPC_STATUS_ASSERT(status);
} }
static bool ggml_backend_buffer_is_rpc(ggml_backend_buffer_t buffer) {
return buffer->iface.free_buffer == ggml_backend_rpc_buffer_free_buffer;
}
static bool ggml_backend_rpc_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) { static bool ggml_backend_rpc_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
if (ggml_backend_buffer_is_rpc(src->buffer)) { if (ggml_backend_buffer_is_rpc(src->buffer)) {
// check if src and dst are on the same server // check if src and dst are on the same server

View File

@ -731,7 +731,7 @@ struct vk_device_struct {
vk_pipeline pipeline_rope_norm_f32, pipeline_rope_norm_f16, pipeline_rope_norm_f32_f16; vk_pipeline pipeline_rope_norm_f32, pipeline_rope_norm_f16, pipeline_rope_norm_f32_f16;
vk_pipeline pipeline_rope_neox_f32, pipeline_rope_neox_f16, pipeline_rope_neox_f32_f16; vk_pipeline pipeline_rope_neox_f32, pipeline_rope_neox_f16, pipeline_rope_neox_f32_f16;
vk_pipeline pipeline_rope_multi_f32, pipeline_rope_multi_f16; vk_pipeline pipeline_rope_multi_f32, pipeline_rope_multi_f16, pipeline_rope_multi_f32_f16;
vk_pipeline pipeline_rope_vision_f32, pipeline_rope_vision_f16; vk_pipeline pipeline_rope_vision_f32, pipeline_rope_vision_f16;
vk_pipeline pipeline_argsort_f32[num_argsort_pipelines]; vk_pipeline pipeline_argsort_f32[num_argsort_pipelines];
vk_pipeline pipeline_argsort_large_f32[num_argsort_pipelines]; vk_pipeline pipeline_argsort_large_f32[num_argsort_pipelines];
@ -4077,6 +4077,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
ggml_vk_create_pipeline(device, device->pipeline_rope_norm_f32_f16, "rope_norm_f32_f16", rope_norm_f32_f16_rte_len, rope_norm_f32_f16_rte_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_rope_norm_f32_f16, "rope_norm_f32_f16", rope_norm_f32_f16_rte_len, rope_norm_f32_f16_rte_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_rope_neox_f32_f16, "rope_neox_f32_f16", rope_neox_f32_f16_rte_len, rope_neox_f32_f16_rte_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_rope_neox_f32_f16, "rope_neox_f32_f16", rope_neox_f32_f16_rte_len, rope_neox_f32_f16_rte_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_rope_multi_f32_f16, "rope_multi_f32_f16", rope_multi_f32_f16_rte_len, rope_multi_f32_f16_rte_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
} else { } else {
ggml_vk_create_pipeline(device, device->pipeline_rope_norm_f16, "rope_norm_f16", rope_norm_f16_len, rope_norm_f16_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_rope_norm_f16, "rope_norm_f16", rope_norm_f16_len, rope_norm_f16_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_rope_neox_f16, "rope_neox_f16", rope_neox_f16_len, rope_neox_f16_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_rope_neox_f16, "rope_neox_f16", rope_neox_f16_len, rope_neox_f16_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
@ -4085,6 +4086,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
ggml_vk_create_pipeline(device, device->pipeline_rope_norm_f32_f16, "rope_norm_f32_f16", rope_norm_f32_f16_len, rope_norm_f32_f16_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_rope_norm_f32_f16, "rope_norm_f32_f16", rope_norm_f32_f16_len, rope_norm_f32_f16_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_rope_neox_f32_f16, "rope_neox_f32_f16", rope_neox_f32_f16_len, rope_neox_f32_f16_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_rope_neox_f32_f16, "rope_neox_f32_f16", rope_neox_f32_f16_len, rope_neox_f32_f16_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_rope_multi_f32_f16, "rope_multi_f32_f16", rope_multi_f32_f16_len, rope_multi_f32_f16_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
} }
for (uint32_t i = 0; i < num_argsort_pipelines; ++i) { for (uint32_t i = 0; i < num_argsort_pipelines; ++i) {
@ -8680,6 +8682,9 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
return ctx->device->pipeline_rope_multi_f32; return ctx->device->pipeline_rope_multi_f32;
} }
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F16) {
return ctx->device->pipeline_rope_multi_f32_f16;
}
if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) { if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
return ctx->device->pipeline_rope_multi_f16; return ctx->device->pipeline_rope_multi_f16;
} }
@ -13076,9 +13081,9 @@ static bool ggml_vk_can_fuse_rope_set_rows(ggml_backend_vk_context * ctx, const
return false; return false;
} }
// Only norm/neox shaders have the fusion code // Only norm/neox/mrope shaders have the fusion code
const int mode = ((const int32_t *) rope->op_params)[2]; const int mode = ((const int32_t *) rope->op_params)[2];
if (mode != GGML_ROPE_TYPE_NORMAL && mode != GGML_ROPE_TYPE_NEOX) { if (mode != GGML_ROPE_TYPE_NORMAL && mode != GGML_ROPE_TYPE_NEOX && mode != GGML_ROPE_TYPE_MROPE) {
return false; return false;
} }

View File

@ -49,8 +49,8 @@ void rope_norm(const uint i0, const uint i1, rope_params p) {
uint idst = i1*ne0 + i0; uint idst = i1*ne0 + i0;
const uint ix = rope_a_coord(i0, i01, i02, p); const uint ix = rope_a_coord(i0, i01, i02, p);
// Fusion optimization: ROPE + VIEW + SET_ROWS.. // Fusion optimization: ROPE + VIEW + SET_ROWS.
// The rope output is viewed as a 1D tensor and offset based on a row index in data_i. // The rope output is viewed as a 1D tensor and offset based on a row index in rope_data_i.
if (p.set_rows_stride != 0) { if (p.set_rows_stride != 0) {
idst = i01*ne0 + i0; idst = i01*ne0 + i0;
idst += rope_data_i[i02].x * p.set_rows_stride; idst += rope_data_i[i02].x * p.set_rows_stride;
@ -91,7 +91,7 @@ void rope_neox(const uint i0, const uint i1, rope_params p) {
uint idst = i1*ne0 + i0/2; uint idst = i1*ne0 + i0/2;
const uint ix = rope_a_coord(i0/2, i01, i02, p); const uint ix = rope_a_coord(i0/2, i01, i02, p);
// Fusion optimization: ROPE + VIEW + SET_ROWS.. // Fusion optimization: ROPE + VIEW + SET_ROWS.
// The rope output is viewed as a 1D tensor and offset based on a row index in rope_data_i. // The rope output is viewed as a 1D tensor and offset based on a row index in rope_data_i.
if (p.set_rows_stride != 0) { if (p.set_rows_stride != 0) {
idst = i01*ne0 + i0/2; idst = i01*ne0 + i0/2;
@ -132,9 +132,16 @@ void rope_multi(const uint i0, const uint i1, rope_params p) {
const uint i01 = i1 % ne1; const uint i01 = i1 % ne1;
const uint i02 = i1 / ne1; const uint i02 = i1 / ne1;
const uint idst = i1*ne0 + i0/2; uint idst = i1*ne0 + i0/2;
const uint ix = rope_a_coord(i0/2, i01, i02, p); const uint ix = rope_a_coord(i0/2, i01, i02, p);
// Fusion optimization: ROPE + VIEW + SET_ROWS.
// The rope output is viewed as a 1D tensor and offset based on a row index in rope_data_i.
if (p.set_rows_stride != 0) {
idst = i01*ne0 + i0/2;
idst += rope_data_i[i02].x * p.set_rows_stride;
}
if (i0 >= p.n_dims) { if (i0 >= p.n_dims) {
rope_data_d[idst + i0/2 + 0] = ROPE_D_TYPE(rope_data_a[ix + i0/2 + 0]); rope_data_d[idst + i0/2 + 0] = ROPE_D_TYPE(rope_data_a[ix + i0/2 + 0]);
rope_data_d[idst + i0/2 + 1] = ROPE_D_TYPE(rope_data_a[ix + i0/2 + 1]); rope_data_d[idst + i0/2 + 1] = ROPE_D_TYPE(rope_data_a[ix + i0/2 + 1]);

View File

@ -927,6 +927,8 @@ void process_shaders() {
string_to_spv("rope_multi_f32", "rope_multi.comp", {{"A_TYPE", "float"}, {"ROPE_D_TYPE", "float"}}); string_to_spv("rope_multi_f32", "rope_multi.comp", {{"A_TYPE", "float"}, {"ROPE_D_TYPE", "float"}});
string_to_spv("rope_multi_f16", "rope_multi.comp", {{"A_TYPE", "float16_t"}, {"ROPE_D_TYPE", "float16_t"}}); string_to_spv("rope_multi_f16", "rope_multi.comp", {{"A_TYPE", "float16_t"}, {"ROPE_D_TYPE", "float16_t"}});
string_to_spv("rope_multi_f16_rte", "rope_multi.comp", {{"A_TYPE", "float16_t"}, {"ROPE_D_TYPE", "float16_t"}, {"RTE16", "1"}}); string_to_spv("rope_multi_f16_rte", "rope_multi.comp", {{"A_TYPE", "float16_t"}, {"ROPE_D_TYPE", "float16_t"}, {"RTE16", "1"}});
string_to_spv("rope_multi_f32_f16", "rope_multi.comp", {{"A_TYPE", "float"}, {"ROPE_D_TYPE", "float16_t"}});
string_to_spv("rope_multi_f32_f16_rte", "rope_multi.comp", {{"A_TYPE", "float"}, {"ROPE_D_TYPE", "float16_t"}, {"RTE16", "1"}});
string_to_spv("rope_vision_f32", "rope_vision.comp", {{"A_TYPE", "float"}, {"ROPE_D_TYPE", "float"}}); string_to_spv("rope_vision_f32", "rope_vision.comp", {{"A_TYPE", "float"}, {"ROPE_D_TYPE", "float"}});
string_to_spv("rope_vision_f16", "rope_vision.comp", {{"A_TYPE", "float16_t"}, {"ROPE_D_TYPE", "float16_t"}}); string_to_spv("rope_vision_f16", "rope_vision.comp", {{"A_TYPE", "float16_t"}, {"ROPE_D_TYPE", "float16_t"}});

View File

@ -181,6 +181,7 @@ class Keys:
DIMENSION_COUNT = "{arch}.rope.dimension_count" DIMENSION_COUNT = "{arch}.rope.dimension_count"
DIMENSION_SECTIONS = "{arch}.rope.dimension_sections" DIMENSION_SECTIONS = "{arch}.rope.dimension_sections"
FREQ_BASE = "{arch}.rope.freq_base" FREQ_BASE = "{arch}.rope.freq_base"
FREQ_BASE_SWA = "{arch}.rope.freq_base_swa"
SCALING_TYPE = "{arch}.rope.scaling.type" SCALING_TYPE = "{arch}.rope.scaling.type"
SCALING_FACTOR = "{arch}.rope.scaling.factor" SCALING_FACTOR = "{arch}.rope.scaling.factor"
SCALING_ATTN_FACTOR = "{arch}.rope.scaling.attn_factor" SCALING_ATTN_FACTOR = "{arch}.rope.scaling.attn_factor"
@ -354,6 +355,7 @@ class MODEL_ARCH(IntEnum):
STARCODER = auto() STARCODER = auto()
REFACT = auto() REFACT = auto()
BERT = auto() BERT = auto()
MODERN_BERT = auto()
NOMIC_BERT = auto() NOMIC_BERT = auto()
NOMIC_BERT_MOE = auto() NOMIC_BERT_MOE = auto()
NEO_BERT = auto() NEO_BERT = auto()
@ -747,6 +749,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
MODEL_ARCH.STARCODER: "starcoder", MODEL_ARCH.STARCODER: "starcoder",
MODEL_ARCH.REFACT: "refact", MODEL_ARCH.REFACT: "refact",
MODEL_ARCH.BERT: "bert", MODEL_ARCH.BERT: "bert",
MODEL_ARCH.MODERN_BERT: "modern-bert",
MODEL_ARCH.NOMIC_BERT: "nomic-bert", MODEL_ARCH.NOMIC_BERT: "nomic-bert",
MODEL_ARCH.NOMIC_BERT_MOE: "nomic-bert-moe", MODEL_ARCH.NOMIC_BERT_MOE: "nomic-bert-moe",
MODEL_ARCH.NEO_BERT: "neo-bert", MODEL_ARCH.NEO_BERT: "neo-bert",
@ -1367,6 +1370,19 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_TENSOR.CLS, MODEL_TENSOR.CLS,
MODEL_TENSOR.CLS_OUT, MODEL_TENSOR.CLS_OUT,
], ],
MODEL_ARCH.MODERN_BERT: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.TOKEN_EMBD_NORM,
MODEL_TENSOR.OUTPUT_NORM,
MODEL_TENSOR.ATTN_NORM,
MODEL_TENSOR.ATTN_OUT,
MODEL_TENSOR.ATTN_QKV,
MODEL_TENSOR.FFN_UP,
MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_NORM,
MODEL_TENSOR.CLS,
MODEL_TENSOR.CLS_OUT,
],
MODEL_ARCH.NOMIC_BERT: [ MODEL_ARCH.NOMIC_BERT: [
MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.TOKEN_EMBD_NORM, MODEL_TENSOR.TOKEN_EMBD_NORM,

View File

@ -774,8 +774,12 @@ class GGUFWriter:
def add_shared_kv_layers(self, value: int) -> None: def add_shared_kv_layers(self, value: int) -> None:
self.add_uint32(Keys.Attention.SHARED_KV_LAYERS.format(arch=self.arch), value) self.add_uint32(Keys.Attention.SHARED_KV_LAYERS.format(arch=self.arch), value)
def add_sliding_window_pattern(self, value: Sequence[bool]) -> None: def add_sliding_window_pattern(self, value: int | Sequence[bool]) -> None:
self.add_array(Keys.Attention.SLIDING_WINDOW_PATTERN.format(arch=self.arch), value) key = Keys.Attention.SLIDING_WINDOW_PATTERN.format(arch=self.arch)
if isinstance(value, int):
self.add_uint32(key, value)
else:
self.add_array(key, value)
def add_dense_features_dims(self, dense:str, in_f:int, out_f:int) -> None: def add_dense_features_dims(self, dense:str, in_f:int, out_f:int) -> None:
self.add_uint32(Keys.LLM.DENSE_FEAT_IN_SIZE.format(arch=self.arch, dense=dense), in_f) self.add_uint32(Keys.LLM.DENSE_FEAT_IN_SIZE.format(arch=self.arch, dense=dense), in_f)
@ -886,6 +890,9 @@ class GGUFWriter:
def add_value_residual_mix_lora_rank(self, length: int) -> None: def add_value_residual_mix_lora_rank(self, length: int) -> None:
self.add_uint32(Keys.Attention.VALUE_RESIDUAL_MIX_LORA_RANK.format(arch=self.arch), length) self.add_uint32(Keys.Attention.VALUE_RESIDUAL_MIX_LORA_RANK.format(arch=self.arch), length)
def add_rope_freq_base_swa(self, value: float) -> None:
self.add_float32(Keys.Rope.FREQ_BASE_SWA.format(arch=self.arch), value)
def add_gate_lora_rank(self, length: int) -> None: def add_gate_lora_rank(self, length: int) -> None:
self.add_uint32(Keys.Attention.GATE_LORA_RANK.format(arch=self.arch), length) self.add_uint32(Keys.Attention.GATE_LORA_RANK.format(arch=self.arch), length)

View File

@ -17,6 +17,7 @@ class TensorNameMap:
"embed_tokens", # embeddinggemma "embed_tokens", # embeddinggemma
"tok_embeddings", # llama-pth "tok_embeddings", # llama-pth
"embeddings.word_embeddings", # bert nomic-bert "embeddings.word_embeddings", # bert nomic-bert
"embeddings.tok_embeddings", # modern-bert
"language_model.embedding.word_embeddings", # persimmon "language_model.embedding.word_embeddings", # persimmon
"wte", # gpt2 "wte", # gpt2
"transformer.embd.wte", # phi2 "transformer.embd.wte", # phi2
@ -46,6 +47,7 @@ class TensorNameMap:
MODEL_TENSOR.TOKEN_EMBD_NORM: ( MODEL_TENSOR.TOKEN_EMBD_NORM: (
"word_embeddings_layernorm", # bloom "word_embeddings_layernorm", # bloom
"embeddings.LayerNorm", # bert "embeddings.LayerNorm", # bert
"embeddings.norm", # modern-bert
"emb_ln", # nomic-bert "emb_ln", # nomic-bert
"transformer.norm", # openelm "transformer.norm", # openelm
"rwkv.blocks.0.pre_ln", # rwkv "rwkv.blocks.0.pre_ln", # rwkv
@ -75,6 +77,7 @@ class TensorNameMap:
"head.out", # wavtokenizer "head.out", # wavtokenizer
"lm_head", # llama4 "lm_head", # llama4
"model.transformer.ff_out", # llada "model.transformer.ff_out", # llada
"head.decoder", # modern-bert
), ),
MODEL_TENSOR.DENSE_2_OUT: ( MODEL_TENSOR.DENSE_2_OUT: (
"dense_2_out", # embeddinggemma "dense_2_out", # embeddinggemma
@ -104,6 +107,7 @@ class TensorNameMap:
"backbone.final_layer_norm", # wavtokenizer "backbone.final_layer_norm", # wavtokenizer
"model.norm", # llama4 "model.norm", # llama4
"model.transformer.ln_f", # llada "model.transformer.ln_f", # llada
"final_norm", # modern-bert
"model.norm", # cogvlm "model.norm", # cogvlm
), ),
@ -151,6 +155,7 @@ class TensorNameMap:
"model.layers.{bid}.input_layernorm", # llama4 "model.layers.{bid}.input_layernorm", # llama4
"layers.{bid}.input_layernorm", # embeddinggemma "layers.{bid}.input_layernorm", # embeddinggemma
"transformer_encoder.{bid}.attention_norm", # neobert "transformer_encoder.{bid}.attention_norm", # neobert
"layers.{bid}.attn_norm", # modern-bert
"model.layers.{bid}.operator_norm", # lfm2 "model.layers.{bid}.operator_norm", # lfm2
"model.transformer.blocks.{bid}.attn_norm", # llada "model.transformer.blocks.{bid}.attn_norm", # llada
"layers.{bid}.input_layernorm", # qwen3-embedding "layers.{bid}.input_layernorm", # qwen3-embedding
@ -187,6 +192,7 @@ class TensorNameMap:
"encoder.layers.{bid}.self_attention.query_key_value", # chatglm "encoder.layers.{bid}.self_attention.query_key_value", # chatglm
"transformer.layers.{bid}.attn.qkv_proj", # openelm "transformer.layers.{bid}.attn.qkv_proj", # openelm
"transformer_encoder.{bid}.qkv", # neobert "transformer_encoder.{bid}.qkv", # neobert
"layers.{bid}.attn.Wqkv", # modern-bert
"model.layers.{bid}.self_attn.language_expert_query_key_value", # cogvlm "model.layers.{bid}.self_attn.language_expert_query_key_value", # cogvlm
), ),
@ -261,6 +267,7 @@ class TensorNameMap:
"model.layers.{bid}.self_attn.linear_attn", # deci "model.layers.{bid}.self_attn.linear_attn", # deci
"layers.{bid}.attention.wo", # llama-pth "layers.{bid}.attention.wo", # llama-pth
"encoder.layer.{bid}.attention.output.dense", # bert "encoder.layer.{bid}.attention.output.dense", # bert
"layers.{bid}.attn.Wo", # modern-bert
"transformer.layer.{bid}.attention.out_lin", # distillbert "transformer.layer.{bid}.attention.out_lin", # distillbert
"transformer.h.{bid}.attn.out_proj", # gpt-j "transformer.h.{bid}.attn.out_proj", # gpt-j
"language_model.encoder.layers.{bid}.self_attention.dense", # persimmon "language_model.encoder.layers.{bid}.self_attention.dense", # persimmon
@ -344,6 +351,7 @@ class TensorNameMap:
"layers.{bid}.post_attention_layernorm", # qwen3-embedding "layers.{bid}.post_attention_layernorm", # qwen3-embedding
"model.layers.{bid}.feedforward_layernorm", # apertus "model.layers.{bid}.feedforward_layernorm", # apertus
"model.layers.{bid}.pre_mlp_layernorm", # kormo "model.layers.{bid}.pre_mlp_layernorm", # kormo
"layers.{bid}.mlp_norm" # modern-bert
), ),
# Pre feed-forward norm # Pre feed-forward norm
@ -407,6 +415,7 @@ class TensorNameMap:
"layers.{bid}.mlp.up_proj", # embeddinggemma "layers.{bid}.mlp.up_proj", # embeddinggemma
"layers.{bid}.feed_forward.w3", # llama-pth "layers.{bid}.feed_forward.w3", # llama-pth
"encoder.layer.{bid}.intermediate.dense", # bert "encoder.layer.{bid}.intermediate.dense", # bert
"layers.{bid}.mlp.Wi", # modern-bert
"transformer.layer.{bid}.ffn.lin1", # distillbert "transformer.layer.{bid}.ffn.lin1", # distillbert
"transformer.h.{bid}.mlp.fc_in", # gpt-j "transformer.h.{bid}.mlp.fc_in", # gpt-j
"transformer.h.{bid}.mlp.linear_3", # refact "transformer.h.{bid}.mlp.linear_3", # refact
@ -521,6 +530,7 @@ class TensorNameMap:
"layers.{bid}.mlp.down_proj", # embeddinggemma "layers.{bid}.mlp.down_proj", # embeddinggemma
"layers.{bid}.feed_forward.w2", # llama-pth "layers.{bid}.feed_forward.w2", # llama-pth
"encoder.layer.{bid}.output.dense", # bert "encoder.layer.{bid}.output.dense", # bert
"layers.{bid}.mlp.Wo", # modern-bert
"transformer.layer.{bid}.ffn.lin2", # distillbert "transformer.layer.{bid}.ffn.lin2", # distillbert
"transformer.h.{bid}.mlp.fc_out", # gpt-j "transformer.h.{bid}.mlp.fc_out", # gpt-j
"language_model.encoder.layers.{bid}.mlp.dense_4h_to_h", # persimmon "language_model.encoder.layers.{bid}.mlp.dense_4h_to_h", # persimmon
@ -1122,6 +1132,7 @@ class TensorNameMap:
"classifier.dense", # roberta "classifier.dense", # roberta
"pre_classifier", # distillbert "pre_classifier", # distillbert
"dense", # neobert "dense", # neobert
"head.dense", # modern-bert
), ),
MODEL_TENSOR.CLS_OUT: ( MODEL_TENSOR.CLS_OUT: (

View File

@ -110,7 +110,6 @@ class SafetensorRemote:
""" """
BASE_DOMAIN = "https://huggingface.co" BASE_DOMAIN = "https://huggingface.co"
ALIGNMENT = 8 # bytes
@classmethod @classmethod
def get_list_tensors_hf_model(cls, model_id: str) -> dict[str, RemoteTensor]: def get_list_tensors_hf_model(cls, model_id: str) -> dict[str, RemoteTensor]:
@ -204,9 +203,6 @@ class SafetensorRemote:
# Calculate the data start offset # Calculate the data start offset
data_start_offset = 8 + metadata_length data_start_offset = 8 + metadata_length
alignment = SafetensorRemote.ALIGNMENT
if data_start_offset % alignment != 0:
data_start_offset += alignment - (data_start_offset % alignment)
# Check if we have enough data to read the metadata # Check if we have enough data to read the metadata
if len(raw_data) < 8 + metadata_length: if len(raw_data) < 8 + metadata_length:
@ -298,7 +294,6 @@ class SafetensorsLocal:
Custom parsing gives a bit more control over the memory usage. Custom parsing gives a bit more control over the memory usage.
The official safetensors library doesn't expose file ranges. The official safetensors library doesn't expose file ranges.
""" """
ALIGNMENT = 8 # bytes
tensors: dict[str, LocalTensor] tensors: dict[str, LocalTensor]
@ -316,9 +311,6 @@ class SafetensorsLocal:
raise ValueError(f"Failed to parse safetensors metadata as JSON: {e}") raise ValueError(f"Failed to parse safetensors metadata as JSON: {e}")
data_start_offset = f.tell() data_start_offset = f.tell()
alignment = self.ALIGNMENT
if data_start_offset % alignment != 0:
data_start_offset += alignment - (data_start_offset % alignment)
tensors: dict[str, LocalTensor] = {} tensors: dict[str, LocalTensor] = {}
for name, meta in metadata.items(): for name, meta in metadata.items():

View File

@ -18,17 +18,17 @@ model="Llama-3.2-3B-Instruct-Q4_0.gguf"
device="HTP0" device="HTP0"
[ "$D" != "" ] && device="$D" [ "$D" != "" ] && device="$D"
verbose=
[ "$V" != "" ] && verbose="GGML_HEXAGON_VERBOSE=$V"
experimental= experimental=
[ "$E" != "" ] && experimental="GGML_HEXAGON_EXPERIMENTAL=$E" [ "$E" != "" ] && experimental="GGML_HEXAGON_EXPERIMENTAL=$E"
verbose=
[ "$V" != "" ] && verbose="GGML_HEXAGON_VERBOSE=$V" cli_opts="$cli_opts -v"
sched= sched=
[ "$SCHED" != "" ] && sched="GGML_SCHED_DEBUG=2" cli_opts="$cli_opts -v" [ "$SCHED" != "" ] && sched="GGML_SCHED_DEBUG=2" cli_opts="$cli_opts -v"
profile= profile=
[ "$PROF" != "" ] && profile="GGML_HEXAGON_PROFILE=$PROF GGML_HEXAGON_OPSYNC=1" [ "$PROF" != "" ] && profile="GGML_HEXAGON_PROFILE=$PROF GGML_HEXAGON_OPSYNC=1" cli_opts="$cli_opts -v"
opmask= opmask=
[ "$OPMASK" != "" ] && opmask="GGML_HEXAGON_OPMASK=$OPMASK" [ "$OPMASK" != "" ] && opmask="GGML_HEXAGON_OPMASK=$OPMASK"
@ -45,9 +45,9 @@ adb $adbserial shell " \
cd $basedir; ulimit -c unlimited; \ cd $basedir; ulimit -c unlimited; \
LD_LIBRARY_PATH=$basedir/$branch/lib \ LD_LIBRARY_PATH=$basedir/$branch/lib \
ADSP_LIBRARY_PATH=$basedir/$branch/lib \ ADSP_LIBRARY_PATH=$basedir/$branch/lib \
$verbose $experimental $sched $opmask $profile $nhvx $ndev \ $verbose $experimental $sched $opmask $profile $nhvx $ndev \
./$branch/bin/llama-completion --no-mmap -m $basedir/../gguf/$model \ ./$branch/bin/llama-cli --no-mmap -m $basedir/../gguf/$model \
--poll 1000 -t 6 --cpu-mask 0xfc --cpu-strict 1 \ --poll 1000 -t 6 --cpu-mask 0xfc --cpu-strict 1 \
--ctx-size 8192 --batch-size 128 -ctk q8_0 -ctv q8_0 -fa on \ --ctx-size 8192 --batch-size 128 -fa on \
-ngl 99 --device $device $cli_opts $@ \ -ngl 99 --device $device $cli_opts $@ \
" "

View File

@ -0,0 +1,53 @@
#!/bin/sh
#
# Basedir on device
basedir=/data/local/tmp/llama.cpp
cli_opts=
branch=.
[ "$B" != "" ] && branch=$B
adbserial=
[ "$S" != "" ] && adbserial="-s $S"
model="Llama-3.2-3B-Instruct-Q4_0.gguf"
[ "$M" != "" ] && model="$M"
device="HTP0"
[ "$D" != "" ] && device="$D"
experimental=
[ "$E" != "" ] && experimental="GGML_HEXAGON_EXPERIMENTAL=$E"
verbose=
[ "$V" != "" ] && verbose="GGML_HEXAGON_VERBOSE=$V" cli_opts="$cli_opts -v"
sched=
[ "$SCHED" != "" ] && sched="GGML_SCHED_DEBUG=2" cli_opts="$cli_opts -v"
profile=
[ "$PROF" != "" ] && profile="GGML_HEXAGON_PROFILE=$PROF GGML_HEXAGON_OPSYNC=1" cli_opts="$cli_opts -v"
opmask=
[ "$OPMASK" != "" ] && opmask="GGML_HEXAGON_OPMASK=$OPMASK"
nhvx=
[ "$NHVX" != "" ] && nhvx="GGML_HEXAGON_NHVX=$NHVX"
ndev=
[ "$NDEV" != "" ] && ndev="GGML_HEXAGON_NDEV=$NDEV"
set -x
adb $adbserial shell " \
cd $basedir; ulimit -c unlimited; \
LD_LIBRARY_PATH=$basedir/$branch/lib \
ADSP_LIBRARY_PATH=$basedir/$branch/lib \
$verbose $experimental $sched $opmask $profile $nhvx $ndev \
./$branch/bin/llama-completion --no-mmap -m $basedir/../gguf/$model \
--poll 1000 -t 6 --cpu-mask 0xfc --cpu-strict 1 \
--ctx-size 8192 --batch-size 128 -fa on \
-ngl 99 -no-cnv --device $device $cli_opts $@ \
"

View File

@ -90,6 +90,7 @@ add_library(llama
models/mamba.cpp models/mamba.cpp
models/minicpm3.cpp models/minicpm3.cpp
models/minimax-m2.cpp models/minimax-m2.cpp
models/modern-bert.cpp
models/mpt.cpp models/mpt.cpp
models/nemotron-h.cpp models/nemotron-h.cpp
models/nemotron.cpp models/nemotron.cpp

View File

@ -20,6 +20,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_STARCODER, "starcoder" }, { LLM_ARCH_STARCODER, "starcoder" },
{ LLM_ARCH_REFACT, "refact" }, { LLM_ARCH_REFACT, "refact" },
{ LLM_ARCH_BERT, "bert" }, { LLM_ARCH_BERT, "bert" },
{ LLM_ARCH_MODERN_BERT, "modern-bert" },
{ LLM_ARCH_NOMIC_BERT, "nomic-bert" }, { LLM_ARCH_NOMIC_BERT, "nomic-bert" },
{ LLM_ARCH_NOMIC_BERT_MOE, "nomic-bert-moe" }, { LLM_ARCH_NOMIC_BERT_MOE, "nomic-bert-moe" },
{ LLM_ARCH_NEO_BERT, "neo-bert" }, { LLM_ARCH_NEO_BERT, "neo-bert" },
@ -204,6 +205,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_ATTENTION_GATE_LORA_RANK, "%s.attention.gate_lora_rank" }, { LLM_KV_ATTENTION_GATE_LORA_RANK, "%s.attention.gate_lora_rank" },
{ LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" }, { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" }, { LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
{ LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, "%s.attention.sliding_window_pattern" },
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" }, { LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
{ LLM_KV_ATTENTION_OUTPUT_SCALE, "%s.attention.output_scale" }, { LLM_KV_ATTENTION_OUTPUT_SCALE, "%s.attention.output_scale" },
{ LLM_KV_ATTENTION_TEMPERATURE_LENGTH, "%s.attention.temperature_length" }, { LLM_KV_ATTENTION_TEMPERATURE_LENGTH, "%s.attention.temperature_length" },
@ -214,6 +216,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" }, { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
{ LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" }, { LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
{ LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" }, { LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
{ LLM_KV_ROPE_FREQ_BASE_SWA, "%s.rope.freq_base_swa" },
{ LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" }, { LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" },
{ LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" }, { LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" },
{ LLM_KV_ROPE_SCALING_FACTOR, "%s.rope.scaling.factor" }, { LLM_KV_ROPE_SCALING_FACTOR, "%s.rope.scaling.factor" },
@ -778,6 +781,20 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
LLM_TENSOR_CLS, LLM_TENSOR_CLS,
LLM_TENSOR_CLS_OUT, LLM_TENSOR_CLS_OUT,
}; };
case LLM_ARCH_MODERN_BERT:
return {
LLM_TENSOR_TOKEN_EMBD,
LLM_TENSOR_TOKEN_EMBD_NORM,
LLM_TENSOR_OUTPUT_NORM,
LLM_TENSOR_ATTN_NORM,
LLM_TENSOR_ATTN_OUT,
LLM_TENSOR_ATTN_QKV,
LLM_TENSOR_FFN_DOWN,
LLM_TENSOR_FFN_UP,
LLM_TENSOR_FFN_NORM,
LLM_TENSOR_CLS,
LLM_TENSOR_CLS_OUT,
};
case LLM_ARCH_JINA_BERT_V2: case LLM_ARCH_JINA_BERT_V2:
return { return {
LLM_TENSOR_TOKEN_EMBD, LLM_TENSOR_TOKEN_EMBD,

View File

@ -24,6 +24,7 @@ enum llm_arch {
LLM_ARCH_STARCODER, LLM_ARCH_STARCODER,
LLM_ARCH_REFACT, LLM_ARCH_REFACT,
LLM_ARCH_BERT, LLM_ARCH_BERT,
LLM_ARCH_MODERN_BERT,
LLM_ARCH_NOMIC_BERT, LLM_ARCH_NOMIC_BERT,
LLM_ARCH_NOMIC_BERT_MOE, LLM_ARCH_NOMIC_BERT_MOE,
LLM_ARCH_NEO_BERT, LLM_ARCH_NEO_BERT,
@ -208,6 +209,7 @@ enum llm_kv {
LLM_KV_ATTENTION_GATE_LORA_RANK, LLM_KV_ATTENTION_GATE_LORA_RANK,
LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
LLM_KV_ATTENTION_SLIDING_WINDOW, LLM_KV_ATTENTION_SLIDING_WINDOW,
LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN,
LLM_KV_ATTENTION_SCALE, LLM_KV_ATTENTION_SCALE,
LLM_KV_ATTENTION_OUTPUT_SCALE, LLM_KV_ATTENTION_OUTPUT_SCALE,
LLM_KV_ATTENTION_TEMPERATURE_LENGTH, LLM_KV_ATTENTION_TEMPERATURE_LENGTH,
@ -218,6 +220,7 @@ enum llm_kv {
LLM_KV_ROPE_DIMENSION_COUNT, LLM_KV_ROPE_DIMENSION_COUNT,
LLM_KV_ROPE_DIMENSION_SECTIONS, LLM_KV_ROPE_DIMENSION_SECTIONS,
LLM_KV_ROPE_FREQ_BASE, LLM_KV_ROPE_FREQ_BASE,
LLM_KV_ROPE_FREQ_BASE_SWA,
LLM_KV_ROPE_SCALE_LINEAR, LLM_KV_ROPE_SCALE_LINEAR,
LLM_KV_ROPE_SCALING_TYPE, LLM_KV_ROPE_SCALING_TYPE,
LLM_KV_ROPE_SCALING_FACTOR, LLM_KV_ROPE_SCALING_FACTOR,

View File

@ -459,23 +459,22 @@ llama_context::llama_context(
} }
llama_context::~llama_context() { llama_context::~llama_context() {
// FIXME this currently results in a use-after-free bug if the model is freed before the context if (!model.hparams.no_alloc) {
// if (!model.hparams.no_alloc) { for (size_t i = 0; i < backend_ptrs.size(); ++i) {
// for (size_t i = 0; i < backend_ptrs.size(); ++i) { ggml_backend_t backend = backend_ptrs[i];
// ggml_backend_t backend = backend_ptrs[i]; ggml_backend_buffer_type_t buft = backend_buft[i];
// ggml_backend_buffer_type_t buft = backend_buft[i];
// const size_t size_exp = backend_buf_exp_size[i]; const size_t size_exp = backend_buf_exp_size[i];
// const size_t size_act = ggml_backend_sched_get_buffer_size(sched.get(), backend); const size_t size_act = ggml_backend_sched_get_buffer_size(sched.get(), backend);
// if (size_exp == size_act) { if (size_exp == size_act) {
// LLAMA_LOG_DEBUG("%s: %10s compute buffer size is %8.4f MiB, matches expectation of %8.4f MiB\n", LLAMA_LOG_DEBUG("%s: %10s compute buffer size is %8.4f MiB, matches expectation of %8.4f MiB\n",
// __func__, ggml_backend_buft_name(buft), size_act / (1024.0*1024.0), size_exp / (1024.0*1024.0)); __func__, ggml_backend_buft_name(buft), size_act / (1024.0*1024.0), size_exp / (1024.0*1024.0));
// } else { } else {
// LLAMA_LOG_WARN("%s: %10s compute buffer size of %8.4f MiB, does not match expectation of %8.4f MiB\n", LLAMA_LOG_WARN("%s: %10s compute buffer size of %8.4f MiB, does not match expectation of %8.4f MiB\n",
// __func__, ggml_backend_buft_name(buft), size_act / (1024.0*1024.0), size_exp / (1024.0*1024.0)); __func__, ggml_backend_buft_name(buft), size_act / (1024.0*1024.0), size_exp / (1024.0*1024.0));
// } }
// } }
// } }
ggml_opt_free(opt_ctx); ggml_opt_free(opt_ctx);
} }

View File

@ -462,6 +462,29 @@ namespace GGUFMeta {
return get_key_or_arr(llm_kv(kid), result, n, required); return get_key_or_arr(llm_kv(kid), result, n, required);
} }
bool llama_model_loader::get_key_or_arr(enum llm_kv kid, uint32_t & result, bool required) {
const std::string key = llm_kv(kid);
const int id = gguf_find_key(meta.get(), key.c_str());
if (id < 0) {
if (required) {
throw std::runtime_error(format("key not found in model: %s", key.c_str()));
}
return false;
}
// throw and error if type is an array
if (gguf_get_kv_type(meta.get(), id) == GGUF_TYPE_ARRAY) {
if (required) {
throw std::runtime_error(format("expected scalar, found array for key: %s", key.c_str()));
}
return false;
}
return get_key(key, result, required);
}
// TODO: this is not very clever - figure out something better // TODO: this is not very clever - figure out something better
template bool llama_model_loader::get_key_or_arr<std::array<int, 4>>(enum llm_kv kid, std::array<int, 4> & result, uint32_t n, bool required); template bool llama_model_loader::get_key_or_arr<std::array<int, 4>>(enum llm_kv kid, std::array<int, 4> & result, uint32_t n, bool required);
template bool llama_model_loader::get_key_or_arr<std::array<uint32_t, 512>>(enum llm_kv kid, std::array<uint32_t, 512> & result, uint32_t n, bool required); template bool llama_model_loader::get_key_or_arr<std::array<uint32_t, 512>>(enum llm_kv kid, std::array<uint32_t, 512> & result, uint32_t n, bool required);

View File

@ -131,6 +131,8 @@ struct llama_model_loader {
template<typename T> template<typename T>
bool get_key_or_arr(enum llm_kv kid, T & result, uint32_t n, bool required = true); bool get_key_or_arr(enum llm_kv kid, T & result, uint32_t n, bool required = true);
bool get_key_or_arr(enum llm_kv kid, uint32_t & result, bool required = true);
std::string get_arch_name() const; std::string get_arch_name() const;
enum llm_arch get_arch() const; enum llm_arch get_arch() const;

View File

@ -31,12 +31,14 @@ const char * llm_type_name(llm_type type) {
case LLM_TYPE_17M: return "17M"; case LLM_TYPE_17M: return "17M";
case LLM_TYPE_22M: return "22M"; case LLM_TYPE_22M: return "22M";
case LLM_TYPE_33M: return "33M"; case LLM_TYPE_33M: return "33M";
case LLM_TYPE_47M: return "47M";
case LLM_TYPE_60M: return "60M"; case LLM_TYPE_60M: return "60M";
case LLM_TYPE_70M: return "70M"; case LLM_TYPE_70M: return "70M";
case LLM_TYPE_80M: return "80M"; case LLM_TYPE_80M: return "80M";
case LLM_TYPE_109M: return "109M"; case LLM_TYPE_109M: return "109M";
case LLM_TYPE_137M: return "137M"; case LLM_TYPE_137M: return "137M";
case LLM_TYPE_140M: return "140M"; case LLM_TYPE_140M: return "140M";
case LLM_TYPE_149M: return "149M";
case LLM_TYPE_160M: return "160M"; case LLM_TYPE_160M: return "160M";
case LLM_TYPE_190M: return "190M"; case LLM_TYPE_190M: return "190M";
case LLM_TYPE_220M: return "220M"; case LLM_TYPE_220M: return "220M";
@ -46,6 +48,7 @@ const char * llm_type_name(llm_type type) {
case LLM_TYPE_335M: return "335M"; case LLM_TYPE_335M: return "335M";
case LLM_TYPE_350M: return "350M"; case LLM_TYPE_350M: return "350M";
case LLM_TYPE_360M: return "360M"; case LLM_TYPE_360M: return "360M";
case LLM_TYPE_395M: return "395M";
case LLM_TYPE_410M: return "410M"; case LLM_TYPE_410M: return "410M";
case LLM_TYPE_450M: return "450M"; case LLM_TYPE_450M: return "450M";
case LLM_TYPE_475M: return "475M"; case LLM_TYPE_475M: return "475M";
@ -875,6 +878,34 @@ void llama_model::load_hparams(llama_model_loader & ml) {
default: type = LLM_TYPE_UNKNOWN; default: type = LLM_TYPE_UNKNOWN;
} }
} break; } break;
case LLM_ARCH_MODERN_BERT:
{
const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
if (found_swa && hparams.n_swa > 0) {
uint32_t swa_period = 3;
hparams.swa_type = LLAMA_SWA_TYPE_SYMMETRIC;
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa);
ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
hparams.set_swa_pattern(swa_period);
} else {
hparams.swa_type = LLAMA_SWA_TYPE_NONE;
}
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
switch (hparams.n_layer) {
case 12:
type = LLM_TYPE_47M; break; // granite-embedding-small
case 22:
type = LLM_TYPE_149M; break; // modern-bert-base
case 28:
type = LLM_TYPE_395M; break; // modern-bert-large
default: type = LLM_TYPE_UNKNOWN;
}
} break;
case LLM_ARCH_JINA_BERT_V2: case LLM_ARCH_JINA_BERT_V2:
{ {
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@ -3155,6 +3186,37 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
layer.layer_out_norm_b = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd}, 0); layer.layer_out_norm_b = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd}, 0);
} }
} break; } break;
case LLM_ARCH_MODERN_BERT:
{
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
for(int i = 0; i < n_layer; ++i) {
auto& layer = layers[i];
if ( i != 0 ) {
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
} else{
// layer 0 uses identity
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
}
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, 3 * n_embd }, 0);
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, 2 * n_ff}, 0);
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
}
cls = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, n_embd}, TENSOR_NOT_REQUIRED);
cls_out = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"), {hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
} break;
case LLM_ARCH_NEO_BERT: case LLM_ARCH_NEO_BERT:
{ {
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@ -5181,9 +5243,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
const int64_t n_group = hparams.ssm_n_group; const int64_t n_group = hparams.ssm_n_group;
const int64_t d_in_proj = 2*d_inner + 2*n_group*d_state + n_ssm_head; const int64_t d_in_proj = 2*d_inner + 2*n_group*d_state + n_ssm_head;
const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
const int64_t n_ff_shexp = hparams.n_ff_shexp;
// embeddings // embeddings
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@ -5235,6 +5294,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED); layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
} else { } else {
if (n_expert != 0) { if (n_expert != 0) {
const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
const int64_t n_ff_shexp = hparams.n_ff_shexp;
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert}, 0); layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert}, 0);
layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert }, 0); layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert }, 0);
@ -7089,6 +7151,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
case LLM_ARCH_NOMIC_BERT_MOE: case LLM_ARCH_NOMIC_BERT_MOE:
case LLM_ARCH_NEO_BERT: case LLM_ARCH_NEO_BERT:
case LLM_ARCH_WAVTOKENIZER_DEC: case LLM_ARCH_WAVTOKENIZER_DEC:
case LLM_ARCH_MODERN_BERT:
case LLM_ARCH_GEMMA_EMBEDDING: case LLM_ARCH_GEMMA_EMBEDDING:
case LLM_ARCH_DREAM: case LLM_ARCH_DREAM:
case LLM_ARCH_LLADA: case LLM_ARCH_LLADA:
@ -7248,6 +7311,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
{ {
llm = std::make_unique<llm_build_bert>(*this, params); llm = std::make_unique<llm_build_bert>(*this, params);
} break; } break;
case LLM_ARCH_MODERN_BERT:
{
llm = std::make_unique<llm_build_modern_bert<true>>(*this, params);
} break;
case LLM_ARCH_NEO_BERT: case LLM_ARCH_NEO_BERT:
{ {
llm = std::make_unique<llm_build_neo_bert>(*this, params); llm = std::make_unique<llm_build_neo_bert>(*this, params);
@ -7816,6 +7883,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
case LLM_ARCH_DBRX: case LLM_ARCH_DBRX:
case LLM_ARCH_BERT: case LLM_ARCH_BERT:
case LLM_ARCH_JINA_BERT_V3: case LLM_ARCH_JINA_BERT_V3:
case LLM_ARCH_MODERN_BERT:
case LLM_ARCH_NOMIC_BERT: case LLM_ARCH_NOMIC_BERT:
case LLM_ARCH_NOMIC_BERT_MOE: case LLM_ARCH_NOMIC_BERT_MOE:
case LLM_ARCH_STABLELM: case LLM_ARCH_STABLELM:

View File

@ -24,12 +24,14 @@ enum llm_type {
LLM_TYPE_17M, LLM_TYPE_17M,
LLM_TYPE_22M, LLM_TYPE_22M,
LLM_TYPE_33M, LLM_TYPE_33M,
LLM_TYPE_47M,
LLM_TYPE_60M, LLM_TYPE_60M,
LLM_TYPE_70M, LLM_TYPE_70M,
LLM_TYPE_80M, LLM_TYPE_80M,
LLM_TYPE_109M, LLM_TYPE_109M,
LLM_TYPE_137M, LLM_TYPE_137M,
LLM_TYPE_140M, LLM_TYPE_140M,
LLM_TYPE_149M,
LLM_TYPE_160M, LLM_TYPE_160M,
LLM_TYPE_190M, LLM_TYPE_190M,
LLM_TYPE_220M, LLM_TYPE_220M,
@ -39,6 +41,7 @@ enum llm_type {
LLM_TYPE_335M, LLM_TYPE_335M,
LLM_TYPE_350M, LLM_TYPE_350M,
LLM_TYPE_360M, LLM_TYPE_360M,
LLM_TYPE_395M,
LLM_TYPE_410M, LLM_TYPE_410M,
LLM_TYPE_450M, LLM_TYPE_450M,
LLM_TYPE_475M, LLM_TYPE_475M,

View File

@ -1878,7 +1878,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
tokenizer_pre == "jina-v2-es" || tokenizer_pre == "jina-v2-es" ||
tokenizer_pre == "jina-v2-de" || tokenizer_pre == "jina-v2-de" ||
tokenizer_pre == "a.x-4.0" || tokenizer_pre == "a.x-4.0" ||
tokenizer_pre == "mellum") { tokenizer_pre == "mellum" ||
tokenizer_pre == "modern-bert" ) {
pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2; pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
} else if ( } else if (
tokenizer_pre == "jina-v1-en" || tokenizer_pre == "jina-v1-en" ||
@ -2528,6 +2529,13 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
for (const auto * token : {"<unk>", "<s>", "<|endoftext|>"}) { for (const auto * token : {"<unk>", "<s>", "<|endoftext|>"}) {
_set_token_attr(token, LLAMA_TOKEN_ATTR_RSTRIP, false); _set_token_attr(token, LLAMA_TOKEN_ATTR_RSTRIP, false);
} }
} else if (_contains_any(model_name, {"modern-bert"})) {
if (token_to_id.count("[MASK]") == 0 ) {
LLAMA_LOG_WARN("%s: Mask token missing in vocab!\n", __func__);
}
else {
_set_token_attr("[MASK]", LLAMA_TOKEN_ATTR_LSTRIP, true);
}
} }
} }
} }

View File

@ -327,6 +327,11 @@ struct llm_build_mistral3 : public llm_graph_context {
llm_build_mistral3(const llama_model & model, const llm_graph_params & params); llm_build_mistral3(const llama_model & model, const llm_graph_params & params);
}; };
template <bool iswa>
struct llm_build_modern_bert : public llm_graph_context {
llm_build_modern_bert(const llama_model & model, const llm_graph_params & params);
};
struct llm_build_mpt : public llm_graph_context { struct llm_build_mpt : public llm_graph_context {
llm_build_mpt(const llama_model & model, const llm_graph_params & params); llm_build_mpt(const llama_model & model, const llm_graph_params & params);
}; };

126
src/models/modern-bert.cpp Normal file
View File

@ -0,0 +1,126 @@
#include "models.h"
template <bool iswa>
llm_build_modern_bert<iswa>::llm_build_modern_bert(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
ggml_tensor * cur;
ggml_tensor * inpL;
ggml_tensor * inp_pos = build_inp_pos();
// construct input embeddings (token, type, position)
inpL = build_inp_embd(model.tok_embd);
cb(inpL, "inp_embd", -1);
// embed layer norm
inpL = build_norm(inpL, model.tok_norm, nullptr, LLM_NORM, -1);
cb(inpL, "inp_norm", -1);
ggml_tensor * inp_out_ids = build_inp_out_ids();
auto * inp_attn = build_attn_inp_no_cache();
for (int il = 0; il < n_layer; ++il) {
float freq_base_l = 0.0f;
if constexpr (iswa) {
freq_base_l = model.get_rope_freq_base(cparams, il);
} else {
freq_base_l = freq_base;
}
cur = inpL;
// attention layer norm
if (model.layers[il].attn_norm) {
cur = build_norm(inpL,
model.layers[il].attn_norm, NULL,
LLM_NORM, il);
cb(cur, "attn_norm", il);
}
// self attention
cur = build_lora_mm(model.layers[il].wqkv, cur);
cb(cur, "wqkv", il);
const size_t type_size = ggml_type_size(cur->type);
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*type_size, cur->nb[1], 0*type_size*(n_embd));
ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*type_size, cur->nb[1], 1*type_size*(n_embd));
ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*type_size, cur->nb[1], 1*type_size*(n_embd + n_embd_gqa));
// RoPE
Qcur = ggml_rope_ext(
ctx0, Qcur, inp_pos, nullptr,
n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow
);
Kcur = ggml_rope_ext(
ctx0, Kcur, inp_pos, nullptr,
n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow
);
cb(Qcur, "Qcur", il);
cb(Kcur, "Kcur", il);
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, nullptr,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
cb(cur, "kqv_out", il);
if (il == n_layer - 1 && inp_out_ids) {
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
}
// re-add the layer input
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
cb(ffn_inp, "ffn_inp", il);
// attention layer norm
cur = build_norm(ffn_inp,
model.layers[il].ffn_norm, NULL,
LLM_NORM, il);
cb(cur, "ffn_norm", il);
cur = build_ffn(cur,
model.layers[il].ffn_up, NULL, NULL,
NULL, NULL, NULL,
model.layers[il].ffn_down, NULL, NULL,
NULL,
LLM_FFN_GEGLU, LLM_FFN_SEQ, il);
// attentions bypass the intermediate layer
cur = ggml_add(ctx0, cur, ffn_inp);
// input for next layer
inpL = cur;
}
cur = inpL;
cur = build_norm(cur,
model.output_norm, NULL,
LLM_NORM, -1);
cb(cur, "final_norm_out", -1);
if (hparams.pooling_type == LLAMA_POOLING_TYPE_CLS) {
// extracting cls token
cur = ggml_view_1d(ctx0, cur, hparams.n_embd, 0);
cb(cur, "cls_pooled_embd", -1);
}
cb(cur, "res_embd", -1);
res->t_embd = cur;
ggml_build_forward_expand(gf, cur);
}
// Explicit template instantiations
template struct llm_build_modern_bert<false>;
template struct llm_build_modern_bert<true>;

View File

@ -2329,11 +2329,13 @@ struct test_set_rows : public test_case {
struct test_rope_set_rows : public test_case { struct test_rope_set_rows : public test_case {
const ggml_type type; const ggml_type type;
const ggml_type type_idx; const ggml_type type_idx;
const std::array<int64_t, 4> ne; const std::array<int64_t, 4> ne_a;
int mode; int mode;
const int n_ctx{512};
const int n_dims{128};
std::string vars() override { std::string vars() override {
return VARS_TO_STR4(type, type_idx, ne, mode); return VARS_TO_STR4(type, type_idx, ne_a, mode);
} }
std::string op_desc(ggml_tensor * t) override { std::string op_desc(ggml_tensor * t) override {
@ -2345,24 +2347,51 @@ struct test_rope_set_rows : public test_case {
test_rope_set_rows(ggml_type type, test_rope_set_rows(ggml_type type,
ggml_type type_idx, ggml_type type_idx,
std::array<int64_t, 4> ne, std::array<int64_t, 4> ne_a,
int mode) int mode)
: type(type), type_idx(type_idx), ne(ne), mode(mode) {} : type(type), type_idx(type_idx), ne_a(ne_a), mode(mode) {}
ggml_tensor * build_graph(ggml_context * ctx) override { ggml_tensor * build_graph(ggml_context * ctx) override {
ggml_tensor * src = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, ne[0], ne[1], ne[2], 1); ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, ne_a[0], ne_a[1], ne_a[2], 1);
ggml_set_name(src, "src"); ggml_set_name(a, "a");
ggml_tensor * pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, ne[2]); const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE;
const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
ggml_tensor * rope = ggml_rope(ctx, src, pos, ne[0], mode); ggml_tensor * pos;
if (is_mrope || is_vision) {
pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, ne_a[2] * 4);
} else {
pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, ne_a[2]);
}
ggml_set_name(pos, "pos");
ggml_tensor * view = ggml_view_2d(ctx, rope, ne[0] * ne[1], ne[2], rope->nb[2], 0); float fs = 1.4245f;
float ef = 0.7465f;
float af = 1.4245f;
ggml_tensor * freq = nullptr;
ggml_tensor * dst = ggml_new_tensor_4d(ctx, type, ne[0] * ne[1], ne[2] * ne[3], 1, 1); ggml_tensor * rope = nullptr;
if (is_mrope) {
if (is_vision) {
GGML_ASSERT(n_dims/4 > 0);
int rope_sections[4] = {n_dims/4, n_dims/4, 0, 0}; // Vision-RoPE only use first two dimension for image (x, y) coordinate
rope = ggml_rope_multi(ctx, a, pos, freq, n_dims/2, rope_sections, mode, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f);
} else {
GGML_ASSERT(n_dims/3 > 0);
int rope_sections[4] = {n_dims/3, n_dims/3, n_dims/3, 0};
rope = ggml_rope_multi(ctx, a, pos, freq, n_dims, rope_sections, mode, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f);
}
} else {
rope = ggml_rope(ctx, a, pos, ne_a[0], mode);
}
ggml_tensor * view = ggml_view_2d(ctx, rope, ne_a[0] * ne_a[1], ne_a[2], rope->nb[2], 0);
ggml_tensor * dst = ggml_new_tensor_4d(ctx, type, ne_a[0] * ne_a[1], ne_a[2] * ne_a[3], 1, 1);
ggml_set_name(dst, "dst"); ggml_set_name(dst, "dst");
ggml_tensor * row_idxs = ggml_new_tensor_3d(ctx, type_idx, ne[2], 1, 1); ggml_tensor * row_idxs = ggml_new_tensor_3d(ctx, type_idx, ne_a[2], 1, 1);
ggml_set_name(row_idxs, "row_idxs"); ggml_set_name(row_idxs, "row_idxs");
ggml_tensor * out = ggml_set_rows(ctx, dst, view, row_idxs); ggml_tensor * out = ggml_set_rows(ctx, dst, view, row_idxs);
@ -2373,14 +2402,26 @@ struct test_rope_set_rows : public test_case {
void initialize_tensors(ggml_context * ctx) override { void initialize_tensors(ggml_context * ctx) override {
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) { for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
if (t->type == GGML_TYPE_I64 || t->type == GGML_TYPE_I32) { if (strcmp(t->name, "row_idxs") == 0) {
if (ggml_is_view_op(t->op)) { if (ggml_is_view_op(t->op)) {
continue; continue;
} }
init_set_rows_row_ids(t, ne_a[2]);
init_set_rows_row_ids(t, ne[2]); } else if (t->type == GGML_TYPE_I32) {
// pos
const int num_pos_ids = (mode & GGML_ROPE_TYPE_MROPE) ? ne_a[2] * 4 : ne_a[2];
std::vector<int> data(num_pos_ids);
for (int i = 0; i < num_pos_ids; i++) {
data[i] = rand() % n_ctx;
}
ggml_backend_tensor_set(t, data.data(), 0, num_pos_ids * sizeof(int));
} else { } else {
init_tensor_uniform(t); if (t->ne[0] == n_dims/2) {
// frequency factors in the range [0.9f, 1.1f]
init_tensor_uniform(t, 0.9f, 1.1f);
} else {
init_tensor_uniform(t);
}
} }
} }
} }
@ -6854,10 +6895,12 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
} }
} }
for (int mode : { GGML_ROPE_TYPE_NORMAL, GGML_ROPE_TYPE_NEOX }) { for (int mode : { GGML_ROPE_TYPE_NORMAL, GGML_ROPE_TYPE_NEOX, GGML_ROPE_TYPE_MROPE, GGML_ROPE_TYPE_VISION }) {
for (ggml_type type : {GGML_TYPE_F16, GGML_TYPE_F32}) { for (ggml_type type : {GGML_TYPE_F16, GGML_TYPE_F32}) {
test_cases.emplace_back(new test_rope_set_rows(type, GGML_TYPE_I64, { 128, 32, 1, 100 }, mode)); for (int ne2 : {1, 8, 512}) {
test_cases.emplace_back(new test_rope_set_rows(type, GGML_TYPE_I64, { 128, 32, 512, 1 }, mode)); test_cases.emplace_back(new test_rope_set_rows(type, GGML_TYPE_I64, { 128, 32, ne2, 1 }, mode));
test_cases.emplace_back(new test_rope_set_rows(type, GGML_TYPE_I64, { 128, 32, ne2, 3 }, mode));
}
} }
} }

View File

@ -1196,6 +1196,9 @@ int main(int argc, const char ** argv) {
test_sampler_chain(); test_sampler_chain();
llama_free(ctx);
llama_model_free(model);
fprintf(stdout, "All tests passed.\n"); fprintf(stdout, "All tests passed.\n");
return 0; return 0;
} }

View File

@ -300,8 +300,8 @@ int main(int argc, char **argv) {
fprintf(stderr, "%s : tokens written to '%s'\n", __func__, (fname_text + ".tokcpp").c_str()); fprintf(stderr, "%s : tokens written to '%s'\n", __func__, (fname_text + ".tokcpp").c_str());
} }
llama_model_free(model);
llama_free(ctx); llama_free(ctx);
llama_model_free(model);
llama_backend_free(); llama_backend_free();

View File

@ -146,8 +146,8 @@ int main(int argc, char **argv) {
} }
} }
llama_model_free(model);
llama_free(ctx); llama_free(ctx);
llama_model_free(model);
llama_backend_free(); llama_backend_free();

View File

@ -116,8 +116,8 @@ int main(int argc, char ** argv) {
} }
} }
llama_model_free(model);
llama_free(ctx); llama_free(ctx);
llama_model_free(model);
llama_backend_free(); llama_backend_free();

View File

@ -55,6 +55,7 @@ int main(int argc, char ** argv) {
if (ctx == NULL) { if (ctx == NULL) {
fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__); fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
llama_model_free(model);
return 1; return 1;
} }
@ -108,6 +109,8 @@ int main(int argc, char ** argv) {
if (!decode_helper(ctx, batch, ctx_params.n_batch, true)) { if (!decode_helper(ctx, batch, ctx_params.n_batch, true)) {
LOG_ERR("%s: llama_decode() failed\n", __func__); LOG_ERR("%s: llama_decode() failed\n", __func__);
llama_free(ctx);
llama_model_free(model);
return 1; return 1;
} }
} }
@ -147,6 +150,8 @@ int main(int argc, char ** argv) {
if (!decode_helper(ctx, batch, ctx_params.n_batch, false)) { if (!decode_helper(ctx, batch, ctx_params.n_batch, false)) {
LOG_ERR("%s: llama_decode() failed\n", __func__); LOG_ERR("%s: llama_decode() failed\n", __func__);
llama_free(ctx);
llama_model_free(model);
return 1; return 1;
} }
@ -165,6 +170,8 @@ int main(int argc, char ** argv) {
common_batch_add(batch, get_token_rand(), pp + 0, { 0 }, true); common_batch_add(batch, get_token_rand(), pp + 0, { 0 }, true);
if (!decode_helper(ctx, batch, ctx_params.n_batch, true)) { if (!decode_helper(ctx, batch, ctx_params.n_batch, true)) {
LOG_ERR("%s: llama_decode() failed\n", __func__); LOG_ERR("%s: llama_decode() failed\n", __func__);
llama_free(ctx);
llama_model_free(model);
return 1; return 1;
} }
llama_memory_seq_rm(mem, 0, pp, -1); llama_memory_seq_rm(mem, 0, pp, -1);
@ -184,6 +191,8 @@ int main(int argc, char ** argv) {
if (!decode_helper(ctx, batch, ctx_params.n_batch, true)) { if (!decode_helper(ctx, batch, ctx_params.n_batch, true)) {
LOG_ERR("%s: llama_decode() failed\n", __func__); LOG_ERR("%s: llama_decode() failed\n", __func__);
llama_free(ctx);
llama_model_free(model);
return 1; return 1;
} }
} }
@ -200,6 +209,8 @@ int main(int argc, char ** argv) {
if (!decode_helper(ctx, batch, ctx_params.n_batch, true)) { if (!decode_helper(ctx, batch, ctx_params.n_batch, true)) {
LOG_ERR("%s: llama_decode() failed\n", __func__); LOG_ERR("%s: llama_decode() failed\n", __func__);
llama_free(ctx);
llama_model_free(model);
return 1; return 1;
} }
} }

View File

@ -1 +1,187 @@
TODO # llama.cpp/tools/cli
## Usage
<!-- HELP_START -->
<!-- IMPORTANT: The list below is auto-generated by llama-gen-docs; do NOT modify it manually -->
### Common params
| Argument | Explanation |
| -------- | ----------- |
| `-h, --help, --usage` | print usage and exit |
| `--version` | show version and build info |
| `-cl, --cache-list` | show list of models in cache |
| `--completion-bash` | print source-able bash completion script for llama.cpp |
| `--verbose-prompt` | print a verbose prompt before generation (default: false) |
| `-t, --threads N` | number of CPU threads to use during generation (default: -1)<br/>(env: LLAMA_ARG_THREADS) |
| `-tb, --threads-batch N` | number of threads to use during batch and prompt processing (default: same as --threads) |
| `-C, --cpu-mask M` | CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: "") |
| `-Cr, --cpu-range lo-hi` | range of CPUs for affinity. Complements --cpu-mask |
| `--cpu-strict <0\|1>` | use strict CPU placement (default: 0) |
| `--prio N` | set process/thread priority : low(-1), normal(0), medium(1), high(2), realtime(3) (default: 0) |
| `--poll <0...100>` | use polling level to wait for work (0 - no polling, default: 50) |
| `-Cb, --cpu-mask-batch M` | CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask) |
| `-Crb, --cpu-range-batch lo-hi` | ranges of CPUs for affinity. Complements --cpu-mask-batch |
| `--cpu-strict-batch <0\|1>` | use strict CPU placement (default: same as --cpu-strict) |
| `--prio-batch N` | set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: 0) |
| `--poll-batch <0\|1>` | use polling to wait for work (default: same as --poll) |
| `-c, --ctx-size N` | size of the prompt context (default: 0, 0 = loaded from model)<br/>(env: LLAMA_ARG_CTX_SIZE) |
| `-n, --predict, --n-predict N` | number of tokens to predict (default: -1, -1 = infinity)<br/>(env: LLAMA_ARG_N_PREDICT) |
| `-b, --batch-size N` | logical maximum batch size (default: 2048)<br/>(env: LLAMA_ARG_BATCH) |
| `-ub, --ubatch-size N` | physical maximum batch size (default: 512)<br/>(env: LLAMA_ARG_UBATCH) |
| `--keep N` | number of tokens to keep from the initial prompt (default: 0, -1 = all) |
| `--swa-full` | use full-size SWA cache (default: false)<br/>[(more info)](https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)<br/>(env: LLAMA_ARG_SWA_FULL) |
| `-fa, --flash-attn [on\|off\|auto]` | set Flash Attention use ('on', 'off', or 'auto', default: 'auto')<br/>(env: LLAMA_ARG_FLASH_ATTN) |
| `-p, --prompt PROMPT` | prompt to start generation with; for system message, use -sys |
| `--perf, --no-perf` | whether to enable internal libllama performance timings (default: false)<br/>(env: LLAMA_ARG_PERF) |
| `-f, --file FNAME` | a file containing the prompt (default: none) |
| `-bf, --binary-file FNAME` | binary file containing the prompt (default: none) |
| `-e, --escape, --no-escape` | whether to process escapes sequences (\n, \r, \t, \', \", \\) (default: true) |
| `--rope-scaling {none,linear,yarn}` | RoPE frequency scaling method, defaults to linear unless specified by the model<br/>(env: LLAMA_ARG_ROPE_SCALING_TYPE) |
| `--rope-scale N` | RoPE context scaling factor, expands context by a factor of N<br/>(env: LLAMA_ARG_ROPE_SCALE) |
| `--rope-freq-base N` | RoPE base frequency, used by NTK-aware scaling (default: loaded from model)<br/>(env: LLAMA_ARG_ROPE_FREQ_BASE) |
| `--rope-freq-scale N` | RoPE frequency scaling factor, expands context by a factor of 1/N<br/>(env: LLAMA_ARG_ROPE_FREQ_SCALE) |
| `--yarn-orig-ctx N` | YaRN: original context size of model (default: 0 = model training context size)<br/>(env: LLAMA_ARG_YARN_ORIG_CTX) |
| `--yarn-ext-factor N` | YaRN: extrapolation mix factor (default: -1.0, 0.0 = full interpolation)<br/>(env: LLAMA_ARG_YARN_EXT_FACTOR) |
| `--yarn-attn-factor N` | YaRN: scale sqrt(t) or attention magnitude (default: -1.0)<br/>(env: LLAMA_ARG_YARN_ATTN_FACTOR) |
| `--yarn-beta-slow N` | YaRN: high correction dim or alpha (default: -1.0)<br/>(env: LLAMA_ARG_YARN_BETA_SLOW) |
| `--yarn-beta-fast N` | YaRN: low correction dim or beta (default: -1.0)<br/>(env: LLAMA_ARG_YARN_BETA_FAST) |
| `-kvo, --kv-offload, -nkvo, --no-kv-offload` | whether to enable KV cache offloading (default: enabled)<br/>(env: LLAMA_ARG_KV_OFFLOAD) |
| `--repack, -nr, --no-repack` | whether to enable weight repacking (default: enabled)<br/>(env: LLAMA_ARG_REPACK) |
| `--no-host` | bypass host buffer allowing extra buffers to be used<br/>(env: LLAMA_ARG_NO_HOST) |
| `-ctk, --cache-type-k TYPE` | KV cache data type for K<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_K) |
| `-ctv, --cache-type-v TYPE` | KV cache data type for V<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_V) |
| `-dt, --defrag-thold N` | KV cache defragmentation threshold (DEPRECATED)<br/>(env: LLAMA_ARG_DEFRAG_THOLD) |
| `-np, --parallel N` | number of parallel sequences to decode (default: 1)<br/>(env: LLAMA_ARG_N_PARALLEL) |
| `--mlock` | force system to keep model in RAM rather than swapping or compressing<br/>(env: LLAMA_ARG_MLOCK) |
| `--mmap, --no-mmap` | whether to memory-map model (if disabled, slower load but may reduce pageouts if not using mlock) (default: enabled)<br/>(env: LLAMA_ARG_MMAP) |
| `--numa TYPE` | attempt optimizations that help on some NUMA systems<br/>- distribute: spread execution evenly over all nodes<br/>- isolate: only spawn threads on CPUs on the node that execution started on<br/>- numactl: use the CPU map provided by numactl<br/>if run without this previously, it is recommended to drop the system page cache before using this<br/>see https://github.com/ggml-org/llama.cpp/issues/1437<br/>(env: LLAMA_ARG_NUMA) |
| `-dev, --device <dev1,dev2,..>` | comma-separated list of devices to use for offloading (none = don't offload)<br/>use --list-devices to see a list of available devices<br/>(env: LLAMA_ARG_DEVICE) |
| `--list-devices` | print list of available devices and exit |
| `-ot, --override-tensor <tensor name pattern>=<buffer type>,...` | override tensor buffer type |
| `-cmoe, --cpu-moe` | keep all Mixture of Experts (MoE) weights in the CPU<br/>(env: LLAMA_ARG_CPU_MOE) |
| `-ncmoe, --n-cpu-moe N` | keep the Mixture of Experts (MoE) weights of the first N layers in the CPU<br/>(env: LLAMA_ARG_N_CPU_MOE) |
| `-ngl, --gpu-layers, --n-gpu-layers N` | max. number of layers to store in VRAM (default: -1)<br/>(env: LLAMA_ARG_N_GPU_LAYERS) |
| `-sm, --split-mode {none,layer,row}` | how to split the model across multiple GPUs, one of:<br/>- none: use one GPU only<br/>- layer (default): split layers and KV across GPUs<br/>- row: split rows across GPUs<br/>(env: LLAMA_ARG_SPLIT_MODE) |
| `-ts, --tensor-split N0,N1,N2,...` | fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1<br/>(env: LLAMA_ARG_TENSOR_SPLIT) |
| `-mg, --main-gpu INDEX` | the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: 0)<br/>(env: LLAMA_ARG_MAIN_GPU) |
| `-fit, --fit [on\|off]` | whether to adjust unset arguments to fit in device memory ('on' or 'off', default: 'on')<br/>(env: LLAMA_ARG_FIT) |
| `-fitt, --fit-target MiB` | target margin per device for --fit option, default: 1024<br/>(env: LLAMA_ARG_FIT_TARGET) |
| `-fitc, --fit-ctx N` | minimum ctx size that can be set by --fit option, default: 4096<br/>(env: LLAMA_ARG_FIT_CTX) |
| `--check-tensors` | check model tensor data for invalid values (default: false) |
| `--override-kv KEY=TYPE:VALUE,...` | advanced option to override model metadata by key. to specify multiple overrides, either use comma-separated or repeat this argument.<br/>types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false,tokenizer.ggml.add_eos_token=bool:false |
| `--op-offload, --no-op-offload` | whether to offload host tensor operations to device (default: true) |
| `--lora FNAME` | path to LoRA adapter (use comma-separated values to load multiple adapters) |
| `--lora-scaled FNAME:SCALE,...` | path to LoRA adapter with user defined scaling (format: FNAME:SCALE,...)<br/>note: use comma-separated values |
| `--control-vector FNAME` | add a control vector<br/>note: use comma-separated values to add multiple control vectors |
| `--control-vector-scaled FNAME:SCALE,...` | add a control vector with user defined scaling SCALE<br/>note: use comma-separated values (format: FNAME:SCALE,...) |
| `--control-vector-layer-range START END` | layer range to apply the control vector(s) to, start and end inclusive |
| `-m, --model FNAME` | model path to load<br/>(env: LLAMA_ARG_MODEL) |
| `-mu, --model-url MODEL_URL` | model download url (default: unused)<br/>(env: LLAMA_ARG_MODEL_URL) |
| `-dr, --docker-repo [<repo>/]<model>[:quant]` | Docker Hub model repository. repo is optional, default to ai/. quant is optional, default to :latest.<br/>example: gemma3<br/>(default: unused)<br/>(env: LLAMA_ARG_DOCKER_REPO) |
| `-hf, -hfr, --hf-repo <user>/<model>[:quant]` | Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.<br/>mmproj is also downloaded automatically if available. to disable, add --no-mmproj<br/>example: unsloth/phi-4-GGUF:q4_k_m<br/>(default: unused)<br/>(env: LLAMA_ARG_HF_REPO) |
| `-hfd, -hfrd, --hf-repo-draft <user>/<model>[:quant]` | Same as --hf-repo, but for the draft model (default: unused)<br/>(env: LLAMA_ARG_HFD_REPO) |
| `-hff, --hf-file FILE` | Hugging Face model file. If specified, it will override the quant in --hf-repo (default: unused)<br/>(env: LLAMA_ARG_HF_FILE) |
| `-hfv, -hfrv, --hf-repo-v <user>/<model>[:quant]` | Hugging Face model repository for the vocoder model (default: unused)<br/>(env: LLAMA_ARG_HF_REPO_V) |
| `-hffv, --hf-file-v FILE` | Hugging Face model file for the vocoder model (default: unused)<br/>(env: LLAMA_ARG_HF_FILE_V) |
| `-hft, --hf-token TOKEN` | Hugging Face access token (default: value from HF_TOKEN environment variable)<br/>(env: HF_TOKEN) |
| `--log-disable` | Log disable |
| `--log-file FNAME` | Log to file<br/>(env: LLAMA_LOG_FILE) |
| `--log-colors [on\|off\|auto]` | Set colored logging ('on', 'off', or 'auto', default: 'auto')<br/>'auto' enables colors when output is to a terminal<br/>(env: LLAMA_LOG_COLORS) |
| `-v, --verbose, --log-verbose` | Set verbosity level to infinity (i.e. log all messages, useful for debugging) |
| `--offline` | Offline mode: forces use of cache, prevents network access<br/>(env: LLAMA_OFFLINE) |
| `-lv, --verbosity, --log-verbosity N` | Set the verbosity threshold. Messages with a higher verbosity will be ignored. Values:<br/> - 0: generic output<br/> - 1: error<br/> - 2: warning<br/> - 3: info<br/> - 4: debug<br/>(default: 3)<br/><br/>(env: LLAMA_LOG_VERBOSITY) |
| `--log-prefix` | Enable prefix in log messages<br/>(env: LLAMA_LOG_PREFIX) |
| `--log-timestamps` | Enable timestamps in log messages<br/>(env: LLAMA_LOG_TIMESTAMPS) |
| `-ctkd, --cache-type-k-draft TYPE` | KV cache data type for K for the draft model<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_K_DRAFT) |
| `-ctvd, --cache-type-v-draft TYPE` | KV cache data type for V for the draft model<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_V_DRAFT) |
### Sampling params
| Argument | Explanation |
| -------- | ----------- |
| `--samplers SAMPLERS` | samplers that will be used for generation in the order, separated by ';'<br/>(default: penalties;dry;top_n_sigma;top_k;typ_p;top_p;min_p;xtc;temperature) |
| `-s, --seed SEED` | RNG seed (default: -1, use random seed for -1) |
| `--sampler-seq, --sampling-seq SEQUENCE` | simplified sequence for samplers that will be used (default: edskypmxt) |
| `--ignore-eos` | ignore end of stream token and continue generating (implies --logit-bias EOS-inf) |
| `--temp N` | temperature (default: 0.8) |
| `--top-k N` | top-k sampling (default: 40, 0 = disabled)<br/>(env: LLAMA_ARG_TOP_K) |
| `--top-p N` | top-p sampling (default: 0.9, 1.0 = disabled) |
| `--min-p N` | min-p sampling (default: 0.1, 0.0 = disabled) |
| `--top-nsigma N` | top-n-sigma sampling (default: -1.0, -1.0 = disabled) |
| `--xtc-probability N` | xtc probability (default: 0.0, 0.0 = disabled) |
| `--xtc-threshold N` | xtc threshold (default: 0.1, 1.0 = disabled) |
| `--typical N` | locally typical sampling, parameter p (default: 1.0, 1.0 = disabled) |
| `--repeat-last-n N` | last n tokens to consider for penalize (default: 64, 0 = disabled, -1 = ctx_size) |
| `--repeat-penalty N` | penalize repeat sequence of tokens (default: 1.0, 1.0 = disabled) |
| `--presence-penalty N` | repeat alpha presence penalty (default: 0.0, 0.0 = disabled) |
| `--frequency-penalty N` | repeat alpha frequency penalty (default: 0.0, 0.0 = disabled) |
| `--dry-multiplier N` | set DRY sampling multiplier (default: 0.0, 0.0 = disabled) |
| `--dry-base N` | set DRY sampling base value (default: 1.75) |
| `--dry-allowed-length N` | set allowed length for DRY sampling (default: 2) |
| `--dry-penalty-last-n N` | set DRY penalty for the last n tokens (default: -1, 0 = disable, -1 = context size) |
| `--dry-sequence-breaker STRING` | add sequence breaker for DRY sampling, clearing out default breakers ('\n', ':', '"', '*') in the process; use "none" to not use any sequence breakers |
| `--dynatemp-range N` | dynamic temperature range (default: 0.0, 0.0 = disabled) |
| `--dynatemp-exp N` | dynamic temperature exponent (default: 1.0) |
| `--mirostat N` | use Mirostat sampling.<br/>Top K, Nucleus and Locally Typical samplers are ignored if used.<br/>(default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0) |
| `--mirostat-lr N` | Mirostat learning rate, parameter eta (default: 0.1) |
| `--mirostat-ent N` | Mirostat target entropy, parameter tau (default: 5.0) |
| `-l, --logit-bias TOKEN_ID(+/-)BIAS` | modifies the likelihood of token appearing in the completion,<br/>i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',<br/>or `--logit-bias 15043-1` to decrease likelihood of token ' Hello' |
| `--grammar GRAMMAR` | BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '') |
| `--grammar-file FNAME` | file to read grammar from |
| `-j, --json-schema SCHEMA` | JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object<br/>For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead |
| `-jf, --json-schema-file FILE` | File containing a JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object<br/>For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead |
### CLI-specific params
| Argument | Explanation |
| -------- | ----------- |
| `--display-prompt, --no-display-prompt` | whether to print prompt at generation (default: true) |
| `-co, --color [on\|off\|auto]` | Colorize output to distinguish prompt and user input from generations ('on', 'off', or 'auto', default: 'auto')<br/>'auto' enables colors when output is to a terminal |
| `--ctx-checkpoints, --swa-checkpoints N` | max number of context checkpoints to create per slot (default: 8)[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)<br/>(env: LLAMA_ARG_CTX_CHECKPOINTS) |
| `-cram, --cache-ram N` | set the maximum cache size in MiB (default: 8192, -1 - no limit, 0 - disable)[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)<br/>(env: LLAMA_ARG_CACHE_RAM) |
| `--context-shift, --no-context-shift` | whether to use context shift on infinite text generation (default: disabled)<br/>(env: LLAMA_ARG_CONTEXT_SHIFT) |
| `-sys, --system-prompt PROMPT` | system prompt to use with model (if applicable, depending on chat template) |
| `--show-timings, --no-show-timings` | whether to show timing information after each response (default: true)<br/>(env: LLAMA_ARG_SHOW_TIMINGS) |
| `-sysf, --system-prompt-file FNAME` | a file containing the system prompt (default: none) |
| `-r, --reverse-prompt PROMPT` | halt generation at PROMPT, return control in interactive mode |
| `-sp, --special` | special tokens output enabled (default: false) |
| `-cnv, --conversation, -no-cnv, --no-conversation` | whether to run in conversation mode:<br/>- does not print special tokens and suffix/prefix<br/>- interactive mode is also enabled<br/>(default: auto enabled if chat template is available) |
| `-st, --single-turn` | run conversation for a single turn only, then exit when done<br/>will not be interactive if first turn is predefined with --prompt<br/>(default: false) |
| `-mli, --multiline-input` | allows you to write or paste multiple lines without ending each in '\' |
| `--warmup, --no-warmup` | whether to perform warmup with an empty run (default: enabled) |
| `-mm, --mmproj FILE` | path to a multimodal projector file. see tools/mtmd/README.md<br/>note: if -hf is used, this argument can be omitted<br/>(env: LLAMA_ARG_MMPROJ) |
| `-mmu, --mmproj-url URL` | URL to a multimodal projector file. see tools/mtmd/README.md<br/>(env: LLAMA_ARG_MMPROJ_URL) |
| `--mmproj-auto, --no-mmproj, --no-mmproj-auto` | whether to use multimodal projector file (if available), useful when using -hf (default: enabled)<br/>(env: LLAMA_ARG_MMPROJ_AUTO) |
| `--mmproj-offload, --no-mmproj-offload` | whether to enable GPU offloading for multimodal projector (default: enabled)<br/>(env: LLAMA_ARG_MMPROJ_OFFLOAD) |
| `--image, --audio FILE` | path to an image or audio file. use with multimodal models, use comma-separated values for multiple files |
| `--image-min-tokens N` | minimum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)<br/>(env: LLAMA_ARG_IMAGE_MIN_TOKENS) |
| `--image-max-tokens N` | maximum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)<br/>(env: LLAMA_ARG_IMAGE_MAX_TOKENS) |
| `-otd, --override-tensor-draft <tensor name pattern>=<buffer type>,...` | override tensor buffer type for draft model |
| `-cmoed, --cpu-moe-draft` | keep all Mixture of Experts (MoE) weights in the CPU for the draft model<br/>(env: LLAMA_ARG_CPU_MOE_DRAFT) |
| `-ncmoed, --n-cpu-moe-draft N` | keep the Mixture of Experts (MoE) weights of the first N layers in the CPU for the draft model<br/>(env: LLAMA_ARG_N_CPU_MOE_DRAFT) |
| `--chat-template-kwargs STRING` | sets additional params for the json template parser<br/>(env: LLAMA_CHAT_TEMPLATE_KWARGS) |
| `--jinja, --no-jinja` | whether to use jinja template engine for chat (default: enabled)<br/>(env: LLAMA_ARG_JINJA) |
| `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:<br/>- none: leaves thoughts unparsed in `message.content`<br/>- deepseek: puts thoughts in `message.reasoning_content`<br/>- deepseek-legacy: keeps `<think>` tags in `message.content` while also populating `message.reasoning_content`<br/>(default: auto)<br/>(env: LLAMA_ARG_THINK) |
| `--reasoning-budget N` | controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)<br/>(env: LLAMA_ARG_THINK_BUDGET) |
| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
| `--simple-io` | use basic IO for better compatibility in subprocesses and limited consoles |
| `--draft, --draft-n, --draft-max N` | number of tokens to draft for speculative decoding (default: 16)<br/>(env: LLAMA_ARG_DRAFT_MAX) |
| `--draft-min, --draft-n-min N` | minimum number of draft tokens to use for speculative decoding (default: 0)<br/>(env: LLAMA_ARG_DRAFT_MIN) |
| `--draft-p-min P` | minimum speculative decoding probability (greedy) (default: 0.8)<br/>(env: LLAMA_ARG_DRAFT_P_MIN) |
| `-cd, --ctx-size-draft N` | size of the prompt context for the draft model (default: 0, 0 = loaded from model)<br/>(env: LLAMA_ARG_CTX_SIZE_DRAFT) |
| `-devd, --device-draft <dev1,dev2,..>` | comma-separated list of devices to use for offloading the draft model (none = don't offload)<br/>use --list-devices to see a list of available devices |
| `-ngld, --gpu-layers-draft, --n-gpu-layers-draft N` | number of layers to store in VRAM for the draft model<br/>(env: LLAMA_ARG_N_GPU_LAYERS_DRAFT) |
| `-md, --model-draft FNAME` | draft model for speculative decoding (default: unused)<br/>(env: LLAMA_ARG_MODEL_DRAFT) |
| `--spec-replace TARGET DRAFT` | translate the string in TARGET into DRAFT if the draft model and main model are not compatible |
| `--gpt-oss-20b-default` | use gpt-oss-20b (note: can download weights from the internet) |
| `--gpt-oss-120b-default` | use gpt-oss-120b (note: can download weights from the internet) |
| `--vision-gemma-4b-default` | use Gemma 3 4B QAT (note: can download weights from the internet) |
| `--vision-gemma-12b-default` | use Gemma 3 12B QAT (note: can download weights from the internet) |
<!-- HELP_END -->

View File

@ -216,7 +216,7 @@ int main(int argc, char ** argv) {
ctx_cli.ctx_server.start_loop(); ctx_cli.ctx_server.start_loop();
}); });
auto inf = ctx_cli.ctx_server.get_info(); auto inf = ctx_cli.ctx_server.get_meta();
std::string modalities = "text"; std::string modalities = "text";
if (inf.has_inp_image) { if (inf.has_inp_image) {
modalities += ", vision"; modalities += ", vision";

View File

@ -5,13 +5,14 @@ This example program allows you to use various LLaMA language models easily and
## Table of Contents ## Table of Contents
1. [Quick Start](#quick-start) 1. [Quick Start](#quick-start)
2. [Common Options](#common-options) 2. [Usage](#usage)
3. [Input Prompts](#input-prompts) 3. [Common Options](#common-options)
4. [Interaction](#interaction) 4. [Input Prompts](#input-prompts)
5. [Context Management](#context-management) 5. [Interaction](#interaction)
6. [Generation Flags](#generation-flags) 6. [Context Management](#context-management)
7. [Performance Tuning and Memory Options](#performance-tuning-and-memory-options) 7. [Generation Flags](#generation-flags)
8. [Additional Options](#additional-options) 8. [Performance Tuning and Memory Options](#performance-tuning-and-memory-options)
9. [Additional Options](#additional-options)
## Quick Start ## Quick Start
@ -82,6 +83,177 @@ Once downloaded, place your model in the models folder in llama.cpp.
llama-completion.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1 llama-completion.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1
``` ```
## Usage
<!-- HELP_START -->
<!-- IMPORTANT: The list below is auto-generated by llama-gen-docs; do NOT modify it manually -->
### Common params
| Argument | Explanation |
| -------- | ----------- |
| `-h, --help, --usage` | print usage and exit |
| `--version` | show version and build info |
| `-cl, --cache-list` | show list of models in cache |
| `--completion-bash` | print source-able bash completion script for llama.cpp |
| `--verbose-prompt` | print a verbose prompt before generation (default: false) |
| `-t, --threads N` | number of CPU threads to use during generation (default: -1)<br/>(env: LLAMA_ARG_THREADS) |
| `-tb, --threads-batch N` | number of threads to use during batch and prompt processing (default: same as --threads) |
| `-C, --cpu-mask M` | CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: "") |
| `-Cr, --cpu-range lo-hi` | range of CPUs for affinity. Complements --cpu-mask |
| `--cpu-strict <0\|1>` | use strict CPU placement (default: 0) |
| `--prio N` | set process/thread priority : low(-1), normal(0), medium(1), high(2), realtime(3) (default: 0) |
| `--poll <0...100>` | use polling level to wait for work (0 - no polling, default: 50) |
| `-Cb, --cpu-mask-batch M` | CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask) |
| `-Crb, --cpu-range-batch lo-hi` | ranges of CPUs for affinity. Complements --cpu-mask-batch |
| `--cpu-strict-batch <0\|1>` | use strict CPU placement (default: same as --cpu-strict) |
| `--prio-batch N` | set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: 0) |
| `--poll-batch <0\|1>` | use polling to wait for work (default: same as --poll) |
| `-c, --ctx-size N` | size of the prompt context (default: 0, 0 = loaded from model)<br/>(env: LLAMA_ARG_CTX_SIZE) |
| `-n, --predict, --n-predict N` | number of tokens to predict (default: -1, -1 = infinity, -2 = until context filled)<br/>(env: LLAMA_ARG_N_PREDICT) |
| `-b, --batch-size N` | logical maximum batch size (default: 2048)<br/>(env: LLAMA_ARG_BATCH) |
| `-ub, --ubatch-size N` | physical maximum batch size (default: 512)<br/>(env: LLAMA_ARG_UBATCH) |
| `--keep N` | number of tokens to keep from the initial prompt (default: 0, -1 = all) |
| `--swa-full` | use full-size SWA cache (default: false)<br/>[(more info)](https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)<br/>(env: LLAMA_ARG_SWA_FULL) |
| `-fa, --flash-attn [on\|off\|auto]` | set Flash Attention use ('on', 'off', or 'auto', default: 'auto')<br/>(env: LLAMA_ARG_FLASH_ATTN) |
| `-p, --prompt PROMPT` | prompt to start generation with; for system message, use -sys |
| `--perf, --no-perf` | whether to enable internal libllama performance timings (default: false)<br/>(env: LLAMA_ARG_PERF) |
| `-f, --file FNAME` | a file containing the prompt (default: none) |
| `-bf, --binary-file FNAME` | binary file containing the prompt (default: none) |
| `-e, --escape, --no-escape` | whether to process escapes sequences (\n, \r, \t, \', \", \\) (default: true) |
| `--rope-scaling {none,linear,yarn}` | RoPE frequency scaling method, defaults to linear unless specified by the model<br/>(env: LLAMA_ARG_ROPE_SCALING_TYPE) |
| `--rope-scale N` | RoPE context scaling factor, expands context by a factor of N<br/>(env: LLAMA_ARG_ROPE_SCALE) |
| `--rope-freq-base N` | RoPE base frequency, used by NTK-aware scaling (default: loaded from model)<br/>(env: LLAMA_ARG_ROPE_FREQ_BASE) |
| `--rope-freq-scale N` | RoPE frequency scaling factor, expands context by a factor of 1/N<br/>(env: LLAMA_ARG_ROPE_FREQ_SCALE) |
| `--yarn-orig-ctx N` | YaRN: original context size of model (default: 0 = model training context size)<br/>(env: LLAMA_ARG_YARN_ORIG_CTX) |
| `--yarn-ext-factor N` | YaRN: extrapolation mix factor (default: -1.0, 0.0 = full interpolation)<br/>(env: LLAMA_ARG_YARN_EXT_FACTOR) |
| `--yarn-attn-factor N` | YaRN: scale sqrt(t) or attention magnitude (default: -1.0)<br/>(env: LLAMA_ARG_YARN_ATTN_FACTOR) |
| `--yarn-beta-slow N` | YaRN: high correction dim or alpha (default: -1.0)<br/>(env: LLAMA_ARG_YARN_BETA_SLOW) |
| `--yarn-beta-fast N` | YaRN: low correction dim or beta (default: -1.0)<br/>(env: LLAMA_ARG_YARN_BETA_FAST) |
| `-kvo, --kv-offload, -nkvo, --no-kv-offload` | whether to enable KV cache offloading (default: enabled)<br/>(env: LLAMA_ARG_KV_OFFLOAD) |
| `--repack, -nr, --no-repack` | whether to enable weight repacking (default: enabled)<br/>(env: LLAMA_ARG_REPACK) |
| `--no-host` | bypass host buffer allowing extra buffers to be used<br/>(env: LLAMA_ARG_NO_HOST) |
| `-ctk, --cache-type-k TYPE` | KV cache data type for K<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_K) |
| `-ctv, --cache-type-v TYPE` | KV cache data type for V<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_V) |
| `-dt, --defrag-thold N` | KV cache defragmentation threshold (DEPRECATED)<br/>(env: LLAMA_ARG_DEFRAG_THOLD) |
| `-np, --parallel N` | number of parallel sequences to decode (default: 1)<br/>(env: LLAMA_ARG_N_PARALLEL) |
| `--mlock` | force system to keep model in RAM rather than swapping or compressing<br/>(env: LLAMA_ARG_MLOCK) |
| `--mmap, --no-mmap` | whether to memory-map model (if disabled, slower load but may reduce pageouts if not using mlock) (default: enabled)<br/>(env: LLAMA_ARG_MMAP) |
| `--numa TYPE` | attempt optimizations that help on some NUMA systems<br/>- distribute: spread execution evenly over all nodes<br/>- isolate: only spawn threads on CPUs on the node that execution started on<br/>- numactl: use the CPU map provided by numactl<br/>if run without this previously, it is recommended to drop the system page cache before using this<br/>see https://github.com/ggml-org/llama.cpp/issues/1437<br/>(env: LLAMA_ARG_NUMA) |
| `-dev, --device <dev1,dev2,..>` | comma-separated list of devices to use for offloading (none = don't offload)<br/>use --list-devices to see a list of available devices<br/>(env: LLAMA_ARG_DEVICE) |
| `--list-devices` | print list of available devices and exit |
| `-ot, --override-tensor <tensor name pattern>=<buffer type>,...` | override tensor buffer type |
| `-cmoe, --cpu-moe` | keep all Mixture of Experts (MoE) weights in the CPU<br/>(env: LLAMA_ARG_CPU_MOE) |
| `-ncmoe, --n-cpu-moe N` | keep the Mixture of Experts (MoE) weights of the first N layers in the CPU<br/>(env: LLAMA_ARG_N_CPU_MOE) |
| `-ngl, --gpu-layers, --n-gpu-layers N` | max. number of layers to store in VRAM (default: -1)<br/>(env: LLAMA_ARG_N_GPU_LAYERS) |
| `-sm, --split-mode {none,layer,row}` | how to split the model across multiple GPUs, one of:<br/>- none: use one GPU only<br/>- layer (default): split layers and KV across GPUs<br/>- row: split rows across GPUs<br/>(env: LLAMA_ARG_SPLIT_MODE) |
| `-ts, --tensor-split N0,N1,N2,...` | fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1<br/>(env: LLAMA_ARG_TENSOR_SPLIT) |
| `-mg, --main-gpu INDEX` | the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: 0)<br/>(env: LLAMA_ARG_MAIN_GPU) |
| `-fit, --fit [on\|off]` | whether to adjust unset arguments to fit in device memory ('on' or 'off', default: 'on')<br/>(env: LLAMA_ARG_FIT) |
| `-fitt, --fit-target MiB` | target margin per device for --fit option, default: 1024<br/>(env: LLAMA_ARG_FIT_TARGET) |
| `-fitc, --fit-ctx N` | minimum ctx size that can be set by --fit option, default: 4096<br/>(env: LLAMA_ARG_FIT_CTX) |
| `--check-tensors` | check model tensor data for invalid values (default: false) |
| `--override-kv KEY=TYPE:VALUE,...` | advanced option to override model metadata by key. to specify multiple overrides, either use comma-separated or repeat this argument.<br/>types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false,tokenizer.ggml.add_eos_token=bool:false |
| `--op-offload, --no-op-offload` | whether to offload host tensor operations to device (default: true) |
| `--lora FNAME` | path to LoRA adapter (use comma-separated values to load multiple adapters) |
| `--lora-scaled FNAME:SCALE,...` | path to LoRA adapter with user defined scaling (format: FNAME:SCALE,...)<br/>note: use comma-separated values |
| `--control-vector FNAME` | add a control vector<br/>note: use comma-separated values to add multiple control vectors |
| `--control-vector-scaled FNAME:SCALE,...` | add a control vector with user defined scaling SCALE<br/>note: use comma-separated values (format: FNAME:SCALE,...) |
| `--control-vector-layer-range START END` | layer range to apply the control vector(s) to, start and end inclusive |
| `-m, --model FNAME` | model path to load<br/>(env: LLAMA_ARG_MODEL) |
| `-mu, --model-url MODEL_URL` | model download url (default: unused)<br/>(env: LLAMA_ARG_MODEL_URL) |
| `-dr, --docker-repo [<repo>/]<model>[:quant]` | Docker Hub model repository. repo is optional, default to ai/. quant is optional, default to :latest.<br/>example: gemma3<br/>(default: unused)<br/>(env: LLAMA_ARG_DOCKER_REPO) |
| `-hf, -hfr, --hf-repo <user>/<model>[:quant]` | Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.<br/>mmproj is also downloaded automatically if available. to disable, add --no-mmproj<br/>example: unsloth/phi-4-GGUF:q4_k_m<br/>(default: unused)<br/>(env: LLAMA_ARG_HF_REPO) |
| `-hfd, -hfrd, --hf-repo-draft <user>/<model>[:quant]` | Same as --hf-repo, but for the draft model (default: unused)<br/>(env: LLAMA_ARG_HFD_REPO) |
| `-hff, --hf-file FILE` | Hugging Face model file. If specified, it will override the quant in --hf-repo (default: unused)<br/>(env: LLAMA_ARG_HF_FILE) |
| `-hfv, -hfrv, --hf-repo-v <user>/<model>[:quant]` | Hugging Face model repository for the vocoder model (default: unused)<br/>(env: LLAMA_ARG_HF_REPO_V) |
| `-hffv, --hf-file-v FILE` | Hugging Face model file for the vocoder model (default: unused)<br/>(env: LLAMA_ARG_HF_FILE_V) |
| `-hft, --hf-token TOKEN` | Hugging Face access token (default: value from HF_TOKEN environment variable)<br/>(env: HF_TOKEN) |
| `--log-disable` | Log disable |
| `--log-file FNAME` | Log to file<br/>(env: LLAMA_LOG_FILE) |
| `--log-colors [on\|off\|auto]` | Set colored logging ('on', 'off', or 'auto', default: 'auto')<br/>'auto' enables colors when output is to a terminal<br/>(env: LLAMA_LOG_COLORS) |
| `-v, --verbose, --log-verbose` | Set verbosity level to infinity (i.e. log all messages, useful for debugging) |
| `--offline` | Offline mode: forces use of cache, prevents network access<br/>(env: LLAMA_OFFLINE) |
| `-lv, --verbosity, --log-verbosity N` | Set the verbosity threshold. Messages with a higher verbosity will be ignored. Values:<br/> - 0: generic output<br/> - 1: error<br/> - 2: warning<br/> - 3: info<br/> - 4: debug<br/>(default: 3)<br/><br/>(env: LLAMA_LOG_VERBOSITY) |
| `--log-prefix` | Enable prefix in log messages<br/>(env: LLAMA_LOG_PREFIX) |
| `--log-timestamps` | Enable timestamps in log messages<br/>(env: LLAMA_LOG_TIMESTAMPS) |
| `-ctkd, --cache-type-k-draft TYPE` | KV cache data type for K for the draft model<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_K_DRAFT) |
| `-ctvd, --cache-type-v-draft TYPE` | KV cache data type for V for the draft model<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_V_DRAFT) |
### Sampling params
| Argument | Explanation |
| -------- | ----------- |
| `--samplers SAMPLERS` | samplers that will be used for generation in the order, separated by ';'<br/>(default: penalties;dry;top_n_sigma;top_k;typ_p;top_p;min_p;xtc;temperature) |
| `-s, --seed SEED` | RNG seed (default: -1, use random seed for -1) |
| `--sampler-seq, --sampling-seq SEQUENCE` | simplified sequence for samplers that will be used (default: edskypmxt) |
| `--ignore-eos` | ignore end of stream token and continue generating (implies --logit-bias EOS-inf) |
| `--temp N` | temperature (default: 0.8) |
| `--top-k N` | top-k sampling (default: 40, 0 = disabled)<br/>(env: LLAMA_ARG_TOP_K) |
| `--top-p N` | top-p sampling (default: 0.9, 1.0 = disabled) |
| `--min-p N` | min-p sampling (default: 0.1, 0.0 = disabled) |
| `--top-nsigma N` | top-n-sigma sampling (default: -1.0, -1.0 = disabled) |
| `--xtc-probability N` | xtc probability (default: 0.0, 0.0 = disabled) |
| `--xtc-threshold N` | xtc threshold (default: 0.1, 1.0 = disabled) |
| `--typical N` | locally typical sampling, parameter p (default: 1.0, 1.0 = disabled) |
| `--repeat-last-n N` | last n tokens to consider for penalize (default: 64, 0 = disabled, -1 = ctx_size) |
| `--repeat-penalty N` | penalize repeat sequence of tokens (default: 1.0, 1.0 = disabled) |
| `--presence-penalty N` | repeat alpha presence penalty (default: 0.0, 0.0 = disabled) |
| `--frequency-penalty N` | repeat alpha frequency penalty (default: 0.0, 0.0 = disabled) |
| `--dry-multiplier N` | set DRY sampling multiplier (default: 0.0, 0.0 = disabled) |
| `--dry-base N` | set DRY sampling base value (default: 1.75) |
| `--dry-allowed-length N` | set allowed length for DRY sampling (default: 2) |
| `--dry-penalty-last-n N` | set DRY penalty for the last n tokens (default: -1, 0 = disable, -1 = context size) |
| `--dry-sequence-breaker STRING` | add sequence breaker for DRY sampling, clearing out default breakers ('\n', ':', '"', '*') in the process; use "none" to not use any sequence breakers |
| `--dynatemp-range N` | dynamic temperature range (default: 0.0, 0.0 = disabled) |
| `--dynatemp-exp N` | dynamic temperature exponent (default: 1.0) |
| `--mirostat N` | use Mirostat sampling.<br/>Top K, Nucleus and Locally Typical samplers are ignored if used.<br/>(default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0) |
| `--mirostat-lr N` | Mirostat learning rate, parameter eta (default: 0.1) |
| `--mirostat-ent N` | Mirostat target entropy, parameter tau (default: 5.0) |
| `-l, --logit-bias TOKEN_ID(+/-)BIAS` | modifies the likelihood of token appearing in the completion,<br/>i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',<br/>or `--logit-bias 15043-1` to decrease likelihood of token ' Hello' |
| `--grammar GRAMMAR` | BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '') |
| `--grammar-file FNAME` | file to read grammar from |
| `-j, --json-schema SCHEMA` | JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object<br/>For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead |
| `-jf, --json-schema-file FILE` | File containing a JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object<br/>For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead |
### Completion-specific params
| Argument | Explanation |
| -------- | ----------- |
| `--display-prompt, --no-display-prompt` | whether to print prompt at generation (default: true) |
| `-co, --color [on\|off\|auto]` | Colorize output to distinguish prompt and user input from generations ('on', 'off', or 'auto', default: 'auto')<br/>'auto' enables colors when output is to a terminal |
| `--context-shift, --no-context-shift` | whether to use context shift on infinite text generation (default: disabled)<br/>(env: LLAMA_ARG_CONTEXT_SHIFT) |
| `-sys, --system-prompt PROMPT` | system prompt to use with model (if applicable, depending on chat template) |
| `-sysf, --system-prompt-file FNAME` | a file containing the system prompt (default: none) |
| `-ptc, --print-token-count N` | print token count every N tokens (default: -1) |
| `--prompt-cache FNAME` | file to cache prompt state for faster startup (default: none) |
| `--prompt-cache-all` | if specified, saves user input and generations to cache as well |
| `--prompt-cache-ro` | if specified, uses the prompt cache but does not update it |
| `-r, --reverse-prompt PROMPT` | halt generation at PROMPT, return control in interactive mode |
| `-sp, --special` | special tokens output enabled (default: false) |
| `-cnv, --conversation, -no-cnv, --no-conversation` | whether to run in conversation mode:<br/>- does not print special tokens and suffix/prefix<br/>- interactive mode is also enabled<br/>(default: auto enabled if chat template is available) |
| `-st, --single-turn` | run conversation for a single turn only, then exit when done<br/>will not be interactive if first turn is predefined with --prompt<br/>(default: false) |
| `-i, --interactive` | run in interactive mode (default: false) |
| `-if, --interactive-first` | run in interactive mode and wait for input right away (default: false) |
| `-mli, --multiline-input` | allows you to write or paste multiple lines without ending each in '\' |
| `--in-prefix-bos` | prefix BOS to user inputs, preceding the `--in-prefix` string |
| `--in-prefix STRING` | string to prefix user inputs with (default: empty) |
| `--in-suffix STRING` | string to suffix after user inputs with (default: empty) |
| `--warmup, --no-warmup` | whether to perform warmup with an empty run (default: enabled) |
| `-gan, --grp-attn-n N` | group-attention factor (default: 1)<br/>(env: LLAMA_ARG_GRP_ATTN_N) |
| `-gaw, --grp-attn-w N` | group-attention width (default: 512)<br/>(env: LLAMA_ARG_GRP_ATTN_W) |
| `--jinja, --no-jinja` | whether to use jinja template engine for chat (default: disabled)<br/>(env: LLAMA_ARG_JINJA) |
| `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:<br/>- none: leaves thoughts unparsed in `message.content`<br/>- deepseek: puts thoughts in `message.reasoning_content`<br/>- deepseek-legacy: keeps `<think>` tags in `message.content` while also populating `message.reasoning_content`<br/>(default: auto)<br/>(env: LLAMA_ARG_THINK) |
| `--reasoning-budget N` | controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)<br/>(env: LLAMA_ARG_THINK_BUDGET) |
| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
| `--simple-io` | use basic IO for better compatibility in subprocesses and limited consoles |
<!-- HELP_END -->
## Common Options ## Common Options
In this section, we cover the most commonly used options for running the `llama-completion` program with the LLaMA models: In this section, we cover the most commonly used options for running the `llama-completion` program with the LLaMA models:

View File

@ -2102,6 +2102,8 @@ int main(int argc, char ** argv) {
struct ggml_threadpool_params tpp = ggml_threadpool_params_default(t.n_threads); struct ggml_threadpool_params tpp = ggml_threadpool_params_default(t.n_threads);
if (!parse_cpu_mask(t.cpu_mask, tpp.cpumask)) { if (!parse_cpu_mask(t.cpu_mask, tpp.cpumask)) {
fprintf(stderr, "%s: failed to parse cpu-mask: %s\n", __func__, t.cpu_mask.c_str()); fprintf(stderr, "%s: failed to parse cpu-mask: %s\n", __func__, t.cpu_mask.c_str());
llama_free(ctx);
llama_model_free(lmodel);
exit(1); exit(1);
} }
tpp.strict_cpu = t.cpu_strict; tpp.strict_cpu = t.cpu_strict;
@ -2111,6 +2113,8 @@ int main(int argc, char ** argv) {
struct ggml_threadpool * threadpool = ggml_threadpool_new_fn(&tpp); struct ggml_threadpool * threadpool = ggml_threadpool_new_fn(&tpp);
if (!threadpool) { if (!threadpool) {
fprintf(stderr, "%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads); fprintf(stderr, "%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
llama_free(ctx);
llama_model_free(lmodel);
exit(1); exit(1);
} }
@ -2126,6 +2130,8 @@ int main(int argc, char ** argv) {
bool res = test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads); bool res = test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);
if (!res) { if (!res) {
fprintf(stderr, "%s: error: failed to run prompt warmup\n", __func__); fprintf(stderr, "%s: error: failed to run prompt warmup\n", __func__);
llama_free(ctx);
llama_model_free(lmodel);
exit(1); exit(1);
} }
} }
@ -2136,6 +2142,8 @@ int main(int argc, char ** argv) {
bool res = test_gen(ctx, 1, t.n_threads); bool res = test_gen(ctx, 1, t.n_threads);
if (!res) { if (!res) {
fprintf(stderr, "%s: error: failed to run gen warmup\n", __func__); fprintf(stderr, "%s: error: failed to run gen warmup\n", __func__);
llama_free(ctx);
llama_model_free(lmodel);
exit(1); exit(1);
} }
} }
@ -2164,6 +2172,8 @@ int main(int argc, char ** argv) {
bool res = test_prompt(ctx, t.n_depth, t.n_batch, t.n_threads); bool res = test_prompt(ctx, t.n_depth, t.n_batch, t.n_threads);
if (!res) { if (!res) {
fprintf(stderr, "%s: error: failed to run depth\n", __func__); fprintf(stderr, "%s: error: failed to run depth\n", __func__);
llama_free(ctx);
llama_model_free(lmodel);
exit(1); exit(1);
} }
@ -2189,6 +2199,8 @@ int main(int argc, char ** argv) {
bool res = test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads); bool res = test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);
if (!res) { if (!res) {
fprintf(stderr, "%s: error: failed to run prompt\n", __func__); fprintf(stderr, "%s: error: failed to run prompt\n", __func__);
llama_free(ctx);
llama_model_free(lmodel);
exit(1); exit(1);
} }
} }
@ -2200,6 +2212,8 @@ int main(int argc, char ** argv) {
bool res = test_gen(ctx, t.n_gen, t.n_threads); bool res = test_gen(ctx, t.n_gen, t.n_threads);
if (!res) { if (!res) {
fprintf(stderr, "%s: error: failed to run gen\n", __func__); fprintf(stderr, "%s: error: failed to run gen\n", __func__);
llama_free(ctx);
llama_model_free(lmodel);
exit(1); exit(1);
} }
} }

View File

@ -23,9 +23,11 @@ For the ful list of features, please refer to [server's changelog](https://githu
## Usage ## Usage
<!-- Note for contributors: The list below is generated by llama-gen-docs --> <!-- HELP_START -->
**Common params** <!-- IMPORTANT: The list below is auto-generated by llama-gen-docs; do NOT modify it manually -->
### Common params
| Argument | Explanation | | Argument | Explanation |
| -------- | ----------- | | -------- | ----------- |
@ -38,13 +40,13 @@ For the ful list of features, please refer to [server's changelog](https://githu
| `-tb, --threads-batch N` | number of threads to use during batch and prompt processing (default: same as --threads) | | `-tb, --threads-batch N` | number of threads to use during batch and prompt processing (default: same as --threads) |
| `-C, --cpu-mask M` | CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: "") | | `-C, --cpu-mask M` | CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: "") |
| `-Cr, --cpu-range lo-hi` | range of CPUs for affinity. Complements --cpu-mask | | `-Cr, --cpu-range lo-hi` | range of CPUs for affinity. Complements --cpu-mask |
| `--cpu-strict <0\|1>` | use strict CPU placement (default: 0)<br/> | | `--cpu-strict <0\|1>` | use strict CPU placement (default: 0) |
| `--prio N` | set process/thread priority : low(-1), normal(0), medium(1), high(2), realtime(3) (default: 0)<br/> | | `--prio N` | set process/thread priority : low(-1), normal(0), medium(1), high(2), realtime(3) (default: 0) |
| `--poll <0...100>` | use polling level to wait for work (0 - no polling, default: 50)<br/> | | `--poll <0...100>` | use polling level to wait for work (0 - no polling, default: 50) |
| `-Cb, --cpu-mask-batch M` | CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask) | | `-Cb, --cpu-mask-batch M` | CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask) |
| `-Crb, --cpu-range-batch lo-hi` | ranges of CPUs for affinity. Complements --cpu-mask-batch | | `-Crb, --cpu-range-batch lo-hi` | ranges of CPUs for affinity. Complements --cpu-mask-batch |
| `--cpu-strict-batch <0\|1>` | use strict CPU placement (default: same as --cpu-strict) | | `--cpu-strict-batch <0\|1>` | use strict CPU placement (default: same as --cpu-strict) |
| `--prio-batch N` | set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: 0)<br/> | | `--prio-batch N` | set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: 0) |
| `--poll-batch <0\|1>` | use polling to wait for work (default: same as --poll) | | `--poll-batch <0\|1>` | use polling to wait for work (default: same as --poll) |
| `-c, --ctx-size N` | size of the prompt context (default: 0, 0 = loaded from model)<br/>(env: LLAMA_ARG_CTX_SIZE) | | `-c, --ctx-size N` | size of the prompt context (default: 0, 0 = loaded from model)<br/>(env: LLAMA_ARG_CTX_SIZE) |
| `-n, --predict, --n-predict N` | number of tokens to predict (default: -1, -1 = infinity)<br/>(env: LLAMA_ARG_N_PREDICT) | | `-n, --predict, --n-predict N` | number of tokens to predict (default: -1, -1 = infinity)<br/>(env: LLAMA_ARG_N_PREDICT) |
@ -114,7 +116,7 @@ For the ful list of features, please refer to [server's changelog](https://githu
| `-ctvd, --cache-type-v-draft TYPE` | KV cache data type for V for the draft model<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_V_DRAFT) | | `-ctvd, --cache-type-v-draft TYPE` | KV cache data type for V for the draft model<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_V_DRAFT) |
**Sampling params** ### Sampling params
| Argument | Explanation | | Argument | Explanation |
| -------- | ----------- | | -------- | ----------- |
@ -138,7 +140,7 @@ For the ful list of features, please refer to [server's changelog](https://githu
| `--dry-base N` | set DRY sampling base value (default: 1.75) | | `--dry-base N` | set DRY sampling base value (default: 1.75) |
| `--dry-allowed-length N` | set allowed length for DRY sampling (default: 2) | | `--dry-allowed-length N` | set allowed length for DRY sampling (default: 2) |
| `--dry-penalty-last-n N` | set DRY penalty for the last n tokens (default: -1, 0 = disable, -1 = context size) | | `--dry-penalty-last-n N` | set DRY penalty for the last n tokens (default: -1, 0 = disable, -1 = context size) |
| `--dry-sequence-breaker STRING` | add sequence breaker for DRY sampling, clearing out default breakers ('\n', ':', '"', '*') in the process; use "none" to not use any sequence breakers<br/> | | `--dry-sequence-breaker STRING` | add sequence breaker for DRY sampling, clearing out default breakers ('\n', ':', '"', '*') in the process; use "none" to not use any sequence breakers |
| `--dynatemp-range N` | dynamic temperature range (default: 0.0, 0.0 = disabled) | | `--dynatemp-range N` | dynamic temperature range (default: 0.0, 0.0 = disabled) |
| `--dynatemp-exp N` | dynamic temperature exponent (default: 1.0) | | `--dynatemp-exp N` | dynamic temperature exponent (default: 1.0) |
| `--mirostat N` | use Mirostat sampling.<br/>Top K, Nucleus and Locally Typical samplers are ignored if used.<br/>(default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0) | | `--mirostat N` | use Mirostat sampling.<br/>Top K, Nucleus and Locally Typical samplers are ignored if used.<br/>(default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0) |
@ -151,7 +153,7 @@ For the ful list of features, please refer to [server's changelog](https://githu
| `-jf, --json-schema-file FILE` | File containing a JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object<br/>For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead | | `-jf, --json-schema-file FILE` | File containing a JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object<br/>For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead |
**Server-specific params** ### Server-specific params
| Argument | Explanation | | Argument | Explanation |
| -------- | ----------- | | -------- | ----------- |
@ -159,7 +161,7 @@ For the ful list of features, please refer to [server's changelog](https://githu
| `-cram, --cache-ram N` | set the maximum cache size in MiB (default: 8192, -1 - no limit, 0 - disable)[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)<br/>(env: LLAMA_ARG_CACHE_RAM) | | `-cram, --cache-ram N` | set the maximum cache size in MiB (default: 8192, -1 - no limit, 0 - disable)[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)<br/>(env: LLAMA_ARG_CACHE_RAM) |
| `-kvu, --kv-unified` | use single unified KV buffer shared across all sequences (default: enabled if number of slots is auto)<br/>(env: LLAMA_ARG_KV_UNIFIED) | | `-kvu, --kv-unified` | use single unified KV buffer shared across all sequences (default: enabled if number of slots is auto)<br/>(env: LLAMA_ARG_KV_UNIFIED) |
| `--context-shift, --no-context-shift` | whether to use context shift on infinite text generation (default: disabled)<br/>(env: LLAMA_ARG_CONTEXT_SHIFT) | | `--context-shift, --no-context-shift` | whether to use context shift on infinite text generation (default: disabled)<br/>(env: LLAMA_ARG_CONTEXT_SHIFT) |
| `-r, --reverse-prompt PROMPT` | halt generation at PROMPT, return control in interactive mode<br/> | | `-r, --reverse-prompt PROMPT` | halt generation at PROMPT, return control in interactive mode |
| `-sp, --special` | special tokens output enabled (default: false) | | `-sp, --special` | special tokens output enabled (default: false) |
| `--warmup, --no-warmup` | whether to perform warmup with an empty run (default: enabled) | | `--warmup, --no-warmup` | whether to perform warmup with an empty run (default: enabled) |
| `--spm-infill` | use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: disabled) | | `--spm-infill` | use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: disabled) |
@ -208,8 +210,9 @@ For the ful list of features, please refer to [server's changelog](https://githu
| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) | | `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) | | `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
| `--prefill-assistant, --no-prefill-assistant` | whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)<br/>when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled<br/><br/>(env: LLAMA_ARG_PREFILL_ASSISTANT) | | `--prefill-assistant, --no-prefill-assistant` | whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)<br/>when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled<br/><br/>(env: LLAMA_ARG_PREFILL_ASSISTANT) |
| `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.10, 0.0 = disabled)<br/> | | `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.10, 0.0 = disabled) |
| `--lora-init-without-apply` | load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled) | | `--lora-init-without-apply` | load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled) |
| `--sleep-idle-seconds SECONDS` | number of seconds of idleness after which the server will sleep (default: -1; -1 = disabled) |
| `-td, --threads-draft N` | number of threads to use during generation (default: same as --threads) | | `-td, --threads-draft N` | number of threads to use during generation (default: same as --threads) |
| `-tbd, --threads-batch-draft N` | number of threads to use during batch and prompt processing (default: same as --threads-draft) | | `-tbd, --threads-batch-draft N` | number of threads to use during batch and prompt processing (default: same as --threads-draft) |
| `--draft, --draft-n, --draft-max N` | number of tokens to draft for speculative decoding (default: 16)<br/>(env: LLAMA_ARG_DRAFT_MAX) | | `--draft, --draft-n, --draft-max N` | number of tokens to draft for speculative decoding (default: 16)<br/>(env: LLAMA_ARG_DRAFT_MAX) |
@ -234,6 +237,7 @@ For the ful list of features, please refer to [server's changelog](https://githu
| `--vision-gemma-4b-default` | use Gemma 3 4B QAT (note: can download weights from the internet) | | `--vision-gemma-4b-default` | use Gemma 3 4B QAT (note: can download weights from the internet) |
| `--vision-gemma-12b-default` | use Gemma 3 12B QAT (note: can download weights from the internet) | | `--vision-gemma-12b-default` | use Gemma 3 12B QAT (note: can download weights from the internet) |
<!-- HELP_END -->
Note: If both command line argument and environment variable are both set for the same param, the argument will take precedence over env var. Note: If both command line argument and environment variable are both set for the same param, the argument will take precedence over env var.
@ -1567,7 +1571,6 @@ Load a model
Payload: Payload:
- `model`: name of the model to be loaded. - `model`: name of the model to be loaded.
- `extra_args`: (optional) an array of additional arguments to be passed to the model instance. Note: you must start the server with `--models-allow-extra-args` to enable this feature.
```json ```json
{ {

Binary file not shown.

View File

@ -115,26 +115,14 @@ bool lora_should_clear_cache(
!lora_all_alora(next)); !lora_all_alora(next));
} }
std::vector<common_adapter_lora_info> parse_lora_request( std::map<int, float> parse_lora_request(const json & data) {
const std::vector<common_adapter_lora_info> & lora_base, std::map<int, float> lora;
const json & data) {
std::vector<common_adapter_lora_info> lora(lora_base);
int max_idx = lora.size();
// clear existing value
for (auto & entry : lora) {
entry.scale = 0.0f;
}
// set value // set value
for (const auto & entry : data) { for (const auto & entry : data) {
int id = json_value(entry, "id", -1); int id = json_value(entry, "id", -1);
float scale = json_value(entry, "scale", 0.0f); float scale = json_value(entry, "scale", 0.0f);
if (0 <= id && id < max_idx) { lora[id] = scale;
lora[id].scale = scale;
} else {
throw std::runtime_error("invalid adapter id");
}
} }
return lora; return lora;
@ -1435,7 +1423,7 @@ std::string safe_json_to_str(const json & data) {
// TODO: reuse llama_detokenize // TODO: reuse llama_detokenize
template <class Iter> template <class Iter>
static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) { static std::string tokens_to_str(const llama_vocab * ctx, Iter begin, Iter end) {
std::string ret; std::string ret;
for (; begin != end; ++begin) { for (; begin != end; ++begin) {
ret += common_token_to_piece(ctx, *begin); ret += common_token_to_piece(ctx, *begin);
@ -1445,7 +1433,12 @@ static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) {
} }
std::string tokens_to_str(llama_context * ctx, const llama_tokens & tokens) { std::string tokens_to_str(llama_context * ctx, const llama_tokens & tokens) {
return tokens_to_str(ctx, tokens.begin(), tokens.end()); auto model = llama_get_model(ctx);
return tokens_to_str(llama_model_get_vocab(model), tokens.begin(), tokens.end());
}
std::string tokens_to_str(const llama_vocab * vocab, const llama_tokens & tokens) {
return tokens_to_str(vocab, tokens.begin(), tokens.end());
} }
// format incomplete utf-8 multibyte character for output // format incomplete utf-8 multibyte character for output

View File

@ -107,9 +107,7 @@ bool lora_should_clear_cache(
const std::vector<common_adapter_lora_info> & current, const std::vector<common_adapter_lora_info> & current,
const std::vector<common_adapter_lora_info> & next); const std::vector<common_adapter_lora_info> & next);
std::vector<common_adapter_lora_info> parse_lora_request( std::map<int, float> parse_lora_request(const json & data);
const std::vector<common_adapter_lora_info> & lora_base,
const json & data);
bool are_lora_equal( bool are_lora_equal(
const std::vector<common_adapter_lora_info> & l1, const std::vector<common_adapter_lora_info> & l1,
@ -325,6 +323,7 @@ std::vector<llama_token_data> get_token_probabilities(llama_context * ctx, int i
std::string safe_json_to_str(const json & data); std::string safe_json_to_str(const json & data);
std::string tokens_to_str(llama_context * ctx, const llama_tokens & tokens); std::string tokens_to_str(llama_context * ctx, const llama_tokens & tokens);
std::string tokens_to_str(const llama_vocab * vocab, const llama_tokens & tokens);
// format incomplete utf-8 multibyte character for output // format incomplete utf-8 multibyte character for output
std::string tokens_to_output_formatted_string(const llama_context * ctx, const llama_token token); std::string tokens_to_output_formatted_string(const llama_context * ctx, const llama_token token);

File diff suppressed because it is too large Load Diff

View File

@ -9,11 +9,35 @@
struct server_context_impl; // private implementation struct server_context_impl; // private implementation
struct server_context_info { struct server_context_meta {
std::string build_info; std::string build_info;
std::string model_name; std::string model_name;
std::string model_path;
bool has_mtmd;
bool has_inp_image; bool has_inp_image;
bool has_inp_audio; bool has_inp_audio;
json json_webui_settings;
int slot_n_ctx;
enum llama_pooling_type pooling_type;
// chat template
std::string chat_template;
std::string chat_template_tool_use;
// tokens
std::string bos_token_str;
std::string eos_token_str;
llama_token fim_pre_token;
llama_token fim_sub_token;
llama_token fim_mid_token;
// model meta
enum llama_vocab_type model_vocab_type;
int32_t model_vocab_n_tokens;
int32_t model_n_ctx_train;
int32_t model_n_embd_inp;
uint64_t model_n_params;
uint64_t model_size;
}; };
struct server_context { struct server_context {
@ -33,14 +57,15 @@ struct server_context {
void terminate(); void terminate();
// get the underlaying llama_context, can return nullptr if sleeping // get the underlaying llama_context, can return nullptr if sleeping
// not thread-safe, should only be used from the main thread
llama_context * get_llama_context() const; llama_context * get_llama_context() const;
// get a new response reader, used by CLI application // get a new response reader, used by CLI application
server_response_reader get_response_reader(); server_response_reader get_response_reader();
// get server info // get server metadata (read-only), can only be called after load_model()
// used by CLI application // not thread-safe, should only be used from the main thread
server_context_info get_info() const; server_context_meta get_meta() const;
}; };
@ -48,13 +73,17 @@ struct server_context {
struct server_res_generator; struct server_res_generator;
struct server_routes { struct server_routes {
server_routes(const common_params & params, server_context & ctx_server, std::function<bool()> is_ready = []() { return true; }) server_routes(const common_params & params, server_context & ctx_server);
: params(params), ctx_server(*ctx_server.impl), is_ready(is_ready) {
init_routes();
}
void init_routes(); void init_routes();
// note: this is not thread-safe and can only when ctx_http.is_ready is false
void update_meta(const server_context & ctx_server) {
this->meta = std::make_unique<server_context_meta>(ctx_server.get_meta());
}
// handlers using lambda function, so that they can capture `this` without `std::bind` // handlers using lambda function, so that they can capture `this` without `std::bind`
// they won't be called until ctx_http.is_ready is set to true
server_http_context::handler_t get_health; server_http_context::handler_t get_health;
server_http_context::handler_t get_metrics; server_http_context::handler_t get_metrics;
server_http_context::handler_t get_slots; server_http_context::handler_t get_slots;
@ -78,13 +107,24 @@ struct server_routes {
server_http_context::handler_t get_lora_adapters; server_http_context::handler_t get_lora_adapters;
server_http_context::handler_t post_lora_adapters; server_http_context::handler_t post_lora_adapters;
private: private:
// TODO: move these outside of server_routes? std::unique_ptr<server_res_generator> handle_completions_impl(
const server_http_req & req,
server_task_type type,
const json & data,
const std::vector<raw_buffer> & files,
task_response_type res_type);
std::unique_ptr<server_res_generator> handle_slots_save(const server_http_req & req, int id_slot); std::unique_ptr<server_res_generator> handle_slots_save(const server_http_req & req, int id_slot);
std::unique_ptr<server_res_generator> handle_slots_restore(const server_http_req & req, int id_slot); std::unique_ptr<server_res_generator> handle_slots_restore(const server_http_req & req, int id_slot);
std::unique_ptr<server_res_generator> handle_slots_erase(const server_http_req &, int id_slot); std::unique_ptr<server_res_generator> handle_slots_erase(const server_http_req &, int id_slot);
std::unique_ptr<server_res_generator> handle_embeddings_impl(const server_http_req & req, task_response_type res_type); std::unique_ptr<server_res_generator> handle_embeddings_impl(const server_http_req & req, task_response_type res_type);
// using unique_ptr to allow late initialization of const
std::unique_ptr<const server_context_meta> meta;
const common_params & params; const common_params & params;
server_context_impl & ctx_server; const server_context_impl & ctx_server;
std::function<bool()> is_ready;
server_queue & queue_tasks;
server_response & queue_results;
std::unique_ptr<server_res_generator> create_response(bool bypass_sleep = false);
}; };

View File

@ -177,12 +177,11 @@ bool server_http_context::init(const common_params & params) {
if (!ready) { if (!ready) {
auto tmp = string_split<std::string>(req.path, '.'); auto tmp = string_split<std::string>(req.path, '.');
if (req.path == "/" || tmp.back() == "html") { if (req.path == "/" || tmp.back() == "html") {
res.set_content(reinterpret_cast<const char*>(loading_html), loading_html_len, "text/html; charset=utf-8");
res.status = 503; res.status = 503;
} else if (req.path == "/models" || req.path == "/v1/models" || req.path == "/api/tags") { res.set_content(reinterpret_cast<const char*>(loading_html), loading_html_len, "text/html; charset=utf-8");
// allow the models endpoint to be accessed during loading
return true;
} else { } else {
// no endpoints is allowed to be accessed when the server is not ready
// this is to prevent any data races or inconsistent states
res.status = 503; res.status = 503;
res.set_content( res.set_content(
safe_json_to_str(json { safe_json_to_str(json {
@ -334,12 +333,16 @@ static std::map<std::string, std::string> get_headers(const httplib::Request & r
return headers; return headers;
} }
static void process_handler_response(server_http_res_ptr & response, httplib::Response & res) { // using unique_ptr for request to allow safe capturing in lambdas
using server_http_req_ptr = std::unique_ptr<server_http_req>;
static void process_handler_response(server_http_req_ptr && request, server_http_res_ptr & response, httplib::Response & res) {
if (response->is_stream()) { if (response->is_stream()) {
res.status = response->status; res.status = response->status;
set_headers(res, response->headers); set_headers(res, response->headers);
std::string content_type = response->content_type; std::string content_type = response->content_type;
// convert to shared_ptr as both chunked_content_provider() and on_complete() need to use it // convert to shared_ptr as both chunked_content_provider() and on_complete() need to use it
std::shared_ptr<server_http_req> q_ptr = std::move(request);
std::shared_ptr<server_http_res> r_ptr = std::move(response); std::shared_ptr<server_http_res> r_ptr = std::move(response);
const auto chunked_content_provider = [response = r_ptr](size_t, httplib::DataSink & sink) -> bool { const auto chunked_content_provider = [response = r_ptr](size_t, httplib::DataSink & sink) -> bool {
std::string chunk; std::string chunk;
@ -355,8 +358,9 @@ static void process_handler_response(server_http_res_ptr & response, httplib::Re
} }
return has_next; return has_next;
}; };
const auto on_complete = [response = r_ptr](bool) mutable { const auto on_complete = [request = q_ptr, response = r_ptr](bool) mutable {
response.reset(); // trigger the destruction of the response object response.reset(); // trigger the destruction of the response object
request.reset(); // trigger the destruction of the request object
}; };
res.set_chunked_content_provider(content_type, chunked_content_provider, on_complete); res.set_chunked_content_provider(content_type, chunked_content_provider, on_complete);
} else { } else {
@ -368,27 +372,29 @@ static void process_handler_response(server_http_res_ptr & response, httplib::Re
void server_http_context::get(const std::string & path, const server_http_context::handler_t & handler) const { void server_http_context::get(const std::string & path, const server_http_context::handler_t & handler) const {
pimpl->srv->Get(path_prefix + path, [handler](const httplib::Request & req, httplib::Response & res) { pimpl->srv->Get(path_prefix + path, [handler](const httplib::Request & req, httplib::Response & res) {
server_http_res_ptr response = handler(server_http_req{ server_http_req_ptr request = std::make_unique<server_http_req>(server_http_req{
get_params(req), get_params(req),
get_headers(req), get_headers(req),
req.path, req.path,
req.body, req.body,
req.is_connection_closed req.is_connection_closed
}); });
process_handler_response(response, res); server_http_res_ptr response = handler(*request);
process_handler_response(std::move(request), response, res);
}); });
} }
void server_http_context::post(const std::string & path, const server_http_context::handler_t & handler) const { void server_http_context::post(const std::string & path, const server_http_context::handler_t & handler) const {
pimpl->srv->Post(path_prefix + path, [handler](const httplib::Request & req, httplib::Response & res) { pimpl->srv->Post(path_prefix + path, [handler](const httplib::Request & req, httplib::Response & res) {
server_http_res_ptr response = handler(server_http_req{ server_http_req_ptr request = std::make_unique<server_http_req>(server_http_req{
get_params(req), get_params(req),
get_headers(req), get_headers(req),
req.path, req.path,
req.body, req.body,
req.is_connection_closed req.is_connection_closed
}); });
process_handler_response(response, res); server_http_res_ptr response = handler(*request);
process_handler_response(std::move(request), response, res);
}); });
} }

View File

@ -325,23 +325,25 @@ void server_response::terminate() {
// server_response_reader // server_response_reader
// //
void server_response_reader::post_task(server_task && task) { void server_response_reader::post_task(server_task && task, bool front) {
GGML_ASSERT(id_tasks.empty() && "post_task() can only be called once per reader"); GGML_ASSERT(id_tasks.empty() && "post_task() can only be called once per reader");
task.index = 0;
id_tasks.insert(task.id); id_tasks.insert(task.id);
states.push_back(task.create_state()); states.push_back(task.create_state());
queue_results.add_waiting_task_id(task.id); queue_results.add_waiting_task_id(task.id);
queue_tasks.post(std::move(task)); queue_tasks.post(std::move(task), front);
} }
void server_response_reader::post_tasks(std::vector<server_task> && tasks) { void server_response_reader::post_tasks(std::vector<server_task> && tasks, bool front) {
GGML_ASSERT(id_tasks.empty() && "post_tasks() can only be called once per reader"); GGML_ASSERT(id_tasks.empty() && "post_tasks() can only be called once per reader");
id_tasks = server_task::get_list_id(tasks); id_tasks = server_task::get_list_id(tasks);
states.reserve(tasks.size()); states.reserve(tasks.size());
for (size_t i = 0; i < tasks.size(); i++) { for (size_t i = 0; i < tasks.size(); i++) {
tasks[i].index = i;
states.push_back(tasks[i].create_state()); states.push_back(tasks[i].create_state());
} }
queue_results.add_waiting_tasks(tasks); queue_results.add_waiting_tasks(tasks);
queue_tasks.post(std::move(tasks)); queue_tasks.post(std::move(tasks), front);
} }
bool server_response_reader::has_next() const { bool server_response_reader::has_next() const {
@ -367,7 +369,7 @@ server_task_result_ptr server_response_reader::next(const std::function<bool()>
} }
if (!states.empty()) { if (!states.empty()) {
// update the generation state if needed // update the generation state if needed
size_t idx = result->get_index(); const size_t idx = result->index;
GGML_ASSERT(idx < states.size()); GGML_ASSERT(idx < states.size());
result->update(states[idx]); result->update(states[idx]);
} }
@ -383,6 +385,7 @@ server_task_result_ptr server_response_reader::next(const std::function<bool()>
server_response_reader::batch_response server_response_reader::wait_for_all(const std::function<bool()> & should_stop) { server_response_reader::batch_response server_response_reader::wait_for_all(const std::function<bool()> & should_stop) {
batch_response batch_res; batch_response batch_res;
batch_res.results.clear();
batch_res.results.resize(id_tasks.size()); batch_res.results.resize(id_tasks.size());
while (has_next()) { while (has_next()) {
auto res = next(should_stop); auto res = next(should_stop);
@ -394,7 +397,7 @@ server_response_reader::batch_response server_response_reader::wait_for_all(cons
batch_res.error = std::move(res); batch_res.error = std::move(res);
return batch_res; return batch_res;
} }
const size_t idx = res->get_index(); const size_t idx = res->index;
GGML_ASSERT(idx < batch_res.results.size() && "index out of range"); GGML_ASSERT(idx < batch_res.results.size() && "index out of range");
GGML_ASSERT(batch_res.results[idx] == nullptr && "duplicate result received"); GGML_ASSERT(batch_res.results[idx] == nullptr && "duplicate result received");
batch_res.results[idx] = std::move(res); batch_res.results[idx] = std::move(res);

View File

@ -5,6 +5,7 @@
#include <condition_variable> #include <condition_variable>
#include <deque> #include <deque>
#include <mutex> #include <mutex>
#include <vector>
#include <unordered_set> #include <unordered_set>
// struct for managing server tasks // struct for managing server tasks
@ -173,8 +174,10 @@ struct server_response_reader {
int get_new_id() { int get_new_id() {
return queue_tasks.get_new_id(); return queue_tasks.get_new_id();
} }
void post_task(server_task && task);
void post_tasks(std::vector<server_task> && tasks); // if front = true, the task will be posted to the front of the queue (high priority)
void post_task(server_task && task, bool front = false);
void post_tasks(std::vector<server_task> && tasks, bool front = false);
bool has_next() const; bool has_next() const;
// return nullptr if should_stop() is true before receiving a result // return nullptr if should_stop() is true before receiving a result

View File

@ -32,8 +32,8 @@ json task_params::to_json(bool only_metrics) const {
} }
json lora = json::array(); json lora = json::array();
for (size_t i = 0; i < this->lora.size(); ++i) { for (auto & it : this->lora) {
lora.push_back({{"id", i}, {"scale", this->lora[i].scale}}); lora.push_back({{"id", it.first}, {"scale", it.second}});
} }
if (only_metrics) { if (only_metrics) {
@ -145,12 +145,10 @@ json task_params::to_json(bool only_metrics) const {
// //
task_params server_task::params_from_json_cmpl( task_params server_task::params_from_json_cmpl(
const llama_context * ctx, const llama_vocab * vocab,
const common_params & params_base, const common_params & params_base,
const int n_ctx_slot,
const json & data) { const json & data) {
const llama_model * model = llama_get_model(ctx);
const llama_vocab * vocab = llama_model_get_vocab(model);
task_params params; task_params params;
// Sampling parameter defaults are loaded from the global server context (but individual requests can still them) // Sampling parameter defaults are loaded from the global server context (but individual requests can still them)
@ -223,12 +221,12 @@ task_params server_task::params_from_json_cmpl(
if (data.contains("lora")) { if (data.contains("lora")) {
if (data.at("lora").is_array()) { if (data.at("lora").is_array()) {
params.lora = parse_lora_request(params_base.lora_adapters, data.at("lora")); params.lora = parse_lora_request(data.at("lora"));
} else { } else {
throw std::runtime_error("Error: 'lora' must be an array of objects with 'id' and 'scale' fields"); throw std::runtime_error("Error: 'lora' must be an array of objects with 'id' and 'scale' fields");
} }
} else { } else {
params.lora = params_base.lora_adapters; params.lora = {};
} }
// TODO: add more sanity checks for the input parameters // TODO: add more sanity checks for the input parameters
@ -243,11 +241,11 @@ task_params server_task::params_from_json_cmpl(
if (params.sampling.penalty_last_n == -1) { if (params.sampling.penalty_last_n == -1) {
// note: should be the slot's context and not the full context, but it's ok // note: should be the slot's context and not the full context, but it's ok
params.sampling.penalty_last_n = llama_n_ctx(ctx); params.sampling.penalty_last_n = n_ctx_slot;
} }
if (params.sampling.dry_penalty_last_n == -1) { if (params.sampling.dry_penalty_last_n == -1) {
params.sampling.dry_penalty_last_n = llama_n_ctx(ctx); params.sampling.dry_penalty_last_n = n_ctx_slot;
} }
if (params.sampling.dry_base < 1.0f) { if (params.sampling.dry_base < 1.0f) {
@ -1153,7 +1151,7 @@ json server_task_result_rerank::to_json() {
json server_task_result_cmpl_partial::to_json_anthropic() { json server_task_result_cmpl_partial::to_json_anthropic() {
json events = json::array(); json events = json::array();
bool first = (n_decoded == 1); bool first = (n_decoded == 1);
static bool text_block_started = false; bool text_block_started = false;
if (first) { if (first) {
text_block_started = false; text_block_started = false;
@ -1324,6 +1322,30 @@ json server_task_result_slot_erase::to_json() {
}; };
} }
//
// server_task_result_get_lora
//
json server_task_result_get_lora::to_json() {
json result = json::array();
for (size_t i = 0; i < loras.size(); ++i) {
auto & lora = loras[i];
json entry = {
{"id", i},
{"path", lora.info.path},
{"scale", lora.info.scale},
{"task_name", lora.info.task_name},
{"prompt_prefix", lora.info.prompt_prefix},
};
if (!lora.alora_invocation_tokens.empty()) {
entry["alora_invocation_string"] = lora.alora_invocation_string;
entry["alora_invocation_tokens"] = lora.alora_invocation_tokens;
}
result.push_back(std::move(entry));
}
return result;
}
// //
// server_task_result_apply_lora // server_task_result_apply_lora
// //

View File

@ -6,6 +6,7 @@
#include <string> #include <string>
#include <unordered_set> #include <unordered_set>
#include <list> #include <list>
#include <map>
// TODO: prevent including the whole server-common.h as we only use server_tokens // TODO: prevent including the whole server-common.h as we only use server_tokens
#include "server-common.h" #include "server-common.h"
@ -23,6 +24,7 @@ enum server_task_type {
SERVER_TASK_TYPE_SLOT_SAVE, SERVER_TASK_TYPE_SLOT_SAVE,
SERVER_TASK_TYPE_SLOT_RESTORE, SERVER_TASK_TYPE_SLOT_RESTORE,
SERVER_TASK_TYPE_SLOT_ERASE, SERVER_TASK_TYPE_SLOT_ERASE,
SERVER_TASK_TYPE_GET_LORA,
SERVER_TASK_TYPE_SET_LORA, SERVER_TASK_TYPE_SET_LORA,
}; };
@ -60,7 +62,7 @@ struct task_params {
int64_t t_max_prompt_ms = -1; // TODO: implement int64_t t_max_prompt_ms = -1; // TODO: implement
int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit
std::vector<common_adapter_lora_info> lora; std::map<int, float> lora; // mapping adapter ID -> scale
std::vector<std::string> antiprompt; std::vector<std::string> antiprompt;
std::vector<std::string> response_fields; std::vector<std::string> response_fields;
@ -105,8 +107,10 @@ struct task_result_state {
}; };
struct server_task { struct server_task {
int id = -1; // to be filled by server_queue int id = -1; // to be filled by server_queue
int index = -1; // used when there are multiple prompts (batch request)
// TODO @ngxson : remove this field and implement a mapping task_id -> idx in the response_reader
size_t index = 0; // used when there are multiple prompts (batch request)
// used by SERVER_TASK_TYPE_CANCEL // used by SERVER_TASK_TYPE_CANCEL
int id_target = -1; int id_target = -1;
@ -138,7 +142,7 @@ struct server_task {
bool metrics_reset_bucket = false; bool metrics_reset_bucket = false;
// used by SERVER_TASK_TYPE_SET_LORA // used by SERVER_TASK_TYPE_SET_LORA
std::vector<common_adapter_lora_info> set_lora; std::map<int, float> set_lora; // mapping adapter ID -> scale
server_task() = default; server_task() = default;
@ -149,9 +153,10 @@ struct server_task {
} }
static task_params params_from_json_cmpl( static task_params params_from_json_cmpl(
const llama_context * ctx, const llama_vocab * vocab,
const common_params & params_base, const common_params & params_base,
const json & data); const int n_ctx_slot,
const json & data);
// utility function // utility function
static std::unordered_set<int> get_list_id(const std::vector<server_task> & tasks) { static std::unordered_set<int> get_list_id(const std::vector<server_task> & tasks) {
@ -162,10 +167,9 @@ struct server_task {
return ids; return ids;
} }
server_task create_child(int id_parent, int id_child, int idx) const { server_task create_child(int id_parent, int id_child) const {
server_task copy; server_task copy;
copy.id = id_child; copy.id = id_child;
copy.index = idx;
copy.id_parent = id_parent; copy.id_parent = id_parent;
copy.params = params; copy.params = params;
copy.type = type; copy.type = type;
@ -212,6 +216,10 @@ struct result_prompt_progress {
struct server_task_result { struct server_task_result {
int id = -1; int id = -1;
int id_slot = -1; int id_slot = -1;
// TODO @ngxson : remove this field and implement a mapping task_id -> idx in the response_reader
size_t index = 0; // to be used for batched tasks
virtual bool is_error() { virtual bool is_error() {
// only used by server_task_result_error // only used by server_task_result_error
return false; return false;
@ -220,9 +228,6 @@ struct server_task_result {
// only used by server_task_result_cmpl_* // only used by server_task_result_cmpl_*
return true; return true;
} }
virtual int get_index() {
return -1;
}
virtual void update(task_result_state &) { virtual void update(task_result_state &) {
// only used by server_task_result_cmpl_* // only used by server_task_result_cmpl_*
} }
@ -255,8 +260,6 @@ struct completion_token_output {
}; };
struct server_task_result_cmpl_final : server_task_result { struct server_task_result_cmpl_final : server_task_result {
int index = 0;
std::string content; std::string content;
llama_tokens tokens; llama_tokens tokens;
@ -289,10 +292,6 @@ struct server_task_result_cmpl_final : server_task_result {
std::vector<common_chat_msg_diff> oaicompat_msg_diffs; // to be populated by update() std::vector<common_chat_msg_diff> oaicompat_msg_diffs; // to be populated by update()
bool is_updated = false; bool is_updated = false;
virtual int get_index() override {
return index;
}
virtual bool is_stop() override { virtual bool is_stop() override {
return true; // in stream mode, final responses are considered stop return true; // in stream mode, final responses are considered stop
} }
@ -318,8 +317,6 @@ struct server_task_result_cmpl_final : server_task_result {
}; };
struct server_task_result_cmpl_partial : server_task_result { struct server_task_result_cmpl_partial : server_task_result {
int index = 0;
std::string content; std::string content;
llama_tokens tokens; llama_tokens tokens;
@ -340,10 +337,6 @@ struct server_task_result_cmpl_partial : server_task_result {
std::vector<common_chat_msg_diff> oaicompat_msg_diffs; // to be populated by update() std::vector<common_chat_msg_diff> oaicompat_msg_diffs; // to be populated by update()
bool is_updated = false; bool is_updated = false;
virtual int get_index() override {
return index;
}
virtual bool is_stop() override { virtual bool is_stop() override {
return false; // in stream mode, partial responses are not considered stop return false; // in stream mode, partial responses are not considered stop
} }
@ -365,7 +358,6 @@ struct server_task_result_cmpl_partial : server_task_result {
}; };
struct server_task_result_embd : server_task_result { struct server_task_result_embd : server_task_result {
int index = 0;
std::vector<std::vector<float>> embedding; std::vector<std::vector<float>> embedding;
int32_t n_tokens; int32_t n_tokens;
@ -373,10 +365,6 @@ struct server_task_result_embd : server_task_result {
// response formatting // response formatting
task_response_type res_type = TASK_RESPONSE_TYPE_NONE; task_response_type res_type = TASK_RESPONSE_TYPE_NONE;
virtual int get_index() override {
return index;
}
virtual json to_json() override; virtual json to_json() override;
json to_json_non_oaicompat(); json to_json_non_oaicompat();
@ -385,20 +373,14 @@ struct server_task_result_embd : server_task_result {
}; };
struct server_task_result_rerank : server_task_result { struct server_task_result_rerank : server_task_result {
int index = 0;
float score = -1e6; float score = -1e6;
int32_t n_tokens; int32_t n_tokens;
virtual int get_index() override {
return index;
}
virtual json to_json() override; virtual json to_json() override;
}; };
struct server_task_result_error : server_task_result { struct server_task_result_error : server_task_result {
int index = 0;
error_type err_type = ERROR_TYPE_SERVER; error_type err_type = ERROR_TYPE_SERVER;
std::string err_msg; std::string err_msg;
@ -460,6 +442,17 @@ struct server_task_result_slot_erase : server_task_result {
virtual json to_json() override; virtual json to_json() override;
}; };
struct server_task_result_get_lora : server_task_result {
struct lora {
common_adapter_lora_info info;
std::string alora_invocation_string;
llama_tokens alora_invocation_tokens;
};
std::vector<lora> loras;
virtual json to_json() override;
};
struct server_task_result_apply_lora : server_task_result { struct server_task_result_apply_lora : server_task_result {
virtual json to_json() override; virtual json to_json() override;
}; };

View File

@ -119,7 +119,7 @@ int main(int argc, char ** argv, char ** envp) {
// //
// register API routes // register API routes
server_routes routes(params, ctx_server, [&ctx_http]() { return ctx_http.is_ready.load(); }); server_routes routes(params, ctx_server);
bool is_router_server = params.model.path.empty(); bool is_router_server = params.model.path.empty();
std::optional<server_models_routes> models_routes{}; std::optional<server_models_routes> models_routes{};
@ -252,6 +252,7 @@ int main(int argc, char ** argv, char ** envp) {
return 1; return 1;
} }
routes.update_meta(ctx_server);
ctx_http.is_ready.store(true); ctx_http.is_ready.store(true);
LOG_INF("%s: model loaded\n", __func__); LOG_INF("%s: model loaded\n", __func__);

View File

@ -294,15 +294,14 @@ class SettingsStore {
* This sets up the default values from /props endpoint * This sets up the default values from /props endpoint
*/ */
syncWithServerDefaults(): void { syncWithServerDefaults(): void {
const serverParams = serverStore.defaultParams; const propsDefaults = this.getServerDefaults();
if (!serverParams) {
console.warn('No server parameters available for initialization'); if (Object.keys(propsDefaults).length === 0) {
console.warn('No server defaults available for initialization');
return; return;
} }
const propsDefaults = this.getServerDefaults();
for (const [key, propsValue] of Object.entries(propsDefaults)) { for (const [key, propsValue] of Object.entries(propsDefaults)) {
const currentValue = getConfigValue(this.config, key); const currentValue = getConfigValue(this.config, key);

View File

@ -119,7 +119,7 @@
$effect(() => { $effect(() => {
const serverProps = serverStore.props; const serverProps = serverStore.props;
if (serverProps?.default_generation_settings?.params) { if (serverProps) {
settingsStore.syncWithServerDefaults(); settingsStore.syncWithServerDefaults();
} }
}); });