Merge branch 'ggml-org:master' into power-law-sampler

This commit is contained in:
ddh0 2025-12-22 14:45:08 -06:00 committed by GitHub
commit 6bad4aef77
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
41 changed files with 2284 additions and 607 deletions

View File

@ -688,13 +688,15 @@ jobs:
- name: Pack artifacts
id: pack_artifacts
run: |
tar -czvf llama-${{ steps.tag.outputs.name }}-xcframework.tar.gz -C build-apple llama.xcframework
# Zip file is required for Swift Package Manager, which does not support tar.gz for binary targets.
# For more details, see https://developer.apple.com/documentation/xcode/distributing-binary-frameworks-as-swift-packages
zip -r -y llama-${{ steps.tag.outputs.name }}-xcframework.zip build-apple/llama.xcframework
- name: Upload artifacts
uses: actions/upload-artifact@v4
with:
path: llama-${{ steps.tag.outputs.name }}-xcframework.tar.gz
name: llama-${{ steps.tag.outputs.name }}-xcframework.tar.gz
path: llama-${{ steps.tag.outputs.name }}-xcframework.zip
name: llama-${{ steps.tag.outputs.name }}-xcframework.zip
openEuler-cann:
@ -863,7 +865,7 @@ jobs:
**macOS/iOS:**
- [macOS Apple Silicon (arm64)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.tar.gz)
- [macOS Intel (x64)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-macos-x64.tar.gz)
- [iOS XCFramework](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-xcframework.tar.gz)
- [iOS XCFramework](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-xcframework.zip)
**Linux:**
- [Ubuntu x64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.tar.gz)

View File

@ -1078,6 +1078,8 @@ struct common_init_result::impl {
impl() = default;
~impl() = default;
// note: the order in which model, context, etc. are declared matters because their destructors will be called bottom-to-top
llama_model_ptr model;
llama_context_ptr context;

View File

@ -141,16 +141,24 @@ class ModelBase:
self.model_name = model_name
self.dir_model_card = dir_model # overridden in convert_lora_to_gguf.py
# Apply heuristics to figure out typical tensor encoding based on first layer tensor encoding type
# Apply heuristics to figure out typical tensor encoding based on first tensor's dtype
# NOTE: can't use field "torch_dtype" in config.json, because some finetunes lie.
if self.ftype == gguf.LlamaFileType.GUESSED:
# NOTE: can't use field "torch_dtype" in config.json, because some finetunes lie.
_, first_tensor = next(self.get_tensors())
if first_tensor.dtype == torch.float16:
logger.info(f"choosing --outtype f16 from first tensor type ({first_tensor.dtype})")
self.ftype = gguf.LlamaFileType.MOSTLY_F16
for _, tensor in self.get_tensors():
if tensor.dim() < 2:
continue
if tensor.dtype == torch.bfloat16:
self.ftype = gguf.LlamaFileType.MOSTLY_BF16
logger.info("heuristics detected bfloat16 tensor dtype, setting --outtype bf16")
break
elif tensor.dtype == torch.float16:
self.ftype = gguf.LlamaFileType.MOSTLY_F16
logger.info("heuristics detected float16 tensor dtype, setting --outtype f16")
break
else:
logger.info(f"choosing --outtype bf16 from first tensor type ({first_tensor.dtype})")
self.ftype = gguf.LlamaFileType.MOSTLY_BF16
self.ftype = gguf.LlamaFileType.MOSTLY_F16
logger.info("heuristics unable to detect tensor dtype, defaulting to --outtype f16")
self.dequant_model()
@ -10557,8 +10565,8 @@ def parse_args() -> argparse.Namespace:
help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
)
parser.add_argument(
"--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "tq1_0", "tq2_0", "auto"], default="f16",
help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, tq1_0 or tq2_0 for ternary, and auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
"--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "tq1_0", "tq2_0", "auto"], default="auto",
help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, tq1_0 or tq2_0 for ternary, and auto for the highest-fidelity 16-bit float type",
)
parser.add_argument(
"--bigendian", action="store_true",

View File

@ -2,57 +2,74 @@
#include "common.h"
#include <fstream>
#include <sstream>
#include <string>
// Export usage message (-h) to markdown format
// Automatically update the markdown docs
static void write_table_header(std::ofstream & file) {
file << "| Argument | Explanation |\n";
file << "| -------- | ----------- |\n";
#define HELP_START_MARKER "<!-- HELP_START -->"
#define HELP_END_MARKER "<!-- HELP_END -->"
#define NOTE_MESSAGE "<!-- IMPORTANT: The list below is auto-generated by llama-gen-docs; do NOT modify it manually -->"
struct md_file {
llama_example ex;
std::string fname;
std::string specific_section_header;
};
std::vector<md_file> md_files = {
{LLAMA_EXAMPLE_CLI, "tools/cli/README.md", "CLI-specific params"},
{LLAMA_EXAMPLE_COMPLETION, "tools/completion/README.md", "Completion-specific params"},
{LLAMA_EXAMPLE_SERVER, "tools/server/README.md", "Server-specific params"},
};
static void write_table_header(std::ostringstream & ss) {
ss << "| Argument | Explanation |\n";
ss << "| -------- | ----------- |\n";
}
static void write_table_entry(std::ofstream & file, const common_arg & opt) {
file << "| `";
static void write_table_entry(std::ostringstream & ss, const common_arg & opt) {
ss << "| `";
// args
auto all_args = opt.get_args();
for (const auto & arg : all_args) {
if (arg == all_args.front()) {
file << arg;
if (all_args.size() > 1) file << ", ";
ss << arg;
if (all_args.size() > 1) ss << ", ";
} else {
file << arg << (arg != all_args.back() ? ", " : "");
ss << arg << (arg != all_args.back() ? ", " : "");
}
}
// value hint
if (opt.value_hint) {
std::string md_value_hint(opt.value_hint);
string_replace_all(md_value_hint, "|", "\\|");
file << " " << md_value_hint;
ss << " " << md_value_hint;
}
if (opt.value_hint_2) {
std::string md_value_hint_2(opt.value_hint_2);
string_replace_all(md_value_hint_2, "|", "\\|");
file << " " << md_value_hint_2;
ss << " " << md_value_hint_2;
}
// help text
std::string md_help(opt.help);
md_help = string_strip(md_help);
string_replace_all(md_help, "\n", "<br/>");
string_replace_all(md_help, "|", "\\|");
file << "` | " << md_help << " |\n";
ss << "` | " << md_help << " |\n";
}
static void write_table(std::ofstream & file, std::vector<common_arg *> & opts) {
write_table_header(file);
static void write_table(std::ostringstream & ss, std::vector<common_arg *> & opts) {
write_table_header(ss);
for (const auto & opt : opts) {
write_table_entry(file, *opt);
write_table_entry(ss, *opt);
}
}
static void export_md(std::string fname, llama_example ex, std::string name) {
std::ofstream file(fname, std::ofstream::out | std::ofstream::trunc);
static void write_help(std::ostringstream & ss, const md_file & md) {
common_params params;
auto ctx_arg = common_params_parser_init(params, ex);
auto ctx_arg = common_params_parser_init(params, md.ex);
std::vector<common_arg *> common_options;
std::vector<common_arg *> sparam_options;
@ -68,18 +85,58 @@ static void export_md(std::string fname, llama_example ex, std::string name) {
}
}
file << "**Common params**\n\n";
write_table(file, common_options);
file << "\n\n**Sampling params**\n\n";
write_table(file, sparam_options);
file << "\n\n**" << name << "-specific params**\n\n";
write_table(file, specific_options);
ss << HELP_START_MARKER << "\n\n";
ss << NOTE_MESSAGE << "\n\n";
ss << "### Common params\n\n";
write_table(ss, common_options);
ss << "\n\n### Sampling params\n\n";
write_table(ss, sparam_options);
ss << "\n\n### " << md.specific_section_header << "\n\n";
write_table(ss, specific_options);
ss << "\n" << HELP_END_MARKER;
}
int main(int, char **) {
// TODO: add CLI
export_md("autogen-completion.md", LLAMA_EXAMPLE_COMPLETION, "Tool");
export_md("autogen-server.md", LLAMA_EXAMPLE_SERVER, "Server");
for (const auto & md : md_files) {
std::ifstream infile(md.fname);
if (!infile.is_open()) {
fprintf(stderr, "failed to open file '%s' for reading\n", md.fname.c_str());
return 1;
}
std::ostringstream ss;
ss << infile.rdbuf();
infile.close();
std::string content = ss.str();
size_t help_start = content.find(HELP_START_MARKER);
size_t help_end = content.find(HELP_END_MARKER);
if (help_start == std::string::npos || help_end == std::string::npos || help_end <= help_start) {
fprintf(stderr, "failed to find help markers in file '%s'\n", md.fname.c_str());
return 1;
}
std::ostringstream new_help_ss;
write_help(new_help_ss, md);
std::string new_help = new_help_ss.str();
content = content.substr(0, help_start) + new_help + content.substr(help_end + strlen(HELP_END_MARKER));
std::ofstream outfile(md.fname);
if (!outfile.is_open()) {
fprintf(stderr, "failed to open file '%s' for writing\n", md.fname.c_str());
return 1;
}
outfile << content;
outfile.close();
printf("Updated help in '%s'\n", md.fname.c_str());
}
return 0;
}

View File

@ -69,6 +69,10 @@
#define VECTOR_REGISTERS 16
#endif
#if defined(__riscv_v_intrinsic)
#define LMUL 4
#endif
#define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1)
namespace {
@ -175,6 +179,46 @@ inline float32x4_t madd(float32x4_t a, float32x4_t b, float32x4_t c) {
}
#endif
#if defined(__riscv_zvfh)
template <>
inline vfloat32m1_t madd(vfloat16mf2_t a, vfloat16mf2_t b, vfloat32m1_t c) {
return __riscv_vfwmacc_vv_f32m1(c, a, b, __riscv_vsetvlmax_e32m1());
}
inline vfloat32m2_t madd(vfloat16m1_t a, vfloat16m1_t b, vfloat32m2_t c) {
return __riscv_vfwmacc_vv_f32m2(c, a, b, __riscv_vsetvlmax_e32m2());
}
inline vfloat32m4_t madd(vfloat16m2_t a, vfloat16m2_t b, vfloat32m4_t c) {
return __riscv_vfwmacc_vv_f32m4(c, a, b, __riscv_vsetvlmax_e32m4());
}
inline vfloat32m8_t madd(vfloat16m4_t a, vfloat16m4_t b, vfloat32m8_t c) {
return __riscv_vfwmacc_vv_f32m8(c, a, b, __riscv_vsetvlmax_e32m8());
}
inline vfloat32m1_t madd(vfloat32m1_t a, vfloat32m1_t b, vfloat32m1_t c) {
return __riscv_vfmacc_vv_f32m1(c, a, b, __riscv_vsetvlmax_e32m1());
}
inline vfloat32m2_t madd(vfloat32m2_t a, vfloat32m2_t b, vfloat32m2_t c) {
return __riscv_vfmacc_vv_f32m2(c, a, b, __riscv_vsetvlmax_e32m2());
}
inline vfloat32m4_t madd(vfloat32m4_t a, vfloat32m4_t b, vfloat32m4_t c) {
return __riscv_vfmacc_vv_f32m4(c, a, b, __riscv_vsetvlmax_e32m4());
}
inline vfloat32m8_t madd(vfloat32m8_t a, vfloat32m8_t b, vfloat32m8_t c) {
return __riscv_vfmacc_vv_f32m8(c, a, b, __riscv_vsetvlmax_e32m8());
}
#endif
#if defined(__riscv_zvfbfwma)
inline vfloat32m1_t madd(vbfloat16mf2_t a, vbfloat16mf2_t b, vfloat32m1_t c) {
return __riscv_vfwmaccbf16_vv_f32m1(c, a, b, __riscv_vsetvlmax_e32m1());
}
inline vfloat32m2_t madd(vbfloat16m1_t a, vbfloat16m1_t b, vfloat32m2_t c) {
return __riscv_vfwmaccbf16_vv_f32m2(c, a, b, __riscv_vsetvlmax_e32m2());
}
inline vfloat32m4_t madd(vbfloat16m2_t a, vbfloat16m2_t b, vfloat32m4_t c) {
return __riscv_vfwmaccbf16_vv_f32m4(c, a, b, __riscv_vsetvlmax_e32m4());
}
#endif
////////////////////////////////////////////////////////////////////////////////////////////////////
// VECTORIZED HORIZONTAL SUM
@ -227,6 +271,25 @@ inline float hsum(__m512 x) {
}
#endif // __AVX512F__
#if defined(__riscv_zvfh)
inline float hsum(vfloat32m1_t x) {
return __riscv_vfmv_f_s_f32m1_f32(
__riscv_vfredusum_vs_f32m1_f32m1(x, __riscv_vfmv_v_f_f32m1(0, 1), __riscv_vsetvlmax_e32m1()));
}
inline float hsum(vfloat32m2_t x) {
return __riscv_vfmv_f_s_f32m1_f32(
__riscv_vfredusum_vs_f32m2_f32m1(x, __riscv_vfmv_v_f_f32m1(0, 1), __riscv_vsetvlmax_e32m2()));
}
inline float hsum(vfloat32m4_t x) {
return __riscv_vfmv_f_s_f32m1_f32(
__riscv_vfredusum_vs_f32m4_f32m1(x, __riscv_vfmv_v_f_f32m1(0, 1), __riscv_vsetvlmax_e32m4()));
}
inline float hsum(vfloat32m8_t x) {
return __riscv_vfmv_f_s_f32m1_f32(
__riscv_vfredusum_vs_f32m8_f32m1(x, __riscv_vfmv_v_f_f32m1(0, 1), __riscv_vsetvlmax_e32m8()));
}
#endif
////////////////////////////////////////////////////////////////////////////////////////////////////
// VECTORIZED MEMORY LOADING
@ -315,6 +378,88 @@ template <> inline __m256bh load(const float *p) {
}
#endif
#if defined(__riscv_zvfh)
template <> inline vfloat16mf2_t load(const ggml_fp16_t *p) {
return __riscv_vle16_v_f16mf2(reinterpret_cast<const _Float16 *>(p), __riscv_vsetvlmax_e16mf2());
}
template <> inline vfloat16m1_t load(const ggml_fp16_t *p) {
return __riscv_vle16_v_f16m1(reinterpret_cast<const _Float16 *>(p), __riscv_vsetvlmax_e16m1());
}
template <> inline vfloat16m2_t load(const ggml_fp16_t *p) {
return __riscv_vle16_v_f16m2(reinterpret_cast<const _Float16 *>(p), __riscv_vsetvlmax_e16m2());
}
template <> inline vfloat16m4_t load(const ggml_fp16_t *p) {
return __riscv_vle16_v_f16m4(reinterpret_cast<const _Float16 *>(p), __riscv_vsetvlmax_e16m4());
}
template <> inline vfloat32m1_t load(const float *p) {
return __riscv_vle32_v_f32m1(p, __riscv_vsetvlmax_e32m1());
}
template <> inline vfloat32m2_t load(const float *p) {
return __riscv_vle32_v_f32m2(p, __riscv_vsetvlmax_e32m2());
}
template <> inline vfloat32m4_t load(const float *p) {
return __riscv_vle32_v_f32m4(p, __riscv_vsetvlmax_e32m4());
}
template <> inline vfloat32m8_t load(const float *p) {
return __riscv_vle32_v_f32m8(p, __riscv_vsetvlmax_e32m8());
}
#endif
#if defined(__riscv_zvfbfwma)
template <> inline vbfloat16mf2_t load(const ggml_bf16_t *p) {
return __riscv_vle16_v_bf16mf2(reinterpret_cast<const __bf16*>(p), __riscv_vsetvlmax_e16mf2());
}
template <> inline vbfloat16m1_t load(const ggml_bf16_t *p) {
return __riscv_vle16_v_bf16m1(reinterpret_cast<const __bf16*>(p), __riscv_vsetvlmax_e16m1());
}
template <> inline vbfloat16m2_t load(const ggml_bf16_t *p) {
return __riscv_vle16_v_bf16m2(reinterpret_cast<const __bf16*>(p), __riscv_vsetvlmax_e16m2());
}
#endif
#if defined(__riscv_zvfh)
template <typename T> T set_zero();
template <> inline vfloat16mf2_t set_zero() {
return __riscv_vfmv_v_f_f16mf2(0, __riscv_vsetvlmax_e16mf2());
}
template <> inline vfloat16m1_t set_zero() {
return __riscv_vfmv_v_f_f16m1(0, __riscv_vsetvlmax_e16m1());
}
template <> inline vfloat16m2_t set_zero() {
return __riscv_vfmv_v_f_f16m2(0, __riscv_vsetvlmax_e16m2());
}
template <> inline vfloat16m4_t set_zero() {
return __riscv_vfmv_v_f_f16m4(0, __riscv_vsetvlmax_e16m4());
}
template <> inline vfloat32m1_t set_zero() {
return __riscv_vfmv_v_f_f32m1(0.0f, __riscv_vsetvlmax_e32m1());
}
template <> inline vfloat32m2_t set_zero() {
return __riscv_vfmv_v_f_f32m2(0, __riscv_vsetvlmax_e32m2());
}
template <> inline vfloat32m4_t set_zero() {
return __riscv_vfmv_v_f_f32m4(0, __riscv_vsetvlmax_e32m4());
}
template <> inline vfloat32m8_t set_zero() {
return __riscv_vfmv_v_f_f32m8(0, __riscv_vsetvlmax_e32m8());
}
#endif
#if defined(__riscv_v_intrinsic)
template <typename T> size_t vlmax() {
if constexpr (std::is_same_v<T, vfloat16mf2_t>) { return __riscv_vsetvlmax_e16mf2(); }
else if constexpr (std::is_same_v<T, vfloat16m1_t>) { return __riscv_vsetvlmax_e16m1(); }
else if constexpr (std::is_same_v<T, vfloat16m2_t>) { return __riscv_vsetvlmax_e16m2(); }
else if constexpr (std::is_same_v<T, vfloat16m4_t>) { return __riscv_vsetvlmax_e16m4(); }
else if constexpr (std::is_same_v<T, vfloat32m1_t>) { return __riscv_vsetvlmax_e32m1(); }
else if constexpr (std::is_same_v<T, vfloat32m2_t>) { return __riscv_vsetvlmax_e32m2(); }
else if constexpr (std::is_same_v<T, vfloat32m4_t>) { return __riscv_vsetvlmax_e32m4(); }
else if constexpr (std::is_same_v<T, vfloat32m8_t>) { return __riscv_vsetvlmax_e32m8(); }
return 0;
}
#endif
////////////////////////////////////////////////////////////////////////////////////////////////////
// FLOATING POINT MATRIX MULTIPLICATION
@ -488,6 +633,573 @@ class tinyBLAS {
const int64_t ldc;
};
#if defined(__riscv_v_intrinsic)
template <typename D, typename V, typename TA, typename TB, typename TC>
class tinyBLAS_RVV {
public:
tinyBLAS_RVV(const ggml_compute_params * params, int64_t k,
const TA *A, int64_t lda,
const TB *B, int64_t ldb,
TC *C, int64_t ldc)
: params(params), A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc) {
}
bool matmul(int64_t m, int64_t n) {
if (k % vlmax<V>() != 0) {
return false;
}
#if LMUL == 1
if (m % 16 == 0 && (m/16 >= params->nth)) {
const int64_t SIZE_N = BLOCK_SIZE<6>(n);
mnpack<4, 6, 4>(m, n, SIZE_N, 12);
return true;
}
if (m % 8 == 0 ) {
const int64_t SIZE_N = BLOCK_SIZE<6>(n);
mnpack<4, 6, 2>(m, n, SIZE_N, 12);
return true;
}
if (m % 4 == 0) {
const int64_t SIZE_N = BLOCK_SIZE<6>(n);
mnpack<4, 6, 1>(m, n, SIZE_N, 12);
return true;
}
#elif LMUL == 2
if (m % 16 == 0 && (m/16 >= params->nth)) {
const int64_t SIZE_N = BLOCK_SIZE<3>(n);
mnpack<4, 3, 4>(m, n, SIZE_N, 24);
return true;
}
if (m % 8 == 0 ) {
const int64_t SIZE_N = BLOCK_SIZE<3>(n);
mnpack<4, 3, 2>(m, n, SIZE_N, 24);
return true;
}
if (m % 4 == 0) {
const int64_t SIZE_N = BLOCK_SIZE<3>(n);
mnpack<4, 3, 1>(m, n, SIZE_N, 24);
return true;
}
#else // LMUL = 4
if (m % 16 == 0 && (m/16 >= params->nth)) {
const int64_t SIZE_N = BLOCK_SIZE<2>(n);
mnpack<2, 2, 8>(m, n, SIZE_N, 36);
return true;
}
if (m % 8 == 0 ) {
const int64_t SIZE_N = BLOCK_SIZE<2>(n);
mnpack<2, 2, 4>(m, n, SIZE_N, 36);
return true;
}
if (m % 4 == 0) {
const int64_t SIZE_N = BLOCK_SIZE<2>(n);
mnpack<2, 2, 2>(m, n, SIZE_N, 36);
return true;
}
#endif
return false;
}
private:
template<int RM, int RN, int BM>
inline void mnpack(int64_t m, int64_t n, int64_t SIZE_N, int64_t BN) {
if (SIZE_N == RN) {
return gemm<RM, RN, BM>(m, n, BN);
}
if constexpr (RN > 1) {
return mnpack<RM, RN-1, BM>(m, n, SIZE_N, BN);
} else {
GGML_LOG_ERROR("mnpack<%d, %d> bloc size not supported\n", RM, (int)SIZE_N);
GGML_ASSERT(false); // we have miss something.
}
}
inline void gemm_bloc_4x6(int64_t ii, int64_t jj) {
size_t vl = vlmax<V>();
D Cv00 = set_zero<D>();
D Cv01 = set_zero<D>();
D Cv02 = set_zero<D>();
D Cv03 = set_zero<D>();
D Cv10 = set_zero<D>();
D Cv11 = set_zero<D>();
D Cv12 = set_zero<D>();
D Cv13 = set_zero<D>();
D Cv20 = set_zero<D>();
D Cv21 = set_zero<D>();
D Cv22 = set_zero<D>();
D Cv23 = set_zero<D>();
D Cv30 = set_zero<D>();
D Cv31 = set_zero<D>();
D Cv32 = set_zero<D>();
D Cv33 = set_zero<D>();
D Cv40 = set_zero<D>();
D Cv41 = set_zero<D>();
D Cv42 = set_zero<D>();
D Cv43 = set_zero<D>();
D Cv50 = set_zero<D>();
D Cv51 = set_zero<D>();
D Cv52 = set_zero<D>();
D Cv53 = set_zero<D>();
for (int64_t l = 0; l < k; l += vl) {
V Bv0 = load<V>(B + ldb * (jj + 0) + l);
V Bv1 = load<V>(B + ldb * (jj + 1) + l);
V Bv2 = load<V>(B + ldb * (jj + 2) + l);
V Bv3 = load<V>(B + ldb * (jj + 3) + l);
V Bv4 = load<V>(B + ldb * (jj + 4) + l);
V Bv5 = load<V>(B + ldb * (jj + 5) + l);
V Av0 = load<V>(A + lda * (ii + 0) + l);
Cv00 = madd(Av0, Bv0, Cv00);
Cv10 = madd(Av0, Bv1, Cv10);
Cv20 = madd(Av0, Bv2, Cv20);
Cv30 = madd(Av0, Bv3, Cv30);
Cv40 = madd(Av0, Bv4, Cv40);
Cv50 = madd(Av0, Bv5, Cv50);
V Av1 = load<V>(A + lda * (ii + 1) + l);
Cv01 = madd(Av1, Bv0, Cv01);
Cv11 = madd(Av1, Bv1, Cv11);
Cv21 = madd(Av1, Bv2, Cv21);
Cv31 = madd(Av1, Bv3, Cv31);
Cv41 = madd(Av1, Bv4, Cv41);
Cv51 = madd(Av1, Bv5, Cv51);
V Av2 = load<V>(A + lda * (ii + 2) + l);
Cv02 = madd(Av2, Bv0, Cv02);
Cv12 = madd(Av2, Bv1, Cv12);
Cv22 = madd(Av2, Bv2, Cv22);
Cv32 = madd(Av2, Bv3, Cv32);
Cv42 = madd(Av2, Bv4, Cv42);
Cv52 = madd(Av2, Bv5, Cv52);
V Av3 = load<V>(A + lda * (ii + 3) + l);
Cv03 = madd(Av3, Bv0, Cv03);
Cv13 = madd(Av3, Bv1, Cv13);
Cv23 = madd(Av3, Bv2, Cv23);
Cv33 = madd(Av3, Bv3, Cv33);
Cv43 = madd(Av3, Bv4, Cv43);
Cv53 = madd(Av3, Bv5, Cv53);
}
C[ldc * (jj + 0) + (ii + 0)] = hsum(Cv00);
C[ldc * (jj + 0) + (ii + 1)] = hsum(Cv01);
C[ldc * (jj + 0) + (ii + 2)] = hsum(Cv02);
C[ldc * (jj + 0) + (ii + 3)] = hsum(Cv03);
C[ldc * (jj + 1) + (ii + 0)] = hsum(Cv10);
C[ldc * (jj + 1) + (ii + 1)] = hsum(Cv11);
C[ldc * (jj + 1) + (ii + 2)] = hsum(Cv12);
C[ldc * (jj + 1) + (ii + 3)] = hsum(Cv13);
C[ldc * (jj + 2) + (ii + 0)] = hsum(Cv20);
C[ldc * (jj + 2) + (ii + 1)] = hsum(Cv21);
C[ldc * (jj + 2) + (ii + 2)] = hsum(Cv22);
C[ldc * (jj + 2) + (ii + 3)] = hsum(Cv23);
C[ldc * (jj + 3) + (ii + 0)] = hsum(Cv30);
C[ldc * (jj + 3) + (ii + 1)] = hsum(Cv31);
C[ldc * (jj + 3) + (ii + 2)] = hsum(Cv32);
C[ldc * (jj + 3) + (ii + 3)] = hsum(Cv33);
C[ldc * (jj + 4) + (ii + 0)] = hsum(Cv40);
C[ldc * (jj + 4) + (ii + 1)] = hsum(Cv41);
C[ldc * (jj + 4) + (ii + 2)] = hsum(Cv42);
C[ldc * (jj + 4) + (ii + 3)] = hsum(Cv43);
C[ldc * (jj + 5) + (ii + 0)] = hsum(Cv50);
C[ldc * (jj + 5) + (ii + 1)] = hsum(Cv51);
C[ldc * (jj + 5) + (ii + 2)] = hsum(Cv52);
C[ldc * (jj + 5) + (ii + 3)] = hsum(Cv53);
}
inline void gemm_bloc_4x5(int64_t ii, int64_t jj) {
size_t vl = vlmax<V>();
D Cv00 = set_zero<D>();
D Cv01 = set_zero<D>();
D Cv02 = set_zero<D>();
D Cv03 = set_zero<D>();
D Cv10 = set_zero<D>();
D Cv11 = set_zero<D>();
D Cv12 = set_zero<D>();
D Cv13 = set_zero<D>();
D Cv20 = set_zero<D>();
D Cv21 = set_zero<D>();
D Cv22 = set_zero<D>();
D Cv23 = set_zero<D>();
D Cv30 = set_zero<D>();
D Cv31 = set_zero<D>();
D Cv32 = set_zero<D>();
D Cv33 = set_zero<D>();
D Cv40 = set_zero<D>();
D Cv41 = set_zero<D>();
D Cv42 = set_zero<D>();
D Cv43 = set_zero<D>();
for (int64_t l = 0; l < k; l += vl) {
V Bv0 = load<V>(B + ldb * (jj + 0) + l);
V Bv1 = load<V>(B + ldb * (jj + 1) + l);
V Bv2 = load<V>(B + ldb * (jj + 2) + l);
V Bv3 = load<V>(B + ldb * (jj + 3) + l);
V Bv4 = load<V>(B + ldb * (jj + 4) + l);
V Av0 = load<V>(A + lda * (ii + 0) + l);
Cv00 = madd(Av0, Bv0, Cv00);
Cv10 = madd(Av0, Bv1, Cv10);
Cv20 = madd(Av0, Bv2, Cv20);
Cv30 = madd(Av0, Bv3, Cv30);
Cv40 = madd(Av0, Bv4, Cv40);
V Av1 = load<V>(A + lda * (ii + 1) + l);
Cv01 = madd(Av1, Bv0, Cv01);
Cv11 = madd(Av1, Bv1, Cv11);
Cv21 = madd(Av1, Bv2, Cv21);
Cv31 = madd(Av1, Bv3, Cv31);
Cv41 = madd(Av1, Bv4, Cv41);
V Av2 = load<V>(A + lda * (ii + 2) + l);
Cv02 = madd(Av2, Bv0, Cv02);
Cv12 = madd(Av2, Bv1, Cv12);
Cv22 = madd(Av2, Bv2, Cv22);
Cv32 = madd(Av2, Bv3, Cv32);
Cv42 = madd(Av2, Bv4, Cv42);
V Av3 = load<V>(A + lda * (ii + 3) + l);
Cv03 = madd(Av3, Bv0, Cv03);
Cv13 = madd(Av3, Bv1, Cv13);
Cv23 = madd(Av3, Bv2, Cv23);
Cv33 = madd(Av3, Bv3, Cv33);
Cv43 = madd(Av3, Bv4, Cv43);
}
C[ldc * (jj + 0) + (ii + 0)] = hsum(Cv00);
C[ldc * (jj + 0) + (ii + 1)] = hsum(Cv01);
C[ldc * (jj + 0) + (ii + 2)] = hsum(Cv02);
C[ldc * (jj + 0) + (ii + 3)] = hsum(Cv03);
C[ldc * (jj + 1) + (ii + 0)] = hsum(Cv10);
C[ldc * (jj + 1) + (ii + 1)] = hsum(Cv11);
C[ldc * (jj + 1) + (ii + 2)] = hsum(Cv12);
C[ldc * (jj + 1) + (ii + 3)] = hsum(Cv13);
C[ldc * (jj + 2) + (ii + 0)] = hsum(Cv20);
C[ldc * (jj + 2) + (ii + 1)] = hsum(Cv21);
C[ldc * (jj + 2) + (ii + 2)] = hsum(Cv22);
C[ldc * (jj + 2) + (ii + 3)] = hsum(Cv23);
C[ldc * (jj + 3) + (ii + 0)] = hsum(Cv30);
C[ldc * (jj + 3) + (ii + 1)] = hsum(Cv31);
C[ldc * (jj + 3) + (ii + 2)] = hsum(Cv32);
C[ldc * (jj + 3) + (ii + 3)] = hsum(Cv33);
C[ldc * (jj + 4) + (ii + 0)] = hsum(Cv40);
C[ldc * (jj + 4) + (ii + 1)] = hsum(Cv41);
C[ldc * (jj + 4) + (ii + 2)] = hsum(Cv42);
C[ldc * (jj + 4) + (ii + 3)] = hsum(Cv43);
}
inline void gemm_bloc_4x4(int64_t ii, int64_t jj) {
size_t vl = vlmax<V>();
D Cv00 = set_zero<D>();
D Cv01 = set_zero<D>();
D Cv02 = set_zero<D>();
D Cv03 = set_zero<D>();
D Cv10 = set_zero<D>();
D Cv11 = set_zero<D>();
D Cv12 = set_zero<D>();
D Cv13 = set_zero<D>();
D Cv20 = set_zero<D>();
D Cv21 = set_zero<D>();
D Cv22 = set_zero<D>();
D Cv23 = set_zero<D>();
D Cv30 = set_zero<D>();
D Cv31 = set_zero<D>();
D Cv32 = set_zero<D>();
D Cv33 = set_zero<D>();
for (int64_t l = 0; l < k; l += vl) {
V Av0 = load<V>(A + lda * (ii + 0) + l);
V Av1 = load<V>(A + lda * (ii + 1) + l);
V Av2 = load<V>(A + lda * (ii + 2) + l);
V Av3 = load<V>(A + lda * (ii + 3) + l);
V Bv0 = load<V>(B + ldb * (jj + 0) + l);
Cv00 = madd(Av0, Bv0, Cv00);
Cv01 = madd(Av1, Bv0, Cv01);
Cv02 = madd(Av2, Bv0, Cv02);
Cv03 = madd(Av3, Bv0, Cv03);
V Bv1 = load<V>(B + ldb * (jj + 1) + l);
Cv10 = madd(Av0, Bv1, Cv10);
Cv11 = madd(Av1, Bv1, Cv11);
Cv12 = madd(Av2, Bv1, Cv12);
Cv13 = madd(Av3, Bv1, Cv13);
V Bv2 = load<V>(B + ldb * (jj + 2) + l);
Cv20 = madd(Av0, Bv2, Cv20);
Cv21 = madd(Av1, Bv2, Cv21);
Cv22 = madd(Av2, Bv2, Cv22);
Cv23 = madd(Av3, Bv2, Cv23);
V Bv3 = load<V>(B + ldb * (jj + 3) + l);
Cv30 = madd(Av0, Bv3, Cv30);
Cv31 = madd(Av1, Bv3, Cv31);
Cv32 = madd(Av2, Bv3, Cv32);
Cv33 = madd(Av3, Bv3, Cv33);
}
C[ldc * (jj + 0) + (ii + 0)] = hsum(Cv00);
C[ldc * (jj + 0) + (ii + 1)] = hsum(Cv01);
C[ldc * (jj + 0) + (ii + 2)] = hsum(Cv02);
C[ldc * (jj + 0) + (ii + 3)] = hsum(Cv03);
C[ldc * (jj + 1) + (ii + 0)] = hsum(Cv10);
C[ldc * (jj + 1) + (ii + 1)] = hsum(Cv11);
C[ldc * (jj + 1) + (ii + 2)] = hsum(Cv12);
C[ldc * (jj + 1) + (ii + 3)] = hsum(Cv13);
C[ldc * (jj + 2) + (ii + 0)] = hsum(Cv20);
C[ldc * (jj + 2) + (ii + 1)] = hsum(Cv21);
C[ldc * (jj + 2) + (ii + 2)] = hsum(Cv22);
C[ldc * (jj + 2) + (ii + 3)] = hsum(Cv23);
C[ldc * (jj + 3) + (ii + 0)] = hsum(Cv30);
C[ldc * (jj + 3) + (ii + 1)] = hsum(Cv31);
C[ldc * (jj + 3) + (ii + 2)] = hsum(Cv32);
C[ldc * (jj + 3) + (ii + 3)] = hsum(Cv33);
}
inline void gemm_bloc_4x3(int64_t ii, int64_t jj) {
size_t vl = vlmax<V>();
D Cv00 = set_zero<D>();
D Cv01 = set_zero<D>();
D Cv02 = set_zero<D>();
D Cv03 = set_zero<D>();
D Cv10 = set_zero<D>();
D Cv11 = set_zero<D>();
D Cv12 = set_zero<D>();
D Cv13 = set_zero<D>();
D Cv20 = set_zero<D>();
D Cv21 = set_zero<D>();
D Cv22 = set_zero<D>();
D Cv23 = set_zero<D>();
for (int64_t l = 0; l < k; l += vl) {
V Av0 = load<V>(A + lda * (ii + 0) + l);
V Av1 = load<V>(A + lda * (ii + 1) + l);
V Av2 = load<V>(A + lda * (ii + 2) + l);
V Av3 = load<V>(A + lda * (ii + 3) + l);
V Bv0 = load<V>(B + ldb * (jj + 0) + l);
Cv00 = madd(Av0, Bv0, Cv00);
Cv01 = madd(Av1, Bv0, Cv01);
Cv02 = madd(Av2, Bv0, Cv02);
Cv03 = madd(Av3, Bv0, Cv03);
V Bv1 = load<V>(B + ldb * (jj + 1) + l);
Cv10 = madd(Av0, Bv1, Cv10);
Cv11 = madd(Av1, Bv1, Cv11);
Cv12 = madd(Av2, Bv1, Cv12);
Cv13 = madd(Av3, Bv1, Cv13);
V Bv2 = load<V>(B + ldb * (jj + 2) + l);
Cv20 = madd(Av0, Bv2, Cv20);
Cv21 = madd(Av1, Bv2, Cv21);
Cv22 = madd(Av2, Bv2, Cv22);
Cv23 = madd(Av3, Bv2, Cv23);
}
C[ldc * (jj + 0) + (ii + 0)] = hsum(Cv00);
C[ldc * (jj + 0) + (ii + 1)] = hsum(Cv01);
C[ldc * (jj + 0) + (ii + 2)] = hsum(Cv02);
C[ldc * (jj + 0) + (ii + 3)] = hsum(Cv03);
C[ldc * (jj + 1) + (ii + 0)] = hsum(Cv10);
C[ldc * (jj + 1) + (ii + 1)] = hsum(Cv11);
C[ldc * (jj + 1) + (ii + 2)] = hsum(Cv12);
C[ldc * (jj + 1) + (ii + 3)] = hsum(Cv13);
C[ldc * (jj + 2) + (ii + 0)] = hsum(Cv20);
C[ldc * (jj + 2) + (ii + 1)] = hsum(Cv21);
C[ldc * (jj + 2) + (ii + 2)] = hsum(Cv22);
C[ldc * (jj + 2) + (ii + 3)] = hsum(Cv23);
}
inline void gemm_bloc_4x2(int64_t ii, int64_t jj) {
size_t vl = vlmax<V>();
D Cv00 = set_zero<D>();
D Cv01 = set_zero<D>();
D Cv02 = set_zero<D>();
D Cv03 = set_zero<D>();
D Cv10 = set_zero<D>();
D Cv11 = set_zero<D>();
D Cv12 = set_zero<D>();
D Cv13 = set_zero<D>();
for (int64_t l = 0; l < k; l += vl) {
V Av0 = load<V>(A + lda * (ii + 0) + l);
V Av1 = load<V>(A + lda * (ii + 1) + l);
V Av2 = load<V>(A + lda * (ii + 2) + l);
V Av3 = load<V>(A + lda * (ii + 3) + l);
V Bv0 = load<V>(B + ldb * (jj + 0) + l);
Cv00 = madd(Av0, Bv0, Cv00);
Cv01 = madd(Av1, Bv0, Cv01);
Cv02 = madd(Av2, Bv0, Cv02);
Cv03 = madd(Av3, Bv0, Cv03);
V Bv1 = load<V>(B + ldb * (jj + 1) + l);
Cv10 = madd(Av0, Bv1, Cv10);
Cv11 = madd(Av1, Bv1, Cv11);
Cv12 = madd(Av2, Bv1, Cv12);
Cv13 = madd(Av3, Bv1, Cv13);
}
C[ldc * (jj + 0) + (ii + 0)] = hsum(Cv00);
C[ldc * (jj + 0) + (ii + 1)] = hsum(Cv01);
C[ldc * (jj + 0) + (ii + 2)] = hsum(Cv02);
C[ldc * (jj + 0) + (ii + 3)] = hsum(Cv03);
C[ldc * (jj + 1) + (ii + 0)] = hsum(Cv10);
C[ldc * (jj + 1) + (ii + 1)] = hsum(Cv11);
C[ldc * (jj + 1) + (ii + 2)] = hsum(Cv12);
C[ldc * (jj + 1) + (ii + 3)] = hsum(Cv13);
}
inline void gemm_bloc_4x1(int64_t ii, int64_t jj) {
size_t vl = vlmax<V>();
D Cv00 = set_zero<D>();
D Cv01 = set_zero<D>();
D Cv02 = set_zero<D>();
D Cv03 = set_zero<D>();
for (int64_t l = 0; l < k; l += vl) {
V Av0 = load<V>(A + lda * (ii + 0) + l);
V Av1 = load<V>(A + lda * (ii + 1) + l);
V Av2 = load<V>(A + lda * (ii + 2) + l);
V Av3 = load<V>(A + lda * (ii + 3) + l);
V Bv0 = load<V>(B + ldb * (jj + 0) + l);
Cv00 = madd(Av0, Bv0, Cv00);
Cv01 = madd(Av1, Bv0, Cv01);
Cv02 = madd(Av2, Bv0, Cv02);
Cv03 = madd(Av3, Bv0, Cv03);
}
C[ldc * (jj + 0) + (ii + 0)] = hsum(Cv00);
C[ldc * (jj + 0) + (ii + 1)] = hsum(Cv01);
C[ldc * (jj + 0) + (ii + 2)] = hsum(Cv02);
C[ldc * (jj + 0) + (ii + 3)] = hsum(Cv03);
}
inline void gemm_bloc_2x2(int64_t ii, int64_t jj) {
size_t vl = vlmax<V>();
D Cv00 = set_zero<D>();
D Cv01 = set_zero<D>();
D Cv10 = set_zero<D>();
D Cv11 = set_zero<D>();
for (int64_t l = 0; l < k; l += vl) {
V Av0 = load<V>(A + lda * (ii + 0) + l);
V Av1 = load<V>(A + lda * (ii + 1) + l);
V Bv0 = load<V>(B + ldb * (jj + 0) + l);
Cv00 = madd(Av0, Bv0, Cv00);
Cv01 = madd(Av1, Bv0, Cv01);
V Bv1 = load<V>(B + ldb * (jj + 1) + l);
Cv10 = madd(Av0, Bv1, Cv10);
Cv11 = madd(Av1, Bv1, Cv11);
}
C[ldc * (jj + 0) + (ii + 0)] = hsum(Cv00);
C[ldc * (jj + 0) + (ii + 1)] = hsum(Cv01);
C[ldc * (jj + 1) + (ii + 0)] = hsum(Cv10);
C[ldc * (jj + 1) + (ii + 1)] = hsum(Cv11);
}
inline void gemm_bloc_2x1(int64_t ii, int64_t jj) {
size_t vl = vlmax<V>();
D Cv00 = set_zero<D>();
D Cv01 = set_zero<D>();
for (int64_t l = 0; l < k; l += vl) {
V Av0 = load<V>(A + lda * (ii + 0) + l);
V Av1 = load<V>(A + lda * (ii + 1) + l);
V Bv0 = load<V>(B + ldb * (jj + 0) + l);
Cv00 = madd(Av0, Bv0, Cv00);
Cv01 = madd(Av1, Bv0, Cv01);
}
C[ldc * (jj + 0) + (ii + 0)] = hsum(Cv00);
C[ldc * (jj + 0) + (ii + 1)] = hsum(Cv01);
}
template <int RM, int RN>
inline void gemm_bloc(int64_t ii, int64_t jj) {
if constexpr (RM == 4) {
if constexpr (RN == 6) { return gemm_bloc_4x6(ii, jj); }
if constexpr (RN == 5) { return gemm_bloc_4x5(ii, jj); }
if constexpr (RN == 4) { return gemm_bloc_4x4(ii, jj); }
if constexpr (RN == 3) { return gemm_bloc_4x3(ii, jj); }
if constexpr (RN == 2) { return gemm_bloc_4x2(ii, jj); }
if constexpr (RN == 1) { return gemm_bloc_4x1(ii, jj); }
} else if constexpr (RM == 2) {
if constexpr (RN == 2) { return gemm_bloc_2x2(ii, jj); }
if constexpr (RN == 1) { return gemm_bloc_2x1(ii, jj); }
}
}
template <int RM, int RN, int BM>
NOINLINE void gemm(int64_t m, int64_t n, int64_t BN) {
GGML_ASSERT(m % (RM * BM) == 0);
const int64_t ytiles = m / (RM * BM);
const int64_t xtiles = (n + RN -1) / RN;
const int64_t jj_RN = (xtiles - (xtiles * RN - n));
// "round" bloc_size to "nearest" BN
const int64_t NB_BN = xtiles < BN ? 1 : (xtiles + BN / 2) / BN;
const int64_t SIZE_BN = xtiles % NB_BN == 0 ? xtiles / NB_BN : xtiles / NB_BN + 1;
const int64_t jj_BN = (NB_BN - (NB_BN * SIZE_BN - xtiles));
const int64_t nb_job = ytiles * NB_BN;
if (params->ith == 0) {
GGML_ASSERT( jj_BN * SIZE_BN + (NB_BN - jj_BN) * (SIZE_BN - 1) == xtiles);
// Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start.
ggml_threadpool_chunk_set(params->threadpool, params->nth);
}
ggml_barrier(params->threadpool);
int64_t job = params->ith;
while (job < nb_job) {
const int64_t ii = (job % ytiles) * RM * BM;
const int64_t jb = job / ytiles;
const int64_t jr0 = BLOC_POS(jb , jj_BN, SIZE_BN);
const int64_t jrN = BLOC_POS(jb+1, jj_BN, SIZE_BN);
const int64_t jj0 = BLOC_POS(jr0, jj_RN, RN);
const int64_t jj2 = BLOC_POS(jrN, jj_RN, RN);
const int64_t jj1 = jj2 < jj_RN * RN ? jj2 : jj_RN * RN;
for (int64_t bi = 0; bi < BM * RM; bi += RM) {
int64_t jj = jj0;
for (; jj < jj1; jj += RN) {
gemm_bloc<RM, RN>(ii + bi, jj);
}
if constexpr (RN > 1) {
for (; jj < jj2; jj += RN - 1) {
gemm_bloc<RM, RN-1>(ii + bi, jj);
}
}
GGML_ASSERT(jj == jj2);
}
job = ggml_threadpool_chunk_add(params->threadpool, 1);
}
ggml_barrier(params->threadpool);
return;
}
const ggml_compute_params * params;
const TA *const A;
const TB *const B;
TC *const C;
const int64_t k;
const int64_t lda;
const int64_t ldb;
const int64_t ldc;
};
#endif
//////////////////////////////////////////////////////////////////////////////////////////
// QUANT ZERO MATRIX MULTIPLICATION
@ -2657,6 +3369,24 @@ bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64
params->ith, params->nth};
tb.matmul(m, n);
return true;
#elif defined(__riscv_zvfh)
#if LMUL == 1
tinyBLAS_RVV<vfloat32m1_t, vfloat32m1_t, float, float, float> tb{ params,
k, (const float *)A, lda,
(const float *)B, ldb,
(float *)C, ldc};
#elif LMUL == 2
tinyBLAS_RVV<vfloat32m2_t, vfloat32m2_t, float, float, float> tb{ params,
k, (const float *)A, lda,
(const float *)B, ldb,
(float *)C, ldc};
#else // LMUL = 4
tinyBLAS_RVV<vfloat32m4_t, vfloat32m4_t, float, float, float> tb{ params,
k, (const float *)A, lda,
(const float *)B, ldb,
(float *)C, ldc};
#endif
return tb.matmul(m, n);
#else
return false;
#endif
@ -2699,6 +3429,24 @@ bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64
tb.matmul(m, n);
return true;
}
#elif defined(__riscv_zvfbfwma)
#if LMUL == 1
tinyBLAS_RVV<vfloat32m1_t, vbfloat16mf2_t, ggml_bf16_t, ggml_bf16_t, float> tb{ params,
k, (const ggml_bf16_t *)A, lda,
(const ggml_bf16_t *)B, ldb,
(float *)C, ldc};
#elif LMUL == 2
tinyBLAS_RVV<vfloat32m2_t, vbfloat16m1_t, ggml_bf16_t, ggml_bf16_t, float> tb{ params,
k, (const ggml_bf16_t *)A, lda,
(const ggml_bf16_t *)B, ldb,
(float *)C, ldc};
#else // LMUL = 4
tinyBLAS_RVV<vfloat32m4_t, vbfloat16m2_t, ggml_bf16_t, ggml_bf16_t, float> tb{ params,
k, (const ggml_bf16_t *)A, lda,
(const ggml_bf16_t *)B, ldb,
(float *)C, ldc};
#endif
return tb.matmul(m, n);
#endif
return false;
}
@ -2748,6 +3496,26 @@ bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64
(float *)C, ldc};
return tb.matmul(m, n);
}
#elif defined(__riscv_zvfh)
if (Btype == GGML_TYPE_F16) {
#if LMUL == 1
tinyBLAS_RVV<vfloat32m1_t, vfloat16mf2_t, ggml_fp16_t, ggml_fp16_t, float> tb{ params,
k, (const ggml_fp16_t *)A, lda,
(const ggml_fp16_t *)B, ldb,
(float *)C, ldc};
#elif LMUL == 2
tinyBLAS_RVV<vfloat32m2_t, vfloat16m1_t, ggml_fp16_t, ggml_fp16_t, float> tb{ params,
k, (const ggml_fp16_t *)A, lda,
(const ggml_fp16_t *)B, ldb,
(float *)C, ldc};
#else // LMUL = 4
tinyBLAS_RVV<vfloat32m4_t, vfloat16m2_t, ggml_fp16_t, ggml_fp16_t, float> tb{ params,
k, (const ggml_fp16_t *)A, lda,
(const ggml_fp16_t *)B, ldb,
(float *)C, ldc};
#endif
return tb.matmul(m, n);
}
#endif
return false;
}

View File

@ -2668,7 +2668,7 @@ static void ggml_hexagon_unary(const struct ggml_tensor * op, uint32_t flags) {
req.op = HTP_OP_UNARY_SILU;
supported = true;
}
else if (ggml_get_unary_op(dst) == GGML_UNARY_OP_GELU){
else if (ggml_get_unary_op(dst) == GGML_UNARY_OP_GELU) {
req.op = HTP_OP_UNARY_GELU;
supported = true;
}

View File

@ -263,7 +263,8 @@ static void unary_gelu_fp32_per_thread(const struct htp_tensor * src0,
struct htp_spad * dst_spad,
uint32_t nth,
uint32_t ith,
uint32_t src0_nrows_per_thread) {
uint32_t src0_nrows_per_thread,
dma_queue * dma_queue) {
htp_act_preamble2;
uint64_t t1, t2;
@ -271,6 +272,8 @@ static void unary_gelu_fp32_per_thread(const struct htp_tensor * src0,
const size_t src0_row_size = nb01;
const size_t dst_row_size = nb1;
const size_t src0_row_size_aligned = htp_round_up(src0_row_size, VLEN);
const size_t dst_row_size_aligned = htp_round_up(dst_row_size, VLEN);
const uint32_t src0_nrows = ne01 * ne02 * ne03;
@ -282,60 +285,81 @@ static void unary_gelu_fp32_per_thread(const struct htp_tensor * src0,
return;
}
int is_aligned = 1;
int opt_path = 0;
if (!htp_is_aligned((void *) src0->data, VLEN) || !htp_is_aligned((void *) dst->data, VLEN)) {
is_aligned = 0;
FARF(HIGH, "silu-f32: unaligned addresses in elementwise op, possibly slower execution\n");
}
if ((1 == is_aligned) && !(nb01 & (VLEN - 1))) {
opt_path = 1;
const uint8_t * data_src0 = (const uint8_t *) src0->data;
uint8_t * data_dst = (uint8_t *) dst->data;
uint8_t * src0_spad_data = src0_spad->data + (ith * src0_spad->size_per_thread);
uint8_t * dst_spad_data = dst_spad->data + (ith * dst_spad->size_per_thread);
// While given src0_spad->size_per_thread, divide it to two ping-pong buffer for src0
size_t src0_spad_half_size = src0_spad->size_per_thread / 2;
size_t dst_spad_half_size = dst_spad->size_per_thread / 2;
// In gelu = x*sigmoid(x*1.702)
const int BLOCK = src0_spad_half_size / src0_row_size_aligned; // How many rows can we process in one block
if (BLOCK == 0) {
FARF(ERROR, "gelu-f32 : current VTCM reservation %zu is too small for even 1 row per thread, needed at least %zu\n",
src0_spad->size_per_thread, src0_row_size_aligned);
return;
}
const uint8_t * restrict data_src0 = (const uint8_t *) src0->data;
uint8_t * restrict data_dst = (uint8_t *) dst->data;
// See discussion: https://github.com/ggml-org/llama.cpp/pull/18151#issuecomment-3678235379
for (uint32_t ir = src0_start_row, spad_idx = 0; ir < src0_end_row && spad_idx < 2; ir += BLOCK, spad_idx++) {
const uint32_t block_size = MIN(BLOCK, src0_end_row - ir);
uint8_t * restrict src0_spad_data = src0_spad->data + (ith * src0_row_size);
uint8_t * restrict dst_spad_data = dst_spad->data + (ith * dst_row_size);
// Dummy DMA transation for sequencing (interleaving dst,src,dst,...)
dma_queue_push_vtcm_to_ddr(dma_queue,
dma_make_ptr(data_dst, dst_spad_data + (spad_idx * dst_spad_half_size)),
dst_row_size, dst_row_size_aligned, 0);
dma_queue_push_ddr_to_vtcm(dma_queue,
dma_make_ptr(src0_spad_data + (spad_idx * src0_spad_half_size), data_src0 + (ir * src0_row_size)),
src0_row_size_aligned, src0_row_size, block_size);
}
const int BLOCK = 8;
for (uint32_t ir = src0_start_row; ir < src0_end_row; ir += BLOCK) {
const uint32_t block_end = MIN(ir + BLOCK, src0_end_row);
const uint32_t block_size = MIN(BLOCK, src0_end_row - ir);
// Prefetch next block
if (block_end < src0_end_row) {
const float * restrict prefetch_ptr = (float *) (data_src0 + (block_end * src0_row_size));
htp_l2fetch(prefetch_ptr, 1, block_end * src0_row_size, src0_row_size);
}
float* dst_spad = (float *) dma_queue_pop(dma_queue).src;
float* src0_spad = (float *) dma_queue_pop(dma_queue).dst;
// Process rows in current block
for (uint32_t ib = ir; ib < block_end; ib++) {
const float * restrict src0 = (float *) (data_src0 + (ib * src0_row_size));
float * restrict dst = (float *) (data_dst + (ib * dst_row_size));
for (uint32_t ib = 0; ib < block_size; ib++) {
const float* src0_spad_ptr = src0_spad + ib * (src0_row_size_aligned / sizeof(float));
float* dst_spad_ptr = dst_spad + ib * (dst_row_size_aligned / sizeof(float));
// gelu = x * sigmoid(1.702 * x) // current implementation
if (1 == opt_path) {
hvx_mul_scalar_f32((const uint8_t *) src0, (float) 1.702, (uint8_t *) src0_spad_data, ne0);
hvx_fast_sigmoid_f32((const uint8_t *) src0_spad_data, (uint8_t *) src0_spad_data, ne0);
hvx_mul_f32_opt((const uint8_t *) src0, src0_spad_data, (uint8_t *) dst, ne0);
} else {
hvx_mul_scalar_f32( (const uint8_t *) src0, (float)1.702, (uint8_t *) src0_spad_data, ne0);
hvx_sigmoid_f32((const uint8_t *) src0_spad_data, (uint8_t *) src0_spad_data, ne0);
hvx_mul_f32((const uint8_t *) src0, src0_spad_data, (uint8_t *) dst, ne0);
}
hvx_mul_scalar_f32((const uint8_t *) src0_spad_ptr, (float) 1.702, (uint8_t *) dst_spad_ptr, ne0);
hvx_fast_sigmoid_f32((const uint8_t *) dst_spad_ptr, (uint8_t *) dst_spad_ptr, ne0);
hvx_mul_f32_opt((const uint8_t *) src0_spad_ptr, (uint8_t *) dst_spad_ptr, (uint8_t *) dst_spad_ptr, ne0);
}
dma_queue_push_vtcm_to_ddr(dma_queue,
dma_make_ptr(data_dst + (ir * dst_row_size), dst_spad),
dst_row_size, dst_row_size_aligned, block_size);
// prefetch N+2 loop iteration if any
const uint32_t pref_block = (ir + BLOCK * 2);
if (pref_block < src0_end_row) {
const uint32_t pref_block_size = MIN(BLOCK, src0_end_row - pref_block);
dma_queue_push_ddr_to_vtcm(dma_queue,
dma_make_ptr(src0_spad, data_src0 + (pref_block * src0_row_size)),
src0_row_size_aligned, src0_row_size, pref_block_size);
}
}
dma_queue_flush(dma_queue);
t2 = HAP_perf_get_qtimer_count();
FARF(HIGH, "gelu-f32 %d/%d/%d: %ux%ux%ux%u (%u:%u) -> %ux%ux%ux%u usec %u\n", ith, nth, opt_path, ne00, ne01, ne02,
FARF(HIGH, "gelu-f32 %d/%d: %ux%ux%ux%u (%u:%u) -> %ux%ux%ux%u usec %u\n", ith, nth, ne00, ne01, ne02,
ne03, src0_start_row, src0_end_row, ne0, ne1, ne2, ne3, (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
}
static void unary_gelu_fp32(unsigned int n, unsigned int i, void * data) {
struct htp_ops_context * octx = (struct htp_ops_context *) data;
unary_gelu_fp32_per_thread(&octx->src0, &octx->dst, octx->op_params, &octx->src0_spad, &octx->dst_spad, n, i,
octx->src0_nrows_per_thread);
octx->src0_nrows_per_thread, octx->ctx->dma[i]);
}
@ -468,21 +492,45 @@ static int execute_op_activations_fp32(struct htp_ops_context * octx) {
const uint32_t n_threads = octx->n_threads;
const uint32_t src0_nrows = src0->ne[1] * src0->ne[2] * src0->ne[3];
const size_t src0_row_size = src0->nb[1];
const size_t src1_row_size = src1->ne[0] ? src1->nb[1] : src0->nb[1];
const size_t dst_row_size = dst->nb[1];
size_t src0_row_size = src0->nb[1];
size_t src1_row_size = src1->nb[1]; // zero bytes if src1 is not used
size_t dst_row_size = dst->nb[1];
const bool src1_valid = src1->ne[0];
if (!src1_valid) {
src1_row_size = src0_row_size;
}
const size_t src0_row_size_aligned = htp_round_up(src0_row_size, VLEN);
const size_t src1_row_size_aligned = htp_round_up(src1_row_size, VLEN);
const size_t dst_row_size_aligned = htp_round_up(dst_row_size, VLEN);
// VTCM scratchpads for all tensors
// N rows per thread, padded to HVX vector size
octx->dst_spad.size = htp_round_up(dst_row_size, 128) * octx->n_threads;
octx->src0_spad.size = htp_round_up(src0_row_size, 128) * octx->n_threads;
octx->src1_spad.size = htp_round_up(src1_row_size, 128) * octx->n_threads;
size_t spad_size = octx->src0_spad.size + octx->src1_spad.size + octx->dst_spad.size;
size_t spad_size_per_row = (src0_row_size_aligned + src1_row_size_aligned) + dst_row_size_aligned;
size_t vtcm_row_per_thread = (octx->ctx->vtcm_size)/ (n_threads* spad_size_per_row);
// Make sure the reserved vtcm size is sufficient
if(vtcm_row_per_thread ==0){
FARF(ERROR, "act-%s : current VTCM reservation %zu is too small for even 1 row per thread, needed at least %zu\n", op_type, octx->ctx->vtcm_size,
spad_size_per_row * n_threads);
return HTP_STATUS_VTCM_TOO_SMALL;
}
octx->src0_spad.size_per_thread = src0_row_size_aligned * vtcm_row_per_thread;
octx->src1_spad.size_per_thread = src1_row_size_aligned * vtcm_row_per_thread;
octx->dst_spad.size_per_thread = dst_row_size_aligned * vtcm_row_per_thread;
octx->dst_spad.size = n_threads* octx->dst_spad.size_per_thread;
octx->src0_spad.size = n_threads* octx->src0_spad.size_per_thread;
octx->src1_spad.size = n_threads* octx->src1_spad.size_per_thread;
octx->src0_spad.data = octx->ctx->vtcm_base;
octx->src1_spad.data = octx->src0_spad.data + octx->src0_spad.size;
octx->dst_spad.data = octx->src1_spad.data + octx->src1_spad.size;
if (src1->ne[0]) {
FARF(HIGH,
"%s: %ux%ux%ux%u x %ux%ux%ux%u -> %ux%ux%ux%u : src0-spad-size %u src1-spad-size %u dst-spad-size %u\n",
FARF(HIGH, "%s: %ux%ux%ux%u x %ux%ux%ux%u -> %ux%ux%ux%u : src0-spad-size %u src1-spad-size %u dst-spad-size %u\n",
op_type, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src1->ne[0], src1->ne[1], src1->ne[2],
src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], octx->src0_spad.size, octx->src1_spad.size,
octx->dst_spad.size);
@ -492,20 +540,8 @@ static int execute_op_activations_fp32(struct htp_ops_context * octx) {
octx->src0_spad.size, octx->src1_spad.size, octx->dst_spad.size);
}
// Make sure the reserved vtcm size is sufficient
if (octx->ctx->vtcm_size < spad_size) {
FARF(ERROR, "act-%s : current VTCM reservation %zu is too small, needed %zu\n", op_type, octx->ctx->vtcm_size,
spad_size);
return HTP_STATUS_VTCM_TOO_SMALL;
}
octx->src0_spad.data = octx->ctx->vtcm_base;
octx->src1_spad.data = octx->src0_spad.data + octx->src0_spad.size;
octx->dst_spad.data = octx->src1_spad.data + octx->src1_spad.size;
if (!(octx->flags & HTP_OPFLAGS_SKIP_COMPUTE)) {
uint32_t n_jobs = MIN(n_threads, src0_nrows);
octx->src0_nrows_per_thread = (src0_nrows + n_jobs - 1) / n_jobs;
worker_pool_run_func(octx->ctx->worker_pool, act_op_func, octx, n_jobs);
}

View File

@ -34,12 +34,12 @@ dma_queue * dma_queue_create(size_t capacity) {
q->desc = (hexagon_udma_descriptor_type1_t *) memalign(64, capacity * sizeof(hexagon_udma_descriptor_type1_t));
memset(q->desc, 0, capacity * sizeof(hexagon_udma_descriptor_type1_t));
q->dst = (void **) memalign(4, capacity * sizeof(void *));
memset(q->dst, 0, capacity * sizeof(void *));
q->dptr = (dma_ptr *) memalign(4, capacity * sizeof(dma_ptr));
memset(q->dptr, 0, capacity * sizeof(dma_ptr));
q->tail = &q->desc[capacity - 1];
if (!q->desc && !q->dst) {
if (!q->desc && !q->dptr) {
FARF(ERROR, "%s: failed to allocate DMA queue items\n", __FUNCTION__);
return NULL;
}
@ -54,16 +54,10 @@ void dma_queue_delete(dma_queue * q) {
return;
}
free(q->desc);
free(q->dst);
free(q->dptr);
free(q);
}
void dma_queue_flush(dma_queue * q) {
while (1) {
uint32_t s = dmwait() & 0x3;
if (s == HEXAGON_UDMA_DM0_STATUS_IDLE) {
break;
}
}
q->tail = NULL;
while (dma_queue_pop(q).dst != NULL) ;
}

View File

@ -11,10 +11,15 @@
extern "C" {
#endif
typedef struct {
void *dst;
const void *src;
} dma_ptr;
typedef struct {
hexagon_udma_descriptor_type1_t * desc; // descriptor pointers
hexagon_udma_descriptor_type1_t * tail; // tail pointer
void ** dst; // dst pointers
dma_ptr * dptr; // dst/src pointers
uint32_t push_idx;
uint32_t pop_idx;
uint32_t capacity;
@ -49,13 +54,20 @@ static inline unsigned int dmwait(void) {
return ret;
}
static inline bool dma_queue_push(dma_queue * q,
void * dst,
const void * src,
size_t dst_row_size,
size_t src_row_size,
size_t nrows) {
static inline dma_ptr dma_make_ptr(void *dst, const void *src)
{
dma_ptr p = { dst, src };
return p;
}
static inline bool dma_queue_push(dma_queue * q,
dma_ptr dptr,
size_t dst_row_size,
size_t src_row_size,
size_t width, // width in bytes. number of bytes to transfer per row
size_t nrows) {
if (((q->push_idx + 1) & q->idx_mask) == q->pop_idx) {
FARF(ERROR, "dma-push: queue full\n");
return false;
}
@ -75,18 +87,18 @@ static inline bool dma_queue_push(dma_queue * q,
#endif
desc->order = 0;
desc->dstate = HEXAGON_UDMA_DESC_DSTATE_INCOMPLETE;
desc->src = (void *) src;
desc->dst = (void *) dst;
desc->src = (void *) dptr.src;
desc->dst = (void *) dptr.dst;
desc->allocation = 0;
desc->padding = 0;
desc->roiwidth = src_row_size;
desc->roiwidth = width;
desc->roiheight = nrows;
desc->srcstride = src_row_size;
desc->dststride = dst_row_size;
desc->srcwidthoffset = 0;
desc->dstwidthoffset = 0;
q->dst[q->push_idx] = dst;
q->dptr[q->push_idx] = dptr;
dmlink(q->tail, desc);
q->tail = desc;
@ -96,9 +108,28 @@ static inline bool dma_queue_push(dma_queue * q,
return true;
}
static inline uint8_t * dma_queue_pop(dma_queue * q) {
static inline bool dma_queue_push_ddr_to_vtcm(dma_queue * q,
dma_ptr dptr,
size_t dst_row_size,
size_t src_row_size,
size_t nrows) {
return dma_queue_push(q, dptr, dst_row_size, src_row_size, src_row_size, nrows);
}
static inline bool dma_queue_push_vtcm_to_ddr(dma_queue * q,
dma_ptr dptr,
size_t dst_row_size,
size_t src_row_size,
size_t nrows) {
return dma_queue_push(q, dptr, dst_row_size, src_row_size, dst_row_size, nrows);
}
static inline dma_ptr dma_queue_pop(dma_queue * q) {
dma_ptr dptr = { NULL };
if (q->push_idx == q->pop_idx) {
return NULL;
return dptr;
}
hexagon_udma_descriptor_type1_t * desc = &q->desc[q->pop_idx];
@ -112,11 +143,11 @@ static inline uint8_t * dma_queue_pop(dma_queue * q) {
// FARF(ERROR, "dma-pop: waiting for DMA : %u\n", q->pop_idx);
}
uint8_t * dst = (uint8_t *) q->dst[q->pop_idx];
dptr = q->dptr[q->pop_idx];
// FARF(ERROR, "dma-pop: i %u dst %p\n", q->pop_idx, dst);
q->pop_idx = (q->pop_idx + 1) & q->idx_mask;
return dst;
return dptr;
}
#ifdef __cplusplus

View File

@ -980,8 +980,6 @@ static inline void hvx_fast_sigmoid_f32(const uint8_t * restrict src, uint8_t *
int step_of_1 = num_elems >> 5;
int remaining = num_elems - step_of_1 * VLEN_FP32;
assert(remaining == 0);
const HVX_Vector * restrict v_src = (HVX_Vector *) src;
HVX_Vector * restrict v_dst = (HVX_Vector *) dst;
@ -996,8 +994,16 @@ static inline void hvx_fast_sigmoid_f32(const uint8_t * restrict src, uint8_t *
for (int i = 0; i < step_of_1; i++) {
v_dst[i] = hvx_vec_fast_sigmoid_fp32_guard(v_src[i], one, max_exp, min_exp);
}
}
if (remaining > 0) {
const float * srcf = ((const float *) src) + step_of_1* VLEN_FP32;
float * dstf = (float *) dst + step_of_1*VLEN_FP32;
HVX_Vector in = *(HVX_UVector *) srcf;
HVX_Vector out = hvx_vec_fast_sigmoid_fp32_guard(in, one, max_exp, min_exp);
hvx_vec_store_u((void *) dstf, remaining * SIZEOF_FP32, out);
}
}
static inline void hvx_sigmoid_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems){
int step_of_1 = num_elems >> 5; // divby 32, because 32 float = 128 bytes per HVX vector

View File

@ -299,7 +299,8 @@ AEEResult htp_iface_start(remote_handle64 handle, uint32 sess_id, uint64 dsp_que
ctx->n_threads = n_hvx;
for (int i = 0; i < ctx->n_threads; i++) {
ctx->dma[i] = dma_queue_create(HTP_SPAD_SRC0_NROWS * 2);
// see discussion https://github.com/ggml-org/llama.cpp/pull/18151#discussion_r2632388541
ctx->dma[i] = dma_queue_create(64);
}
// init worker pool

View File

@ -1127,13 +1127,13 @@ static void matmul(struct htp_matmul_type * mt,
if (is0 >= HTP_SPAD_SRC0_NROWS) {
break;
}
dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size,
dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size),
src0_row_size_padded, src0_row_size, 2);
}
// Process src0 rows
for (uint32_t ir0 = src0_start_row; ir0 < src0_end_row_x2; ir0 += 2) {
const uint8_t * ss0 = dma_queue_pop(dma_queue);
const uint8_t * ss0 = dma_queue_pop(dma_queue).dst;
#pragma unroll(2)
for (uint32_t ir1 = 0; ir1 < src1_nrows; ++ir1) {
@ -1146,7 +1146,7 @@ static void matmul(struct htp_matmul_type * mt,
const int pr0 = (ir0 + HTP_SPAD_SRC0_NROWS);
const int is0 = (pr0 - src0_start_row) % HTP_SPAD_SRC0_NROWS;
if (pr0 < src0_end_row_x2) {
dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + pr0 * src0_row_size,
dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_row_size_padded, src0_row + pr0 * src0_row_size),
src0_row_size_padded, src0_row_size, 2);
}
}
@ -1155,9 +1155,9 @@ static void matmul(struct htp_matmul_type * mt,
if (src0_end_row != src0_end_row_x2) {
uint32_t ir0 = src0_end_row_x2;
const int is0 = (ir0 - src0_start_row);
dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size,
dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size),
src0_row_size_padded, src0_row_size, 1);
const uint8_t * ss0 = dma_queue_pop(dma_queue);
const uint8_t * ss0 = dma_queue_pop(dma_queue).dst;
#pragma unroll(2)
for (uint32_t ir1 = 0; ir1 < src1_nrows; ++ir1) {
@ -1229,20 +1229,20 @@ static void matvec(struct htp_matmul_type * mt,
if (is0 >= HTP_SPAD_SRC0_NROWS) {
break;
}
dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size,
dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size),
src0_row_size_padded, src0_row_size, 2);
}
// Process src0 rows
for (uint32_t ir0 = src0_start_row; ir0 < src0_end_row_x2; ir0 += 2) {
const uint8_t * ss0 = dma_queue_pop(dma_queue);
const uint8_t * ss0 = dma_queue_pop(dma_queue).dst;
mt->vec_dot_rx2(ne00, &tmp[ir0 - src0_start_row], ss0, src0_row_size_padded, src1_col);
// Prefetch next (n + spad_nrows) row
const uint32_t pr0 = (ir0 + HTP_SPAD_SRC0_NROWS);
const uint32_t is0 = (pr0 - src0_start_row) % HTP_SPAD_SRC0_NROWS;
if (pr0 < src0_end_row_x2) {
dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + pr0 * src0_row_size,
dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_row_size_padded, src0_row + pr0 * src0_row_size),
src0_row_size_padded, src0_row_size, 2);
}
}
@ -1251,9 +1251,9 @@ static void matvec(struct htp_matmul_type * mt,
if (src0_end_row != src0_end_row_x2) {
const uint32_t ir0 = src0_end_row_x2;
const uint32_t is0 = (ir0 - src0_start_row);
dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size,
dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size),
src0_row_size_padded, src0_row_size, 1);
const uint8_t * ss0 = dma_queue_pop(dma_queue);
const uint8_t * ss0 = dma_queue_pop(dma_queue).dst;
mt->vec_dot(ne00, &tmp[ir0 - src0_start_row], ss0, src1_col);
}
@ -1343,13 +1343,13 @@ static void matmul_id(struct htp_matmul_type * mt,
if (is0 >= HTP_SPAD_SRC0_NROWS) {
break;
}
dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size,
dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size),
src0_row_size_padded, src0_row_size, 2);
}
// Process src0 rows
for (uint32_t ir0 = src0_start_row; ir0 < src0_end_row_x2; ir0 += 2) {
const uint8_t * ss0 = dma_queue_pop(dma_queue);
const uint8_t * ss0 = dma_queue_pop(dma_queue).dst;
for (uint32_t cid = 0; cid < cne1; ++cid) {
struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, cid);
@ -1368,7 +1368,7 @@ static void matmul_id(struct htp_matmul_type * mt,
const int pr0 = (ir0 + HTP_SPAD_SRC0_NROWS);
const int is0 = (pr0 - src0_start_row) % HTP_SPAD_SRC0_NROWS;
if (pr0 < src0_end_row_x2) {
dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + pr0 * src0_row_size,
dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_row_size_padded, src0_row + pr0 * src0_row_size),
src0_row_size_padded, src0_row_size, 2);
}
}
@ -1377,9 +1377,9 @@ static void matmul_id(struct htp_matmul_type * mt,
if (src0_end_row != src0_end_row_x2) {
uint32_t ir0 = src0_end_row_x2;
const uint32_t is0 = (ir0 - src0_start_row);
dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size,
dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size),
src0_row_size_padded, src0_row_size, 1);
const uint8_t * ss0 = dma_queue_pop(dma_queue);
const uint8_t * ss0 = dma_queue_pop(dma_queue).dst;
for (uint32_t cid = 0; cid < cne1; ++cid) {
struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, cid);
@ -1467,20 +1467,20 @@ static void matvec_id(struct htp_matmul_type * mt,
if (is0 >= HTP_SPAD_SRC0_NROWS) {
break;
}
dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size,
dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size),
src0_row_size_padded, src0_row_size, 2);
}
// Process src0 rows
for (uint32_t ir0 = src0_start_row; ir0 < src0_end_row_x2; ir0 += 2) {
const uint8_t * ss0 = dma_queue_pop(dma_queue);
const uint8_t * ss0 = dma_queue_pop(dma_queue).dst;
mt->vec_dot_rx2(ne00, &dst_row[ir0], ss0, src0_row_size_padded, src1_col);
// Prefetch next (n + spad_nrows) row
const int pr0 = (ir0 + HTP_SPAD_SRC0_NROWS);
const int is0 = (pr0 - src0_start_row) % HTP_SPAD_SRC0_NROWS;
if (pr0 < src0_end_row_x2) {
dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + pr0 * src0_row_size,
dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_row_size_padded, src0_row + pr0 * src0_row_size),
src0_row_size_padded, src0_row_size, 2);
}
}
@ -1489,9 +1489,9 @@ static void matvec_id(struct htp_matmul_type * mt,
if (src0_end_row != src0_end_row_x2) {
uint32_t ir0 = src0_end_row_x2;
const uint32_t is0 = (ir0 - src0_start_row);
dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size,
dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size),
src0_row_size_padded, src0_row_size, 1);
const uint8_t * ss0 = dma_queue_pop(dma_queue);
const uint8_t * ss0 = dma_queue_pop(dma_queue).dst;
mt->vec_dot(ne00, &dst_row[ir0], ss0, src1_col);
}
}

View File

@ -494,6 +494,7 @@ struct ggml_backend_opencl_context {
cl_kernel kernel_convert_block_q8_0, kernel_restore_block_q8_0;
cl_kernel kernel_mul_mat_q4_0_f32_8x_flat;
cl_kernel kernel_convert_block_q4_0_noshuffle;
cl_kernel kernel_restore_block_q4_0_noshuffle;
cl_kernel kernel_mul_mat_q4_0_f32_1d_8x_flat, kernel_mul_mat_q4_0_f32_1d_16x_flat;
cl_kernel kernel_mul_mv_q6_K_f32;
cl_kernel kernel_mul_mv_mxfp4_f32, kernel_mul_mv_mxfp4_f32_flat;
@ -634,6 +635,7 @@ struct ggml_backend_opencl_context {
cl_kernel kernel_transpose_32;
cl_kernel kernel_transpose_32_16;
cl_kernel kernel_transpose_16;
cl_kernel kernel_transpose_16_buf;
cl_kernel kernel_transpose_16_4x1;
cl_mem A_s_d_max; // max scale buffer size for transpose
@ -806,6 +808,7 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
CL_CHECK((backend_ctx->kernel_convert_block_q4_0_noshuffle = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q4_0_noshuffle", &err), err));
CL_CHECK((backend_ctx->kernel_restore_block_q4_0_noshuffle = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q4_0_noshuffle", &err), err));
CL_CHECK((backend_ctx->kernel_convert_block_q4_0 = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q4_0", &err), err));
CL_CHECK((backend_ctx->kernel_restore_block_q4_0 = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q4_0", &err), err));
CL_CHECK((backend_ctx->kernel_convert_block_mxfp4 = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_mxfp4", &err), err));
@ -2004,7 +2007,8 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
CL_CHECK((backend_ctx->kernel_transpose_32_16 = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_32_16", &err), err));
CL_CHECK((backend_ctx->kernel_transpose_32 = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_32", &err), err));
CL_CHECK((backend_ctx->kernel_transpose_16 = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_16", &err), err));
CL_CHECK((backend_ctx->kernel_transpose_16_4x1 = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_16_4x1", &err), err));
CL_CHECK((backend_ctx->kernel_transpose_16_buf = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_16_buf", &err), err));
CL_CHECK((backend_ctx->kernel_transpose_16_4x1 = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_16_4x1", &err), err));
GGML_LOG_CONT(".");
}
@ -3933,6 +3937,91 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer,
if (tensor->type == GGML_TYPE_Q4_0) {
ggml_tensor_extra_cl_q4_0 * extra = (ggml_tensor_extra_cl_q4_0 *)tensor->extra;
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
if (use_adreno_kernels(backend_ctx, tensor)) {
cl_int err;
cl_kernel kernel;
cl_int M = tensor->ne[1]; // ne01
cl_int K = tensor->ne[0]; // ne00
GGML_ASSERT(K % 32 == 0);
GGML_ASSERT(M % 4 == 0);
size_t size_q = (ggml_nelements(tensor)/ggml_blck_size(tensor->type))*ggml_blck_size(tensor->type)/2;
size_t size_d = (ggml_nelements(tensor)/ggml_blck_size(tensor->type))*sizeof(ggml_fp16_t);
GGML_ASSERT(size_d + size_q == ggml_nbytes(tensor) && "Incorrect tensor size");
cl_mem buf_trans_q;
cl_mem buf_trans_d;
CL_CHECK((buf_trans_q = clCreateBuffer(context, CL_MEM_READ_WRITE,
size_q, NULL, &err), err));
CL_CHECK((buf_trans_d = clCreateBuffer(context, CL_MEM_READ_WRITE,
size_d, NULL, &err), err));
kernel = backend_ctx->kernel_transpose_16_buf;
// transpose q back
cl_int stride_k_q = K/4;
size_t local_size_q[3] = {64, 1, 1};
size_t global_size_q[3] = {(size_t)M, (size_t)stride_k_q, 1};
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->q));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &buf_trans_q));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_int), &M));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_int), &stride_k_q));
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
global_size_q, local_size_q, 0, NULL, NULL));
// transpose scales back
cl_int stride_k_d = K/32;
size_t local_size_d[3] = {64, 1, 1};
size_t global_size_d[3] = {(size_t)M, (size_t)stride_k_d, 1};
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->d));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &buf_trans_d));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_int), &M));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_int), &stride_k_d));
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
global_size_d, local_size_d, 0, NULL, NULL));
// unpack
cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE,
ggml_nbytes(tensor), NULL, &err);
CL_CHECK(err);
cl_uchar mask_0F = 0x0F;
cl_uchar mask_F0 = 0xF0;
size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
size_t local_work_size[] = {1, 1, 1};
kernel = backend_ctx->kernel_restore_block_q4_0_noshuffle;
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &buf_trans_q));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &buf_trans_d));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &data_device));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_uchar), &mask_0F));
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_uchar), &mask_F0));
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
global_work_size, local_work_size, 0, NULL, NULL));
// read back to host
CL_CHECK(clEnqueueReadBuffer(
queue, data_device, CL_TRUE, offset,
size, data, 0, NULL, NULL));
CL_CHECK(clReleaseMemObject(data_device));
CL_CHECK(clReleaseMemObject(buf_trans_q));
CL_CHECK(clReleaseMemObject(buf_trans_d));
return;
}
#endif
cl_int err;
cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE,
ggml_nbytes(tensor), NULL, &err);

View File

@ -117,6 +117,27 @@ kernel void kernel_convert_block_q4_0_noshuffle(
}
}
kernel void kernel_restore_block_q4_0_noshuffle(
global uchar * src_q,
global half * src_d,
global struct block_q4_0 * dst,
uchar mask_0F,
uchar mask_F0
) {
global struct block_q4_0 * b = (global struct block_q4_0 *) dst + get_global_id(0);
global uchar * q = (global uchar *) src_q + QK4_0/2*get_global_id(0);
global half * d = (global half *) src_d + get_global_id(0);
b->d = *d;
for (int i = 0; i < QK4_0/4; ++i) {
uchar x0 = q[i + 0 ] ;
uchar x1 = q[i + QK4_0/4];
b->qs[2*i + 0] = convert_uchar((x0 & mask_0F) | ((x1 & mask_0F) << 4));
b->qs[2*i + 1] = convert_uchar(((x0 & mask_F0) >> 4) | (x1 & mask_F0));
}
}
//------------------------------------------------------------------------------
// block_mxfp4
//------------------------------------------------------------------------------

View File

@ -44,6 +44,19 @@ kernel void kernel_transpose_16_4x1(
write_imageh(output, i * rows + j, (half4)(temp0, temp1, temp2, temp3));
}
// Transpose treating each element as 16-bit using buffer
kernel void kernel_transpose_16_buf(
global const ushort * input,
global ushort * output,
const int ldi,
const int ldo
) {
const int x = get_global_id(0);
const int y = get_global_id(1);
output[x*ldo + y] = input[y*ldi + x];
}
// 32-bit transpose, loading/storing a 4x4 tile of elements
kernel void kernel_transpose_32(
__read_only image1d_buffer_t input,

View File

@ -731,7 +731,7 @@ struct vk_device_struct {
vk_pipeline pipeline_rope_norm_f32, pipeline_rope_norm_f16, pipeline_rope_norm_f32_f16;
vk_pipeline pipeline_rope_neox_f32, pipeline_rope_neox_f16, pipeline_rope_neox_f32_f16;
vk_pipeline pipeline_rope_multi_f32, pipeline_rope_multi_f16;
vk_pipeline pipeline_rope_multi_f32, pipeline_rope_multi_f16, pipeline_rope_multi_f32_f16;
vk_pipeline pipeline_rope_vision_f32, pipeline_rope_vision_f16;
vk_pipeline pipeline_argsort_f32[num_argsort_pipelines];
vk_pipeline pipeline_argsort_large_f32[num_argsort_pipelines];
@ -856,6 +856,15 @@ struct vk_subbuffer {
}
};
// vk_event is used for the event-related backend interfaces. It uses 'event' for
// event_wait and 'fence' for event_synchronize. Polling on an event for
// event_synchronize wouldn't be sufficient to wait for command buffers to complete,
// and would lead to validation errors.
struct vk_event {
vk::Event event;
vk::Fence fence;
};
struct vk_semaphore {
vk::Semaphore s;
uint64_t value;
@ -2544,6 +2553,15 @@ static void ggml_vk_sync_buffers(ggml_backend_vk_context* ctx, vk_context& subct
);
}
static void ggml_vk_set_event(vk_context& ctx, vk::Event& event) {
VK_LOG_DEBUG("ggml_vk_set_event()");
ctx->s->buffer.setEvent(
event,
ctx->p->q->stage_flags
);
}
static void ggml_vk_wait_events(vk_context& ctx, std::vector<vk::Event>&& events) {
VK_LOG_DEBUG("ggml_vk_wait_events()");
if (events.empty()) {
@ -4059,6 +4077,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
ggml_vk_create_pipeline(device, device->pipeline_rope_norm_f32_f16, "rope_norm_f32_f16", rope_norm_f32_f16_rte_len, rope_norm_f32_f16_rte_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_rope_neox_f32_f16, "rope_neox_f32_f16", rope_neox_f32_f16_rte_len, rope_neox_f32_f16_rte_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_rope_multi_f32_f16, "rope_multi_f32_f16", rope_multi_f32_f16_rte_len, rope_multi_f32_f16_rte_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
} else {
ggml_vk_create_pipeline(device, device->pipeline_rope_norm_f16, "rope_norm_f16", rope_norm_f16_len, rope_norm_f16_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_rope_neox_f16, "rope_neox_f16", rope_neox_f16_len, rope_neox_f16_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
@ -4067,6 +4086,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
ggml_vk_create_pipeline(device, device->pipeline_rope_norm_f32_f16, "rope_norm_f32_f16", rope_norm_f32_f16_len, rope_norm_f32_f16_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_rope_neox_f32_f16, "rope_neox_f32_f16", rope_neox_f32_f16_len, rope_neox_f32_f16_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_rope_multi_f32_f16, "rope_multi_f32_f16", rope_multi_f32_f16_len, rope_multi_f32_f16_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
}
for (uint32_t i = 0; i < num_argsort_pipelines; ++i) {
@ -6089,13 +6109,8 @@ static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_cont
}
}
static void ggml_vk_buffer_write_2d_async(vk_context subctx, vk_buffer& dst, size_t offset, const void * src, size_t spitch, size_t width, size_t height, bool sync_staging = false) {
static bool ggml_vk_buffer_write_2d_async(vk_context subctx, vk_buffer& dst, size_t offset, const void * src, size_t spitch, size_t width, size_t height, bool sync_staging = false) {
VK_LOG_DEBUG("ggml_vk_buffer_write_2d_async(" << width << ", " << height << ")");
// Buffer is already mapped
if(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) {
std::cerr << "ggml_vulkan: buffer_write_async dst buffer is host_visible. Use synchronous write." << std::endl;
GGML_ABORT("fatal error");
}
// Check if src is pinned memory
vk_buffer buf = nullptr;
size_t buf_offset = 0;
@ -6120,12 +6135,13 @@ static void ggml_vk_buffer_write_2d_async(vk_context subctx, vk_buffer& dst, siz
ggml_vk_sync_buffers(nullptr, subctx);
subctx->s->buffer.copyBuffer(buf->buffer, dst->buffer, slices);
return;
return true;
}
VK_LOG_DEBUG("STAGING");
if (!sync_staging) {
GGML_ABORT("Asynchronous write to non-pinned memory not supported");
// copy was not handled caller needs to fall back
return false;
}
// Staging buffer required
@ -6149,9 +6165,10 @@ static void ggml_vk_buffer_write_2d_async(vk_context subctx, vk_buffer& dst, siz
deferred_memcpy((uint8_t *)staging_buffer->ptr + i * width, (const uint8_t *) src + i * spitch, width, &subctx->in_memcpys);
}
}
return true;
}
static void ggml_vk_buffer_write_async(vk_context subctx, vk_buffer& dst, size_t offset, const void * src, size_t size, bool sync_staging = false) {
static bool ggml_vk_buffer_write_async(vk_context subctx, vk_buffer& dst, size_t offset, const void * src, size_t size, bool sync_staging = false) {
VK_LOG_DEBUG("ggml_vk_buffer_write_async(" << size << ")");
return ggml_vk_buffer_write_2d_async(subctx, dst, offset, src, size, size, 1, sync_staging);
}
@ -6170,7 +6187,8 @@ static void ggml_vk_buffer_write_2d(vk_buffer& dst, size_t offset, const void *
vk_context subctx = ggml_vk_create_temporary_context(dst->device->transfer_queue.cmd_pool);
ggml_vk_ctx_begin(dst->device, subctx);
ggml_vk_buffer_write_2d_async(subctx, dst, offset, src, spitch, width, height, true);
bool ret = ggml_vk_buffer_write_2d_async(subctx, dst, offset, src, spitch, width, height, true);
GGML_ASSERT(ret);
ggml_vk_ctx_end(subctx);
for (auto& cpy : subctx->in_memcpys) {
@ -8664,6 +8682,9 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
return ctx->device->pipeline_rope_multi_f32;
}
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F16) {
return ctx->device->pipeline_rope_multi_f32_f16;
}
if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
return ctx->device->pipeline_rope_multi_f16;
}
@ -12671,7 +12692,23 @@ static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, ggml_tensor
vk_buffer buf = buf_ctx->dev_buffer;
ggml_vk_buffer_write_async(transfer_ctx, buf, vk_tensor_offset(tensor) + tensor->view_offs + offset, data, size);
auto dst_offset = vk_tensor_offset(tensor) + tensor->view_offs + offset;
bool ret = ggml_vk_buffer_write_async(transfer_ctx, buf, dst_offset, data, size);
if (!ret) {
ggml_vk_ensure_sync_staging_buffer(ctx, size);
ggml_vk_sync_buffers(nullptr, transfer_ctx);
vk::BufferCopy buffer_cpy;
buffer_cpy.srcOffset = 0;
buffer_cpy.dstOffset = dst_offset;
buffer_cpy.size = size;
transfer_ctx->s->buffer.copyBuffer(ctx->sync_staging->buffer, buf->buffer, { buffer_cpy });
deferred_memcpy(ctx->sync_staging->ptr, data, size, &transfer_ctx->in_memcpys);
ggml_vk_synchronize(ctx);
}
}
static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
@ -13044,9 +13081,9 @@ static bool ggml_vk_can_fuse_rope_set_rows(ggml_backend_vk_context * ctx, const
return false;
}
// Only norm/neox shaders have the fusion code
// Only norm/neox/mrope shaders have the fusion code
const int mode = ((const int32_t *) rope->op_params)[2];
if (mode != GGML_ROPE_TYPE_NORMAL && mode != GGML_ROPE_TYPE_NEOX) {
if (mode != GGML_ROPE_TYPE_NORMAL && mode != GGML_ROPE_TYPE_NEOX && mode != GGML_ROPE_TYPE_MROPE) {
return false;
}
@ -13678,11 +13715,58 @@ static void ggml_vk_graph_optimize(ggml_backend_t backend, struct ggml_cgraph *
}
}
static void ggml_backend_vk_event_record(ggml_backend_t backend, ggml_backend_event_t event) {
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
vk_event *vkev = (vk_event *)event->context;
vk_context transfer_ctx;
if (ctx->transfer_ctx.expired()) {
// Initialize new transfer context
transfer_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
ctx->transfer_ctx = transfer_ctx;
ggml_vk_ctx_begin(ctx->device, transfer_ctx);
} else {
transfer_ctx = ctx->transfer_ctx.lock();
}
// the backend interface doesn't have an explicit reset, so reset it here
// before we record the command to set it
ctx->device->device.resetEvent(vkev->event);
ctx->device->device.resetFences({ vkev->fence });
ggml_vk_set_event(transfer_ctx, vkev->event);
ggml_vk_ctx_end(transfer_ctx);
ggml_vk_submit(transfer_ctx, {vkev->fence});
ctx->submit_pending = true;
ctx->transfer_ctx.reset();
}
static void ggml_backend_vk_event_wait(ggml_backend_t backend, ggml_backend_event_t event) {
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
vk_event *vkev = (vk_event *)event->context;
vk_context transfer_ctx;
if (ctx->transfer_ctx.expired()) {
// Initialize new transfer context
transfer_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
ctx->transfer_ctx = transfer_ctx;
ggml_vk_ctx_begin(ctx->device, transfer_ctx);
} else {
transfer_ctx = ctx->transfer_ctx.lock();
}
ggml_vk_wait_events(transfer_ctx, {vkev->event});
}
// TODO: enable async and synchronize
static ggml_backend_i ggml_backend_vk_interface = {
/* .get_name = */ ggml_backend_vk_name,
/* .free = */ ggml_backend_vk_free,
/* .set_tensor_async = */ NULL, // ggml_backend_vk_set_tensor_async,
/* .set_tensor_async = */ ggml_backend_vk_set_tensor_async,
/* .get_tensor_async = */ ggml_backend_vk_get_tensor_async,
/* .cpy_tensor_async = */ NULL, // ggml_backend_vk_cpy_tensor_async,
/* .synchronize = */ ggml_backend_vk_synchronize,
@ -13691,8 +13775,8 @@ static ggml_backend_i ggml_backend_vk_interface = {
/* .graph_plan_update = */ NULL,
/* .graph_plan_compute = */ NULL,
/* .graph_compute = */ ggml_backend_vk_graph_compute,
/* .event_record = */ NULL,
/* .event_wait = */ NULL,
/* .event_record = */ ggml_backend_vk_event_record,
/* .event_wait = */ ggml_backend_vk_event_wait,
/* .graph_optimize = */ ggml_vk_graph_optimize,
};
@ -13867,10 +13951,10 @@ static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml
props->device_id = ctx->pci_bus_id.empty() ? nullptr : ctx->pci_bus_id.c_str();
ggml_backend_vk_device_get_memory(dev, &props->memory_free, &props->memory_total);
props->caps = {
/* .async = */ false,
/* .async = */ true,
/* .host_buffer = */ true,
/* .buffer_from_host_ptr = */ false,
/* .events = */ false,
/* .events = */ true,
};
}
@ -14402,6 +14486,46 @@ static bool ggml_backend_vk_device_offload_op(ggml_backend_dev_t dev, const ggml
UNUSED(dev);
}
static ggml_backend_event_t ggml_backend_vk_device_event_new(ggml_backend_dev_t dev) {
ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
auto device = ggml_vk_get_device(ctx->device);
vk_event *vkev = new vk_event;
if (!vkev) {
return nullptr;
}
// The event/fence is expected to initially be in the signaled state.
vkev->event = device->device.createEvent({});
vkev->fence = device->device.createFence({vk::FenceCreateFlagBits::eSignaled});
device->device.setEvent(vkev->event);
return new ggml_backend_event {
/* .device = */ dev,
/* .context = */ vkev,
};
}
static void ggml_backend_vk_device_event_free(ggml_backend_dev_t dev, ggml_backend_event_t event) {
ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
auto device = ggml_vk_get_device(ctx->device);
vk_event *vkev = (vk_event *)event->context;
device->device.destroyFence(vkev->fence);
device->device.destroyEvent(vkev->event);
delete vkev;
delete event;
}
static void ggml_backend_vk_device_event_synchronize(ggml_backend_dev_t dev, ggml_backend_event_t event) {
ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
auto device = ggml_vk_get_device(ctx->device);
vk_event *vkev = (vk_event *)event->context;
VK_CHECK(device->device.waitForFences({ vkev->fence }, true, UINT64_MAX), "event_synchronize");
}
static const struct ggml_backend_device_i ggml_backend_vk_device_i = {
/* .get_name = */ ggml_backend_vk_device_get_name,
/* .get_description = */ ggml_backend_vk_device_get_description,
@ -14415,9 +14539,9 @@ static const struct ggml_backend_device_i ggml_backend_vk_device_i = {
/* .supports_op = */ ggml_backend_vk_device_supports_op,
/* .supports_buft = */ ggml_backend_vk_device_supports_buft,
/* .offload_op = */ ggml_backend_vk_device_offload_op,
/* .event_new = */ NULL,
/* .event_free = */ NULL,
/* .event_synchronize = */ NULL,
/* .event_new = */ ggml_backend_vk_device_event_new,
/* .event_free = */ ggml_backend_vk_device_event_free,
/* .event_synchronize = */ ggml_backend_vk_device_event_synchronize,
};
static const char * ggml_backend_vk_reg_get_name(ggml_backend_reg_t reg) {

View File

@ -49,8 +49,8 @@ void rope_norm(const uint i0, const uint i1, rope_params p) {
uint idst = i1*ne0 + i0;
const uint ix = rope_a_coord(i0, i01, i02, p);
// Fusion optimization: ROPE + VIEW + SET_ROWS..
// The rope output is viewed as a 1D tensor and offset based on a row index in data_i.
// Fusion optimization: ROPE + VIEW + SET_ROWS.
// The rope output is viewed as a 1D tensor and offset based on a row index in rope_data_i.
if (p.set_rows_stride != 0) {
idst = i01*ne0 + i0;
idst += rope_data_i[i02].x * p.set_rows_stride;
@ -91,7 +91,7 @@ void rope_neox(const uint i0, const uint i1, rope_params p) {
uint idst = i1*ne0 + i0/2;
const uint ix = rope_a_coord(i0/2, i01, i02, p);
// Fusion optimization: ROPE + VIEW + SET_ROWS..
// Fusion optimization: ROPE + VIEW + SET_ROWS.
// The rope output is viewed as a 1D tensor and offset based on a row index in rope_data_i.
if (p.set_rows_stride != 0) {
idst = i01*ne0 + i0/2;
@ -132,9 +132,16 @@ void rope_multi(const uint i0, const uint i1, rope_params p) {
const uint i01 = i1 % ne1;
const uint i02 = i1 / ne1;
const uint idst = i1*ne0 + i0/2;
uint idst = i1*ne0 + i0/2;
const uint ix = rope_a_coord(i0/2, i01, i02, p);
// Fusion optimization: ROPE + VIEW + SET_ROWS.
// The rope output is viewed as a 1D tensor and offset based on a row index in rope_data_i.
if (p.set_rows_stride != 0) {
idst = i01*ne0 + i0/2;
idst += rope_data_i[i02].x * p.set_rows_stride;
}
if (i0 >= p.n_dims) {
rope_data_d[idst + i0/2 + 0] = ROPE_D_TYPE(rope_data_a[ix + i0/2 + 0]);
rope_data_d[idst + i0/2 + 1] = ROPE_D_TYPE(rope_data_a[ix + i0/2 + 1]);

View File

@ -927,6 +927,8 @@ void process_shaders() {
string_to_spv("rope_multi_f32", "rope_multi.comp", {{"A_TYPE", "float"}, {"ROPE_D_TYPE", "float"}});
string_to_spv("rope_multi_f16", "rope_multi.comp", {{"A_TYPE", "float16_t"}, {"ROPE_D_TYPE", "float16_t"}});
string_to_spv("rope_multi_f16_rte", "rope_multi.comp", {{"A_TYPE", "float16_t"}, {"ROPE_D_TYPE", "float16_t"}, {"RTE16", "1"}});
string_to_spv("rope_multi_f32_f16", "rope_multi.comp", {{"A_TYPE", "float"}, {"ROPE_D_TYPE", "float16_t"}});
string_to_spv("rope_multi_f32_f16_rte", "rope_multi.comp", {{"A_TYPE", "float"}, {"ROPE_D_TYPE", "float16_t"}, {"RTE16", "1"}});
string_to_spv("rope_vision_f32", "rope_vision.comp", {{"A_TYPE", "float"}, {"ROPE_D_TYPE", "float"}});
string_to_spv("rope_vision_f16", "rope_vision.comp", {{"A_TYPE", "float16_t"}, {"ROPE_D_TYPE", "float16_t"}});

View File

@ -110,7 +110,6 @@ class SafetensorRemote:
"""
BASE_DOMAIN = "https://huggingface.co"
ALIGNMENT = 8 # bytes
@classmethod
def get_list_tensors_hf_model(cls, model_id: str) -> dict[str, RemoteTensor]:
@ -204,9 +203,6 @@ class SafetensorRemote:
# Calculate the data start offset
data_start_offset = 8 + metadata_length
alignment = SafetensorRemote.ALIGNMENT
if data_start_offset % alignment != 0:
data_start_offset += alignment - (data_start_offset % alignment)
# Check if we have enough data to read the metadata
if len(raw_data) < 8 + metadata_length:
@ -298,7 +294,6 @@ class SafetensorsLocal:
Custom parsing gives a bit more control over the memory usage.
The official safetensors library doesn't expose file ranges.
"""
ALIGNMENT = 8 # bytes
tensors: dict[str, LocalTensor]
@ -316,9 +311,6 @@ class SafetensorsLocal:
raise ValueError(f"Failed to parse safetensors metadata as JSON: {e}")
data_start_offset = f.tell()
alignment = self.ALIGNMENT
if data_start_offset % alignment != 0:
data_start_offset += alignment - (data_start_offset % alignment)
tensors: dict[str, LocalTensor] = {}
for name, meta in metadata.items():

View File

@ -459,23 +459,22 @@ llama_context::llama_context(
}
llama_context::~llama_context() {
// FIXME this currently results in a use-after-free bug if the model is freed before the context
// if (!model.hparams.no_alloc) {
// for (size_t i = 0; i < backend_ptrs.size(); ++i) {
// ggml_backend_t backend = backend_ptrs[i];
// ggml_backend_buffer_type_t buft = backend_buft[i];
if (!model.hparams.no_alloc) {
for (size_t i = 0; i < backend_ptrs.size(); ++i) {
ggml_backend_t backend = backend_ptrs[i];
ggml_backend_buffer_type_t buft = backend_buft[i];
// const size_t size_exp = backend_buf_exp_size[i];
// const size_t size_act = ggml_backend_sched_get_buffer_size(sched.get(), backend);
// if (size_exp == size_act) {
// LLAMA_LOG_DEBUG("%s: %10s compute buffer size is %8.4f MiB, matches expectation of %8.4f MiB\n",
// __func__, ggml_backend_buft_name(buft), size_act / (1024.0*1024.0), size_exp / (1024.0*1024.0));
// } else {
// LLAMA_LOG_WARN("%s: %10s compute buffer size of %8.4f MiB, does not match expectation of %8.4f MiB\n",
// __func__, ggml_backend_buft_name(buft), size_act / (1024.0*1024.0), size_exp / (1024.0*1024.0));
// }
// }
// }
const size_t size_exp = backend_buf_exp_size[i];
const size_t size_act = ggml_backend_sched_get_buffer_size(sched.get(), backend);
if (size_exp == size_act) {
LLAMA_LOG_DEBUG("%s: %10s compute buffer size is %8.4f MiB, matches expectation of %8.4f MiB\n",
__func__, ggml_backend_buft_name(buft), size_act / (1024.0*1024.0), size_exp / (1024.0*1024.0));
} else {
LLAMA_LOG_WARN("%s: %10s compute buffer size of %8.4f MiB, does not match expectation of %8.4f MiB\n",
__func__, ggml_backend_buft_name(buft), size_act / (1024.0*1024.0), size_exp / (1024.0*1024.0));
}
}
}
ggml_opt_free(opt_ctx);
}

View File

@ -2329,11 +2329,13 @@ struct test_set_rows : public test_case {
struct test_rope_set_rows : public test_case {
const ggml_type type;
const ggml_type type_idx;
const std::array<int64_t, 4> ne;
const std::array<int64_t, 4> ne_a;
int mode;
const int n_ctx{512};
const int n_dims{128};
std::string vars() override {
return VARS_TO_STR4(type, type_idx, ne, mode);
return VARS_TO_STR4(type, type_idx, ne_a, mode);
}
std::string op_desc(ggml_tensor * t) override {
@ -2345,24 +2347,51 @@ struct test_rope_set_rows : public test_case {
test_rope_set_rows(ggml_type type,
ggml_type type_idx,
std::array<int64_t, 4> ne,
std::array<int64_t, 4> ne_a,
int mode)
: type(type), type_idx(type_idx), ne(ne), mode(mode) {}
: type(type), type_idx(type_idx), ne_a(ne_a), mode(mode) {}
ggml_tensor * build_graph(ggml_context * ctx) override {
ggml_tensor * src = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, ne[0], ne[1], ne[2], 1);
ggml_set_name(src, "src");
ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, ne_a[0], ne_a[1], ne_a[2], 1);
ggml_set_name(a, "a");
ggml_tensor * pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, ne[2]);
const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE;
const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
ggml_tensor * rope = ggml_rope(ctx, src, pos, ne[0], mode);
ggml_tensor * pos;
if (is_mrope || is_vision) {
pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, ne_a[2] * 4);
} else {
pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, ne_a[2]);
}
ggml_set_name(pos, "pos");
ggml_tensor * view = ggml_view_2d(ctx, rope, ne[0] * ne[1], ne[2], rope->nb[2], 0);
float fs = 1.4245f;
float ef = 0.7465f;
float af = 1.4245f;
ggml_tensor * freq = nullptr;
ggml_tensor * dst = ggml_new_tensor_4d(ctx, type, ne[0] * ne[1], ne[2] * ne[3], 1, 1);
ggml_tensor * rope = nullptr;
if (is_mrope) {
if (is_vision) {
GGML_ASSERT(n_dims/4 > 0);
int rope_sections[4] = {n_dims/4, n_dims/4, 0, 0}; // Vision-RoPE only use first two dimension for image (x, y) coordinate
rope = ggml_rope_multi(ctx, a, pos, freq, n_dims/2, rope_sections, mode, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f);
} else {
GGML_ASSERT(n_dims/3 > 0);
int rope_sections[4] = {n_dims/3, n_dims/3, n_dims/3, 0};
rope = ggml_rope_multi(ctx, a, pos, freq, n_dims, rope_sections, mode, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f);
}
} else {
rope = ggml_rope(ctx, a, pos, ne_a[0], mode);
}
ggml_tensor * view = ggml_view_2d(ctx, rope, ne_a[0] * ne_a[1], ne_a[2], rope->nb[2], 0);
ggml_tensor * dst = ggml_new_tensor_4d(ctx, type, ne_a[0] * ne_a[1], ne_a[2] * ne_a[3], 1, 1);
ggml_set_name(dst, "dst");
ggml_tensor * row_idxs = ggml_new_tensor_3d(ctx, type_idx, ne[2], 1, 1);
ggml_tensor * row_idxs = ggml_new_tensor_3d(ctx, type_idx, ne_a[2], 1, 1);
ggml_set_name(row_idxs, "row_idxs");
ggml_tensor * out = ggml_set_rows(ctx, dst, view, row_idxs);
@ -2373,14 +2402,26 @@ struct test_rope_set_rows : public test_case {
void initialize_tensors(ggml_context * ctx) override {
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
if (t->type == GGML_TYPE_I64 || t->type == GGML_TYPE_I32) {
if (strcmp(t->name, "row_idxs") == 0) {
if (ggml_is_view_op(t->op)) {
continue;
}
init_set_rows_row_ids(t, ne[2]);
init_set_rows_row_ids(t, ne_a[2]);
} else if (t->type == GGML_TYPE_I32) {
// pos
const int num_pos_ids = (mode & GGML_ROPE_TYPE_MROPE) ? ne_a[2] * 4 : ne_a[2];
std::vector<int> data(num_pos_ids);
for (int i = 0; i < num_pos_ids; i++) {
data[i] = rand() % n_ctx;
}
ggml_backend_tensor_set(t, data.data(), 0, num_pos_ids * sizeof(int));
} else {
init_tensor_uniform(t);
if (t->ne[0] == n_dims/2) {
// frequency factors in the range [0.9f, 1.1f]
init_tensor_uniform(t, 0.9f, 1.1f);
} else {
init_tensor_uniform(t);
}
}
}
}
@ -6854,10 +6895,12 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
}
}
for (int mode : { GGML_ROPE_TYPE_NORMAL, GGML_ROPE_TYPE_NEOX }) {
for (int mode : { GGML_ROPE_TYPE_NORMAL, GGML_ROPE_TYPE_NEOX, GGML_ROPE_TYPE_MROPE, GGML_ROPE_TYPE_VISION }) {
for (ggml_type type : {GGML_TYPE_F16, GGML_TYPE_F32}) {
test_cases.emplace_back(new test_rope_set_rows(type, GGML_TYPE_I64, { 128, 32, 1, 100 }, mode));
test_cases.emplace_back(new test_rope_set_rows(type, GGML_TYPE_I64, { 128, 32, 512, 1 }, mode));
for (int ne2 : {1, 8, 512}) {
test_cases.emplace_back(new test_rope_set_rows(type, GGML_TYPE_I64, { 128, 32, ne2, 1 }, mode));
test_cases.emplace_back(new test_rope_set_rows(type, GGML_TYPE_I64, { 128, 32, ne2, 3 }, mode));
}
}
}

View File

@ -1196,6 +1196,9 @@ int main(int argc, const char ** argv) {
test_sampler_chain();
llama_free(ctx);
llama_model_free(model);
fprintf(stdout, "All tests passed.\n");
return 0;
}

View File

@ -300,8 +300,8 @@ int main(int argc, char **argv) {
fprintf(stderr, "%s : tokens written to '%s'\n", __func__, (fname_text + ".tokcpp").c_str());
}
llama_model_free(model);
llama_free(ctx);
llama_model_free(model);
llama_backend_free();

View File

@ -146,8 +146,8 @@ int main(int argc, char **argv) {
}
}
llama_model_free(model);
llama_free(ctx);
llama_model_free(model);
llama_backend_free();

View File

@ -116,8 +116,8 @@ int main(int argc, char ** argv) {
}
}
llama_model_free(model);
llama_free(ctx);
llama_model_free(model);
llama_backend_free();

View File

@ -55,6 +55,7 @@ int main(int argc, char ** argv) {
if (ctx == NULL) {
fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
llama_model_free(model);
return 1;
}
@ -108,6 +109,8 @@ int main(int argc, char ** argv) {
if (!decode_helper(ctx, batch, ctx_params.n_batch, true)) {
LOG_ERR("%s: llama_decode() failed\n", __func__);
llama_free(ctx);
llama_model_free(model);
return 1;
}
}
@ -147,6 +150,8 @@ int main(int argc, char ** argv) {
if (!decode_helper(ctx, batch, ctx_params.n_batch, false)) {
LOG_ERR("%s: llama_decode() failed\n", __func__);
llama_free(ctx);
llama_model_free(model);
return 1;
}
@ -165,6 +170,8 @@ int main(int argc, char ** argv) {
common_batch_add(batch, get_token_rand(), pp + 0, { 0 }, true);
if (!decode_helper(ctx, batch, ctx_params.n_batch, true)) {
LOG_ERR("%s: llama_decode() failed\n", __func__);
llama_free(ctx);
llama_model_free(model);
return 1;
}
llama_memory_seq_rm(mem, 0, pp, -1);
@ -184,6 +191,8 @@ int main(int argc, char ** argv) {
if (!decode_helper(ctx, batch, ctx_params.n_batch, true)) {
LOG_ERR("%s: llama_decode() failed\n", __func__);
llama_free(ctx);
llama_model_free(model);
return 1;
}
}
@ -200,6 +209,8 @@ int main(int argc, char ** argv) {
if (!decode_helper(ctx, batch, ctx_params.n_batch, true)) {
LOG_ERR("%s: llama_decode() failed\n", __func__);
llama_free(ctx);
llama_model_free(model);
return 1;
}
}

View File

@ -1 +1,187 @@
TODO
# llama.cpp/tools/cli
## Usage
<!-- HELP_START -->
<!-- IMPORTANT: The list below is auto-generated by llama-gen-docs; do NOT modify it manually -->
### Common params
| Argument | Explanation |
| -------- | ----------- |
| `-h, --help, --usage` | print usage and exit |
| `--version` | show version and build info |
| `-cl, --cache-list` | show list of models in cache |
| `--completion-bash` | print source-able bash completion script for llama.cpp |
| `--verbose-prompt` | print a verbose prompt before generation (default: false) |
| `-t, --threads N` | number of CPU threads to use during generation (default: -1)<br/>(env: LLAMA_ARG_THREADS) |
| `-tb, --threads-batch N` | number of threads to use during batch and prompt processing (default: same as --threads) |
| `-C, --cpu-mask M` | CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: "") |
| `-Cr, --cpu-range lo-hi` | range of CPUs for affinity. Complements --cpu-mask |
| `--cpu-strict <0\|1>` | use strict CPU placement (default: 0) |
| `--prio N` | set process/thread priority : low(-1), normal(0), medium(1), high(2), realtime(3) (default: 0) |
| `--poll <0...100>` | use polling level to wait for work (0 - no polling, default: 50) |
| `-Cb, --cpu-mask-batch M` | CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask) |
| `-Crb, --cpu-range-batch lo-hi` | ranges of CPUs for affinity. Complements --cpu-mask-batch |
| `--cpu-strict-batch <0\|1>` | use strict CPU placement (default: same as --cpu-strict) |
| `--prio-batch N` | set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: 0) |
| `--poll-batch <0\|1>` | use polling to wait for work (default: same as --poll) |
| `-c, --ctx-size N` | size of the prompt context (default: 0, 0 = loaded from model)<br/>(env: LLAMA_ARG_CTX_SIZE) |
| `-n, --predict, --n-predict N` | number of tokens to predict (default: -1, -1 = infinity)<br/>(env: LLAMA_ARG_N_PREDICT) |
| `-b, --batch-size N` | logical maximum batch size (default: 2048)<br/>(env: LLAMA_ARG_BATCH) |
| `-ub, --ubatch-size N` | physical maximum batch size (default: 512)<br/>(env: LLAMA_ARG_UBATCH) |
| `--keep N` | number of tokens to keep from the initial prompt (default: 0, -1 = all) |
| `--swa-full` | use full-size SWA cache (default: false)<br/>[(more info)](https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)<br/>(env: LLAMA_ARG_SWA_FULL) |
| `-fa, --flash-attn [on\|off\|auto]` | set Flash Attention use ('on', 'off', or 'auto', default: 'auto')<br/>(env: LLAMA_ARG_FLASH_ATTN) |
| `-p, --prompt PROMPT` | prompt to start generation with; for system message, use -sys |
| `--perf, --no-perf` | whether to enable internal libllama performance timings (default: false)<br/>(env: LLAMA_ARG_PERF) |
| `-f, --file FNAME` | a file containing the prompt (default: none) |
| `-bf, --binary-file FNAME` | binary file containing the prompt (default: none) |
| `-e, --escape, --no-escape` | whether to process escapes sequences (\n, \r, \t, \', \", \\) (default: true) |
| `--rope-scaling {none,linear,yarn}` | RoPE frequency scaling method, defaults to linear unless specified by the model<br/>(env: LLAMA_ARG_ROPE_SCALING_TYPE) |
| `--rope-scale N` | RoPE context scaling factor, expands context by a factor of N<br/>(env: LLAMA_ARG_ROPE_SCALE) |
| `--rope-freq-base N` | RoPE base frequency, used by NTK-aware scaling (default: loaded from model)<br/>(env: LLAMA_ARG_ROPE_FREQ_BASE) |
| `--rope-freq-scale N` | RoPE frequency scaling factor, expands context by a factor of 1/N<br/>(env: LLAMA_ARG_ROPE_FREQ_SCALE) |
| `--yarn-orig-ctx N` | YaRN: original context size of model (default: 0 = model training context size)<br/>(env: LLAMA_ARG_YARN_ORIG_CTX) |
| `--yarn-ext-factor N` | YaRN: extrapolation mix factor (default: -1.0, 0.0 = full interpolation)<br/>(env: LLAMA_ARG_YARN_EXT_FACTOR) |
| `--yarn-attn-factor N` | YaRN: scale sqrt(t) or attention magnitude (default: -1.0)<br/>(env: LLAMA_ARG_YARN_ATTN_FACTOR) |
| `--yarn-beta-slow N` | YaRN: high correction dim or alpha (default: -1.0)<br/>(env: LLAMA_ARG_YARN_BETA_SLOW) |
| `--yarn-beta-fast N` | YaRN: low correction dim or beta (default: -1.0)<br/>(env: LLAMA_ARG_YARN_BETA_FAST) |
| `-kvo, --kv-offload, -nkvo, --no-kv-offload` | whether to enable KV cache offloading (default: enabled)<br/>(env: LLAMA_ARG_KV_OFFLOAD) |
| `--repack, -nr, --no-repack` | whether to enable weight repacking (default: enabled)<br/>(env: LLAMA_ARG_REPACK) |
| `--no-host` | bypass host buffer allowing extra buffers to be used<br/>(env: LLAMA_ARG_NO_HOST) |
| `-ctk, --cache-type-k TYPE` | KV cache data type for K<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_K) |
| `-ctv, --cache-type-v TYPE` | KV cache data type for V<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_V) |
| `-dt, --defrag-thold N` | KV cache defragmentation threshold (DEPRECATED)<br/>(env: LLAMA_ARG_DEFRAG_THOLD) |
| `-np, --parallel N` | number of parallel sequences to decode (default: 1)<br/>(env: LLAMA_ARG_N_PARALLEL) |
| `--mlock` | force system to keep model in RAM rather than swapping or compressing<br/>(env: LLAMA_ARG_MLOCK) |
| `--mmap, --no-mmap` | whether to memory-map model (if disabled, slower load but may reduce pageouts if not using mlock) (default: enabled)<br/>(env: LLAMA_ARG_MMAP) |
| `--numa TYPE` | attempt optimizations that help on some NUMA systems<br/>- distribute: spread execution evenly over all nodes<br/>- isolate: only spawn threads on CPUs on the node that execution started on<br/>- numactl: use the CPU map provided by numactl<br/>if run without this previously, it is recommended to drop the system page cache before using this<br/>see https://github.com/ggml-org/llama.cpp/issues/1437<br/>(env: LLAMA_ARG_NUMA) |
| `-dev, --device <dev1,dev2,..>` | comma-separated list of devices to use for offloading (none = don't offload)<br/>use --list-devices to see a list of available devices<br/>(env: LLAMA_ARG_DEVICE) |
| `--list-devices` | print list of available devices and exit |
| `-ot, --override-tensor <tensor name pattern>=<buffer type>,...` | override tensor buffer type |
| `-cmoe, --cpu-moe` | keep all Mixture of Experts (MoE) weights in the CPU<br/>(env: LLAMA_ARG_CPU_MOE) |
| `-ncmoe, --n-cpu-moe N` | keep the Mixture of Experts (MoE) weights of the first N layers in the CPU<br/>(env: LLAMA_ARG_N_CPU_MOE) |
| `-ngl, --gpu-layers, --n-gpu-layers N` | max. number of layers to store in VRAM (default: -1)<br/>(env: LLAMA_ARG_N_GPU_LAYERS) |
| `-sm, --split-mode {none,layer,row}` | how to split the model across multiple GPUs, one of:<br/>- none: use one GPU only<br/>- layer (default): split layers and KV across GPUs<br/>- row: split rows across GPUs<br/>(env: LLAMA_ARG_SPLIT_MODE) |
| `-ts, --tensor-split N0,N1,N2,...` | fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1<br/>(env: LLAMA_ARG_TENSOR_SPLIT) |
| `-mg, --main-gpu INDEX` | the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: 0)<br/>(env: LLAMA_ARG_MAIN_GPU) |
| `-fit, --fit [on\|off]` | whether to adjust unset arguments to fit in device memory ('on' or 'off', default: 'on')<br/>(env: LLAMA_ARG_FIT) |
| `-fitt, --fit-target MiB` | target margin per device for --fit option, default: 1024<br/>(env: LLAMA_ARG_FIT_TARGET) |
| `-fitc, --fit-ctx N` | minimum ctx size that can be set by --fit option, default: 4096<br/>(env: LLAMA_ARG_FIT_CTX) |
| `--check-tensors` | check model tensor data for invalid values (default: false) |
| `--override-kv KEY=TYPE:VALUE,...` | advanced option to override model metadata by key. to specify multiple overrides, either use comma-separated or repeat this argument.<br/>types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false,tokenizer.ggml.add_eos_token=bool:false |
| `--op-offload, --no-op-offload` | whether to offload host tensor operations to device (default: true) |
| `--lora FNAME` | path to LoRA adapter (use comma-separated values to load multiple adapters) |
| `--lora-scaled FNAME:SCALE,...` | path to LoRA adapter with user defined scaling (format: FNAME:SCALE,...)<br/>note: use comma-separated values |
| `--control-vector FNAME` | add a control vector<br/>note: use comma-separated values to add multiple control vectors |
| `--control-vector-scaled FNAME:SCALE,...` | add a control vector with user defined scaling SCALE<br/>note: use comma-separated values (format: FNAME:SCALE,...) |
| `--control-vector-layer-range START END` | layer range to apply the control vector(s) to, start and end inclusive |
| `-m, --model FNAME` | model path to load<br/>(env: LLAMA_ARG_MODEL) |
| `-mu, --model-url MODEL_URL` | model download url (default: unused)<br/>(env: LLAMA_ARG_MODEL_URL) |
| `-dr, --docker-repo [<repo>/]<model>[:quant]` | Docker Hub model repository. repo is optional, default to ai/. quant is optional, default to :latest.<br/>example: gemma3<br/>(default: unused)<br/>(env: LLAMA_ARG_DOCKER_REPO) |
| `-hf, -hfr, --hf-repo <user>/<model>[:quant]` | Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.<br/>mmproj is also downloaded automatically if available. to disable, add --no-mmproj<br/>example: unsloth/phi-4-GGUF:q4_k_m<br/>(default: unused)<br/>(env: LLAMA_ARG_HF_REPO) |
| `-hfd, -hfrd, --hf-repo-draft <user>/<model>[:quant]` | Same as --hf-repo, but for the draft model (default: unused)<br/>(env: LLAMA_ARG_HFD_REPO) |
| `-hff, --hf-file FILE` | Hugging Face model file. If specified, it will override the quant in --hf-repo (default: unused)<br/>(env: LLAMA_ARG_HF_FILE) |
| `-hfv, -hfrv, --hf-repo-v <user>/<model>[:quant]` | Hugging Face model repository for the vocoder model (default: unused)<br/>(env: LLAMA_ARG_HF_REPO_V) |
| `-hffv, --hf-file-v FILE` | Hugging Face model file for the vocoder model (default: unused)<br/>(env: LLAMA_ARG_HF_FILE_V) |
| `-hft, --hf-token TOKEN` | Hugging Face access token (default: value from HF_TOKEN environment variable)<br/>(env: HF_TOKEN) |
| `--log-disable` | Log disable |
| `--log-file FNAME` | Log to file<br/>(env: LLAMA_LOG_FILE) |
| `--log-colors [on\|off\|auto]` | Set colored logging ('on', 'off', or 'auto', default: 'auto')<br/>'auto' enables colors when output is to a terminal<br/>(env: LLAMA_LOG_COLORS) |
| `-v, --verbose, --log-verbose` | Set verbosity level to infinity (i.e. log all messages, useful for debugging) |
| `--offline` | Offline mode: forces use of cache, prevents network access<br/>(env: LLAMA_OFFLINE) |
| `-lv, --verbosity, --log-verbosity N` | Set the verbosity threshold. Messages with a higher verbosity will be ignored. Values:<br/> - 0: generic output<br/> - 1: error<br/> - 2: warning<br/> - 3: info<br/> - 4: debug<br/>(default: 3)<br/><br/>(env: LLAMA_LOG_VERBOSITY) |
| `--log-prefix` | Enable prefix in log messages<br/>(env: LLAMA_LOG_PREFIX) |
| `--log-timestamps` | Enable timestamps in log messages<br/>(env: LLAMA_LOG_TIMESTAMPS) |
| `-ctkd, --cache-type-k-draft TYPE` | KV cache data type for K for the draft model<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_K_DRAFT) |
| `-ctvd, --cache-type-v-draft TYPE` | KV cache data type for V for the draft model<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_V_DRAFT) |
### Sampling params
| Argument | Explanation |
| -------- | ----------- |
| `--samplers SAMPLERS` | samplers that will be used for generation in the order, separated by ';'<br/>(default: penalties;dry;top_n_sigma;top_k;typ_p;top_p;min_p;xtc;temperature) |
| `-s, --seed SEED` | RNG seed (default: -1, use random seed for -1) |
| `--sampler-seq, --sampling-seq SEQUENCE` | simplified sequence for samplers that will be used (default: edskypmxt) |
| `--ignore-eos` | ignore end of stream token and continue generating (implies --logit-bias EOS-inf) |
| `--temp N` | temperature (default: 0.8) |
| `--top-k N` | top-k sampling (default: 40, 0 = disabled)<br/>(env: LLAMA_ARG_TOP_K) |
| `--top-p N` | top-p sampling (default: 0.9, 1.0 = disabled) |
| `--min-p N` | min-p sampling (default: 0.1, 0.0 = disabled) |
| `--top-nsigma N` | top-n-sigma sampling (default: -1.0, -1.0 = disabled) |
| `--xtc-probability N` | xtc probability (default: 0.0, 0.0 = disabled) |
| `--xtc-threshold N` | xtc threshold (default: 0.1, 1.0 = disabled) |
| `--typical N` | locally typical sampling, parameter p (default: 1.0, 1.0 = disabled) |
| `--repeat-last-n N` | last n tokens to consider for penalize (default: 64, 0 = disabled, -1 = ctx_size) |
| `--repeat-penalty N` | penalize repeat sequence of tokens (default: 1.0, 1.0 = disabled) |
| `--presence-penalty N` | repeat alpha presence penalty (default: 0.0, 0.0 = disabled) |
| `--frequency-penalty N` | repeat alpha frequency penalty (default: 0.0, 0.0 = disabled) |
| `--dry-multiplier N` | set DRY sampling multiplier (default: 0.0, 0.0 = disabled) |
| `--dry-base N` | set DRY sampling base value (default: 1.75) |
| `--dry-allowed-length N` | set allowed length for DRY sampling (default: 2) |
| `--dry-penalty-last-n N` | set DRY penalty for the last n tokens (default: -1, 0 = disable, -1 = context size) |
| `--dry-sequence-breaker STRING` | add sequence breaker for DRY sampling, clearing out default breakers ('\n', ':', '"', '*') in the process; use "none" to not use any sequence breakers |
| `--dynatemp-range N` | dynamic temperature range (default: 0.0, 0.0 = disabled) |
| `--dynatemp-exp N` | dynamic temperature exponent (default: 1.0) |
| `--mirostat N` | use Mirostat sampling.<br/>Top K, Nucleus and Locally Typical samplers are ignored if used.<br/>(default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0) |
| `--mirostat-lr N` | Mirostat learning rate, parameter eta (default: 0.1) |
| `--mirostat-ent N` | Mirostat target entropy, parameter tau (default: 5.0) |
| `-l, --logit-bias TOKEN_ID(+/-)BIAS` | modifies the likelihood of token appearing in the completion,<br/>i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',<br/>or `--logit-bias 15043-1` to decrease likelihood of token ' Hello' |
| `--grammar GRAMMAR` | BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '') |
| `--grammar-file FNAME` | file to read grammar from |
| `-j, --json-schema SCHEMA` | JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object<br/>For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead |
| `-jf, --json-schema-file FILE` | File containing a JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object<br/>For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead |
### CLI-specific params
| Argument | Explanation |
| -------- | ----------- |
| `--display-prompt, --no-display-prompt` | whether to print prompt at generation (default: true) |
| `-co, --color [on\|off\|auto]` | Colorize output to distinguish prompt and user input from generations ('on', 'off', or 'auto', default: 'auto')<br/>'auto' enables colors when output is to a terminal |
| `--ctx-checkpoints, --swa-checkpoints N` | max number of context checkpoints to create per slot (default: 8)[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)<br/>(env: LLAMA_ARG_CTX_CHECKPOINTS) |
| `-cram, --cache-ram N` | set the maximum cache size in MiB (default: 8192, -1 - no limit, 0 - disable)[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)<br/>(env: LLAMA_ARG_CACHE_RAM) |
| `--context-shift, --no-context-shift` | whether to use context shift on infinite text generation (default: disabled)<br/>(env: LLAMA_ARG_CONTEXT_SHIFT) |
| `-sys, --system-prompt PROMPT` | system prompt to use with model (if applicable, depending on chat template) |
| `--show-timings, --no-show-timings` | whether to show timing information after each response (default: true)<br/>(env: LLAMA_ARG_SHOW_TIMINGS) |
| `-sysf, --system-prompt-file FNAME` | a file containing the system prompt (default: none) |
| `-r, --reverse-prompt PROMPT` | halt generation at PROMPT, return control in interactive mode |
| `-sp, --special` | special tokens output enabled (default: false) |
| `-cnv, --conversation, -no-cnv, --no-conversation` | whether to run in conversation mode:<br/>- does not print special tokens and suffix/prefix<br/>- interactive mode is also enabled<br/>(default: auto enabled if chat template is available) |
| `-st, --single-turn` | run conversation for a single turn only, then exit when done<br/>will not be interactive if first turn is predefined with --prompt<br/>(default: false) |
| `-mli, --multiline-input` | allows you to write or paste multiple lines without ending each in '\' |
| `--warmup, --no-warmup` | whether to perform warmup with an empty run (default: enabled) |
| `-mm, --mmproj FILE` | path to a multimodal projector file. see tools/mtmd/README.md<br/>note: if -hf is used, this argument can be omitted<br/>(env: LLAMA_ARG_MMPROJ) |
| `-mmu, --mmproj-url URL` | URL to a multimodal projector file. see tools/mtmd/README.md<br/>(env: LLAMA_ARG_MMPROJ_URL) |
| `--mmproj-auto, --no-mmproj, --no-mmproj-auto` | whether to use multimodal projector file (if available), useful when using -hf (default: enabled)<br/>(env: LLAMA_ARG_MMPROJ_AUTO) |
| `--mmproj-offload, --no-mmproj-offload` | whether to enable GPU offloading for multimodal projector (default: enabled)<br/>(env: LLAMA_ARG_MMPROJ_OFFLOAD) |
| `--image, --audio FILE` | path to an image or audio file. use with multimodal models, use comma-separated values for multiple files |
| `--image-min-tokens N` | minimum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)<br/>(env: LLAMA_ARG_IMAGE_MIN_TOKENS) |
| `--image-max-tokens N` | maximum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)<br/>(env: LLAMA_ARG_IMAGE_MAX_TOKENS) |
| `-otd, --override-tensor-draft <tensor name pattern>=<buffer type>,...` | override tensor buffer type for draft model |
| `-cmoed, --cpu-moe-draft` | keep all Mixture of Experts (MoE) weights in the CPU for the draft model<br/>(env: LLAMA_ARG_CPU_MOE_DRAFT) |
| `-ncmoed, --n-cpu-moe-draft N` | keep the Mixture of Experts (MoE) weights of the first N layers in the CPU for the draft model<br/>(env: LLAMA_ARG_N_CPU_MOE_DRAFT) |
| `--chat-template-kwargs STRING` | sets additional params for the json template parser<br/>(env: LLAMA_CHAT_TEMPLATE_KWARGS) |
| `--jinja, --no-jinja` | whether to use jinja template engine for chat (default: enabled)<br/>(env: LLAMA_ARG_JINJA) |
| `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:<br/>- none: leaves thoughts unparsed in `message.content`<br/>- deepseek: puts thoughts in `message.reasoning_content`<br/>- deepseek-legacy: keeps `<think>` tags in `message.content` while also populating `message.reasoning_content`<br/>(default: auto)<br/>(env: LLAMA_ARG_THINK) |
| `--reasoning-budget N` | controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)<br/>(env: LLAMA_ARG_THINK_BUDGET) |
| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
| `--simple-io` | use basic IO for better compatibility in subprocesses and limited consoles |
| `--draft, --draft-n, --draft-max N` | number of tokens to draft for speculative decoding (default: 16)<br/>(env: LLAMA_ARG_DRAFT_MAX) |
| `--draft-min, --draft-n-min N` | minimum number of draft tokens to use for speculative decoding (default: 0)<br/>(env: LLAMA_ARG_DRAFT_MIN) |
| `--draft-p-min P` | minimum speculative decoding probability (greedy) (default: 0.8)<br/>(env: LLAMA_ARG_DRAFT_P_MIN) |
| `-cd, --ctx-size-draft N` | size of the prompt context for the draft model (default: 0, 0 = loaded from model)<br/>(env: LLAMA_ARG_CTX_SIZE_DRAFT) |
| `-devd, --device-draft <dev1,dev2,..>` | comma-separated list of devices to use for offloading the draft model (none = don't offload)<br/>use --list-devices to see a list of available devices |
| `-ngld, --gpu-layers-draft, --n-gpu-layers-draft N` | number of layers to store in VRAM for the draft model<br/>(env: LLAMA_ARG_N_GPU_LAYERS_DRAFT) |
| `-md, --model-draft FNAME` | draft model for speculative decoding (default: unused)<br/>(env: LLAMA_ARG_MODEL_DRAFT) |
| `--spec-replace TARGET DRAFT` | translate the string in TARGET into DRAFT if the draft model and main model are not compatible |
| `--gpt-oss-20b-default` | use gpt-oss-20b (note: can download weights from the internet) |
| `--gpt-oss-120b-default` | use gpt-oss-120b (note: can download weights from the internet) |
| `--vision-gemma-4b-default` | use Gemma 3 4B QAT (note: can download weights from the internet) |
| `--vision-gemma-12b-default` | use Gemma 3 12B QAT (note: can download weights from the internet) |
<!-- HELP_END -->

View File

@ -216,7 +216,7 @@ int main(int argc, char ** argv) {
ctx_cli.ctx_server.start_loop();
});
auto inf = ctx_cli.ctx_server.get_info();
auto inf = ctx_cli.ctx_server.get_meta();
std::string modalities = "text";
if (inf.has_inp_image) {
modalities += ", vision";

View File

@ -5,13 +5,14 @@ This example program allows you to use various LLaMA language models easily and
## Table of Contents
1. [Quick Start](#quick-start)
2. [Common Options](#common-options)
3. [Input Prompts](#input-prompts)
4. [Interaction](#interaction)
5. [Context Management](#context-management)
6. [Generation Flags](#generation-flags)
7. [Performance Tuning and Memory Options](#performance-tuning-and-memory-options)
8. [Additional Options](#additional-options)
2. [Usage](#usage)
3. [Common Options](#common-options)
4. [Input Prompts](#input-prompts)
5. [Interaction](#interaction)
6. [Context Management](#context-management)
7. [Generation Flags](#generation-flags)
8. [Performance Tuning and Memory Options](#performance-tuning-and-memory-options)
9. [Additional Options](#additional-options)
## Quick Start
@ -82,6 +83,177 @@ Once downloaded, place your model in the models folder in llama.cpp.
llama-completion.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1
```
## Usage
<!-- HELP_START -->
<!-- IMPORTANT: The list below is auto-generated by llama-gen-docs; do NOT modify it manually -->
### Common params
| Argument | Explanation |
| -------- | ----------- |
| `-h, --help, --usage` | print usage and exit |
| `--version` | show version and build info |
| `-cl, --cache-list` | show list of models in cache |
| `--completion-bash` | print source-able bash completion script for llama.cpp |
| `--verbose-prompt` | print a verbose prompt before generation (default: false) |
| `-t, --threads N` | number of CPU threads to use during generation (default: -1)<br/>(env: LLAMA_ARG_THREADS) |
| `-tb, --threads-batch N` | number of threads to use during batch and prompt processing (default: same as --threads) |
| `-C, --cpu-mask M` | CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: "") |
| `-Cr, --cpu-range lo-hi` | range of CPUs for affinity. Complements --cpu-mask |
| `--cpu-strict <0\|1>` | use strict CPU placement (default: 0) |
| `--prio N` | set process/thread priority : low(-1), normal(0), medium(1), high(2), realtime(3) (default: 0) |
| `--poll <0...100>` | use polling level to wait for work (0 - no polling, default: 50) |
| `-Cb, --cpu-mask-batch M` | CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask) |
| `-Crb, --cpu-range-batch lo-hi` | ranges of CPUs for affinity. Complements --cpu-mask-batch |
| `--cpu-strict-batch <0\|1>` | use strict CPU placement (default: same as --cpu-strict) |
| `--prio-batch N` | set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: 0) |
| `--poll-batch <0\|1>` | use polling to wait for work (default: same as --poll) |
| `-c, --ctx-size N` | size of the prompt context (default: 0, 0 = loaded from model)<br/>(env: LLAMA_ARG_CTX_SIZE) |
| `-n, --predict, --n-predict N` | number of tokens to predict (default: -1, -1 = infinity, -2 = until context filled)<br/>(env: LLAMA_ARG_N_PREDICT) |
| `-b, --batch-size N` | logical maximum batch size (default: 2048)<br/>(env: LLAMA_ARG_BATCH) |
| `-ub, --ubatch-size N` | physical maximum batch size (default: 512)<br/>(env: LLAMA_ARG_UBATCH) |
| `--keep N` | number of tokens to keep from the initial prompt (default: 0, -1 = all) |
| `--swa-full` | use full-size SWA cache (default: false)<br/>[(more info)](https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)<br/>(env: LLAMA_ARG_SWA_FULL) |
| `-fa, --flash-attn [on\|off\|auto]` | set Flash Attention use ('on', 'off', or 'auto', default: 'auto')<br/>(env: LLAMA_ARG_FLASH_ATTN) |
| `-p, --prompt PROMPT` | prompt to start generation with; for system message, use -sys |
| `--perf, --no-perf` | whether to enable internal libllama performance timings (default: false)<br/>(env: LLAMA_ARG_PERF) |
| `-f, --file FNAME` | a file containing the prompt (default: none) |
| `-bf, --binary-file FNAME` | binary file containing the prompt (default: none) |
| `-e, --escape, --no-escape` | whether to process escapes sequences (\n, \r, \t, \', \", \\) (default: true) |
| `--rope-scaling {none,linear,yarn}` | RoPE frequency scaling method, defaults to linear unless specified by the model<br/>(env: LLAMA_ARG_ROPE_SCALING_TYPE) |
| `--rope-scale N` | RoPE context scaling factor, expands context by a factor of N<br/>(env: LLAMA_ARG_ROPE_SCALE) |
| `--rope-freq-base N` | RoPE base frequency, used by NTK-aware scaling (default: loaded from model)<br/>(env: LLAMA_ARG_ROPE_FREQ_BASE) |
| `--rope-freq-scale N` | RoPE frequency scaling factor, expands context by a factor of 1/N<br/>(env: LLAMA_ARG_ROPE_FREQ_SCALE) |
| `--yarn-orig-ctx N` | YaRN: original context size of model (default: 0 = model training context size)<br/>(env: LLAMA_ARG_YARN_ORIG_CTX) |
| `--yarn-ext-factor N` | YaRN: extrapolation mix factor (default: -1.0, 0.0 = full interpolation)<br/>(env: LLAMA_ARG_YARN_EXT_FACTOR) |
| `--yarn-attn-factor N` | YaRN: scale sqrt(t) or attention magnitude (default: -1.0)<br/>(env: LLAMA_ARG_YARN_ATTN_FACTOR) |
| `--yarn-beta-slow N` | YaRN: high correction dim or alpha (default: -1.0)<br/>(env: LLAMA_ARG_YARN_BETA_SLOW) |
| `--yarn-beta-fast N` | YaRN: low correction dim or beta (default: -1.0)<br/>(env: LLAMA_ARG_YARN_BETA_FAST) |
| `-kvo, --kv-offload, -nkvo, --no-kv-offload` | whether to enable KV cache offloading (default: enabled)<br/>(env: LLAMA_ARG_KV_OFFLOAD) |
| `--repack, -nr, --no-repack` | whether to enable weight repacking (default: enabled)<br/>(env: LLAMA_ARG_REPACK) |
| `--no-host` | bypass host buffer allowing extra buffers to be used<br/>(env: LLAMA_ARG_NO_HOST) |
| `-ctk, --cache-type-k TYPE` | KV cache data type for K<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_K) |
| `-ctv, --cache-type-v TYPE` | KV cache data type for V<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_V) |
| `-dt, --defrag-thold N` | KV cache defragmentation threshold (DEPRECATED)<br/>(env: LLAMA_ARG_DEFRAG_THOLD) |
| `-np, --parallel N` | number of parallel sequences to decode (default: 1)<br/>(env: LLAMA_ARG_N_PARALLEL) |
| `--mlock` | force system to keep model in RAM rather than swapping or compressing<br/>(env: LLAMA_ARG_MLOCK) |
| `--mmap, --no-mmap` | whether to memory-map model (if disabled, slower load but may reduce pageouts if not using mlock) (default: enabled)<br/>(env: LLAMA_ARG_MMAP) |
| `--numa TYPE` | attempt optimizations that help on some NUMA systems<br/>- distribute: spread execution evenly over all nodes<br/>- isolate: only spawn threads on CPUs on the node that execution started on<br/>- numactl: use the CPU map provided by numactl<br/>if run without this previously, it is recommended to drop the system page cache before using this<br/>see https://github.com/ggml-org/llama.cpp/issues/1437<br/>(env: LLAMA_ARG_NUMA) |
| `-dev, --device <dev1,dev2,..>` | comma-separated list of devices to use for offloading (none = don't offload)<br/>use --list-devices to see a list of available devices<br/>(env: LLAMA_ARG_DEVICE) |
| `--list-devices` | print list of available devices and exit |
| `-ot, --override-tensor <tensor name pattern>=<buffer type>,...` | override tensor buffer type |
| `-cmoe, --cpu-moe` | keep all Mixture of Experts (MoE) weights in the CPU<br/>(env: LLAMA_ARG_CPU_MOE) |
| `-ncmoe, --n-cpu-moe N` | keep the Mixture of Experts (MoE) weights of the first N layers in the CPU<br/>(env: LLAMA_ARG_N_CPU_MOE) |
| `-ngl, --gpu-layers, --n-gpu-layers N` | max. number of layers to store in VRAM (default: -1)<br/>(env: LLAMA_ARG_N_GPU_LAYERS) |
| `-sm, --split-mode {none,layer,row}` | how to split the model across multiple GPUs, one of:<br/>- none: use one GPU only<br/>- layer (default): split layers and KV across GPUs<br/>- row: split rows across GPUs<br/>(env: LLAMA_ARG_SPLIT_MODE) |
| `-ts, --tensor-split N0,N1,N2,...` | fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1<br/>(env: LLAMA_ARG_TENSOR_SPLIT) |
| `-mg, --main-gpu INDEX` | the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: 0)<br/>(env: LLAMA_ARG_MAIN_GPU) |
| `-fit, --fit [on\|off]` | whether to adjust unset arguments to fit in device memory ('on' or 'off', default: 'on')<br/>(env: LLAMA_ARG_FIT) |
| `-fitt, --fit-target MiB` | target margin per device for --fit option, default: 1024<br/>(env: LLAMA_ARG_FIT_TARGET) |
| `-fitc, --fit-ctx N` | minimum ctx size that can be set by --fit option, default: 4096<br/>(env: LLAMA_ARG_FIT_CTX) |
| `--check-tensors` | check model tensor data for invalid values (default: false) |
| `--override-kv KEY=TYPE:VALUE,...` | advanced option to override model metadata by key. to specify multiple overrides, either use comma-separated or repeat this argument.<br/>types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false,tokenizer.ggml.add_eos_token=bool:false |
| `--op-offload, --no-op-offload` | whether to offload host tensor operations to device (default: true) |
| `--lora FNAME` | path to LoRA adapter (use comma-separated values to load multiple adapters) |
| `--lora-scaled FNAME:SCALE,...` | path to LoRA adapter with user defined scaling (format: FNAME:SCALE,...)<br/>note: use comma-separated values |
| `--control-vector FNAME` | add a control vector<br/>note: use comma-separated values to add multiple control vectors |
| `--control-vector-scaled FNAME:SCALE,...` | add a control vector with user defined scaling SCALE<br/>note: use comma-separated values (format: FNAME:SCALE,...) |
| `--control-vector-layer-range START END` | layer range to apply the control vector(s) to, start and end inclusive |
| `-m, --model FNAME` | model path to load<br/>(env: LLAMA_ARG_MODEL) |
| `-mu, --model-url MODEL_URL` | model download url (default: unused)<br/>(env: LLAMA_ARG_MODEL_URL) |
| `-dr, --docker-repo [<repo>/]<model>[:quant]` | Docker Hub model repository. repo is optional, default to ai/. quant is optional, default to :latest.<br/>example: gemma3<br/>(default: unused)<br/>(env: LLAMA_ARG_DOCKER_REPO) |
| `-hf, -hfr, --hf-repo <user>/<model>[:quant]` | Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.<br/>mmproj is also downloaded automatically if available. to disable, add --no-mmproj<br/>example: unsloth/phi-4-GGUF:q4_k_m<br/>(default: unused)<br/>(env: LLAMA_ARG_HF_REPO) |
| `-hfd, -hfrd, --hf-repo-draft <user>/<model>[:quant]` | Same as --hf-repo, but for the draft model (default: unused)<br/>(env: LLAMA_ARG_HFD_REPO) |
| `-hff, --hf-file FILE` | Hugging Face model file. If specified, it will override the quant in --hf-repo (default: unused)<br/>(env: LLAMA_ARG_HF_FILE) |
| `-hfv, -hfrv, --hf-repo-v <user>/<model>[:quant]` | Hugging Face model repository for the vocoder model (default: unused)<br/>(env: LLAMA_ARG_HF_REPO_V) |
| `-hffv, --hf-file-v FILE` | Hugging Face model file for the vocoder model (default: unused)<br/>(env: LLAMA_ARG_HF_FILE_V) |
| `-hft, --hf-token TOKEN` | Hugging Face access token (default: value from HF_TOKEN environment variable)<br/>(env: HF_TOKEN) |
| `--log-disable` | Log disable |
| `--log-file FNAME` | Log to file<br/>(env: LLAMA_LOG_FILE) |
| `--log-colors [on\|off\|auto]` | Set colored logging ('on', 'off', or 'auto', default: 'auto')<br/>'auto' enables colors when output is to a terminal<br/>(env: LLAMA_LOG_COLORS) |
| `-v, --verbose, --log-verbose` | Set verbosity level to infinity (i.e. log all messages, useful for debugging) |
| `--offline` | Offline mode: forces use of cache, prevents network access<br/>(env: LLAMA_OFFLINE) |
| `-lv, --verbosity, --log-verbosity N` | Set the verbosity threshold. Messages with a higher verbosity will be ignored. Values:<br/> - 0: generic output<br/> - 1: error<br/> - 2: warning<br/> - 3: info<br/> - 4: debug<br/>(default: 3)<br/><br/>(env: LLAMA_LOG_VERBOSITY) |
| `--log-prefix` | Enable prefix in log messages<br/>(env: LLAMA_LOG_PREFIX) |
| `--log-timestamps` | Enable timestamps in log messages<br/>(env: LLAMA_LOG_TIMESTAMPS) |
| `-ctkd, --cache-type-k-draft TYPE` | KV cache data type for K for the draft model<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_K_DRAFT) |
| `-ctvd, --cache-type-v-draft TYPE` | KV cache data type for V for the draft model<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_V_DRAFT) |
### Sampling params
| Argument | Explanation |
| -------- | ----------- |
| `--samplers SAMPLERS` | samplers that will be used for generation in the order, separated by ';'<br/>(default: penalties;dry;top_n_sigma;top_k;typ_p;top_p;min_p;xtc;temperature) |
| `-s, --seed SEED` | RNG seed (default: -1, use random seed for -1) |
| `--sampler-seq, --sampling-seq SEQUENCE` | simplified sequence for samplers that will be used (default: edskypmxt) |
| `--ignore-eos` | ignore end of stream token and continue generating (implies --logit-bias EOS-inf) |
| `--temp N` | temperature (default: 0.8) |
| `--top-k N` | top-k sampling (default: 40, 0 = disabled)<br/>(env: LLAMA_ARG_TOP_K) |
| `--top-p N` | top-p sampling (default: 0.9, 1.0 = disabled) |
| `--min-p N` | min-p sampling (default: 0.1, 0.0 = disabled) |
| `--top-nsigma N` | top-n-sigma sampling (default: -1.0, -1.0 = disabled) |
| `--xtc-probability N` | xtc probability (default: 0.0, 0.0 = disabled) |
| `--xtc-threshold N` | xtc threshold (default: 0.1, 1.0 = disabled) |
| `--typical N` | locally typical sampling, parameter p (default: 1.0, 1.0 = disabled) |
| `--repeat-last-n N` | last n tokens to consider for penalize (default: 64, 0 = disabled, -1 = ctx_size) |
| `--repeat-penalty N` | penalize repeat sequence of tokens (default: 1.0, 1.0 = disabled) |
| `--presence-penalty N` | repeat alpha presence penalty (default: 0.0, 0.0 = disabled) |
| `--frequency-penalty N` | repeat alpha frequency penalty (default: 0.0, 0.0 = disabled) |
| `--dry-multiplier N` | set DRY sampling multiplier (default: 0.0, 0.0 = disabled) |
| `--dry-base N` | set DRY sampling base value (default: 1.75) |
| `--dry-allowed-length N` | set allowed length for DRY sampling (default: 2) |
| `--dry-penalty-last-n N` | set DRY penalty for the last n tokens (default: -1, 0 = disable, -1 = context size) |
| `--dry-sequence-breaker STRING` | add sequence breaker for DRY sampling, clearing out default breakers ('\n', ':', '"', '*') in the process; use "none" to not use any sequence breakers |
| `--dynatemp-range N` | dynamic temperature range (default: 0.0, 0.0 = disabled) |
| `--dynatemp-exp N` | dynamic temperature exponent (default: 1.0) |
| `--mirostat N` | use Mirostat sampling.<br/>Top K, Nucleus and Locally Typical samplers are ignored if used.<br/>(default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0) |
| `--mirostat-lr N` | Mirostat learning rate, parameter eta (default: 0.1) |
| `--mirostat-ent N` | Mirostat target entropy, parameter tau (default: 5.0) |
| `-l, --logit-bias TOKEN_ID(+/-)BIAS` | modifies the likelihood of token appearing in the completion,<br/>i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',<br/>or `--logit-bias 15043-1` to decrease likelihood of token ' Hello' |
| `--grammar GRAMMAR` | BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '') |
| `--grammar-file FNAME` | file to read grammar from |
| `-j, --json-schema SCHEMA` | JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object<br/>For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead |
| `-jf, --json-schema-file FILE` | File containing a JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object<br/>For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead |
### Completion-specific params
| Argument | Explanation |
| -------- | ----------- |
| `--display-prompt, --no-display-prompt` | whether to print prompt at generation (default: true) |
| `-co, --color [on\|off\|auto]` | Colorize output to distinguish prompt and user input from generations ('on', 'off', or 'auto', default: 'auto')<br/>'auto' enables colors when output is to a terminal |
| `--context-shift, --no-context-shift` | whether to use context shift on infinite text generation (default: disabled)<br/>(env: LLAMA_ARG_CONTEXT_SHIFT) |
| `-sys, --system-prompt PROMPT` | system prompt to use with model (if applicable, depending on chat template) |
| `-sysf, --system-prompt-file FNAME` | a file containing the system prompt (default: none) |
| `-ptc, --print-token-count N` | print token count every N tokens (default: -1) |
| `--prompt-cache FNAME` | file to cache prompt state for faster startup (default: none) |
| `--prompt-cache-all` | if specified, saves user input and generations to cache as well |
| `--prompt-cache-ro` | if specified, uses the prompt cache but does not update it |
| `-r, --reverse-prompt PROMPT` | halt generation at PROMPT, return control in interactive mode |
| `-sp, --special` | special tokens output enabled (default: false) |
| `-cnv, --conversation, -no-cnv, --no-conversation` | whether to run in conversation mode:<br/>- does not print special tokens and suffix/prefix<br/>- interactive mode is also enabled<br/>(default: auto enabled if chat template is available) |
| `-st, --single-turn` | run conversation for a single turn only, then exit when done<br/>will not be interactive if first turn is predefined with --prompt<br/>(default: false) |
| `-i, --interactive` | run in interactive mode (default: false) |
| `-if, --interactive-first` | run in interactive mode and wait for input right away (default: false) |
| `-mli, --multiline-input` | allows you to write or paste multiple lines without ending each in '\' |
| `--in-prefix-bos` | prefix BOS to user inputs, preceding the `--in-prefix` string |
| `--in-prefix STRING` | string to prefix user inputs with (default: empty) |
| `--in-suffix STRING` | string to suffix after user inputs with (default: empty) |
| `--warmup, --no-warmup` | whether to perform warmup with an empty run (default: enabled) |
| `-gan, --grp-attn-n N` | group-attention factor (default: 1)<br/>(env: LLAMA_ARG_GRP_ATTN_N) |
| `-gaw, --grp-attn-w N` | group-attention width (default: 512)<br/>(env: LLAMA_ARG_GRP_ATTN_W) |
| `--jinja, --no-jinja` | whether to use jinja template engine for chat (default: disabled)<br/>(env: LLAMA_ARG_JINJA) |
| `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:<br/>- none: leaves thoughts unparsed in `message.content`<br/>- deepseek: puts thoughts in `message.reasoning_content`<br/>- deepseek-legacy: keeps `<think>` tags in `message.content` while also populating `message.reasoning_content`<br/>(default: auto)<br/>(env: LLAMA_ARG_THINK) |
| `--reasoning-budget N` | controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)<br/>(env: LLAMA_ARG_THINK_BUDGET) |
| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
| `--simple-io` | use basic IO for better compatibility in subprocesses and limited consoles |
<!-- HELP_END -->
## Common Options
In this section, we cover the most commonly used options for running the `llama-completion` program with the LLaMA models:

View File

@ -2102,6 +2102,8 @@ int main(int argc, char ** argv) {
struct ggml_threadpool_params tpp = ggml_threadpool_params_default(t.n_threads);
if (!parse_cpu_mask(t.cpu_mask, tpp.cpumask)) {
fprintf(stderr, "%s: failed to parse cpu-mask: %s\n", __func__, t.cpu_mask.c_str());
llama_free(ctx);
llama_model_free(lmodel);
exit(1);
}
tpp.strict_cpu = t.cpu_strict;
@ -2111,6 +2113,8 @@ int main(int argc, char ** argv) {
struct ggml_threadpool * threadpool = ggml_threadpool_new_fn(&tpp);
if (!threadpool) {
fprintf(stderr, "%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
llama_free(ctx);
llama_model_free(lmodel);
exit(1);
}
@ -2126,6 +2130,8 @@ int main(int argc, char ** argv) {
bool res = test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);
if (!res) {
fprintf(stderr, "%s: error: failed to run prompt warmup\n", __func__);
llama_free(ctx);
llama_model_free(lmodel);
exit(1);
}
}
@ -2136,6 +2142,8 @@ int main(int argc, char ** argv) {
bool res = test_gen(ctx, 1, t.n_threads);
if (!res) {
fprintf(stderr, "%s: error: failed to run gen warmup\n", __func__);
llama_free(ctx);
llama_model_free(lmodel);
exit(1);
}
}
@ -2164,6 +2172,8 @@ int main(int argc, char ** argv) {
bool res = test_prompt(ctx, t.n_depth, t.n_batch, t.n_threads);
if (!res) {
fprintf(stderr, "%s: error: failed to run depth\n", __func__);
llama_free(ctx);
llama_model_free(lmodel);
exit(1);
}
@ -2189,6 +2199,8 @@ int main(int argc, char ** argv) {
bool res = test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);
if (!res) {
fprintf(stderr, "%s: error: failed to run prompt\n", __func__);
llama_free(ctx);
llama_model_free(lmodel);
exit(1);
}
}
@ -2200,6 +2212,8 @@ int main(int argc, char ** argv) {
bool res = test_gen(ctx, t.n_gen, t.n_threads);
if (!res) {
fprintf(stderr, "%s: error: failed to run gen\n", __func__);
llama_free(ctx);
llama_model_free(lmodel);
exit(1);
}
}

View File

@ -23,9 +23,11 @@ For the ful list of features, please refer to [server's changelog](https://githu
## Usage
<!-- Note for contributors: The list below is generated by llama-gen-docs -->
<!-- HELP_START -->
**Common params**
<!-- IMPORTANT: The list below is auto-generated by llama-gen-docs; do NOT modify it manually -->
### Common params
| Argument | Explanation |
| -------- | ----------- |
@ -38,13 +40,13 @@ For the ful list of features, please refer to [server's changelog](https://githu
| `-tb, --threads-batch N` | number of threads to use during batch and prompt processing (default: same as --threads) |
| `-C, --cpu-mask M` | CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: "") |
| `-Cr, --cpu-range lo-hi` | range of CPUs for affinity. Complements --cpu-mask |
| `--cpu-strict <0\|1>` | use strict CPU placement (default: 0)<br/> |
| `--prio N` | set process/thread priority : low(-1), normal(0), medium(1), high(2), realtime(3) (default: 0)<br/> |
| `--poll <0...100>` | use polling level to wait for work (0 - no polling, default: 50)<br/> |
| `--cpu-strict <0\|1>` | use strict CPU placement (default: 0) |
| `--prio N` | set process/thread priority : low(-1), normal(0), medium(1), high(2), realtime(3) (default: 0) |
| `--poll <0...100>` | use polling level to wait for work (0 - no polling, default: 50) |
| `-Cb, --cpu-mask-batch M` | CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask) |
| `-Crb, --cpu-range-batch lo-hi` | ranges of CPUs for affinity. Complements --cpu-mask-batch |
| `--cpu-strict-batch <0\|1>` | use strict CPU placement (default: same as --cpu-strict) |
| `--prio-batch N` | set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: 0)<br/> |
| `--prio-batch N` | set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: 0) |
| `--poll-batch <0\|1>` | use polling to wait for work (default: same as --poll) |
| `-c, --ctx-size N` | size of the prompt context (default: 0, 0 = loaded from model)<br/>(env: LLAMA_ARG_CTX_SIZE) |
| `-n, --predict, --n-predict N` | number of tokens to predict (default: -1, -1 = infinity)<br/>(env: LLAMA_ARG_N_PREDICT) |
@ -114,7 +116,7 @@ For the ful list of features, please refer to [server's changelog](https://githu
| `-ctvd, --cache-type-v-draft TYPE` | KV cache data type for V for the draft model<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_V_DRAFT) |
**Sampling params**
### Sampling params
| Argument | Explanation |
| -------- | ----------- |
@ -138,7 +140,7 @@ For the ful list of features, please refer to [server's changelog](https://githu
| `--dry-base N` | set DRY sampling base value (default: 1.75) |
| `--dry-allowed-length N` | set allowed length for DRY sampling (default: 2) |
| `--dry-penalty-last-n N` | set DRY penalty for the last n tokens (default: -1, 0 = disable, -1 = context size) |
| `--dry-sequence-breaker STRING` | add sequence breaker for DRY sampling, clearing out default breakers ('\n', ':', '"', '*') in the process; use "none" to not use any sequence breakers<br/> |
| `--dry-sequence-breaker STRING` | add sequence breaker for DRY sampling, clearing out default breakers ('\n', ':', '"', '*') in the process; use "none" to not use any sequence breakers |
| `--dynatemp-range N` | dynamic temperature range (default: 0.0, 0.0 = disabled) |
| `--dynatemp-exp N` | dynamic temperature exponent (default: 1.0) |
| `--mirostat N` | use Mirostat sampling.<br/>Top K, Nucleus and Locally Typical samplers are ignored if used.<br/>(default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0) |
@ -151,7 +153,7 @@ For the ful list of features, please refer to [server's changelog](https://githu
| `-jf, --json-schema-file FILE` | File containing a JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object<br/>For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead |
**Server-specific params**
### Server-specific params
| Argument | Explanation |
| -------- | ----------- |
@ -159,7 +161,7 @@ For the ful list of features, please refer to [server's changelog](https://githu
| `-cram, --cache-ram N` | set the maximum cache size in MiB (default: 8192, -1 - no limit, 0 - disable)[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)<br/>(env: LLAMA_ARG_CACHE_RAM) |
| `-kvu, --kv-unified` | use single unified KV buffer shared across all sequences (default: enabled if number of slots is auto)<br/>(env: LLAMA_ARG_KV_UNIFIED) |
| `--context-shift, --no-context-shift` | whether to use context shift on infinite text generation (default: disabled)<br/>(env: LLAMA_ARG_CONTEXT_SHIFT) |
| `-r, --reverse-prompt PROMPT` | halt generation at PROMPT, return control in interactive mode<br/> |
| `-r, --reverse-prompt PROMPT` | halt generation at PROMPT, return control in interactive mode |
| `-sp, --special` | special tokens output enabled (default: false) |
| `--warmup, --no-warmup` | whether to perform warmup with an empty run (default: enabled) |
| `--spm-infill` | use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: disabled) |
@ -208,8 +210,9 @@ For the ful list of features, please refer to [server's changelog](https://githu
| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
| `--prefill-assistant, --no-prefill-assistant` | whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)<br/>when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled<br/><br/>(env: LLAMA_ARG_PREFILL_ASSISTANT) |
| `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.10, 0.0 = disabled)<br/> |
| `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.10, 0.0 = disabled) |
| `--lora-init-without-apply` | load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled) |
| `--sleep-idle-seconds SECONDS` | number of seconds of idleness after which the server will sleep (default: -1; -1 = disabled) |
| `-td, --threads-draft N` | number of threads to use during generation (default: same as --threads) |
| `-tbd, --threads-batch-draft N` | number of threads to use during batch and prompt processing (default: same as --threads-draft) |
| `--draft, --draft-n, --draft-max N` | number of tokens to draft for speculative decoding (default: 16)<br/>(env: LLAMA_ARG_DRAFT_MAX) |
@ -234,6 +237,7 @@ For the ful list of features, please refer to [server's changelog](https://githu
| `--vision-gemma-4b-default` | use Gemma 3 4B QAT (note: can download weights from the internet) |
| `--vision-gemma-12b-default` | use Gemma 3 12B QAT (note: can download weights from the internet) |
<!-- HELP_END -->
Note: If both command line argument and environment variable are both set for the same param, the argument will take precedence over env var.
@ -1567,7 +1571,6 @@ Load a model
Payload:
- `model`: name of the model to be loaded.
- `extra_args`: (optional) an array of additional arguments to be passed to the model instance. Note: you must start the server with `--models-allow-extra-args` to enable this feature.
```json
{

View File

@ -115,26 +115,14 @@ bool lora_should_clear_cache(
!lora_all_alora(next));
}
std::vector<common_adapter_lora_info> parse_lora_request(
const std::vector<common_adapter_lora_info> & lora_base,
const json & data) {
std::vector<common_adapter_lora_info> lora(lora_base);
int max_idx = lora.size();
// clear existing value
for (auto & entry : lora) {
entry.scale = 0.0f;
}
std::map<int, float> parse_lora_request(const json & data) {
std::map<int, float> lora;
// set value
for (const auto & entry : data) {
int id = json_value(entry, "id", -1);
float scale = json_value(entry, "scale", 0.0f);
if (0 <= id && id < max_idx) {
lora[id].scale = scale;
} else {
throw std::runtime_error("invalid adapter id");
}
lora[id] = scale;
}
return lora;
@ -1435,7 +1423,7 @@ std::string safe_json_to_str(const json & data) {
// TODO: reuse llama_detokenize
template <class Iter>
static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) {
static std::string tokens_to_str(const llama_vocab * ctx, Iter begin, Iter end) {
std::string ret;
for (; begin != end; ++begin) {
ret += common_token_to_piece(ctx, *begin);
@ -1445,7 +1433,12 @@ static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) {
}
std::string tokens_to_str(llama_context * ctx, const llama_tokens & tokens) {
return tokens_to_str(ctx, tokens.begin(), tokens.end());
auto model = llama_get_model(ctx);
return tokens_to_str(llama_model_get_vocab(model), tokens.begin(), tokens.end());
}
std::string tokens_to_str(const llama_vocab * vocab, const llama_tokens & tokens) {
return tokens_to_str(vocab, tokens.begin(), tokens.end());
}
// format incomplete utf-8 multibyte character for output

View File

@ -107,9 +107,7 @@ bool lora_should_clear_cache(
const std::vector<common_adapter_lora_info> & current,
const std::vector<common_adapter_lora_info> & next);
std::vector<common_adapter_lora_info> parse_lora_request(
const std::vector<common_adapter_lora_info> & lora_base,
const json & data);
std::map<int, float> parse_lora_request(const json & data);
bool are_lora_equal(
const std::vector<common_adapter_lora_info> & l1,
@ -325,6 +323,7 @@ std::vector<llama_token_data> get_token_probabilities(llama_context * ctx, int i
std::string safe_json_to_str(const json & data);
std::string tokens_to_str(llama_context * ctx, const llama_tokens & tokens);
std::string tokens_to_str(const llama_vocab * vocab, const llama_tokens & tokens);
// format incomplete utf-8 multibyte character for output
std::string tokens_to_output_formatted_string(const llama_context * ctx, const llama_token token);

File diff suppressed because it is too large Load Diff

View File

@ -9,11 +9,35 @@
struct server_context_impl; // private implementation
struct server_context_info {
struct server_context_meta {
std::string build_info;
std::string model_name;
std::string model_path;
bool has_mtmd;
bool has_inp_image;
bool has_inp_audio;
json json_webui_settings;
int slot_n_ctx;
enum llama_pooling_type pooling_type;
// chat template
std::string chat_template;
std::string chat_template_tool_use;
// tokens
std::string bos_token_str;
std::string eos_token_str;
llama_token fim_pre_token;
llama_token fim_sub_token;
llama_token fim_mid_token;
// model meta
enum llama_vocab_type model_vocab_type;
int32_t model_vocab_n_tokens;
int32_t model_n_ctx_train;
int32_t model_n_embd_inp;
uint64_t model_n_params;
uint64_t model_size;
};
struct server_context {
@ -33,14 +57,15 @@ struct server_context {
void terminate();
// get the underlaying llama_context, can return nullptr if sleeping
// not thread-safe, should only be used from the main thread
llama_context * get_llama_context() const;
// get a new response reader, used by CLI application
server_response_reader get_response_reader();
// get server info
// used by CLI application
server_context_info get_info() const;
// get server metadata (read-only), can only be called after load_model()
// not thread-safe, should only be used from the main thread
server_context_meta get_meta() const;
};
@ -48,13 +73,17 @@ struct server_context {
struct server_res_generator;
struct server_routes {
server_routes(const common_params & params, server_context & ctx_server, std::function<bool()> is_ready = []() { return true; })
: params(params), ctx_server(*ctx_server.impl), is_ready(is_ready) {
init_routes();
}
server_routes(const common_params & params, server_context & ctx_server);
void init_routes();
// note: this is not thread-safe and can only when ctx_http.is_ready is false
void update_meta(const server_context & ctx_server) {
this->meta = std::make_unique<server_context_meta>(ctx_server.get_meta());
}
// handlers using lambda function, so that they can capture `this` without `std::bind`
// they won't be called until ctx_http.is_ready is set to true
server_http_context::handler_t get_health;
server_http_context::handler_t get_metrics;
server_http_context::handler_t get_slots;
@ -78,13 +107,24 @@ struct server_routes {
server_http_context::handler_t get_lora_adapters;
server_http_context::handler_t post_lora_adapters;
private:
// TODO: move these outside of server_routes?
std::unique_ptr<server_res_generator> handle_completions_impl(
const server_http_req & req,
server_task_type type,
const json & data,
const std::vector<raw_buffer> & files,
task_response_type res_type);
std::unique_ptr<server_res_generator> handle_slots_save(const server_http_req & req, int id_slot);
std::unique_ptr<server_res_generator> handle_slots_restore(const server_http_req & req, int id_slot);
std::unique_ptr<server_res_generator> handle_slots_erase(const server_http_req &, int id_slot);
std::unique_ptr<server_res_generator> handle_embeddings_impl(const server_http_req & req, task_response_type res_type);
// using unique_ptr to allow late initialization of const
std::unique_ptr<const server_context_meta> meta;
const common_params & params;
server_context_impl & ctx_server;
std::function<bool()> is_ready;
const server_context_impl & ctx_server;
server_queue & queue_tasks;
server_response & queue_results;
std::unique_ptr<server_res_generator> create_response(bool bypass_sleep = false);
};

View File

@ -177,12 +177,11 @@ bool server_http_context::init(const common_params & params) {
if (!ready) {
auto tmp = string_split<std::string>(req.path, '.');
if (req.path == "/" || tmp.back() == "html") {
res.set_content(reinterpret_cast<const char*>(loading_html), loading_html_len, "text/html; charset=utf-8");
res.status = 503;
} else if (req.path == "/models" || req.path == "/v1/models" || req.path == "/api/tags") {
// allow the models endpoint to be accessed during loading
return true;
res.set_content(reinterpret_cast<const char*>(loading_html), loading_html_len, "text/html; charset=utf-8");
} else {
// no endpoints is allowed to be accessed when the server is not ready
// this is to prevent any data races or inconsistent states
res.status = 503;
res.set_content(
safe_json_to_str(json {
@ -334,12 +333,16 @@ static std::map<std::string, std::string> get_headers(const httplib::Request & r
return headers;
}
static void process_handler_response(server_http_res_ptr & response, httplib::Response & res) {
// using unique_ptr for request to allow safe capturing in lambdas
using server_http_req_ptr = std::unique_ptr<server_http_req>;
static void process_handler_response(server_http_req_ptr && request, server_http_res_ptr & response, httplib::Response & res) {
if (response->is_stream()) {
res.status = response->status;
set_headers(res, response->headers);
std::string content_type = response->content_type;
// convert to shared_ptr as both chunked_content_provider() and on_complete() need to use it
std::shared_ptr<server_http_req> q_ptr = std::move(request);
std::shared_ptr<server_http_res> r_ptr = std::move(response);
const auto chunked_content_provider = [response = r_ptr](size_t, httplib::DataSink & sink) -> bool {
std::string chunk;
@ -355,8 +358,9 @@ static void process_handler_response(server_http_res_ptr & response, httplib::Re
}
return has_next;
};
const auto on_complete = [response = r_ptr](bool) mutable {
const auto on_complete = [request = q_ptr, response = r_ptr](bool) mutable {
response.reset(); // trigger the destruction of the response object
request.reset(); // trigger the destruction of the request object
};
res.set_chunked_content_provider(content_type, chunked_content_provider, on_complete);
} else {
@ -368,27 +372,29 @@ static void process_handler_response(server_http_res_ptr & response, httplib::Re
void server_http_context::get(const std::string & path, const server_http_context::handler_t & handler) const {
pimpl->srv->Get(path_prefix + path, [handler](const httplib::Request & req, httplib::Response & res) {
server_http_res_ptr response = handler(server_http_req{
server_http_req_ptr request = std::make_unique<server_http_req>(server_http_req{
get_params(req),
get_headers(req),
req.path,
req.body,
req.is_connection_closed
});
process_handler_response(response, res);
server_http_res_ptr response = handler(*request);
process_handler_response(std::move(request), response, res);
});
}
void server_http_context::post(const std::string & path, const server_http_context::handler_t & handler) const {
pimpl->srv->Post(path_prefix + path, [handler](const httplib::Request & req, httplib::Response & res) {
server_http_res_ptr response = handler(server_http_req{
server_http_req_ptr request = std::make_unique<server_http_req>(server_http_req{
get_params(req),
get_headers(req),
req.path,
req.body,
req.is_connection_closed
});
process_handler_response(response, res);
server_http_res_ptr response = handler(*request);
process_handler_response(std::move(request), response, res);
});
}

View File

@ -325,23 +325,25 @@ void server_response::terminate() {
// server_response_reader
//
void server_response_reader::post_task(server_task && task) {
void server_response_reader::post_task(server_task && task, bool front) {
GGML_ASSERT(id_tasks.empty() && "post_task() can only be called once per reader");
task.index = 0;
id_tasks.insert(task.id);
states.push_back(task.create_state());
queue_results.add_waiting_task_id(task.id);
queue_tasks.post(std::move(task));
queue_tasks.post(std::move(task), front);
}
void server_response_reader::post_tasks(std::vector<server_task> && tasks) {
void server_response_reader::post_tasks(std::vector<server_task> && tasks, bool front) {
GGML_ASSERT(id_tasks.empty() && "post_tasks() can only be called once per reader");
id_tasks = server_task::get_list_id(tasks);
states.reserve(tasks.size());
for (size_t i = 0; i < tasks.size(); i++) {
tasks[i].index = i;
states.push_back(tasks[i].create_state());
}
queue_results.add_waiting_tasks(tasks);
queue_tasks.post(std::move(tasks));
queue_tasks.post(std::move(tasks), front);
}
bool server_response_reader::has_next() const {
@ -367,7 +369,7 @@ server_task_result_ptr server_response_reader::next(const std::function<bool()>
}
if (!states.empty()) {
// update the generation state if needed
size_t idx = result->get_index();
const size_t idx = result->index;
GGML_ASSERT(idx < states.size());
result->update(states[idx]);
}
@ -383,6 +385,7 @@ server_task_result_ptr server_response_reader::next(const std::function<bool()>
server_response_reader::batch_response server_response_reader::wait_for_all(const std::function<bool()> & should_stop) {
batch_response batch_res;
batch_res.results.clear();
batch_res.results.resize(id_tasks.size());
while (has_next()) {
auto res = next(should_stop);
@ -394,7 +397,7 @@ server_response_reader::batch_response server_response_reader::wait_for_all(cons
batch_res.error = std::move(res);
return batch_res;
}
const size_t idx = res->get_index();
const size_t idx = res->index;
GGML_ASSERT(idx < batch_res.results.size() && "index out of range");
GGML_ASSERT(batch_res.results[idx] == nullptr && "duplicate result received");
batch_res.results[idx] = std::move(res);

View File

@ -5,6 +5,7 @@
#include <condition_variable>
#include <deque>
#include <mutex>
#include <vector>
#include <unordered_set>
// struct for managing server tasks
@ -173,8 +174,10 @@ struct server_response_reader {
int get_new_id() {
return queue_tasks.get_new_id();
}
void post_task(server_task && task);
void post_tasks(std::vector<server_task> && tasks);
// if front = true, the task will be posted to the front of the queue (high priority)
void post_task(server_task && task, bool front = false);
void post_tasks(std::vector<server_task> && tasks, bool front = false);
bool has_next() const;
// return nullptr if should_stop() is true before receiving a result

View File

@ -32,8 +32,8 @@ json task_params::to_json(bool only_metrics) const {
}
json lora = json::array();
for (size_t i = 0; i < this->lora.size(); ++i) {
lora.push_back({{"id", i}, {"scale", this->lora[i].scale}});
for (auto & it : this->lora) {
lora.push_back({{"id", it.first}, {"scale", it.second}});
}
if (only_metrics) {
@ -145,12 +145,10 @@ json task_params::to_json(bool only_metrics) const {
//
task_params server_task::params_from_json_cmpl(
const llama_context * ctx,
const llama_vocab * vocab,
const common_params & params_base,
const int n_ctx_slot,
const json & data) {
const llama_model * model = llama_get_model(ctx);
const llama_vocab * vocab = llama_model_get_vocab(model);
task_params params;
// Sampling parameter defaults are loaded from the global server context (but individual requests can still them)
@ -225,12 +223,12 @@ task_params server_task::params_from_json_cmpl(
if (data.contains("lora")) {
if (data.at("lora").is_array()) {
params.lora = parse_lora_request(params_base.lora_adapters, data.at("lora"));
params.lora = parse_lora_request(data.at("lora"));
} else {
throw std::runtime_error("Error: 'lora' must be an array of objects with 'id' and 'scale' fields");
}
} else {
params.lora = params_base.lora_adapters;
params.lora = {};
}
// TODO: add more sanity checks for the input parameters
@ -245,11 +243,11 @@ task_params server_task::params_from_json_cmpl(
if (params.sampling.penalty_last_n == -1) {
// note: should be the slot's context and not the full context, but it's ok
params.sampling.penalty_last_n = llama_n_ctx(ctx);
params.sampling.penalty_last_n = n_ctx_slot;
}
if (params.sampling.dry_penalty_last_n == -1) {
params.sampling.dry_penalty_last_n = llama_n_ctx(ctx);
params.sampling.dry_penalty_last_n = n_ctx_slot;
}
if (params.sampling.dry_base < 1.0f) {
@ -1155,7 +1153,7 @@ json server_task_result_rerank::to_json() {
json server_task_result_cmpl_partial::to_json_anthropic() {
json events = json::array();
bool first = (n_decoded == 1);
static bool text_block_started = false;
bool text_block_started = false;
if (first) {
text_block_started = false;
@ -1326,6 +1324,30 @@ json server_task_result_slot_erase::to_json() {
};
}
//
// server_task_result_get_lora
//
json server_task_result_get_lora::to_json() {
json result = json::array();
for (size_t i = 0; i < loras.size(); ++i) {
auto & lora = loras[i];
json entry = {
{"id", i},
{"path", lora.info.path},
{"scale", lora.info.scale},
{"task_name", lora.info.task_name},
{"prompt_prefix", lora.info.prompt_prefix},
};
if (!lora.alora_invocation_tokens.empty()) {
entry["alora_invocation_string"] = lora.alora_invocation_string;
entry["alora_invocation_tokens"] = lora.alora_invocation_tokens;
}
result.push_back(std::move(entry));
}
return result;
}
//
// server_task_result_apply_lora
//

View File

@ -6,6 +6,7 @@
#include <string>
#include <unordered_set>
#include <list>
#include <map>
// TODO: prevent including the whole server-common.h as we only use server_tokens
#include "server-common.h"
@ -23,6 +24,7 @@ enum server_task_type {
SERVER_TASK_TYPE_SLOT_SAVE,
SERVER_TASK_TYPE_SLOT_RESTORE,
SERVER_TASK_TYPE_SLOT_ERASE,
SERVER_TASK_TYPE_GET_LORA,
SERVER_TASK_TYPE_SET_LORA,
};
@ -60,7 +62,7 @@ struct task_params {
int64_t t_max_prompt_ms = -1; // TODO: implement
int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit
std::vector<common_adapter_lora_info> lora;
std::map<int, float> lora; // mapping adapter ID -> scale
std::vector<std::string> antiprompt;
std::vector<std::string> response_fields;
@ -105,8 +107,10 @@ struct task_result_state {
};
struct server_task {
int id = -1; // to be filled by server_queue
int index = -1; // used when there are multiple prompts (batch request)
int id = -1; // to be filled by server_queue
// TODO @ngxson : remove this field and implement a mapping task_id -> idx in the response_reader
size_t index = 0; // used when there are multiple prompts (batch request)
// used by SERVER_TASK_TYPE_CANCEL
int id_target = -1;
@ -138,7 +142,7 @@ struct server_task {
bool metrics_reset_bucket = false;
// used by SERVER_TASK_TYPE_SET_LORA
std::vector<common_adapter_lora_info> set_lora;
std::map<int, float> set_lora; // mapping adapter ID -> scale
server_task() = default;
@ -149,9 +153,10 @@ struct server_task {
}
static task_params params_from_json_cmpl(
const llama_context * ctx,
const common_params & params_base,
const json & data);
const llama_vocab * vocab,
const common_params & params_base,
const int n_ctx_slot,
const json & data);
// utility function
static std::unordered_set<int> get_list_id(const std::vector<server_task> & tasks) {
@ -162,10 +167,9 @@ struct server_task {
return ids;
}
server_task create_child(int id_parent, int id_child, int idx) const {
server_task create_child(int id_parent, int id_child) const {
server_task copy;
copy.id = id_child;
copy.index = idx;
copy.id_parent = id_parent;
copy.params = params;
copy.type = type;
@ -212,6 +216,10 @@ struct result_prompt_progress {
struct server_task_result {
int id = -1;
int id_slot = -1;
// TODO @ngxson : remove this field and implement a mapping task_id -> idx in the response_reader
size_t index = 0; // to be used for batched tasks
virtual bool is_error() {
// only used by server_task_result_error
return false;
@ -220,9 +228,6 @@ struct server_task_result {
// only used by server_task_result_cmpl_*
return true;
}
virtual int get_index() {
return -1;
}
virtual void update(task_result_state &) {
// only used by server_task_result_cmpl_*
}
@ -255,8 +260,6 @@ struct completion_token_output {
};
struct server_task_result_cmpl_final : server_task_result {
int index = 0;
std::string content;
llama_tokens tokens;
@ -289,10 +292,6 @@ struct server_task_result_cmpl_final : server_task_result {
std::vector<common_chat_msg_diff> oaicompat_msg_diffs; // to be populated by update()
bool is_updated = false;
virtual int get_index() override {
return index;
}
virtual bool is_stop() override {
return true; // in stream mode, final responses are considered stop
}
@ -318,8 +317,6 @@ struct server_task_result_cmpl_final : server_task_result {
};
struct server_task_result_cmpl_partial : server_task_result {
int index = 0;
std::string content;
llama_tokens tokens;
@ -340,10 +337,6 @@ struct server_task_result_cmpl_partial : server_task_result {
std::vector<common_chat_msg_diff> oaicompat_msg_diffs; // to be populated by update()
bool is_updated = false;
virtual int get_index() override {
return index;
}
virtual bool is_stop() override {
return false; // in stream mode, partial responses are not considered stop
}
@ -365,7 +358,6 @@ struct server_task_result_cmpl_partial : server_task_result {
};
struct server_task_result_embd : server_task_result {
int index = 0;
std::vector<std::vector<float>> embedding;
int32_t n_tokens;
@ -373,10 +365,6 @@ struct server_task_result_embd : server_task_result {
// response formatting
task_response_type res_type = TASK_RESPONSE_TYPE_NONE;
virtual int get_index() override {
return index;
}
virtual json to_json() override;
json to_json_non_oaicompat();
@ -385,20 +373,14 @@ struct server_task_result_embd : server_task_result {
};
struct server_task_result_rerank : server_task_result {
int index = 0;
float score = -1e6;
int32_t n_tokens;
virtual int get_index() override {
return index;
}
virtual json to_json() override;
};
struct server_task_result_error : server_task_result {
int index = 0;
error_type err_type = ERROR_TYPE_SERVER;
std::string err_msg;
@ -460,6 +442,17 @@ struct server_task_result_slot_erase : server_task_result {
virtual json to_json() override;
};
struct server_task_result_get_lora : server_task_result {
struct lora {
common_adapter_lora_info info;
std::string alora_invocation_string;
llama_tokens alora_invocation_tokens;
};
std::vector<lora> loras;
virtual json to_json() override;
};
struct server_task_result_apply_lora : server_task_result {
virtual json to_json() override;
};

View File

@ -119,7 +119,7 @@ int main(int argc, char ** argv, char ** envp) {
//
// register API routes
server_routes routes(params, ctx_server, [&ctx_http]() { return ctx_http.is_ready.load(); });
server_routes routes(params, ctx_server);
bool is_router_server = params.model.path.empty();
std::optional<server_models_routes> models_routes{};
@ -252,6 +252,7 @@ int main(int argc, char ** argv, char ** envp) {
return 1;
}
routes.update_meta(ctx_server);
ctx_http.is_ready.store(true);
LOG_INF("%s: model loaded\n", __func__);