llama: end-to-end tests (#19802)
* tests: add end-to-end tests per model architecture * fixup for rebase * fix use-after-free in llama-model-loader.cpp * fix CI * fix WebGPU * fix CI * disable CI for macOS-latest-cmake-arm64 * use expert_weights_scale only if != 0.0f * comments
This commit is contained in:
parent
a95047979a
commit
a976ff081b
|
|
@ -93,7 +93,7 @@ jobs:
|
|||
id: cmake_test
|
||||
run: |
|
||||
cd build
|
||||
ctest -L main --verbose --timeout 900
|
||||
ctest -L main -E "test-llama-archs" --verbose --timeout 900
|
||||
|
||||
macOS-latest-cmake-x64:
|
||||
runs-on: macos-15-intel
|
||||
|
|
|
|||
|
|
@ -2666,7 +2666,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||
[](common_params & params, const std::string & value) {
|
||||
params.out_file = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_FINETUNE}));
|
||||
).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_FINETUNE, LLAMA_EXAMPLE_RESULTS}));
|
||||
add_opt(common_arg(
|
||||
{"-ofreq", "--output-frequency"}, "N",
|
||||
string_format("output the imatrix every N iterations (default: %d)", params.n_out_freq),
|
||||
|
|
@ -3607,6 +3607,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||
}
|
||||
}
|
||||
).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
|
||||
add_opt(common_arg(
|
||||
{"--check"},
|
||||
string_format("check rather than generate results (default: %s)", params.check ? "true" : "false"),
|
||||
[](common_params & params) {
|
||||
params.check = true;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_RESULTS}));
|
||||
add_opt(common_arg(
|
||||
{"--save-logits"},
|
||||
string_format("save final logits to files for verification (default: %s)", params.save_logits ? "true" : "false"),
|
||||
|
|
|
|||
|
|
@ -104,6 +104,7 @@ enum llama_example {
|
|||
LLAMA_EXAMPLE_DIFFUSION,
|
||||
LLAMA_EXAMPLE_FINETUNE,
|
||||
LLAMA_EXAMPLE_FIT_PARAMS,
|
||||
LLAMA_EXAMPLE_RESULTS,
|
||||
|
||||
LLAMA_EXAMPLE_COUNT,
|
||||
};
|
||||
|
|
@ -456,6 +457,8 @@ struct common_params {
|
|||
|
||||
bool kl_divergence = false; // compute KL divergence
|
||||
|
||||
bool check = false; // check rather than generate results for llama-results
|
||||
|
||||
bool usage = false; // print usage
|
||||
bool completion = false; // print source-able completion script
|
||||
bool use_color = false; // use color to distinguish generations and inputs
|
||||
|
|
|
|||
|
|
@ -5,6 +5,7 @@
|
|||
#include "ggml-cpu.h"
|
||||
#include "ggml-backend.h"
|
||||
#include "ggml-opt.h"
|
||||
#include "gguf.h"
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
|
@ -440,19 +441,30 @@ extern "C" {
|
|||
|
||||
LLAMA_API void llama_detach_threadpool(struct llama_context * ctx);
|
||||
|
||||
typedef void (*llama_model_set_tensor_data_t)(struct ggml_tensor * tensor, void * userdata);
|
||||
|
||||
// Create a new model from GGUF metadata as well as a function to set the tensor data
|
||||
// - tensors are created as GGML_TYPE_F32 by default,
|
||||
// override by adding a tensor with the same name but a different name to the context
|
||||
LLAMA_API struct llama_model * llama_model_init_from_user(
|
||||
struct gguf_context * metadata,
|
||||
llama_model_set_tensor_data_t set_tensor_data, // function to initialize tensor data with
|
||||
void * set_tensor_data_ud, // userdata for function
|
||||
struct llama_model_params params);
|
||||
|
||||
DEPRECATED(LLAMA_API struct llama_model * llama_load_model_from_file(
|
||||
const char * path_model,
|
||||
struct llama_model_params params),
|
||||
"use llama_model_load_from_file instead");
|
||||
|
||||
// Load the model from a file
|
||||
// Load a model from a file
|
||||
// If the file is split into multiple parts, the file name must follow this pattern: <name>-%05d-of-%05d.gguf
|
||||
// If the split file name does not follow this pattern, use llama_model_load_from_splits
|
||||
LLAMA_API struct llama_model * llama_model_load_from_file(
|
||||
const char * path_model,
|
||||
struct llama_model_params params);
|
||||
|
||||
// Load the model from multiple splits (support custom naming scheme)
|
||||
// Load a model from multiple splits (support custom naming scheme)
|
||||
// The paths must be in the correct order
|
||||
LLAMA_API struct llama_model * llama_model_load_from_splits(
|
||||
const char ** paths,
|
||||
|
|
|
|||
|
|
@ -0,0 +1,18 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
cmake_args=()
|
||||
llama_results_args=()
|
||||
|
||||
for arg in "${@}"; do
|
||||
if [[ "$arg" == -D* ]]; then
|
||||
cmake_args+=("$arg")
|
||||
else
|
||||
llama_results_args+=("$arg")
|
||||
fi
|
||||
done
|
||||
|
||||
dir="build-bisect"
|
||||
rm -rf ${dir} > /dev/null
|
||||
cmake -B ${dir} -S . ${cmake_args} > /dev/null
|
||||
cmake --build ${dir} -t llama-results -j $(nproc) > /dev/null
|
||||
${dir}/bin/llama-results "${llama_results_args[@]}"
|
||||
|
|
@ -0,0 +1,19 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
if [ $# -lt 2 ]; then
|
||||
echo "usage: ./scripts/git-bisect.sh <commit_bad> <commit_good> [additional arguments]"
|
||||
echo " additional arguments: passed to CMake if they start with \"-D\", to llama-results otherwise"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
set -e
|
||||
set -x
|
||||
|
||||
commit_bad=$1
|
||||
commit_good=$2
|
||||
script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
|
||||
git checkout ${commit_good}
|
||||
${script_dir}/git-bisect-run.sh --output results.gguf "${@:3}"
|
||||
git bisect start ${commit_bad} ${commit_good}
|
||||
git bisect run ${script_dir}/git-bisect-run.sh --output results.gguf --check "${@:3}"
|
||||
git bisect reset
|
||||
|
|
@ -4,6 +4,7 @@
|
|||
|
||||
#include <map>
|
||||
#include <set>
|
||||
#include <vector>
|
||||
|
||||
static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||
{ LLM_ARCH_CLIP, "clip" }, // dummy, only used by llama-quantize
|
||||
|
|
@ -2786,6 +2787,15 @@ std::string LLM_TN_IMPL::str() const {
|
|||
return name;
|
||||
}
|
||||
|
||||
std::vector<llm_arch> llm_arch_all() {
|
||||
std::vector<llm_arch> ret;
|
||||
ret.reserve(LLM_ARCH_NAMES.size());
|
||||
for (const auto & [arch, _] : LLM_ARCH_NAMES) {
|
||||
ret.push_back(arch);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
const char * llm_arch_name(llm_arch arch) {
|
||||
auto it = LLM_ARCH_NAMES.find(arch);
|
||||
if (it == LLM_ARCH_NAMES.end()) {
|
||||
|
|
|
|||
|
|
@ -4,6 +4,7 @@
|
|||
|
||||
#include <string>
|
||||
#include <set>
|
||||
#include <vector>
|
||||
|
||||
//
|
||||
// gguf constants (sync with gguf.py)
|
||||
|
|
@ -608,6 +609,8 @@ struct llm_tensor_info {
|
|||
ggml_op op;
|
||||
};
|
||||
|
||||
std::vector<llm_arch> llm_arch_all();
|
||||
|
||||
const char * llm_arch_name(llm_arch arch);
|
||||
|
||||
llm_arch llm_arch_from_string(const std::string & name);
|
||||
|
|
|
|||
|
|
@ -1158,6 +1158,7 @@ llm_graph_result * llama_context::process_ubatch(const llama_ubatch & ubatch, ll
|
|||
{
|
||||
//const auto t_start_us = ggml_time_us();
|
||||
|
||||
// FIXME this call causes a crash if any model inputs were not used in the graph and were therefore not allocated
|
||||
res->set_inputs(&ubatch);
|
||||
|
||||
//LLAMA_LOG_INFO("graph set inputs time: %.3f ms\n", (ggml_time_us() - t_start_us)/1000.0);
|
||||
|
|
|
|||
|
|
@ -509,6 +509,7 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
|
|||
float * data = (float *) cross_kq_mask->data;
|
||||
|
||||
for (int i = 0; i < n_tokens; ++i) {
|
||||
GGML_ASSERT(!cross->seq_ids_enc.empty() && "llama_encode must be called first");
|
||||
for (int j = 0; j < n_enc; ++j) {
|
||||
float f = -INFINITY;
|
||||
|
||||
|
|
@ -1150,6 +1151,7 @@ ggml_tensor * llm_graph_context::build_ffn(
|
|||
return cur;
|
||||
}
|
||||
|
||||
// TODO remove redundant scale_w argument
|
||||
ggml_tensor * llm_graph_context::build_moe_ffn(
|
||||
ggml_tensor * cur,
|
||||
ggml_tensor * gate_inp,
|
||||
|
|
@ -1607,6 +1609,7 @@ ggml_tensor * llm_graph_context::build_inp_attn_scale() const {
|
|||
// this need to be 1x1xN for broadcasting
|
||||
cur = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 1, 1, n_tokens);
|
||||
ggml_set_input(cur);
|
||||
ggml_set_name(cur, "attn_scale");
|
||||
|
||||
res->add_input(std::move(inp));
|
||||
|
||||
|
|
|
|||
|
|
@ -1,12 +1,17 @@
|
|||
#include "llama-model-loader.h"
|
||||
|
||||
#include "ggml-alloc.h"
|
||||
#include "ggml.h"
|
||||
#include "gguf.h"
|
||||
#include "llama-hparams.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <array>
|
||||
#include <cinttypes>
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
#include <future>
|
||||
#include <regex>
|
||||
|
||||
static const size_t kiB = 1024;
|
||||
static const size_t MiB = 1024*kiB;
|
||||
|
|
@ -263,7 +268,7 @@ namespace GGUFMeta {
|
|||
template<typename T>
|
||||
typename std::enable_if<std::is_integral<T>::value, bool>::type
|
||||
llama_model_loader::get_arr_n(const std::string & key, T & result, bool required) {
|
||||
const int kid = gguf_find_key(meta.get(), key.c_str());
|
||||
const int kid = gguf_find_key(metadata, key.c_str());
|
||||
|
||||
if (kid < 0) {
|
||||
if (required) {
|
||||
|
|
@ -273,7 +278,7 @@ namespace GGUFMeta {
|
|||
}
|
||||
|
||||
struct GGUFMeta::ArrayInfo arr_info =
|
||||
GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta.get(), kid);
|
||||
GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(metadata, kid);
|
||||
|
||||
|
||||
result = arr_info.length;
|
||||
|
|
@ -290,7 +295,7 @@ namespace GGUFMeta {
|
|||
|
||||
template<typename T>
|
||||
bool llama_model_loader::get_arr(const std::string & key, std::vector<T> & result, bool required) {
|
||||
const gguf_context * ctx = meta.get();
|
||||
const gguf_context * ctx = metadata;
|
||||
const int kid = gguf_find_key(ctx, key.c_str());
|
||||
|
||||
if (kid < 0 || gguf_get_kv_type(ctx, kid) != GGUF_TYPE_ARRAY) {
|
||||
|
|
@ -331,7 +336,7 @@ namespace GGUFMeta {
|
|||
|
||||
template<typename T, size_t N_MAX>
|
||||
bool llama_model_loader::get_arr(const std::string & key, std::array<T, N_MAX> & result, bool required) {
|
||||
const gguf_context * ctx = meta.get();
|
||||
const gguf_context * ctx = metadata;
|
||||
const int kid = gguf_find_key(ctx, key.c_str());
|
||||
|
||||
if (kid < 0 || gguf_get_kv_type(ctx, kid) != GGUF_TYPE_ARRAY) {
|
||||
|
|
@ -393,7 +398,7 @@ namespace GGUFMeta {
|
|||
const struct llama_model_kv_override * override =
|
||||
it != kv_overrides.end() ? &it->second : nullptr;
|
||||
|
||||
const bool found = GGUFMeta::GKV<T>::set(meta.get(), key, result, override);
|
||||
const bool found = GGUFMeta::GKV<T>::set(metadata, key, result, override);
|
||||
|
||||
if (required && !found) {
|
||||
throw std::runtime_error(format("key not found in model: %s", key.c_str()));
|
||||
|
|
@ -427,7 +432,7 @@ namespace GGUFMeta {
|
|||
// get array of n <= N_MAX elements, or a single element repeated n times
|
||||
template<typename T, size_t N_MAX>
|
||||
bool llama_model_loader::get_key_or_arr(const std::string & key, std::array<T, N_MAX> & result, uint32_t n, bool required) {
|
||||
const int kid = gguf_find_key(meta.get(), key.c_str());
|
||||
const int kid = gguf_find_key(metadata, key.c_str());
|
||||
|
||||
if (kid < 0) {
|
||||
if (required) {
|
||||
|
|
@ -440,9 +445,9 @@ namespace GGUFMeta {
|
|||
throw std::runtime_error(format("n > N_MAX: %u > %u for key %s", (uint32_t) n, (uint32_t) N_MAX, key.c_str()));
|
||||
}
|
||||
|
||||
if (gguf_get_kv_type(meta.get(), kid) == GGUF_TYPE_ARRAY) {
|
||||
if (gguf_get_kv_type(metadata, kid) == GGUF_TYPE_ARRAY) {
|
||||
struct GGUFMeta::ArrayInfo arr_info =
|
||||
GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta.get(), kid);
|
||||
GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(metadata, kid);
|
||||
|
||||
if (n != arr_info.length) {
|
||||
throw std::runtime_error(format("key %s has wrong array length; expected %u, got %u", key.c_str(), n, (uint32_t) arr_info.length));
|
||||
|
|
@ -473,7 +478,7 @@ namespace GGUFMeta {
|
|||
bool llama_model_loader::get_key_or_arr(enum llm_kv kid, uint32_t & result, bool required) {
|
||||
const std::string key = llm_kv(kid);
|
||||
|
||||
const int id = gguf_find_key(meta.get(), key.c_str());
|
||||
const int id = gguf_find_key(metadata, key.c_str());
|
||||
|
||||
if (id < 0) {
|
||||
if (required) {
|
||||
|
|
@ -483,7 +488,7 @@ namespace GGUFMeta {
|
|||
}
|
||||
|
||||
// throw and error if type is an array
|
||||
if (gguf_get_kv_type(meta.get(), id) == GGUF_TYPE_ARRAY) {
|
||||
if (gguf_get_kv_type(metadata, id) == GGUF_TYPE_ARRAY) {
|
||||
if (required) {
|
||||
throw std::runtime_error(format("expected scalar, found array for key: %s", key.c_str()));
|
||||
}
|
||||
|
|
@ -500,6 +505,9 @@ namespace GGUFMeta {
|
|||
|
||||
|
||||
llama_model_loader::llama_model_loader(
|
||||
struct gguf_context * meta,
|
||||
llama_model_set_tensor_data_t set_tensor_data,
|
||||
void * set_tensor_data_ud,
|
||||
const std::string & fname,
|
||||
std::vector<std::string> & splits,
|
||||
bool use_mmap,
|
||||
|
|
@ -507,7 +515,8 @@ llama_model_loader::llama_model_loader(
|
|||
bool check_tensors,
|
||||
bool no_alloc,
|
||||
const llama_model_kv_override * param_overrides_p,
|
||||
const llama_model_tensor_buft_override * param_tensor_buft_overrides_p) {
|
||||
const llama_model_tensor_buft_override * param_tensor_buft_overrides_p)
|
||||
: metadata(meta), set_tensor_data(set_tensor_data), set_tensor_data_ud(set_tensor_data_ud) {
|
||||
int trace = 0;
|
||||
if (getenv("LLAMA_TRACE")) {
|
||||
trace = atoi(getenv("LLAMA_TRACE"));
|
||||
|
|
@ -521,136 +530,142 @@ llama_model_loader::llama_model_loader(
|
|||
|
||||
tensor_buft_overrides = param_tensor_buft_overrides_p;
|
||||
|
||||
// Load the main GGUF
|
||||
struct ggml_context * ctx = NULL;
|
||||
struct gguf_init_params params = {
|
||||
/*.no_alloc = */ true,
|
||||
/*.ctx = */ &ctx,
|
||||
};
|
||||
if (!fname.empty()) {
|
||||
// Load the main GGUF
|
||||
struct ggml_context * ctx = NULL;
|
||||
struct gguf_init_params params = {
|
||||
/*.no_alloc = */ true,
|
||||
/*.ctx = */ &ctx,
|
||||
};
|
||||
|
||||
meta.reset(gguf_init_from_file(fname.c_str(), params));
|
||||
if (!meta) {
|
||||
throw std::runtime_error(format("%s: failed to load model from %s", __func__, fname.c_str()));
|
||||
}
|
||||
|
||||
get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
|
||||
llm_kv = LLM_KV(llm_arch_from_string(arch_name));
|
||||
|
||||
files.emplace_back(new llama_file(fname.c_str(), "rb", use_direct_io));
|
||||
contexts.emplace_back(ctx);
|
||||
|
||||
if (use_mmap && use_direct_io) {
|
||||
if (files.back()->has_direct_io()) {
|
||||
LLAMA_LOG_WARN("%s: direct I/O is enabled, disabling mmap\n", __func__);
|
||||
use_mmap = false;
|
||||
} else {
|
||||
LLAMA_LOG_WARN("%s: direct I/O is not available, using mmap\n", __func__);
|
||||
use_direct_io = false;
|
||||
|
||||
// reopen file using std::fopen for mmap
|
||||
files.pop_back();
|
||||
files.emplace_back(new llama_file(fname.c_str(), "rb", false));
|
||||
}
|
||||
}
|
||||
|
||||
// Save tensors data offset of the main file.
|
||||
// For subsidiary files, `meta` tensor data offset must not be used,
|
||||
// so we build a unified tensors index for weights.
|
||||
for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
|
||||
std::string tensor_name = std::string(cur->name);
|
||||
// make sure there is no duplicated tensor names
|
||||
if (weights_map.find(tensor_name) != weights_map.end()) {
|
||||
throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", ggml_get_name(cur)));
|
||||
}
|
||||
n_elements += ggml_nelements(cur);
|
||||
n_bytes += ggml_nbytes(cur);
|
||||
weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), 0, meta.get(), cur));
|
||||
}
|
||||
uint16_t n_split = 0;
|
||||
get_key(llm_kv(LLM_KV_SPLIT_COUNT), n_split, false);
|
||||
|
||||
// Load additional GGML contexts
|
||||
if (n_split > 1) {
|
||||
// make sure the main file is loaded first
|
||||
uint16_t idx = 0;
|
||||
const std::string kv_split_no = llm_kv(LLM_KV_SPLIT_NO);
|
||||
get_key(kv_split_no, idx);
|
||||
if (idx != 0) {
|
||||
throw std::runtime_error(format("illegal split file idx: %d (file: %s), model must be loaded with the first split", idx, fname.c_str()));
|
||||
metadata_ptr.reset(gguf_init_from_file(fname.c_str(), params));
|
||||
metadata = metadata_ptr.get();
|
||||
if (metadata == nullptr) {
|
||||
throw std::runtime_error(format("%s: failed to load model from %s", __func__, fname.c_str()));
|
||||
}
|
||||
|
||||
// generate list of splits if needed
|
||||
if (splits.empty()) {
|
||||
splits = llama_get_list_splits(fname, idx, n_split);
|
||||
get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
|
||||
llm_kv = LLM_KV(llm_arch_from_string(arch_name));
|
||||
|
||||
files.emplace_back(new llama_file(fname.c_str(), "rb", use_direct_io));
|
||||
contexts.emplace_back(ctx);
|
||||
|
||||
if (use_mmap && use_direct_io) {
|
||||
if (files.back()->has_direct_io()) {
|
||||
LLAMA_LOG_WARN("%s: direct I/O is enabled, disabling mmap\n", __func__);
|
||||
use_mmap = false;
|
||||
} else {
|
||||
LLAMA_LOG_WARN("%s: direct I/O is not available, using mmap\n", __func__);
|
||||
use_direct_io = false;
|
||||
|
||||
// reopen file using std::fopen for mmap
|
||||
files.pop_back();
|
||||
files.emplace_back(new llama_file(fname.c_str(), "rb", false));
|
||||
}
|
||||
}
|
||||
|
||||
// in case user give a custom list of splits, check if it matches the expected number
|
||||
if (n_split != (uint16_t)splits.size()) {
|
||||
throw std::runtime_error(format("invalid split count, given: %zu splits, but expected %d", splits.size(), n_split));
|
||||
// Save tensors data offset of the main file.
|
||||
// For subsidiary files, `meta` tensor data offset must not be used,
|
||||
// so we build a unified tensors index for weights.
|
||||
for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
|
||||
std::string tensor_name = std::string(cur->name);
|
||||
// make sure there is no duplicated tensor names
|
||||
if (weights_map.find(tensor_name) != weights_map.end()) {
|
||||
throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", ggml_get_name(cur)));
|
||||
}
|
||||
n_elements += ggml_nelements(cur);
|
||||
n_bytes += ggml_nbytes(cur);
|
||||
weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), 0, metadata, cur));
|
||||
}
|
||||
uint16_t n_split = 0;
|
||||
get_key(llm_kv(LLM_KV_SPLIT_COUNT), n_split, false);
|
||||
|
||||
if (trace > 0) {
|
||||
LLAMA_LOG_INFO("%s: loading additional %d GGUFs\n", __func__, n_split);
|
||||
}
|
||||
|
||||
// load other splits
|
||||
for (idx = 1; idx < n_split; idx++) {
|
||||
const char * fname_split = splits[idx].c_str();
|
||||
|
||||
struct gguf_init_params split_params = {
|
||||
/*.no_alloc = */ true,
|
||||
/*.ctx = */ &ctx,
|
||||
};
|
||||
gguf_context_ptr ctx_gguf { gguf_init_from_file(fname_split, split_params) };
|
||||
if (!ctx_gguf) {
|
||||
throw std::runtime_error(format("%s: failed to load GGUF split from %s", __func__, fname_split));
|
||||
// Load additional GGML contexts
|
||||
if (n_split > 1) {
|
||||
// make sure the main file is loaded first
|
||||
uint16_t idx = 0;
|
||||
const std::string kv_split_no = llm_kv(LLM_KV_SPLIT_NO);
|
||||
get_key(kv_split_no, idx);
|
||||
if (idx != 0) {
|
||||
throw std::runtime_error(format("illegal split file idx: %d (file: %s), model must be loaded with the first split", idx, fname.c_str()));
|
||||
}
|
||||
|
||||
// check idx
|
||||
// generate list of splits if needed
|
||||
if (splits.empty()) {
|
||||
splits = llama_get_list_splits(fname, idx, n_split);
|
||||
}
|
||||
|
||||
// in case user give a custom list of splits, check if it matches the expected number
|
||||
if (n_split != (uint16_t)splits.size()) {
|
||||
throw std::runtime_error(format("invalid split count, given: %zu splits, but expected %d", splits.size(), n_split));
|
||||
}
|
||||
|
||||
if (trace > 0) {
|
||||
LLAMA_LOG_INFO("%s: loading additional %d GGUFs\n", __func__, n_split);
|
||||
}
|
||||
|
||||
// load other splits
|
||||
for (idx = 1; idx < n_split; idx++) {
|
||||
const char * fname_split = splits[idx].c_str();
|
||||
|
||||
struct gguf_init_params split_params = {
|
||||
/*.no_alloc = */ true,
|
||||
/*.ctx = */ &ctx,
|
||||
};
|
||||
gguf_context_ptr ctx_gguf { gguf_init_from_file(fname_split, split_params) };
|
||||
if (!ctx_gguf) {
|
||||
throw std::runtime_error(format("%s: failed to load GGUF split from %s", __func__, fname_split));
|
||||
}
|
||||
|
||||
// check idx
|
||||
{
|
||||
const int kid = gguf_find_key(ctx_gguf.get(), kv_split_no.c_str());
|
||||
if (kid < 0) {
|
||||
throw std::runtime_error(format("missing key %s in GGUF split %s", kv_split_no.c_str(), fname_split));
|
||||
}
|
||||
int idx_gguf = gguf_get_val_u16(ctx_gguf.get(), kid);
|
||||
if (idx_gguf != idx) {
|
||||
throw std::runtime_error(format("invalid split file idx: %d (file: %s), expected %d", idx_gguf, fname_split, idx));
|
||||
}
|
||||
}
|
||||
|
||||
files.emplace_back(new llama_file(fname_split, "rb", use_direct_io));
|
||||
contexts.emplace_back(ctx);
|
||||
|
||||
// Save tensors data offset info of the shard.
|
||||
for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
|
||||
std::string tensor_name = std::string(cur->name);
|
||||
// make sure there is no duplicated tensor names
|
||||
if (weights_map.find(tensor_name) != weights_map.end()) {
|
||||
throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", ggml_get_name(cur)));
|
||||
}
|
||||
n_elements += ggml_nelements(cur);
|
||||
n_bytes += ggml_nbytes(cur);
|
||||
weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), idx, ctx_gguf.get(), cur));
|
||||
}
|
||||
}
|
||||
|
||||
get_key(llm_kv(LLM_KV_SPLIT_TENSORS_COUNT), n_tensors);
|
||||
|
||||
// sanity check
|
||||
{
|
||||
const int kid = gguf_find_key(ctx_gguf.get(), kv_split_no.c_str());
|
||||
if (kid < 0) {
|
||||
throw std::runtime_error(format("missing key %s in GGUF split %s", kv_split_no.c_str(), fname_split));
|
||||
}
|
||||
int idx_gguf = gguf_get_val_u16(ctx_gguf.get(), kid);
|
||||
if (idx_gguf != idx) {
|
||||
throw std::runtime_error(format("invalid split file idx: %d (file: %s), expected %d", idx_gguf, fname_split, idx));
|
||||
const int n_tensors_loaded = (int) weights_map.size();
|
||||
if (n_tensors != n_tensors_loaded) {
|
||||
throw std::runtime_error(format("corrupted model: %d tensors expected but %d found", n_tensors, n_tensors_loaded));
|
||||
}
|
||||
}
|
||||
|
||||
files.emplace_back(new llama_file(fname_split, "rb", use_direct_io));
|
||||
contexts.emplace_back(ctx);
|
||||
|
||||
// Save tensors data offset info of the shard.
|
||||
for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
|
||||
std::string tensor_name = std::string(cur->name);
|
||||
// make sure there is no duplicated tensor names
|
||||
if (weights_map.find(tensor_name) != weights_map.end()) {
|
||||
throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", ggml_get_name(cur)));
|
||||
}
|
||||
n_elements += ggml_nelements(cur);
|
||||
n_bytes += ggml_nbytes(cur);
|
||||
weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), idx, ctx_gguf.get(), cur));
|
||||
}
|
||||
LLAMA_LOG_INFO("%s: additional %d GGUFs metadata loaded.\n", __func__, n_split - 1);
|
||||
}
|
||||
|
||||
get_key(llm_kv(LLM_KV_SPLIT_TENSORS_COUNT), n_tensors);
|
||||
|
||||
// sanity check
|
||||
{
|
||||
const int n_tensors_loaded = (int) weights_map.size();
|
||||
if (n_tensors != n_tensors_loaded) {
|
||||
throw std::runtime_error(format("corrupted model: %d tensors expected but %d found", n_tensors, n_tensors_loaded));
|
||||
}
|
||||
}
|
||||
|
||||
LLAMA_LOG_INFO("%s: additional %d GGUFs metadata loaded.\n", __func__, n_split - 1);
|
||||
} else {
|
||||
get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
|
||||
llm_kv = LLM_KV(llm_arch_from_string(arch_name));
|
||||
}
|
||||
|
||||
n_kv = gguf_get_n_kv(meta.get());
|
||||
n_kv = gguf_get_n_kv(metadata);
|
||||
n_tensors = weights_map.size();
|
||||
|
||||
fver = (enum llama_fver) gguf_get_version(meta.get());
|
||||
fver = (enum llama_fver) gguf_get_version(metadata);
|
||||
|
||||
LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n",
|
||||
__func__, n_kv, n_tensors, fname.c_str(), llama_file_version_name(fver));
|
||||
|
|
@ -729,14 +744,14 @@ llama_model_loader::llama_model_loader(
|
|||
LLAMA_LOG_INFO("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
|
||||
|
||||
for (int i = 0; i < n_kv; i++) {
|
||||
const char * name = gguf_get_key(meta.get(), i);
|
||||
const enum gguf_type type = gguf_get_kv_type(meta.get(), i);
|
||||
const char * name = gguf_get_key(metadata, i);
|
||||
const enum gguf_type type = gguf_get_kv_type(metadata, i);
|
||||
const std::string type_name =
|
||||
type == GGUF_TYPE_ARRAY
|
||||
? format("%s[%s,%zu]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(meta.get(), i)), gguf_get_arr_n(meta.get(), i))
|
||||
? format("%s[%s,%zu]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(metadata, i)), gguf_get_arr_n(metadata, i))
|
||||
: gguf_type_name(type);
|
||||
|
||||
std::string value = gguf_kv_to_str(meta.get(), i);
|
||||
std::string value = gguf_kv_to_str(metadata, i);
|
||||
const size_t MAX_VALUE_LEN = 40;
|
||||
if (value.size() > MAX_VALUE_LEN) {
|
||||
value = format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str());
|
||||
|
|
@ -838,15 +853,382 @@ const struct ggml_tensor * llama_model_loader::check_tensor_dims(const std::stri
|
|||
return cur;
|
||||
}
|
||||
|
||||
struct ggml_tensor * llama_model_loader::create_tensor(struct ggml_context * ctx, const std::string & name, const std::initializer_list<int64_t> & ne, int flags) {
|
||||
LLAMA_LOG_DEBUG("%s: loading tensor %s\n", __func__, name.c_str());
|
||||
const struct ggml_tensor * cur = check_tensor_dims(name, ne, !(flags & TENSOR_NOT_REQUIRED));
|
||||
// checks if the weight tensor can be used with the specified buffer type and device
|
||||
static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w, ggml_op op, ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev) {
|
||||
GGML_ASSERT(w != nullptr);
|
||||
|
||||
if (op == GGML_OP_NONE) {
|
||||
return true;
|
||||
}
|
||||
|
||||
ggml_init_params params = {
|
||||
/*.mem_size =*/ ggml_tensor_overhead()*8,
|
||||
/*.mem_buffer =*/ NULL,
|
||||
/*.no_alloc =*/ true,
|
||||
};
|
||||
ggml_context_ptr ctx_ptr { ggml_init(params) };
|
||||
if (!ctx_ptr) {
|
||||
throw std::runtime_error(format("failed to create ggml context"));
|
||||
}
|
||||
ggml_context * ctx = ctx_ptr.get();
|
||||
|
||||
ggml_tensor * op_tensor = nullptr;
|
||||
|
||||
switch (op) {
|
||||
case GGML_OP_GET_ROWS:
|
||||
{
|
||||
ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 512);
|
||||
op_tensor = ggml_get_rows(ctx, w, b);
|
||||
} break;
|
||||
case GGML_OP_MUL_MAT:
|
||||
{
|
||||
ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], 512, w->ne[2], w->ne[3]);
|
||||
op_tensor = ggml_mul_mat(ctx, w, b);
|
||||
} break;
|
||||
case GGML_OP_MUL_MAT_ID:
|
||||
{
|
||||
const int n_expert_used = hparams.n_expert_used;
|
||||
GGML_ASSERT(n_expert_used > 0);
|
||||
ggml_tensor * b = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0], n_expert_used, 512);
|
||||
ggml_tensor * ids = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_expert_used, 512);
|
||||
op_tensor = ggml_mul_mat_id(ctx, w, b, ids);
|
||||
} break;
|
||||
case GGML_OP_ADD:
|
||||
{
|
||||
ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
|
||||
op_tensor = ggml_add(ctx, a, w);
|
||||
} break;
|
||||
case GGML_OP_ADD_ID:
|
||||
{
|
||||
const int n_expert_used = hparams.n_expert_used;
|
||||
GGML_ASSERT(n_expert_used > 0);
|
||||
ggml_tensor * a = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0], n_expert_used, 512);
|
||||
ggml_tensor * c = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_expert_used, 512);
|
||||
op_tensor = ggml_add_id(ctx, a, w, c);
|
||||
} break;
|
||||
case GGML_OP_MUL:
|
||||
{
|
||||
ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
|
||||
op_tensor = ggml_mul(ctx, a, w);
|
||||
} break;
|
||||
case GGML_OP_DIV:
|
||||
{
|
||||
ggml_tensor * a = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, w->ne[0]);
|
||||
op_tensor = ggml_div(ctx, a, w);
|
||||
} break;
|
||||
case GGML_OP_ROPE:
|
||||
{
|
||||
const int n_embd_head = hparams.n_embd_head_v;
|
||||
const int n_head = hparams.n_head();
|
||||
ggml_tensor * a = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_embd_head, n_head, 512);
|
||||
ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 512);
|
||||
op_tensor = ggml_rope_ext(
|
||||
ctx, a, b, w,
|
||||
0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0
|
||||
);
|
||||
|
||||
} break;
|
||||
case GGML_OP_SSM_CONV:
|
||||
{
|
||||
const int64_t n_seq_tokens = 512;
|
||||
const int64_t n_seqs = 3;
|
||||
ggml_tensor * conv_x = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0] - 1 + n_seq_tokens, w->ne[1], n_seqs);
|
||||
op_tensor = ggml_ssm_conv(ctx, conv_x, w);
|
||||
} break;
|
||||
case GGML_OP_SSM_SCAN:
|
||||
{
|
||||
// w is ssm_a, which is used to distinguish Mamba-1 and Mamba-2
|
||||
const int64_t d_state = w->ne[0] == 1 ? hparams.ssm_d_state : w->ne[0];
|
||||
const int64_t n_head = w->ne[1];
|
||||
const int64_t head_dim = hparams.ssm_d_inner / n_head;
|
||||
const int64_t n_group = hparams.ssm_n_group ? hparams.ssm_n_group : 1;
|
||||
const int64_t n_seq_tokens = 512;
|
||||
const int64_t n_seqs = 3;
|
||||
ggml_tensor * s = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, head_dim, n_head, n_seqs);
|
||||
ggml_tensor * x = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, head_dim, n_head, n_seq_tokens, n_seqs);
|
||||
ggml_tensor * dt = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_head, n_seq_tokens, n_seqs);
|
||||
ggml_tensor * B = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, n_group, n_seq_tokens, n_seqs);
|
||||
ggml_tensor * C = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, n_group, n_seq_tokens, n_seqs);
|
||||
ggml_tensor * ids = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_seqs);
|
||||
op_tensor = ggml_ssm_scan(ctx, s, x, dt, w, B, C, ids);
|
||||
} break;
|
||||
case GGML_OP_RWKV_WKV6:
|
||||
{
|
||||
// FIXME
|
||||
const int64_t S = 123;
|
||||
const int64_t H = 123;
|
||||
const int64_t n_tokens = 123;
|
||||
const int64_t n_seqs = 123;
|
||||
ggml_tensor * k = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
|
||||
ggml_tensor * v = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
|
||||
ggml_tensor * r = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
|
||||
ggml_tensor * tf = w;
|
||||
ggml_tensor * td = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
|
||||
ggml_tensor * state = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, S, n_seqs, S, H);
|
||||
op_tensor = ggml_rwkv_wkv6(ctx, k, v, r, tf, td, state);
|
||||
} break;
|
||||
case GGML_OP_IM2COL:
|
||||
{
|
||||
const int n_embd_inp = hparams.n_embd_inp();
|
||||
ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, n_embd_inp, w->ne[1], 1, 1);
|
||||
op_tensor = ggml_im2col(ctx, w, b, 1, 0, 0, 0, 1, 0, false, GGML_TYPE_F16);
|
||||
} break;
|
||||
case GGML_OP_SCALE:
|
||||
{
|
||||
op_tensor = ggml_scale(ctx, w, 1.0f);
|
||||
} break;
|
||||
default:
|
||||
GGML_ABORT("%s: missing test for op %s for tensor %s", __func__, ggml_op_name(op), w->name);
|
||||
}
|
||||
|
||||
// create a temporary dummy buffer for the weight so that supports_op can check the buffer type
|
||||
GGML_ASSERT(w->buffer == nullptr);
|
||||
w->buffer = ggml_backend_buft_alloc_buffer(buft, 0);
|
||||
bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor);
|
||||
ggml_backend_buffer_free(w->buffer);
|
||||
w->buffer = nullptr;
|
||||
|
||||
return op_supported;
|
||||
}
|
||||
|
||||
// find the first buffer type in the list that can use the tensor
|
||||
static ggml_backend_buffer_type_t select_weight_buft(const llama_hparams & hparams, ggml_tensor * tensor, ggml_op op, const buft_list_t * buft_list) {
|
||||
GGML_ASSERT(!buft_list->empty());
|
||||
for (const auto & cur : *buft_list) {
|
||||
ggml_backend_dev_t cur_dev = cur.first;
|
||||
ggml_backend_buffer_type_t cur_buft = cur.second;
|
||||
if (weight_buft_supported(hparams, tensor, op, cur_buft, cur_dev)) {
|
||||
return cur_buft;
|
||||
}
|
||||
}
|
||||
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
struct ggml_tensor * llama_model_loader::create_tensor(
|
||||
const llama_hparams & hparams, const buft_list_t * buft_list_cpu, const buft_list_t * buft_list_input, const buft_list_t * buft_list_output,
|
||||
const buft_list_t * buft_list_layer, const LLM_TN_IMPL & tn, const std::initializer_list<int64_t> & ne, int flags) {
|
||||
auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
|
||||
auto it = ctx_map.find(buft);
|
||||
if (it == ctx_map.end()) {
|
||||
// one ggml context per buffer type
|
||||
int max_n_tensors = n_tensors;
|
||||
max_n_tensors += 1; // duplicated output tensor
|
||||
max_n_tensors += hparams.n_layer*2; // duplicated rope freq tensors
|
||||
if (files.empty()) {
|
||||
max_n_tensors += hparams.n_layer*256; // this should be well above what any model actually uses
|
||||
}
|
||||
const size_t ctx_size = ggml_tensor_overhead()*max_n_tensors;
|
||||
|
||||
ggml_init_params params = {
|
||||
/*.mem_size =*/ ctx_size,
|
||||
/*.mem_buffer =*/ NULL,
|
||||
/*.no_alloc =*/ true,
|
||||
};
|
||||
|
||||
ggml_context * ctx = ggml_init(params);
|
||||
if (!ctx) {
|
||||
throw std::runtime_error(format("failed to create ggml context"));
|
||||
}
|
||||
|
||||
ctx_map.emplace(buft, ctx);
|
||||
|
||||
return ctx;
|
||||
}
|
||||
return it->second.get();
|
||||
};
|
||||
|
||||
auto buft_for_tensor = [&](ggml_tensor * t_meta) -> ggml_backend_buffer_type_t {
|
||||
if (!t_meta) {
|
||||
if (flags & TENSOR_NOT_REQUIRED) {
|
||||
return nullptr;
|
||||
}
|
||||
throw std::runtime_error(format("missing tensor '%s'", tn.str().c_str()));
|
||||
}
|
||||
|
||||
// some models use the token embedding tensor as the output, but since these are used in different layers and with different ops
|
||||
// the tensor is duplicated
|
||||
// to handle this, we check if the tensor is duplicated, and if so, we assume that it is being loaded as the output tensor
|
||||
llm_tensor tn_tensor = tn.tensor;
|
||||
if (tn.tensor == LLM_TENSOR_TOKEN_EMBD && (flags & TENSOR_DUPLICATED)) {
|
||||
tn_tensor = LLM_TENSOR_OUTPUT;
|
||||
}
|
||||
|
||||
llm_tensor_info info;
|
||||
try {
|
||||
info = llm_tensor_info_for(tn_tensor);
|
||||
} catch (const std::out_of_range & e) {
|
||||
throw std::runtime_error(format("missing tensor info mapping for %s", tn.str().c_str()));
|
||||
}
|
||||
|
||||
// skip unused tensors
|
||||
if (info.op == GGML_OP_NONE || (flags & TENSOR_SKIP)) {
|
||||
const size_t nbytes = ggml_nbytes(t_meta);
|
||||
LLAMA_LOG_WARN("model has unused tensor %s (size = %zu bytes) -- ignoring\n", tn.str().c_str(), nbytes);
|
||||
|
||||
size_data -= nbytes;
|
||||
n_created++;
|
||||
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// tensors with "bias" suffix are always used with GGML_OP_ADD or GGML_OP_ADD_ID
|
||||
ggml_op op;
|
||||
bool bias = tn.suffix != nullptr && strcmp(tn.suffix, "bias") == 0;
|
||||
if (bias) {
|
||||
if (info.op == GGML_OP_MUL_MAT_ID) {
|
||||
op = GGML_OP_ADD_ID;
|
||||
} else {
|
||||
op = GGML_OP_ADD;
|
||||
}
|
||||
} else {
|
||||
op = info.op;
|
||||
}
|
||||
|
||||
// sanity checks
|
||||
if (info.layer == LLM_TENSOR_LAYER_INPUT || info.layer == LLM_TENSOR_LAYER_OUTPUT) {
|
||||
if (tn.bid != -1) {
|
||||
GGML_ABORT("input/output layer tensor %s used with a layer number", tn.str().c_str());
|
||||
}
|
||||
} else {
|
||||
if (tn.bid == -1) {
|
||||
GGML_ABORT("repeating layer tensor %s used without a layer number", tn.str().c_str());
|
||||
}
|
||||
}
|
||||
|
||||
// select the buffer type for this tensor
|
||||
const buft_list_t * buft_list;
|
||||
switch (info.layer) {
|
||||
case LLM_TENSOR_LAYER_INPUT:
|
||||
buft_list = buft_list_input;
|
||||
break;
|
||||
case LLM_TENSOR_LAYER_OUTPUT:
|
||||
buft_list = buft_list_output;
|
||||
break;
|
||||
case LLM_TENSOR_LAYER_REPEATING:
|
||||
GGML_ASSERT(buft_list_layer != nullptr);
|
||||
buft_list = buft_list_layer;
|
||||
break;
|
||||
default:
|
||||
GGML_ABORT("invalid layer %d for tensor %s", info.layer, tn.str().c_str());
|
||||
}
|
||||
|
||||
ggml_backend_buffer_type_t buft = nullptr;
|
||||
|
||||
// check overrides
|
||||
if (tensor_buft_overrides) {
|
||||
std::string tensor_name = tn.str();
|
||||
for (const auto * overrides = tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) {
|
||||
std::regex pattern(overrides->pattern);
|
||||
if (std::regex_search(tensor_name, pattern)) {
|
||||
if (overrides->buft == ggml_backend_cpu_buffer_type()) {
|
||||
// when overriding to a CPU buffer, consider the extra buffer types
|
||||
buft = select_weight_buft(hparams, t_meta, op, buft_list_cpu);
|
||||
} else {
|
||||
buft = overrides->buft;
|
||||
}
|
||||
|
||||
LLAMA_LOG_DEBUG("tensor %s (%zu MiB %s) buffer type overridden to %s\n",
|
||||
tensor_name.c_str(),
|
||||
ggml_nbytes(t_meta) / 1024 / 1024, ggml_type_name(t_meta->type),
|
||||
ggml_backend_buft_name(buft));
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!buft) {
|
||||
buft = select_weight_buft(hparams, t_meta, op, buft_list);
|
||||
if (!buft) {
|
||||
throw std::runtime_error(format("failed to find a compatible buffer type for tensor %s", tn.str().c_str()));
|
||||
}
|
||||
}
|
||||
|
||||
// avoid using a host buffer when using mmap
|
||||
auto * buft_dev = ggml_backend_buft_get_device(buft);
|
||||
if (use_mmap && buft_dev && buft == ggml_backend_dev_host_buffer_type(buft_dev)) {
|
||||
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
||||
if (!cpu_dev) {
|
||||
throw std::runtime_error("no CPU backend found");
|
||||
}
|
||||
buft = ggml_backend_dev_buffer_type(cpu_dev);
|
||||
}
|
||||
|
||||
if (buft != buft_list->front().second) {
|
||||
if (n_tensors_moved == 0) {
|
||||
first_tensor_moved_name = t_meta->name;
|
||||
first_tensor_moved_type_name = ggml_type_name(t_meta->type);
|
||||
first_moved_from_buft = buft_list->front().second;
|
||||
first_moved_to_buft = buft;
|
||||
}
|
||||
n_tensors_moved++;
|
||||
}
|
||||
|
||||
return buft;
|
||||
};
|
||||
|
||||
if (files.empty()) {
|
||||
if (flags & TENSOR_SKIP_IF_VIRTUAL) {
|
||||
return nullptr;
|
||||
}
|
||||
ggml_type type = GGML_TYPE_F32;
|
||||
const int64_t tid = gguf_find_tensor(metadata, tn.str().c_str());
|
||||
if (tid != -1) {
|
||||
type = gguf_get_tensor_type(metadata, tid);
|
||||
}
|
||||
|
||||
// for tensors that are not required some of the dimensions can be invalid:
|
||||
if (flags & TENSOR_NOT_REQUIRED) {
|
||||
for (size_t dim = 0; dim < ne.size(); dim++) {
|
||||
if (ne.begin()[dim] <= 0) {
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ggml_tensor t_meta;
|
||||
memset(&t_meta, 0, sizeof(ggml_tensor));
|
||||
t_meta.type = type;
|
||||
for (size_t dim = 0; dim < GGML_MAX_DIMS; dim++) {
|
||||
t_meta.ne[dim] = dim < ne.size() ? ne.begin()[dim] : 1;
|
||||
GGML_ASSERT(t_meta.ne[dim] >= 1);
|
||||
t_meta.nb[dim] = dim == 0 ? ggml_type_size(type) : t_meta.ne[dim-1]*t_meta.nb[dim-1];
|
||||
GGML_ASSERT(t_meta.nb[dim] >= 1);
|
||||
}
|
||||
ggml_set_name(&t_meta, tn.str().c_str());
|
||||
|
||||
ggml_backend_buffer_type_t buft = buft_for_tensor(&t_meta);
|
||||
GGML_ASSERT(buft != nullptr);
|
||||
ggml_context * ctx = ctx_for_buft(buft);
|
||||
ggml_tensor * ret = ggml_dup_tensor(ctx, &t_meta);
|
||||
ggml_set_name(ret, tn.str().c_str());
|
||||
return ret;
|
||||
}
|
||||
|
||||
ggml_tensor * t_meta = get_tensor_meta(tn.str().c_str());
|
||||
ggml_backend_buffer_type_t buft = buft_for_tensor(t_meta);
|
||||
if (buft == nullptr) {
|
||||
return nullptr; // return type is ggml_tensor *
|
||||
}
|
||||
ggml_context * ctx = ctx_for_buft(buft);
|
||||
|
||||
// if duplicated, check if the original tensor was allocated in the same buffer type context and avoid creating a new one
|
||||
if (flags & TENSOR_DUPLICATED) {
|
||||
ggml_tensor * t = ggml_get_tensor(ctx, tn.str().c_str());
|
||||
if (t) {
|
||||
return t;
|
||||
}
|
||||
}
|
||||
|
||||
LLAMA_LOG_DEBUG("%s: loading tensor %s\n", __func__, tn.str().c_str());
|
||||
const struct ggml_tensor * cur = check_tensor_dims(tn.str(), ne, !(flags & TENSOR_NOT_REQUIRED));
|
||||
|
||||
if (cur == NULL) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
bool duplicated = flags & TENSOR_DUPLICATED;
|
||||
const bool duplicated = flags & TENSOR_DUPLICATED;
|
||||
|
||||
struct ggml_tensor * tensor = ggml_dup_tensor(ctx, cur);
|
||||
ggml_set_name(tensor, ggml_get_name(cur));
|
||||
|
|
@ -858,7 +1240,6 @@ struct ggml_tensor * llama_model_loader::create_tensor(struct ggml_context * ctx
|
|||
}
|
||||
|
||||
return tensor;
|
||||
|
||||
}
|
||||
|
||||
struct ggml_tensor * llama_model_loader::create_tensor_as_view(struct ggml_context * ctx, struct ggml_tensor * base, const std::string & name, const std::initializer_list<int64_t> & ne, size_t offset, bool required) {
|
||||
|
|
@ -893,6 +1274,11 @@ void llama_model_loader::done_getting_tensors() const {
|
|||
if (n_created != n_tensors) {
|
||||
throw std::runtime_error(format("%s: wrong number of tensors; expected %d, got %d", __func__, n_tensors, n_created));
|
||||
}
|
||||
if (n_tensors_moved > 0) {
|
||||
LLAMA_LOG_DEBUG("%s: tensor '%s' (%s) (and %zu others) cannot be used with preferred buffer type %s, using %s instead\n",
|
||||
__func__, first_tensor_moved_name.c_str(), first_tensor_moved_type_name.c_str(), n_tensors_moved - 1,
|
||||
ggml_backend_buft_name(first_moved_from_buft), ggml_backend_buft_name(first_moved_to_buft));
|
||||
}
|
||||
}
|
||||
|
||||
void llama_model_loader::init_mappings(bool prefetch, llama_mlocks * mlock_mmaps) {
|
||||
|
|
@ -974,6 +1360,12 @@ bool llama_model_loader::load_all_data(
|
|||
llama_mlocks * lmlocks,
|
||||
llama_progress_callback progress_callback,
|
||||
void * progress_callback_user_data) {
|
||||
if (files.empty()) {
|
||||
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
|
||||
set_tensor_data(t, set_tensor_data_ud);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
GGML_ASSERT(size_data != 0 && "call init_mappings() first");
|
||||
|
||||
std::vector<no_init<uint8_t>> read_buf;
|
||||
|
|
|
|||
|
|
@ -4,17 +4,22 @@
|
|||
|
||||
#include "llama-impl.h"
|
||||
#include "llama-arch.h"
|
||||
#include "llama-hparams.h"
|
||||
#include "llama-mmap.h"
|
||||
|
||||
#include "ggml-cpp.h"
|
||||
|
||||
#include <cstddef>
|
||||
#include <cstring>
|
||||
#include <map>
|
||||
#include <stdexcept>
|
||||
#include <unordered_map>
|
||||
|
||||
using llama_buf_map = std::unordered_map<uint32_t, ggml_backend_buffer_t>;
|
||||
|
||||
// lists of buffer types used for each layer
|
||||
using buft_list_t = std::vector<std::pair<ggml_backend_dev_t, ggml_backend_buffer_type_t>>;
|
||||
|
||||
enum llama_fver {
|
||||
GGUF_FILE_VERSION_V1 = 1,
|
||||
GGUF_FILE_VERSION_V2 = 2,
|
||||
|
|
@ -58,9 +63,10 @@ struct llama_model_loader {
|
|||
}
|
||||
};
|
||||
|
||||
static const int TENSOR_NOT_REQUIRED = 1 << 0;
|
||||
static const int TENSOR_DUPLICATED = 1 << 1;
|
||||
static const int TENSOR_SKIP = 1 << 2;
|
||||
static const int TENSOR_NOT_REQUIRED = 1 << 0;
|
||||
static const int TENSOR_DUPLICATED = 1 << 1;
|
||||
static const int TENSOR_SKIP = 1 << 2;
|
||||
static const int TENSOR_SKIP_IF_VIRTUAL = 1 << 3;
|
||||
|
||||
int n_kv = 0;
|
||||
int n_tensors = 0;
|
||||
|
|
@ -84,7 +90,10 @@ struct llama_model_loader {
|
|||
std::unordered_map<std::string, llama_model_kv_override> kv_overrides;
|
||||
const llama_model_tensor_buft_override * tensor_buft_overrides;
|
||||
|
||||
gguf_context_ptr meta;
|
||||
gguf_context_ptr metadata_ptr;
|
||||
struct gguf_context * metadata; // either metadata_ptr.get() or externally set
|
||||
llama_model_set_tensor_data_t set_tensor_data;
|
||||
void * set_tensor_data_ud;
|
||||
std::vector<ggml_context_ptr> contexts;
|
||||
|
||||
std::string arch_name;
|
||||
|
|
@ -94,7 +103,26 @@ struct llama_model_loader {
|
|||
size_t size_data = 0;
|
||||
std::vector<std::pair<size_t, size_t>> mmaps_used;
|
||||
|
||||
// define a comparator for the buft -> ctx map to ensure that the order is well-defined:
|
||||
struct ggml_backend_buft_comparator {
|
||||
bool operator()(const ggml_backend_buffer_type_t & lhs, const ggml_backend_buffer_type_t & rhs) const {
|
||||
return strcmp(ggml_backend_buft_name(lhs), ggml_backend_buft_name(rhs)) < 0;
|
||||
}
|
||||
};
|
||||
|
||||
std::map<ggml_backend_buffer_type_t, ggml_context_ptr, ggml_backend_buft_comparator> ctx_map;
|
||||
|
||||
// track tensors that had to be moved for debugging:
|
||||
size_t n_tensors_moved = 0;
|
||||
std::string first_tensor_moved_name;
|
||||
std::string first_tensor_moved_type_name;
|
||||
ggml_backend_buffer_type_t first_moved_from_buft = nullptr;
|
||||
ggml_backend_buffer_type_t first_moved_to_buft = nullptr;
|
||||
|
||||
llama_model_loader(
|
||||
struct gguf_context * metadata,
|
||||
llama_model_set_tensor_data_t set_tensor_data,
|
||||
void * set_tensor_data_ud,
|
||||
const std::string & fname,
|
||||
std::vector<std::string> & splits, // optional, only need if the split does not follow naming scheme
|
||||
bool use_mmap,
|
||||
|
|
@ -149,7 +177,9 @@ struct llama_model_loader {
|
|||
|
||||
const struct ggml_tensor * check_tensor_dims(const std::string & name, const std::vector<int64_t> & ne, bool required) const;
|
||||
|
||||
struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::initializer_list<int64_t> & ne, int flags = 0);
|
||||
struct ggml_tensor * create_tensor(
|
||||
const llama_hparams & hparams, const buft_list_t * buft_list_cpu, const buft_list_t * buft_list_input, const buft_list_t * buft_list_output,
|
||||
const buft_list_t * buft_list_layer, const LLM_TN_IMPL & tn, const std::initializer_list<int64_t> & ne, int flags);
|
||||
|
||||
struct ggml_tensor * create_tensor_as_view(struct ggml_context * ctx, struct ggml_tensor * base, const std::string & name, const std::initializer_list<int64_t> & ne, size_t offset, bool required = true);
|
||||
|
||||
|
|
|
|||
|
|
@ -7,14 +7,19 @@
|
|||
#include "llama-model.h"
|
||||
#include "llama-vocab.h"
|
||||
|
||||
#include <cstdint>
|
||||
#include <string>
|
||||
|
||||
llama_model_saver::llama_model_saver(const struct llama_model & model) : model(model), llm_kv(model.arch) {
|
||||
gguf_ctx = gguf_init_empty();
|
||||
}
|
||||
llama_model_saver::llama_model_saver(const struct llama_model * model) :
|
||||
gguf_ctx(gguf_init_empty()), gguf_ctx_owned(true), model(model), llm_kv(model->arch) {}
|
||||
|
||||
llama_model_saver::llama_model_saver(enum llm_arch arch, struct gguf_context * gguf_ctx) :
|
||||
gguf_ctx(gguf_ctx == nullptr ? gguf_init_empty() : gguf_ctx), gguf_ctx_owned(gguf_ctx == nullptr), model(nullptr), llm_kv(arch) {}
|
||||
|
||||
llama_model_saver::~llama_model_saver() {
|
||||
gguf_free(gguf_ctx);
|
||||
if (gguf_ctx_owned) {
|
||||
gguf_free(gguf_ctx);
|
||||
}
|
||||
}
|
||||
|
||||
void llama_model_saver::add_kv(const enum llm_kv key, const uint32_t value) {
|
||||
|
|
@ -46,7 +51,8 @@ void llama_model_saver::add_kv(const enum llm_kv key, const char value) {
|
|||
|
||||
template <typename Container>
|
||||
void llama_model_saver::add_kv(const enum llm_kv key, const Container & value, const bool per_layer) {
|
||||
const size_t n_values = per_layer ? size_t(model.hparams.n_layer) : value.size();
|
||||
GGML_ASSERT(model != nullptr || !per_layer);
|
||||
const size_t n_values = per_layer ? size_t(model->hparams.n_layer) : value.size();
|
||||
GGML_ASSERT(n_values <= value.size());
|
||||
|
||||
if (n_values == 0) {
|
||||
|
|
@ -83,6 +89,8 @@ void llama_model_saver::add_kv(const enum llm_kv key, const Container & value, c
|
|||
GGML_ABORT("fatal error");
|
||||
}
|
||||
}
|
||||
// instantiate for external usage:
|
||||
template void llama_model_saver::add_kv<std::vector<uint32_t>>(const enum llm_kv, const std::vector<uint32_t> &, const bool);
|
||||
|
||||
void llama_model_saver::add_kv(const enum llm_kv key, const std::vector<std::string> & value) {
|
||||
std::vector<const char *> tmp(value.size());
|
||||
|
|
@ -104,37 +112,39 @@ void llama_model_saver::add_tensor(const struct ggml_tensor * tensor) {
|
|||
}
|
||||
|
||||
void llama_model_saver::add_kv_from_model() {
|
||||
const llama_hparams & hparams = model.hparams;
|
||||
const llama_vocab & vocab = model.vocab;
|
||||
const llama_hparams & hparams = model->hparams;
|
||||
const llama_vocab & vocab = model->vocab;
|
||||
|
||||
const int32_t n_vocab = vocab.n_tokens();
|
||||
std::vector<std::string> tokens(n_vocab);
|
||||
std::vector<float> scores(n_vocab);
|
||||
std::vector<int32_t> token_types(n_vocab);
|
||||
|
||||
for (int32_t id = 0; id < n_vocab; ++id) {
|
||||
const llama_vocab::token_data & token_data = vocab.get_token_data(id);
|
||||
if (vocab.get_type() != LLAMA_VOCAB_TYPE_NONE) {
|
||||
for (int32_t id = 0; id < n_vocab; ++id) {
|
||||
const llama_vocab::token_data & token_data = vocab.get_token_data(id);
|
||||
|
||||
tokens[id] = token_data.text;
|
||||
scores[id] = token_data.score;
|
||||
tokens[id] = token_data.text;
|
||||
scores[id] = token_data.score;
|
||||
|
||||
switch(token_data.attr) {
|
||||
case LLAMA_TOKEN_ATTR_UNKNOWN: token_types[id] = LLAMA_TOKEN_TYPE_UNKNOWN; break;
|
||||
case LLAMA_TOKEN_ATTR_UNUSED: token_types[id] = LLAMA_TOKEN_TYPE_UNUSED; break;
|
||||
case LLAMA_TOKEN_ATTR_NORMAL: token_types[id] = LLAMA_TOKEN_TYPE_NORMAL; break;
|
||||
case LLAMA_TOKEN_ATTR_CONTROL: token_types[id] = LLAMA_TOKEN_TYPE_CONTROL; break;
|
||||
case LLAMA_TOKEN_ATTR_USER_DEFINED: token_types[id] = LLAMA_TOKEN_TYPE_USER_DEFINED; break;
|
||||
case LLAMA_TOKEN_ATTR_BYTE: token_types[id] = LLAMA_TOKEN_TYPE_BYTE; break;
|
||||
case LLAMA_TOKEN_ATTR_UNDEFINED:
|
||||
default: token_types[id] = LLAMA_TOKEN_TYPE_UNDEFINED; break;
|
||||
switch(token_data.attr) {
|
||||
case LLAMA_TOKEN_ATTR_UNKNOWN: token_types[id] = LLAMA_TOKEN_TYPE_UNKNOWN; break;
|
||||
case LLAMA_TOKEN_ATTR_UNUSED: token_types[id] = LLAMA_TOKEN_TYPE_UNUSED; break;
|
||||
case LLAMA_TOKEN_ATTR_NORMAL: token_types[id] = LLAMA_TOKEN_TYPE_NORMAL; break;
|
||||
case LLAMA_TOKEN_ATTR_CONTROL: token_types[id] = LLAMA_TOKEN_TYPE_CONTROL; break;
|
||||
case LLAMA_TOKEN_ATTR_USER_DEFINED: token_types[id] = LLAMA_TOKEN_TYPE_USER_DEFINED; break;
|
||||
case LLAMA_TOKEN_ATTR_BYTE: token_types[id] = LLAMA_TOKEN_TYPE_BYTE; break;
|
||||
case LLAMA_TOKEN_ATTR_UNDEFINED:
|
||||
default: token_types[id] = LLAMA_TOKEN_TYPE_UNDEFINED; break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// add_kv(LLM_KV_GENERAL_TYPE, ???);
|
||||
add_kv(LLM_KV_GENERAL_ARCHITECTURE, model.arch_name());
|
||||
add_kv(LLM_KV_GENERAL_ARCHITECTURE, model->arch_name());
|
||||
// add_kv(LLM_KV_GENERAL_QUANTIZATION_VERSION, ???);
|
||||
// add_kv(LLM_KV_GENERAL_ALIGNMENT, ???);
|
||||
add_kv(LLM_KV_GENERAL_NAME, model.name);
|
||||
add_kv(LLM_KV_GENERAL_NAME, model->name);
|
||||
// add_kv(LLM_KV_GENERAL_AUTHOR, ???);
|
||||
// add_kv(LLM_KV_GENERAL_VERSION, ???);
|
||||
// add_kv(LLM_KV_GENERAL_URL, ???);
|
||||
|
|
@ -255,25 +265,25 @@ void llama_model_saver::add_kv_from_model() {
|
|||
}
|
||||
|
||||
void llama_model_saver::add_tensors_from_model() {
|
||||
if (std::string(model.output->name) != std::string(model.tok_embd->name)) {
|
||||
add_tensor(model.tok_embd); // some models use the same tensor for tok_embd and output
|
||||
if (std::string(model->output->name) != std::string(model->tok_embd->name)) {
|
||||
add_tensor(model->tok_embd); // some models use the same tensor for tok_embd and output
|
||||
}
|
||||
add_tensor(model.type_embd);
|
||||
add_tensor(model.pos_embd);
|
||||
add_tensor(model.tok_norm);
|
||||
add_tensor(model.tok_norm_b);
|
||||
add_tensor(model.output_norm);
|
||||
add_tensor(model.output_norm_b);
|
||||
add_tensor(model.output);
|
||||
add_tensor(model.output_b);
|
||||
add_tensor(model.output_norm_enc);
|
||||
add_tensor(model.cls);
|
||||
add_tensor(model.cls_b);
|
||||
add_tensor(model.cls_out);
|
||||
add_tensor(model.cls_out_b);
|
||||
add_tensor(model.cls_norm);
|
||||
add_tensor(model->type_embd);
|
||||
add_tensor(model->pos_embd);
|
||||
add_tensor(model->tok_norm);
|
||||
add_tensor(model->tok_norm_b);
|
||||
add_tensor(model->output_norm);
|
||||
add_tensor(model->output_norm_b);
|
||||
add_tensor(model->output);
|
||||
add_tensor(model->output_b);
|
||||
add_tensor(model->output_norm_enc);
|
||||
add_tensor(model->cls);
|
||||
add_tensor(model->cls_b);
|
||||
add_tensor(model->cls_out);
|
||||
add_tensor(model->cls_out_b);
|
||||
add_tensor(model->cls_norm);
|
||||
|
||||
for (const struct llama_layer & layer : model.layers) {
|
||||
for (const struct llama_layer & layer : model->layers) {
|
||||
for (size_t i = 0; i < sizeof(layer)/sizeof(struct ggml_tensor *); ++i) {
|
||||
add_tensor(reinterpret_cast<const struct ggml_tensor * const *>(&layer)[i]);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
#pragma once
|
||||
|
||||
#include "gguf.h"
|
||||
#include "llama.h"
|
||||
#include "llama-arch.h"
|
||||
|
||||
|
|
@ -7,10 +8,12 @@
|
|||
|
||||
struct llama_model_saver {
|
||||
struct gguf_context * gguf_ctx = nullptr;
|
||||
const struct llama_model & model;
|
||||
const bool gguf_ctx_owned;
|
||||
const struct llama_model * model;
|
||||
const struct LLM_KV llm_kv;
|
||||
|
||||
llama_model_saver(const struct llama_model & model);
|
||||
llama_model_saver(const struct llama_model * model);
|
||||
llama_model_saver(enum llm_arch arch, struct gguf_context * gguf_ctx);
|
||||
~llama_model_saver();
|
||||
|
||||
void add_kv(enum llm_kv key, uint32_t value);
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
#include "llama-model.h"
|
||||
|
||||
#include "ggml.h"
|
||||
#include "llama-impl.h"
|
||||
#include "llama-mmap.h"
|
||||
#include "llama-cparams.h"
|
||||
|
|
@ -18,6 +19,7 @@
|
|||
#include <algorithm>
|
||||
#include <cassert>
|
||||
#include <cfloat>
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
#include <cmath>
|
||||
#include <functional>
|
||||
|
|
@ -177,160 +179,6 @@ static llama_rope_scaling_type llama_rope_scaling_type_from_string(const std::st
|
|||
return LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
|
||||
}
|
||||
|
||||
// checks if the weight tensor can be used with the specified buffer type and device
|
||||
static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w, ggml_op op, ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev) {
|
||||
GGML_ASSERT(w != nullptr);
|
||||
|
||||
if (op == GGML_OP_NONE) {
|
||||
return true;
|
||||
}
|
||||
|
||||
ggml_init_params params = {
|
||||
/*.mem_size =*/ ggml_tensor_overhead()*8,
|
||||
/*.mem_buffer =*/ NULL,
|
||||
/*.no_alloc =*/ true,
|
||||
};
|
||||
ggml_context_ptr ctx_ptr { ggml_init(params) };
|
||||
if (!ctx_ptr) {
|
||||
throw std::runtime_error(format("failed to create ggml context"));
|
||||
}
|
||||
ggml_context * ctx = ctx_ptr.get();
|
||||
|
||||
ggml_tensor * op_tensor = nullptr;
|
||||
|
||||
switch (op) {
|
||||
case GGML_OP_GET_ROWS:
|
||||
{
|
||||
ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 512);
|
||||
op_tensor = ggml_get_rows(ctx, w, b);
|
||||
} break;
|
||||
case GGML_OP_MUL_MAT:
|
||||
{
|
||||
ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], 512, w->ne[2], w->ne[3]);
|
||||
op_tensor = ggml_mul_mat(ctx, w, b);
|
||||
} break;
|
||||
case GGML_OP_MUL_MAT_ID:
|
||||
{
|
||||
int n_expert_used = hparams.n_expert_used;
|
||||
ggml_tensor * b = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0], n_expert_used, 512);
|
||||
ggml_tensor * ids = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_expert_used, 512);
|
||||
op_tensor = ggml_mul_mat_id(ctx, w, b, ids);
|
||||
} break;
|
||||
case GGML_OP_ADD:
|
||||
{
|
||||
ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
|
||||
op_tensor = ggml_add(ctx, a, w);
|
||||
} break;
|
||||
case GGML_OP_ADD_ID:
|
||||
{
|
||||
int n_expert_used = hparams.n_expert_used;
|
||||
ggml_tensor * a = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0], n_expert_used, 512);
|
||||
ggml_tensor * c = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_expert_used, 512);
|
||||
op_tensor = ggml_add_id(ctx, a, w, c);
|
||||
} break;
|
||||
case GGML_OP_MUL:
|
||||
{
|
||||
ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
|
||||
op_tensor = ggml_mul(ctx, a, w);
|
||||
} break;
|
||||
case GGML_OP_DIV:
|
||||
{
|
||||
ggml_tensor * a = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, w->ne[0]);
|
||||
op_tensor = ggml_div(ctx, a, w);
|
||||
} break;
|
||||
case GGML_OP_ROPE:
|
||||
{
|
||||
int n_embd_head = hparams.n_embd_head_v;
|
||||
int n_head = hparams.n_head();
|
||||
ggml_tensor * a = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_embd_head, n_head, 512);
|
||||
ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 512);
|
||||
op_tensor = ggml_rope_ext(
|
||||
ctx, a, b, w,
|
||||
0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0
|
||||
);
|
||||
|
||||
} break;
|
||||
case GGML_OP_SSM_CONV:
|
||||
{
|
||||
const int64_t n_seq_tokens = 512;
|
||||
const int64_t n_seqs = 3;
|
||||
ggml_tensor * conv_x = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0] - 1 + n_seq_tokens, w->ne[1], n_seqs);
|
||||
op_tensor = ggml_ssm_conv(ctx, conv_x, w);
|
||||
} break;
|
||||
case GGML_OP_SSM_SCAN:
|
||||
{
|
||||
// w is ssm_a, which is used to distinguish Mamba-1 and Mamba-2
|
||||
const int64_t d_state = w->ne[0] == 1 ? hparams.ssm_d_state : w->ne[0];
|
||||
const int64_t n_head = w->ne[1];
|
||||
const int64_t head_dim = hparams.ssm_d_inner / n_head;
|
||||
const int64_t n_group = hparams.ssm_n_group ? hparams.ssm_n_group : 1;
|
||||
const int64_t n_seq_tokens = 512;
|
||||
const int64_t n_seqs = 3;
|
||||
ggml_tensor * s = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, head_dim, n_head, n_seqs);
|
||||
ggml_tensor * x = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, head_dim, n_head, n_seq_tokens, n_seqs);
|
||||
ggml_tensor * dt = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_head, n_seq_tokens, n_seqs);
|
||||
ggml_tensor * B = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, n_group, n_seq_tokens, n_seqs);
|
||||
ggml_tensor * C = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, n_group, n_seq_tokens, n_seqs);
|
||||
ggml_tensor * ids = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_seqs);
|
||||
op_tensor = ggml_ssm_scan(ctx, s, x, dt, w, B, C, ids);
|
||||
} break;
|
||||
case GGML_OP_RWKV_WKV6:
|
||||
{
|
||||
// FIXME
|
||||
const int64_t S = 123;
|
||||
const int64_t H = 123;
|
||||
const int64_t n_tokens = 123;
|
||||
const int64_t n_seqs = 123;
|
||||
ggml_tensor * k = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
|
||||
ggml_tensor * v = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
|
||||
ggml_tensor * r = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
|
||||
ggml_tensor * tf = w;
|
||||
ggml_tensor * td = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
|
||||
ggml_tensor * state = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, S, n_seqs, S, H);
|
||||
op_tensor = ggml_rwkv_wkv6(ctx, k, v, r, tf, td, state);
|
||||
} break;
|
||||
case GGML_OP_IM2COL:
|
||||
{
|
||||
const int n_embd_inp = hparams.n_embd_inp();
|
||||
ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, n_embd_inp, w->ne[1], 1, 1);
|
||||
op_tensor = ggml_im2col(ctx, w, b, 1, 0, 0, 0, 1, 0, false, GGML_TYPE_F16);
|
||||
} break;
|
||||
case GGML_OP_SCALE:
|
||||
{
|
||||
op_tensor = ggml_scale(ctx, w, 1.0f);
|
||||
} break;
|
||||
default:
|
||||
GGML_ABORT("%s: missing test for op %s for tensor %s", __func__, ggml_op_name(op), w->name);
|
||||
}
|
||||
|
||||
// create a temporary dummy buffer for the weight so that supports_op can check the buffer type
|
||||
GGML_ASSERT(w->buffer == nullptr);
|
||||
w->buffer = ggml_backend_buft_alloc_buffer(buft, 0);
|
||||
bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor);
|
||||
ggml_backend_buffer_free(w->buffer);
|
||||
w->buffer = nullptr;
|
||||
|
||||
return op_supported;
|
||||
}
|
||||
|
||||
// lists of buffer types used for each layer
|
||||
using buft_list_t = std::vector<std::pair<ggml_backend_dev_t, ggml_backend_buffer_type_t>>;
|
||||
|
||||
// find the first buffer type in the list that can use the tensor
|
||||
static ggml_backend_buffer_type_t select_weight_buft(const llama_hparams & hparams, ggml_tensor * tensor, ggml_op op, const buft_list_t & buft_list) {
|
||||
GGML_ASSERT(!buft_list.empty());
|
||||
for (const auto & cur : buft_list) {
|
||||
ggml_backend_dev_t cur_dev = cur.first;
|
||||
ggml_backend_buffer_type_t cur_buft = cur.second;
|
||||
if (weight_buft_supported(hparams, tensor, op, cur_buft, cur_dev)) {
|
||||
return cur_buft;
|
||||
}
|
||||
}
|
||||
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// CPU: ACCEL -> GPU host -> CPU extra -> CPU
|
||||
static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & devices, bool use_extra_bufts, bool no_host) {
|
||||
buft_list_t buft_list;
|
||||
|
|
@ -496,7 +344,7 @@ void llama_model::load_arch(llama_model_loader & ml) {
|
|||
}
|
||||
|
||||
void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
const gguf_context * ctx = ml.meta.get();
|
||||
const gguf_context * ctx = ml.metadata;
|
||||
|
||||
// get metadata as string
|
||||
for (int i = 0; i < gguf_get_n_kv(ctx); i++) {
|
||||
|
|
@ -690,7 +538,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||
hparams.n_attn_temp_floor_scale = 8192;
|
||||
hparams.f_attn_temp_scale = 0.1f;
|
||||
hparams.f_attn_temp_offset = 1.0f;
|
||||
hparams.set_swa_pattern(4); // pattern: 3 chunked - 1 full
|
||||
uint32_t swa_period = 4; // pattern: 3 chunked - 1 full
|
||||
ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
|
||||
hparams.set_swa_pattern(swa_period);
|
||||
|
||||
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
|
||||
hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
|
||||
|
|
@ -727,7 +577,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||
case LLM_ARCH_AFMOE:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
|
||||
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false);
|
||||
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
||||
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
|
||||
ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
|
||||
|
|
@ -739,7 +589,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||
// Pattern: 3 sliding - 1 full (global_attn_every_n_layers = 4)
|
||||
if (hparams.n_swa > 0) {
|
||||
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
||||
hparams.set_swa_pattern(4);
|
||||
uint32_t swa_period = 4;
|
||||
ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
|
||||
hparams.set_swa_pattern(swa_period);
|
||||
|
||||
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
|
||||
hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
|
||||
|
|
@ -884,7 +736,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||
case LLM_ARCH_BERT:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
||||
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn, false);
|
||||
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
|
|
@ -907,10 +759,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||
{
|
||||
const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
|
||||
if (found_swa && hparams.n_swa > 0) {
|
||||
uint32_t swa_period = 3;
|
||||
hparams.swa_type = LLAMA_SWA_TYPE_SYMMETRIC;
|
||||
|
||||
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa);
|
||||
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
|
||||
uint32_t swa_period = 3;
|
||||
ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
|
||||
hparams.set_swa_pattern(swa_period, true);
|
||||
} else {
|
||||
|
|
@ -918,7 +769,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||
}
|
||||
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
||||
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn, false);
|
||||
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
|
|
@ -934,7 +785,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||
case LLM_ARCH_JINA_BERT_V2:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
||||
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn, false);
|
||||
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
|
||||
hparams.f_max_alibi_bias = 8.0f;
|
||||
|
||||
|
|
@ -947,7 +798,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||
case LLM_ARCH_JINA_BERT_V3:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
||||
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn, false);
|
||||
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
|
|
@ -960,8 +811,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||
case LLM_ARCH_NOMIC_BERT_MOE:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
||||
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
|
||||
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn, false);
|
||||
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
|
||||
ml.get_key(LLM_KV_MOE_EVERY_N_LAYERS, hparams.moe_every_n_layers, 0);
|
||||
|
||||
if (hparams.n_layer == 12 && hparams.n_embd == 768) {
|
||||
|
|
@ -975,8 +826,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||
case LLM_ARCH_NEO_BERT:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
||||
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
|
||||
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn, false);
|
||||
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
|
||||
|
||||
if (hparams.n_layer == 28) {
|
||||
type = LLM_TYPE_250M;
|
||||
|
|
@ -985,8 +836,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||
case LLM_ARCH_EUROBERT:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
||||
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
|
||||
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn, false);
|
||||
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
|
||||
|
||||
if (hparams.n_layer == 12) {
|
||||
type = LLM_TYPE_SMALL; // 0.2B
|
||||
|
|
@ -1014,7 +865,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||
ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv, false);
|
||||
ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
|
||||
ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias, false);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
case 32: type = LLM_TYPE_7B; break;
|
||||
|
|
@ -1273,9 +1124,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
|
||||
if (found_swa && hparams.n_swa > 0) {
|
||||
uint32_t swa_period = 8;
|
||||
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
||||
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa);
|
||||
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
|
||||
uint32_t swa_period = 8;
|
||||
ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
|
||||
hparams.set_swa_pattern(swa_period);
|
||||
} else {
|
||||
|
|
@ -1338,7 +1189,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||
{
|
||||
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
||||
hparams.n_swa = 4096; // default value of gemma 2
|
||||
hparams.set_swa_pattern(2);
|
||||
uint32_t swa_period = 2;
|
||||
ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
|
||||
hparams.set_swa_pattern(swa_period);
|
||||
hparams.attn_soft_cap = true;
|
||||
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
|
||||
hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
|
||||
|
|
@ -1366,7 +1219,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||
const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
|
||||
if (found_swa && hparams.n_swa > 0) {
|
||||
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
||||
hparams.set_swa_pattern(6);
|
||||
uint32_t swa_period = 6;
|
||||
ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
|
||||
hparams.set_swa_pattern(swa_period);
|
||||
|
||||
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
|
||||
} else {
|
||||
|
|
@ -1394,8 +1249,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||
} break;
|
||||
case LLM_ARCH_GEMMA3N:
|
||||
{
|
||||
uint32_t swa_period = 5;
|
||||
ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
|
||||
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
||||
hparams.set_swa_pattern(5);
|
||||
hparams.set_swa_pattern(swa_period);
|
||||
|
||||
hparams.n_layer_kv_from_start = 20;
|
||||
hparams.f_attention_scale = 1.0f;
|
||||
|
|
@ -1413,14 +1270,16 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||
case LLM_ARCH_GEMMA_EMBEDDING:
|
||||
{
|
||||
hparams.swa_type = LLAMA_SWA_TYPE_SYMMETRIC;
|
||||
hparams.set_swa_pattern(6);
|
||||
uint32_t swa_period = 6;
|
||||
ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
|
||||
hparams.set_swa_pattern(swa_period);
|
||||
|
||||
hparams.causal_attn = false; // embeddings do not use causal attention
|
||||
|
||||
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
|
||||
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
|
||||
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
|
||||
|
||||
//applied only if model converted with --sentence-transformers-dense-modules
|
||||
ml.get_key(LLM_KV_DENSE_2_FEAT_IN, hparams.dense_2_feat_in, false);
|
||||
|
|
@ -1545,7 +1404,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||
} break;
|
||||
case LLM_ARCH_COMMAND_R:
|
||||
{
|
||||
ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
|
||||
ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale, false);
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||
switch (hparams.n_layer) {
|
||||
case 40: type = LLM_TYPE_35B; break;
|
||||
|
|
@ -1555,7 +1414,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||
case LLM_ARCH_COHERE2:
|
||||
{
|
||||
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
||||
hparams.set_swa_pattern(4);
|
||||
uint32_t swa_period = 4;
|
||||
ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
|
||||
hparams.set_swa_pattern(swa_period);
|
||||
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
|
||||
hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
|
||||
|
||||
|
|
@ -1597,7 +1458,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||
const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
|
||||
if (found_swa && hparams.n_swa > 0) {
|
||||
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
||||
hparams.set_swa_pattern(4);
|
||||
uint32_t swa_period = 4;
|
||||
ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
|
||||
hparams.set_swa_pattern(swa_period);
|
||||
|
||||
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
|
||||
hparams.rope_freq_scale_train_swa = 1.0; // See olmo2.cpp
|
||||
|
|
@ -1704,10 +1567,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||
case LLM_ARCH_DEEPSEEK:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
|
||||
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false);
|
||||
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
||||
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
|
||||
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
|
||||
|
||||
switch (hparams.n_ff_exp) {
|
||||
case 1408: type = LLM_TYPE_16B; break;
|
||||
|
|
@ -1721,7 +1583,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||
const bool is_lite = (hparams.n_layer == 27 || hparams.n_layer == 26 || (hparams.n_layer == 48 && n_vocab == 128256));
|
||||
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
|
||||
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false);
|
||||
if (!is_lite) {
|
||||
ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q);
|
||||
}
|
||||
|
|
@ -1823,7 +1685,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||
ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used);
|
||||
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
|
||||
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false);
|
||||
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
|
||||
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
|
||||
ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
|
||||
|
||||
// Expert gating function (GLM-4.5 uses sigmoid)
|
||||
|
|
@ -1856,7 +1718,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||
ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used);
|
||||
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
|
||||
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false);
|
||||
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
|
||||
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
|
||||
ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
|
||||
|
||||
// deepseek MLA parameters
|
||||
|
|
@ -1942,7 +1804,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||
case LLM_ARCH_JAIS:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||
ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
|
||||
ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias, false);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
case 24: type = LLM_TYPE_1_3B; break;
|
||||
|
|
@ -2012,7 +1874,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||
if (hparams.n_layer == 64) { // 32B
|
||||
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
||||
hparams.n_swa = 4096;
|
||||
hparams.set_swa_pattern(4);
|
||||
uint32_t swa_period = 4;
|
||||
ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
|
||||
hparams.set_swa_pattern(swa_period);
|
||||
|
||||
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
|
||||
hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
|
||||
|
|
@ -2032,7 +1896,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||
{
|
||||
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
||||
hparams.n_swa = 128;
|
||||
hparams.set_swa_pattern(4);
|
||||
uint32_t swa_period = 4;
|
||||
ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
|
||||
hparams.set_swa_pattern(swa_period);
|
||||
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
|
||||
hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
|
||||
|
||||
|
|
@ -2045,7 +1911,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||
ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func);
|
||||
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
|
||||
ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
|
||||
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
|
||||
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false);
|
||||
|
||||
ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false);
|
||||
|
||||
|
|
@ -2129,9 +1995,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
|
||||
ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale);
|
||||
ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale);
|
||||
ml.get_key(LLM_KV_ATTENTION_SCALE, hparams.f_attention_scale);
|
||||
ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale, false);
|
||||
ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale, false);
|
||||
ml.get_key(LLM_KV_ATTENTION_SCALE, hparams.f_attention_scale, false);
|
||||
|
||||
// Granite uses rope_finetuned as a switch for rope, so default to true
|
||||
bool rope_finetuned = true;
|
||||
|
|
@ -2189,7 +2055,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
hparams.f_norm_eps = 1e-5; // eps for qk-norm, torch default
|
||||
ml.get_key(LLM_KV_SWIN_NORM, hparams.swin_norm);
|
||||
ml.get_key(LLM_KV_SWIN_NORM, hparams.swin_norm, false);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
case 32: type = LLM_TYPE_7B; break;
|
||||
|
|
@ -2202,15 +2068,14 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||
ml.get_key(LLM_KV_ATTENTION_GROUPNORM_EPS, hparams.f_norm_group_eps);
|
||||
ml.get_key(LLM_KV_ATTENTION_GROUPNORM_GROUPS, hparams.n_norm_groups);
|
||||
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
||||
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn, false);
|
||||
} break;
|
||||
case LLM_ARCH_BAILINGMOE:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
|
||||
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false);
|
||||
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
||||
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
|
||||
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
|
||||
ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
|
|
@ -2222,11 +2087,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||
case LLM_ARCH_BAILINGMOE2:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
|
||||
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false);
|
||||
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
||||
ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp);
|
||||
ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
|
||||
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
|
||||
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
|
||||
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
|
||||
ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
|
||||
ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func);
|
||||
ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false);
|
||||
|
|
@ -2245,10 +2110,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||
case LLM_ARCH_DOTS1:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
|
||||
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false);
|
||||
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
||||
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
|
||||
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
|
||||
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
|
||||
ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
|
||||
ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
|
||||
switch (hparams.n_layer) {
|
||||
|
|
@ -2268,7 +2133,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
||||
ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
|
||||
ml.get_key(LLM_KV_INTERLEAVE_MOE_LAYER_STEP, hparams.n_moe_layer_step);
|
||||
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
|
||||
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false);
|
||||
}
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
|
|
@ -2313,7 +2178,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
||||
ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp);
|
||||
ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
case 32: type = LLM_TYPE_A13B; break;
|
||||
|
|
@ -2349,7 +2214,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
|
||||
|
||||
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
||||
hparams.set_swa_pattern(2);
|
||||
uint32_t swa_period = 2;
|
||||
ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
|
||||
hparams.set_swa_pattern(swa_period);
|
||||
|
||||
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
|
||||
hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
|
||||
|
|
@ -2387,7 +2254,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||
{
|
||||
ml.get_key(LLM_KV_SHORTCONV_L_CACHE, hparams.n_shortconv_l_cache);
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
|
||||
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false);
|
||||
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
||||
ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func);
|
||||
|
||||
|
|
@ -2406,9 +2273,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||
const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
|
||||
|
||||
if (found_swa && hparams.n_swa > 0) {
|
||||
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
||||
hparams.n_swa = 4096;
|
||||
hparams.set_swa_pattern(4, true);
|
||||
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
||||
hparams.n_swa = 4096;
|
||||
uint32_t swa_period = 4;
|
||||
ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
|
||||
hparams.set_swa_pattern(swa_period, true);
|
||||
|
||||
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
|
||||
hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
|
||||
|
|
@ -2431,7 +2300,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||
case LLM_ARCH_GROVEMOE:
|
||||
{
|
||||
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
||||
ml.get_key(LLM_KV_EXPERT_CHUNK_FEED_FORWARD_LENGTH, hparams.n_ff_chexp);
|
||||
ml.get_key(LLM_KV_EXPERT_CHUNK_FEED_FORWARD_LENGTH, hparams.n_ff_chexp, false);
|
||||
ml.get_key(LLM_KV_EXPERT_GROUP_SCALE, hparams.expert_group_scale);
|
||||
ml.get_key(LLM_KV_EXPERTS_PER_GROUP, hparams.n_group_experts);
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
|
|
@ -2602,7 +2471,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||
|
||||
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
||||
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
|
||||
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa);
|
||||
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
|
||||
ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.swa_layers, hparams.n_layer);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
|
|
@ -2632,8 +2501,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||
// MoE parameters - Kimi uses moe_intermediate_size = 1024
|
||||
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
||||
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
|
||||
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
|
||||
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
|
||||
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false);
|
||||
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
|
||||
ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
|
|
@ -2660,7 +2529,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||
}
|
||||
|
||||
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
|
||||
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa);
|
||||
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
|
||||
ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.swa_layers, hparams.n_layer);
|
||||
ml.get_key_or_arr(LLM_KV_SWIGLU_CLAMP_EXP, hparams.swiglu_clamp_exp, hparams.n_layer, false);
|
||||
ml.get_key_or_arr(LLM_KV_SWIGLU_CLAMP_SHEXP, hparams.swiglu_clamp_shexp, hparams.n_layer, false);
|
||||
|
|
@ -2670,7 +2539,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
default: throw std::runtime_error("unsupported model architecture");
|
||||
default: throw std::runtime_error("unsupported model architecture: " + arch_name());
|
||||
}
|
||||
|
||||
pimpl->n_bytes = ml.n_bytes;
|
||||
|
|
@ -2777,44 +2646,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||
// assign the output layer
|
||||
pimpl->dev_output = get_layer_buft_list(n_layer);
|
||||
|
||||
// one ggml context per buffer type
|
||||
int max_n_tensors = ml.n_tensors;
|
||||
max_n_tensors += 1; // duplicated output tensor
|
||||
max_n_tensors += n_layer*2; // duplicated rope freq tensors
|
||||
const size_t ctx_size = ggml_tensor_overhead()*max_n_tensors;
|
||||
|
||||
// define a comparator for the buft -> ctx map to ensure that the order is well-defined:
|
||||
struct ggml_backend_buft_comparator {
|
||||
bool operator()(const ggml_backend_buffer_type_t & lhs, const ggml_backend_buffer_type_t & rhs) const {
|
||||
return strcmp(ggml_backend_buft_name(lhs), ggml_backend_buft_name(rhs)) < 0;
|
||||
}
|
||||
};
|
||||
std::map<ggml_backend_buffer_type_t, ggml_context_ptr, ggml_backend_buft_comparator> ctx_map;
|
||||
|
||||
auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
|
||||
auto it = ctx_map.find(buft);
|
||||
if (it == ctx_map.end()) {
|
||||
ggml_init_params params = {
|
||||
/*.mem_size =*/ ctx_size,
|
||||
/*.mem_buffer =*/ NULL,
|
||||
/*.no_alloc =*/ true,
|
||||
};
|
||||
|
||||
ggml_context * ctx = ggml_init(params);
|
||||
if (!ctx) {
|
||||
throw std::runtime_error(format("failed to create ggml context"));
|
||||
}
|
||||
|
||||
ctx_map.emplace(buft, ctx);
|
||||
|
||||
return ctx;
|
||||
}
|
||||
return it->second.get();
|
||||
};
|
||||
|
||||
const auto TENSOR_DUPLICATED = llama_model_loader::TENSOR_DUPLICATED;
|
||||
const auto TENSOR_NOT_REQUIRED = llama_model_loader::TENSOR_NOT_REQUIRED;
|
||||
const auto TENSOR_SKIP = llama_model_loader::TENSOR_SKIP;
|
||||
const auto TENSOR_DUPLICATED = llama_model_loader::TENSOR_DUPLICATED;
|
||||
const auto TENSOR_NOT_REQUIRED = llama_model_loader::TENSOR_NOT_REQUIRED;
|
||||
const auto TENSOR_SKIP = llama_model_loader::TENSOR_SKIP;
|
||||
const auto TENSOR_SKIP_IF_VIRTUAL = llama_model_loader::TENSOR_SKIP_IF_VIRTUAL;
|
||||
|
||||
// create tensors for the weights
|
||||
{
|
||||
|
|
@ -2839,147 +2674,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||
throw std::runtime_error("model has expert layers but no expert layers are used");
|
||||
}
|
||||
|
||||
int n_moved_tensors = 0;
|
||||
ggml_tensor * first_moved_tensor = nullptr;
|
||||
ggml_backend_buffer_type_t first_moved_from_buft = nullptr;
|
||||
ggml_backend_buffer_type_t first_moved_to_buft = nullptr;
|
||||
|
||||
auto create_tensor = [&](const LLM_TN_IMPL & tn, const std::initializer_list<int64_t> & ne, int flags) -> ggml_tensor * {
|
||||
ggml_tensor * t_meta = ml.get_tensor_meta(tn.str().c_str());
|
||||
|
||||
if (!t_meta) {
|
||||
if (flags & TENSOR_NOT_REQUIRED) {
|
||||
return nullptr;
|
||||
}
|
||||
throw std::runtime_error(format("missing tensor '%s'", tn.str().c_str()));
|
||||
}
|
||||
|
||||
// some models use the token embedding tensor as the output, but since these are used in different layers and with different ops
|
||||
// the tensor is duplicated
|
||||
// to handle this, we check if the tensor is duplicated, and if so, we assume that it is being loaded as the output tensor
|
||||
llm_tensor tn_tensor = tn.tensor;
|
||||
if (tn.tensor == LLM_TENSOR_TOKEN_EMBD && flags & TENSOR_DUPLICATED) {
|
||||
tn_tensor = LLM_TENSOR_OUTPUT;
|
||||
}
|
||||
|
||||
llm_tensor_info info;
|
||||
try {
|
||||
info = llm_tensor_info_for(tn_tensor);
|
||||
} catch (const std::out_of_range & e) {
|
||||
throw std::runtime_error(format("missing tensor info mapping for %s", tn.str().c_str()));
|
||||
}
|
||||
|
||||
// skip unused tensors
|
||||
if (info.op == GGML_OP_NONE || flags & TENSOR_SKIP) {
|
||||
const size_t nbytes = ggml_nbytes(t_meta);
|
||||
LLAMA_LOG_WARN("model has unused tensor %s (size = %zu bytes) -- ignoring\n", tn.str().c_str(), nbytes);
|
||||
|
||||
ml.size_data -= nbytes;
|
||||
ml.n_created++;
|
||||
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// tensors with "bias" suffix are always used with GGML_OP_ADD or GGML_OP_ADD_ID
|
||||
ggml_op op;
|
||||
bool bias = tn.suffix != nullptr && strcmp(tn.suffix, "bias") == 0;
|
||||
if (bias) {
|
||||
if (info.op == GGML_OP_MUL_MAT_ID) {
|
||||
op = GGML_OP_ADD_ID;
|
||||
} else {
|
||||
op = GGML_OP_ADD;
|
||||
}
|
||||
} else {
|
||||
op = info.op;
|
||||
}
|
||||
|
||||
// sanity checks
|
||||
if (info.layer == LLM_TENSOR_LAYER_INPUT || info.layer == LLM_TENSOR_LAYER_OUTPUT) {
|
||||
if (tn.bid != -1) {
|
||||
GGML_ABORT("input/output layer tensor %s used with a layer number", tn.str().c_str());
|
||||
}
|
||||
} else {
|
||||
if (tn.bid == -1) {
|
||||
GGML_ABORT("repeating layer tensor %s used without a layer number", tn.str().c_str());
|
||||
}
|
||||
}
|
||||
|
||||
// select the buffer type for this tensor
|
||||
buft_list_t * buft_list;
|
||||
switch (info.layer) {
|
||||
case LLM_TENSOR_LAYER_INPUT:
|
||||
buft_list = pimpl->dev_input.buft_list;
|
||||
break;
|
||||
case LLM_TENSOR_LAYER_OUTPUT:
|
||||
buft_list = pimpl->dev_output.buft_list;
|
||||
break;
|
||||
case LLM_TENSOR_LAYER_REPEATING:
|
||||
buft_list = pimpl->dev_layer.at(tn.bid).buft_list;
|
||||
break;
|
||||
default:
|
||||
GGML_ABORT("invalid layer %d for tensor %s", info.layer, tn.str().c_str());
|
||||
}
|
||||
|
||||
ggml_backend_buffer_type_t buft = nullptr;
|
||||
|
||||
// check overrides
|
||||
if (ml.tensor_buft_overrides) {
|
||||
std::string tensor_name = tn.str();
|
||||
for (const auto * overrides = ml.tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) {
|
||||
std::regex pattern(overrides->pattern);
|
||||
if (std::regex_search(tensor_name, pattern)) {
|
||||
if (overrides->buft == ggml_backend_cpu_buffer_type()) {
|
||||
// when overriding to a CPU buffer, consider the extra buffer types
|
||||
buft = select_weight_buft(hparams, t_meta, op, pimpl->cpu_buft_list);
|
||||
} else {
|
||||
buft = overrides->buft;
|
||||
}
|
||||
|
||||
LLAMA_LOG_DEBUG("tensor %s (%zu MiB %s) buffer type overridden to %s\n",
|
||||
tensor_name.c_str(),
|
||||
ggml_nbytes(t_meta) / 1024 / 1024, ggml_type_name(t_meta->type),
|
||||
ggml_backend_buft_name(buft));
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!buft) {
|
||||
buft = select_weight_buft(hparams, t_meta, op, *buft_list);
|
||||
if (!buft) {
|
||||
throw std::runtime_error(format("failed to find a compatible buffer type for tensor %s", tn.str().c_str()));
|
||||
}
|
||||
}
|
||||
|
||||
// avoid using a host buffer when using mmap
|
||||
auto * buft_dev = ggml_backend_buft_get_device(buft);
|
||||
if (ml.use_mmap && buft_dev && buft == ggml_backend_dev_host_buffer_type(buft_dev)) {
|
||||
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
||||
if (!cpu_dev) {
|
||||
throw std::runtime_error("no CPU backend found");
|
||||
}
|
||||
buft = ggml_backend_dev_buffer_type(cpu_dev);
|
||||
}
|
||||
|
||||
if (buft != buft_list->front().second) {
|
||||
n_moved_tensors++;
|
||||
if (!first_moved_tensor) {
|
||||
first_moved_tensor = t_meta;
|
||||
first_moved_from_buft = buft_list->front().second;
|
||||
first_moved_to_buft = buft;
|
||||
}
|
||||
}
|
||||
|
||||
ggml_context * ctx = ctx_for_buft(buft);
|
||||
|
||||
// if duplicated, check if the original tensor was allocated in the same buffer type context and avoid creating a new one
|
||||
if (flags & TENSOR_DUPLICATED) {
|
||||
ggml_tensor * t = ggml_get_tensor(ctx, tn.str().c_str());
|
||||
if (t) {
|
||||
return t;
|
||||
}
|
||||
}
|
||||
return ml.create_tensor(ctx, tn, ne, flags);
|
||||
const buft_list_t * buft_list_layer = tn.bid == -1 ? nullptr : pimpl->dev_layer.at(tn.bid).buft_list;
|
||||
return ml.create_tensor(
|
||||
hparams, &pimpl->cpu_buft_list, pimpl->dev_input.buft_list, pimpl->dev_output.buft_list, buft_list_layer,
|
||||
tn, ne, flags);
|
||||
};
|
||||
|
||||
layers.resize(n_layer);
|
||||
|
|
@ -3148,6 +2847,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||
} break;
|
||||
case LLM_ARCH_LLAMA4:
|
||||
{
|
||||
if (n_expert == 0) {
|
||||
throw std::runtime_error(arch_name() + " model cannot have zero experts");
|
||||
}
|
||||
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
||||
|
||||
// output
|
||||
|
|
@ -3160,7 +2862,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||
}
|
||||
|
||||
for (int i = 0; i < n_layer; ++i) {
|
||||
bool is_moe_layer = hparams.n_moe_layer_step > 0 && (i + 1) % hparams.n_moe_layer_step == 0;
|
||||
const bool is_moe_layer = hparams.n_moe_layer_step > 0 && (i + 1) % hparams.n_moe_layer_step == 0;
|
||||
|
||||
auto & layer = layers[i];
|
||||
|
||||
|
|
@ -3176,7 +2878,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||
layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
|
||||
|
||||
if (is_moe_layer) {
|
||||
int n_ff_exp = hparams.n_ff_exp;
|
||||
const int64_t n_ff_exp = hparams.n_ff_exp;
|
||||
|
||||
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
|
||||
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
|
||||
|
|
@ -3307,7 +3009,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||
case LLM_ARCH_GROK:
|
||||
{
|
||||
if (n_expert == 0) {
|
||||
throw std::runtime_error("Grok model cannot have zero experts");
|
||||
throw std::runtime_error(arch_name() + " model cannot have zero experts");
|
||||
}
|
||||
|
||||
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
||||
|
|
@ -3479,6 +3181,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||
case LLM_ARCH_NOMIC_BERT_MOE:
|
||||
case LLM_ARCH_JINA_BERT_V3:
|
||||
{
|
||||
if (n_token_types == 0) {
|
||||
throw std::runtime_error(arch_name() + " model needs to define token type count");
|
||||
}
|
||||
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
||||
type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, TENSOR_NOT_REQUIRED);
|
||||
|
||||
|
|
@ -3745,8 +3450,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
||||
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
|
||||
|
||||
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
||||
layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
||||
// FIXME test-llama-archs crashes if q_norm is created
|
||||
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED | TENSOR_SKIP_IF_VIRTUAL);
|
||||
layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED | TENSOR_SKIP_IF_VIRTUAL);
|
||||
|
||||
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
||||
layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
||||
|
|
@ -5172,6 +4878,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||
|
||||
const int64_t n_embd_head_qk_rope = hparams.n_rot;
|
||||
const int64_t n_embd_head_qk_nope = n_embd_head_k_mla - n_embd_head_qk_rope;
|
||||
GGML_ASSERT(n_embd_head_qk_nope >= 1);
|
||||
|
||||
const int64_t q_lora_rank = hparams.n_lora_q;
|
||||
const int64_t kv_lora_rank = hparams.n_lora_kv;
|
||||
|
|
@ -5363,7 +5070,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||
|
||||
layer.attn_norm_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_NORM, "weight", i), {n_embd}, 0);
|
||||
// this tensor seems to be unused in HF transformers implementation
|
||||
layer.attn_rel_b_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
|
||||
layer.attn_rel_b_cross = create_tensor(
|
||||
tn(LLM_TENSOR_DEC_CROSS_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED | TENSOR_SKIP_IF_VIRTUAL);
|
||||
|
||||
layer.wq_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
|
||||
layer.wk_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
|
||||
|
|
@ -5969,7 +5677,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||
const int64_t n_ff_exp = hparams.n_ff_exp;
|
||||
const int64_t n_expert = hparams.n_expert;
|
||||
const int64_t n_expert_used = hparams.n_expert_used;
|
||||
const int64_t n_ff_shexp = hparams.n_ff_shexp;
|
||||
const int64_t n_ff_shexp = hparams.n_ff_shexp > 0 ? hparams.n_ff_shexp : n_ff_exp;
|
||||
const int64_t head_dim = hparams.n_embd_head_k;
|
||||
const int64_t n_qo_dim = n_head * head_dim;
|
||||
const int64_t n_kv_dim = n_head_kv * head_dim;
|
||||
|
|
@ -6830,6 +6538,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||
|
||||
for (int i = 0; i < n_layer; ++i) {
|
||||
auto & layer = layers[i];
|
||||
const uint32_t n_ff_shexp = hparams.n_ff_shexp > 0 ? hparams.n_ff_shexp : hparams.n_ff(i);
|
||||
|
||||
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
||||
|
||||
|
|
@ -6848,9 +6557,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
|
||||
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
|
||||
|
||||
layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
|
||||
layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
|
||||
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0);
|
||||
layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_shexp}, 0);
|
||||
layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_shexp}, 0);
|
||||
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, 0);
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_HUNYUAN_DENSE:
|
||||
|
|
@ -7186,15 +6895,14 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||
const int64_t n_embd_head_v_kda = hparams.n_embd_head_kda;
|
||||
const int64_t ssm_d_conv = hparams.ssm_d_conv;
|
||||
|
||||
// Try loading KDA specific tensors (using SSM_ prefix)
|
||||
// Conv1d weights: try 4D first, then 3D (quantization may remove trailing 1)
|
||||
// 4D: [d_conv, 1, d_inner, 1], 3D: [d_conv, 1, d_inner]
|
||||
layer.ssm_q_conv = create_tensor(tn(LLM_TENSOR_SSM_CONV1D_Q, "weight", i), {ssm_d_conv, 1, n_embd_head_k_kda * n_head, 1}, TENSOR_NOT_REQUIRED);
|
||||
if (!layer.ssm_q_conv) {
|
||||
layer.ssm_q_conv = create_tensor(tn(LLM_TENSOR_SSM_CONV1D_Q, "weight", i), {ssm_d_conv, 1, n_embd_head_k_kda * n_head}, TENSOR_NOT_REQUIRED);
|
||||
}
|
||||
if (hparams.is_recurrent(i)) {
|
||||
// Conv1d weights: try 4D first, then 3D (quantization may remove trailing 1)
|
||||
// 4D: [d_conv, 1, d_inner, 1], 3D: [d_conv, 1, d_inner]
|
||||
layer.ssm_q_conv = create_tensor(tn(LLM_TENSOR_SSM_CONV1D_Q, "weight", i), {ssm_d_conv, 1, n_embd_head_k_kda * n_head, 1}, TENSOR_NOT_REQUIRED);
|
||||
if (!layer.ssm_q_conv) {
|
||||
layer.ssm_q_conv = create_tensor(tn(LLM_TENSOR_SSM_CONV1D_Q, "weight", i), {ssm_d_conv, 1, n_embd_head_k_kda * n_head}, 0);
|
||||
}
|
||||
|
||||
if (layer.ssm_q_conv) {
|
||||
// KDA Layer - Conv1d weights may be 3D or 4D
|
||||
layer.ssm_k_conv = create_tensor(tn(LLM_TENSOR_SSM_CONV1D_K, "weight", i), {ssm_d_conv, 1, n_embd_head_k_kda * n_head, 1}, TENSOR_NOT_REQUIRED);
|
||||
if (!layer.ssm_k_conv) {
|
||||
|
|
@ -7261,7 +6969,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||
const int64_t qk_rope_head_dim = hparams.n_rot; // From config: qk_rope_head_dim
|
||||
layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + qk_rope_head_dim}, 0);
|
||||
// Support Legacy GGUFs that don't split wkv_b (MLA KV cache disabled)
|
||||
layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_k_mla - qk_rope_head_dim + n_embd_head_v_mla)}, TENSOR_NOT_REQUIRED);
|
||||
layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i),
|
||||
{kv_lora_rank, n_head * (n_embd_head_k_mla - qk_rope_head_dim + n_embd_head_v_mla)}, TENSOR_NOT_REQUIRED | TENSOR_SKIP_IF_VIRTUAL);
|
||||
if (!layer.wkv_b) { // MLA KV cache enabled
|
||||
layer.wk_b = create_tensor(tn(LLM_TENSOR_ATTN_K_B, "weight", i), {n_embd_head_k_mla - qk_rope_head_dim, kv_lora_rank, n_head}, 0);
|
||||
layer.wv_b = create_tensor(tn(LLM_TENSOR_ATTN_V_B, "weight", i), {kv_lora_rank, n_embd_head_v_mla, n_head}, 0);
|
||||
|
|
@ -7381,6 +7090,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||
} break;
|
||||
case LLM_ARCH_QWEN3NEXT:
|
||||
{
|
||||
if (n_expert == 0) {
|
||||
throw std::runtime_error(arch_name() + " model cannot have zero experts");
|
||||
}
|
||||
|
||||
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
|
||||
|
||||
// output
|
||||
|
|
@ -7409,6 +7122,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||
|
||||
for (int i = 0; i < n_layer; ++i) {
|
||||
auto & layer = layers[i];
|
||||
const uint32_t n_ff_shexp = hparams.n_ff_shexp > 0 ? hparams.n_ff_shexp : hparams.n_ff(i);
|
||||
|
||||
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
|
||||
layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, 0);
|
||||
|
|
@ -7444,9 +7158,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||
|
||||
// Shared experts
|
||||
layer.ffn_gate_inp_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), { n_embd }, 0);
|
||||
layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, hparams.n_ff_shexp }, 0);
|
||||
layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, hparams.n_ff_shexp }, 0);
|
||||
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { hparams.n_ff_shexp, n_embd }, 0);
|
||||
layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, n_ff_shexp }, 0);
|
||||
layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp }, 0);
|
||||
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_shexp, n_embd }, 0);
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_QWEN35MOE:
|
||||
|
|
@ -7711,12 +7425,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||
default:
|
||||
throw std::runtime_error("unknown architecture");
|
||||
}
|
||||
|
||||
if (n_moved_tensors > 0) {
|
||||
LLAMA_LOG_DEBUG("%s: tensor '%s' (%s) (and %d others) cannot be used with preferred buffer type %s, using %s instead\n",
|
||||
__func__, first_moved_tensor->name, ggml_type_name(first_moved_tensor->type), n_moved_tensors - 1,
|
||||
ggml_backend_buft_name(first_moved_from_buft), ggml_backend_buft_name(first_moved_to_buft));
|
||||
}
|
||||
}
|
||||
|
||||
ml.done_getting_tensors();
|
||||
|
|
@ -7726,13 +7434,13 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||
|
||||
// create the backend buffers
|
||||
std::vector<std::pair<ggml_context *, llama_buf_map>> ctx_buf_maps;
|
||||
ctx_buf_maps.reserve(ctx_map.size());
|
||||
ctx_buf_maps.reserve(ml.ctx_map.size());
|
||||
|
||||
// Ensure we have enough capacity for the maximum backend buffer we will potentially create
|
||||
const size_t n_max_backend_buffer = ctx_map.size() * ml.files.size();
|
||||
const size_t n_max_backend_buffer = ml.ctx_map.size() * ml.files.size();
|
||||
pimpl->ctxs_bufs.reserve(n_max_backend_buffer);
|
||||
|
||||
for (auto & [buft, ctx_ptr] : ctx_map) {
|
||||
for (auto & [buft, ctx_ptr] : ml.ctx_map) {
|
||||
ggml_context * ctx = ctx_ptr.get();
|
||||
|
||||
// skip contexts without tensors
|
||||
|
|
|
|||
|
|
@ -556,7 +556,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|||
}
|
||||
|
||||
std::vector<std::string> splits = {};
|
||||
llama_model_loader ml(fname_inp, splits, use_mmap, /*use_direct_io*/ false, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr);
|
||||
llama_model_loader ml(/*metadata*/ nullptr, /*set_tensor_data*/ nullptr, /*set_tensor_data_ud*/ nullptr,
|
||||
fname_inp, splits, use_mmap, /*use_direct_io*/ false, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr);
|
||||
ml.init_mappings(false); // no prefetching
|
||||
|
||||
llama_model model(llama_model_default_params());
|
||||
|
|
@ -596,7 +597,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|||
}
|
||||
|
||||
// copy the KV pairs from the input file
|
||||
gguf_set_kv (ctx_out.get(), ml.meta.get());
|
||||
gguf_set_kv (ctx_out.get(), ml.metadata);
|
||||
gguf_set_val_u32(ctx_out.get(), "general.quantization_version", GGML_QNT_VERSION); // TODO: use LLM_KV
|
||||
gguf_set_val_u32(ctx_out.get(), "general.file_type", ftype); // TODO: use LLM_KV
|
||||
|
||||
|
|
|
|||
|
|
@ -1719,7 +1719,7 @@ private:
|
|||
};
|
||||
|
||||
void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
struct gguf_context * ctx = ml.meta.get();
|
||||
struct gguf_context * ctx = ml.metadata;
|
||||
|
||||
// determine vocab type
|
||||
{
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
#include "llama.h"
|
||||
|
||||
#include "ggml-cpp.h"
|
||||
#include "llama-impl.h"
|
||||
|
||||
#include "llama-chat.h"
|
||||
|
|
@ -12,6 +13,7 @@
|
|||
|
||||
#include "ggml.h"
|
||||
#include "ggml-backend.h"
|
||||
#include "gguf.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
|
|
@ -825,7 +827,8 @@ int64_t llama_time_us(void) {
|
|||
}
|
||||
|
||||
// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
|
||||
static int llama_model_load(const std::string & fname, std::vector<std::string> & splits, llama_model & model, llama_model_params & params) {
|
||||
static int llama_model_load(struct gguf_context * metadata, llama_model_set_tensor_data_t set_tensor_data, void * set_tensor_data_ud,
|
||||
const std::string & fname, std::vector<std::string> & splits, llama_model & model, llama_model_params & params) {
|
||||
// loading time will be recalculated after the first eval, so
|
||||
// we take page faults deferred by mmap() into consideration
|
||||
model.t_load_us = 0;
|
||||
|
|
@ -834,7 +837,8 @@ static int llama_model_load(const std::string & fname, std::vector<std::string>
|
|||
model.t_start_us = tm.t_start_us;
|
||||
|
||||
try {
|
||||
llama_model_loader ml(fname, splits, params.use_mmap, params.use_direct_io, params.check_tensors, params.no_alloc, params.kv_overrides, params.tensor_buft_overrides);
|
||||
llama_model_loader ml(metadata, set_tensor_data, set_tensor_data_ud, fname, splits, params.use_mmap, params.use_direct_io,
|
||||
params.check_tensors, params.no_alloc, params.kv_overrides, params.tensor_buft_overrides);
|
||||
|
||||
ml.print_info();
|
||||
|
||||
|
|
@ -880,9 +884,13 @@ static int llama_model_load(const std::string & fname, std::vector<std::string>
|
|||
}
|
||||
|
||||
static struct llama_model * llama_model_load_from_file_impl(
|
||||
struct gguf_context * metadata,
|
||||
llama_model_set_tensor_data_t set_tensor_data,
|
||||
void * set_tensor_data_ud,
|
||||
const std::string & path_model,
|
||||
std::vector<std::string> & splits,
|
||||
struct llama_model_params params) {
|
||||
GGML_ASSERT((metadata == nullptr) != path_model.empty() && "exactly one out of metadata and path_model needs to be defined");
|
||||
ggml_time_init();
|
||||
|
||||
if (!params.vocab_only && ggml_backend_reg_count() == 0) {
|
||||
|
|
@ -1003,7 +1011,7 @@ static struct llama_model * llama_model_load_from_file_impl(
|
|||
props.memory_free/1024/1024);
|
||||
}
|
||||
|
||||
const int status = llama_model_load(path_model, splits, *model, params);
|
||||
const int status = llama_model_load(metadata, set_tensor_data, set_tensor_data_ud, path_model, splits, *model, params);
|
||||
GGML_ASSERT(status <= 0);
|
||||
if (status < 0) {
|
||||
if (status == -1) {
|
||||
|
|
@ -1019,6 +1027,18 @@ static struct llama_model * llama_model_load_from_file_impl(
|
|||
return model;
|
||||
}
|
||||
|
||||
struct llama_model * llama_model_init_from_user(
|
||||
struct gguf_context * metadata,
|
||||
llama_model_set_tensor_data_t set_tensor_data,
|
||||
void * set_tensor_data_ud,
|
||||
struct llama_model_params params) {
|
||||
GGML_ASSERT(metadata != nullptr);
|
||||
std::string path_model;
|
||||
std::vector<std::string> splits = {};
|
||||
params.use_mmap = false;
|
||||
params.use_extra_bufts = false;
|
||||
return llama_model_load_from_file_impl(metadata, set_tensor_data, set_tensor_data_ud, path_model, splits, params);
|
||||
}
|
||||
// deprecated
|
||||
struct llama_model * llama_load_model_from_file(
|
||||
const char * path_model,
|
||||
|
|
@ -1030,7 +1050,7 @@ struct llama_model * llama_model_load_from_file(
|
|||
const char * path_model,
|
||||
struct llama_model_params params) {
|
||||
std::vector<std::string> splits = {};
|
||||
return llama_model_load_from_file_impl(path_model, splits, params);
|
||||
return llama_model_load_from_file_impl(nullptr, nullptr, nullptr, path_model, splits, params);
|
||||
}
|
||||
|
||||
struct llama_model * llama_model_load_from_splits(
|
||||
|
|
@ -1046,11 +1066,11 @@ struct llama_model * llama_model_load_from_splits(
|
|||
for (size_t i = 0; i < n_paths; ++i) {
|
||||
splits.push_back(paths[i]);
|
||||
}
|
||||
return llama_model_load_from_file_impl(splits.front(), splits, params);
|
||||
return llama_model_load_from_file_impl(nullptr, nullptr, nullptr, splits.front(), splits, params);
|
||||
}
|
||||
|
||||
void llama_model_save_to_file(const struct llama_model * model, const char * path_model) {
|
||||
llama_model_saver ms(*model);
|
||||
llama_model_saver ms(model);
|
||||
ms.add_kv_from_model();
|
||||
ms.add_tensors_from_model();
|
||||
ms.save(path_model);
|
||||
|
|
|
|||
|
|
@ -56,6 +56,7 @@ llm_build_baichuan::llm_build_baichuan(const llama_model & model, const llm_grap
|
|||
);
|
||||
break;
|
||||
case LLM_TYPE_13B:
|
||||
case LLM_TYPE_UNKNOWN:
|
||||
break;
|
||||
default:
|
||||
GGML_ABORT("fatal error");
|
||||
|
|
|
|||
|
|
@ -90,7 +90,7 @@ llm_build_bailingmoe2::llm_build_bailingmoe2(const llama_model & model, const ll
|
|||
model.layers[il].ffn_exp_probs_b,
|
||||
n_expert, n_expert_used,
|
||||
LLM_FFN_SILU, hparams.expert_weights_norm,
|
||||
true, hparams.expert_weights_scale,
|
||||
hparams.expert_weights_scale, hparams.expert_weights_scale,
|
||||
(llama_expert_gating_func_type) hparams.expert_gating_func,
|
||||
il);
|
||||
cb(moe_out, "ffn_moe_out", il);
|
||||
|
|
|
|||
|
|
@ -91,7 +91,7 @@ llm_build_dots1::llm_build_dots1(const llama_model & model, const llm_graph_para
|
|||
model.layers[il].ffn_exp_probs_b,
|
||||
n_expert, n_expert_used,
|
||||
LLM_FFN_SILU, hparams.expert_weights_norm,
|
||||
true, hparams.expert_weights_scale,
|
||||
hparams.expert_weights_scale, hparams.expert_weights_scale,
|
||||
(llama_expert_gating_func_type) hparams.expert_gating_func,
|
||||
il);
|
||||
cb(moe_out, "ffn_moe_out", il);
|
||||
|
|
|
|||
|
|
@ -100,7 +100,7 @@ llm_build_exaone_moe::llm_build_exaone_moe(const llama_model & model, const llm_
|
|||
model.layers[il].ffn_exp_probs_b,
|
||||
n_expert, n_expert_used,
|
||||
LLM_FFN_SILU, hparams.expert_weights_norm,
|
||||
true, hparams.expert_weights_scale,
|
||||
hparams.expert_weights_scale, hparams.expert_weights_scale,
|
||||
(llama_expert_gating_func_type) hparams.expert_gating_func,
|
||||
il);
|
||||
cb(moe_out, "ffn_moe_out", il);
|
||||
|
|
|
|||
|
|
@ -128,7 +128,7 @@ llm_build_glm4_moe::llm_build_glm4_moe(const llama_model & model, const llm_grap
|
|||
model.layers[il].ffn_exp_probs_b,
|
||||
n_expert, n_expert_used,
|
||||
LLM_FFN_SILU, hparams.expert_weights_norm,
|
||||
true, hparams.expert_weights_scale,
|
||||
hparams.expert_weights_scale, hparams.expert_weights_scale,
|
||||
(llama_expert_gating_func_type) hparams.expert_gating_func,
|
||||
il);
|
||||
cb(routed_out, "ffn_moe_out", il);
|
||||
|
|
|
|||
|
|
@ -118,12 +118,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
|
|||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
|
||||
// Check layer type by checking which tensors exist
|
||||
// KDA layers have ssm_a_log tensor, MLA layers have wkv_a_mqa tensor
|
||||
bool is_kda = (layer.ssm_a != nullptr);
|
||||
bool is_mla = (layer.wkv_a_mqa != nullptr);
|
||||
|
||||
if (is_kda) {
|
||||
if (hparams.is_recurrent(il)) {
|
||||
// === KDA Layer (Kimi Delta Attention) with Recurrent State ===
|
||||
// Reference: vLLM kda.py
|
||||
const auto * mctx_cur = inp_rs->mctx;
|
||||
|
|
@ -211,7 +206,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
|
|||
cur = ggml_mul_mat(ctx0, layer.wo, gated);
|
||||
cb(cur, "kda_out", il);
|
||||
|
||||
} else if (is_mla) {
|
||||
} else {
|
||||
// === MLA Layer (Multi-head Latent Attention) without KV Cache ===
|
||||
// Reference: vLLM mla.py
|
||||
// Step 1: Q projection and reshape
|
||||
|
|
@ -310,9 +305,6 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
|
|||
cur = build_attn(inp_attn_kv, layer.wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale_mla, il);
|
||||
cb(cur, "mla_out", il);
|
||||
}
|
||||
} else {
|
||||
// Unknown layer type - this should not happen
|
||||
GGML_ABORT("Kimi layer is neither KDA nor MLA - missing required tensors");
|
||||
}
|
||||
|
||||
// On last layer, select only the output tokens
|
||||
|
|
@ -349,7 +341,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
|
|||
hparams.n_expert,
|
||||
hparams.n_expert_used,
|
||||
LLM_FFN_SILU, true,
|
||||
true, hparams.expert_weights_scale,
|
||||
hparams.expert_weights_scale, hparams.expert_weights_scale,
|
||||
(llama_expert_gating_func_type) hparams.expert_gating_func,
|
||||
il);
|
||||
cb(moe_out, "ffn_moe_out", il);
|
||||
|
|
|
|||
|
|
@ -30,6 +30,7 @@ ggml_tensor * llm_build_mamba_base::build_mamba_layer(llm_graph_input_rs * inp,
|
|||
GGML_ASSERT(n_seqs != 0);
|
||||
GGML_ASSERT(ubatch.equal_seqs());
|
||||
GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
|
||||
GGML_ASSERT(d_inner % n_head == 0);
|
||||
|
||||
ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
|
||||
ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il);
|
||||
|
|
@ -154,6 +155,7 @@ ggml_tensor * llm_build_mamba_base::build_mamba2_layer(llm_graph_input_rs * inp,
|
|||
|
||||
const auto kv_head = mctx_cur->get_head();
|
||||
|
||||
const int64_t n_embd = hparams.n_embd;
|
||||
const int64_t d_conv = hparams.ssm_d_conv;
|
||||
const int64_t d_inner = hparams.ssm_d_inner;
|
||||
const int64_t d_state = hparams.ssm_d_state;
|
||||
|
|
@ -167,6 +169,8 @@ ggml_tensor * llm_build_mamba_base::build_mamba2_layer(llm_graph_input_rs * inp,
|
|||
GGML_ASSERT(n_seqs != 0);
|
||||
GGML_ASSERT(ubatch.equal_seqs());
|
||||
GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
|
||||
GGML_ASSERT(d_inner % n_head == 0);
|
||||
GGML_ASSERT(d_inner % (n_group*n_embd) == 0);
|
||||
|
||||
ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
|
||||
ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il);
|
||||
|
|
|
|||
|
|
@ -124,7 +124,7 @@ ggml_tensor * llm_build_nemotron_h::build_ffn_layer(ggml_tensor * cur, const lla
|
|||
model.layers[il].ffn_exp_probs_b,
|
||||
n_expert, n_expert_used,
|
||||
LLM_FFN_RELU_SQR, hparams.expert_weights_norm,
|
||||
true, hparams.expert_weights_scale,
|
||||
hparams.expert_weights_scale, hparams.expert_weights_scale,
|
||||
LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID,
|
||||
il);
|
||||
cb(moe_out, "ffn_moe_out", il);
|
||||
|
|
|
|||
|
|
@ -27,7 +27,7 @@ llm_build_plamo2::llm_build_plamo2(const llama_model & model, const llm_graph_pa
|
|||
cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
|
||||
|
||||
// check if this layer is Mamba or Attention
|
||||
bool is_mamba_layer = hparams.is_recurrent(il);
|
||||
const bool is_mamba_layer = hparams.is_recurrent(il);
|
||||
|
||||
if (is_mamba_layer) {
|
||||
// PLaMo-2 Mamba layer
|
||||
|
|
@ -171,6 +171,8 @@ ggml_tensor * llm_build_plamo2::build_plamo2_mamba_layer(llm_graph_input_rs * in
|
|||
GGML_ASSERT(n_seqs != 0);
|
||||
GGML_ASSERT(ubatch.equal_seqs());
|
||||
GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
|
||||
GGML_ASSERT(d_inner % n_head == 0);
|
||||
GGML_ASSERT(n_group == 0);
|
||||
|
||||
ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
|
||||
ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il);
|
||||
|
|
|
|||
|
|
@ -185,6 +185,8 @@ if (NOT WIN32 OR NOT BUILD_SHARED_LIBS)
|
|||
#llama_test(test-tokenizer-1-spm NAME test-tokenizer-1-baichuan ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-baichuan.gguf)
|
||||
|
||||
# llama_build_and_test(test-double-float.cpp) # SLOW
|
||||
|
||||
llama_build_and_test(test-llama-archs.cpp)
|
||||
endif()
|
||||
|
||||
llama_build_and_test(test-chat-peg-parser.cpp peg-parser/simple-tokenize.cpp)
|
||||
|
|
|
|||
|
|
@ -0,0 +1,532 @@
|
|||
#include "common.h"
|
||||
#include "log.h"
|
||||
#include "ggml-backend.h"
|
||||
#include "ggml.h"
|
||||
#include "gguf.h"
|
||||
#include "ggml-cpp.h"
|
||||
#include "llama.h"
|
||||
#include "llama-cpp.h"
|
||||
#include "../src/llama-arch.h"
|
||||
#include "../src/llama-model-saver.h"
|
||||
|
||||
#include <cinttypes>
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <cstdint>
|
||||
#include <random>
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
// normalized mean squared error = mse(a, b) / mse(a, 0)
|
||||
static double nmse(const std::vector<float> & a, const std::vector<float> & b) {
|
||||
GGML_ASSERT(a.size() == b.size());
|
||||
double mse_a_b = 0.0;
|
||||
double mse_a_0 = 0.0;
|
||||
|
||||
for (size_t i = 0; i < a.size(); i++) {
|
||||
float a_i = a[i];
|
||||
float b_i = b[i];
|
||||
|
||||
mse_a_b += (a_i - b_i) * (a_i - b_i);
|
||||
mse_a_0 += a_i * a_i;
|
||||
}
|
||||
|
||||
return mse_a_b / mse_a_0;
|
||||
}
|
||||
|
||||
static void set_tensor_data(struct ggml_tensor * tensor, void * userdata) {
|
||||
std::hash<std::string> hasher;
|
||||
std::mt19937 gen(hasher(tensor->name) + *(const size_t *) userdata);
|
||||
std::normal_distribution<float> dis(0.0f, 1.0e-2f);
|
||||
|
||||
const int64_t ne = ggml_nelements(tensor);
|
||||
if (tensor->type == GGML_TYPE_F32) {
|
||||
std::vector<float> tmp(ne);
|
||||
for (int64_t i = 0; i < ne; i++) {
|
||||
tmp[i] = dis(gen);
|
||||
}
|
||||
ggml_backend_tensor_set(tensor, tmp.data(), 0, ggml_nbytes(tensor));
|
||||
} else if (tensor->type == GGML_TYPE_F16) {
|
||||
std::vector<ggml_fp16_t> tmp(ne);
|
||||
for (int64_t i = 0; i < ne; i++) {
|
||||
tmp[i] = ggml_fp32_to_fp16(dis(gen));
|
||||
}
|
||||
ggml_backend_tensor_set(tensor, tmp.data(), 0, ggml_nbytes(tensor));
|
||||
} else {
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
}
|
||||
|
||||
static void usage(char ** argv) {
|
||||
printf("Usage: %s [-a/--arch arch] [-s/--seed seed] [-v/--verbose]\n", argv[0]);
|
||||
}
|
||||
|
||||
static std::vector<llama_token> get_tokens(const uint32_t n_tokens, const uint32_t n_vocab, const size_t seed){
|
||||
std::mt19937 gen(seed);
|
||||
std::uniform_int_distribution<> dis(0, n_vocab - 1);
|
||||
std::vector<llama_token> ret;
|
||||
ret.reserve(n_tokens);
|
||||
for (uint32_t i = 0; i < n_tokens; i++) {
|
||||
ret.push_back(dis(gen));
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
static gguf_context_ptr get_gguf_ctx(const llm_arch arch, const bool moe) {
|
||||
gguf_context_ptr ret(gguf_init_empty());
|
||||
llama_model_saver ms(arch, ret.get());
|
||||
const uint32_t n_ctx = 128;
|
||||
|
||||
uint32_t n_vocab = 128;
|
||||
uint32_t n_embd = 256;
|
||||
uint32_t n_head = 2;
|
||||
uint32_t n_ff = 384;
|
||||
uint32_t n_layer = 2;
|
||||
if (arch == LLM_ARCH_LLAMA4) {
|
||||
n_layer = 4; // hparams.n_no_rope_layer_step is hard-coded to 4
|
||||
} else if (arch == LLM_ARCH_GEMMA3N) {
|
||||
n_embd = 64;
|
||||
n_head = 1;
|
||||
n_ff = 96;
|
||||
} else if (arch == LLM_ARCH_DEEPSEEK2 || arch == LLM_ARCH_GLM_DSA || arch == LLM_ARCH_KIMI_LINEAR) {
|
||||
n_embd = 128;
|
||||
n_head = 1;
|
||||
n_ff = 192;
|
||||
} else if (arch == LLM_ARCH_NEMOTRON_H || arch == LLM_ARCH_NEMOTRON_H_MOE) {
|
||||
n_layer = 3;
|
||||
} else if (arch == LLM_ARCH_CHAMELEON) {
|
||||
n_vocab = 10240;
|
||||
} else if (arch == LLM_ARCH_GEMMA3N) {
|
||||
n_layer = 22; // hparams.n_layer_kv_from_start = 20 is hardcoded
|
||||
}
|
||||
|
||||
const uint32_t n_embd_head = n_embd / n_head;
|
||||
|
||||
ms.add_kv(LLM_KV_GENERAL_ARCHITECTURE, llm_arch_name(arch));
|
||||
ms.add_kv(LLM_KV_VOCAB_SIZE, n_vocab);
|
||||
ms.add_kv(LLM_KV_CONTEXT_LENGTH, n_ctx);
|
||||
ms.add_kv(LLM_KV_EMBEDDING_LENGTH, n_embd);
|
||||
ms.add_kv(LLM_KV_FEATURES_LENGTH, n_embd);
|
||||
ms.add_kv(LLM_KV_BLOCK_COUNT, n_layer);
|
||||
ms.add_kv(LLM_KV_LEADING_DENSE_BLOCK_COUNT, uint32_t(1));
|
||||
|
||||
if (arch == LLM_ARCH_NEMOTRON_H || arch == LLM_ARCH_NEMOTRON_H_MOE) {
|
||||
std::vector<uint32_t> n_ff_per_layer;
|
||||
n_ff_per_layer.reserve(n_layer);
|
||||
for (uint32_t il = 0; il < n_layer; il++) {
|
||||
n_ff_per_layer.push_back(il <= 1 ? 0 : n_ff);
|
||||
}
|
||||
ms.add_kv(LLM_KV_FEED_FORWARD_LENGTH, n_ff_per_layer);
|
||||
} else {
|
||||
ms.add_kv(LLM_KV_FEED_FORWARD_LENGTH, n_ff);
|
||||
}
|
||||
|
||||
ms.add_kv(LLM_KV_USE_PARALLEL_RESIDUAL, false);
|
||||
ms.add_kv(LLM_KV_LOGIT_SCALE, 1.0f);
|
||||
ms.add_kv(LLM_KV_TIME_MIX_EXTRA_DIM, uint32_t(64));
|
||||
ms.add_kv(LLM_KV_TIME_DECAY_EXTRA_DIM, uint32_t(128));
|
||||
ms.add_kv(LLM_KV_FULL_ATTENTION_INTERVAL, uint32_t(2));
|
||||
|
||||
if (arch == LLM_ARCH_PLAMO2 || arch == LLM_ARCH_JAMBA || arch == LLM_ARCH_NEMOTRON_H || arch == LLM_ARCH_NEMOTRON_H_MOE ||
|
||||
arch == LLM_ARCH_GRANITE_HYBRID || arch == LLM_ARCH_LFM2 || arch == LLM_ARCH_LFM2MOE || arch == LLM_ARCH_KIMI_LINEAR) {
|
||||
GGML_ASSERT(n_layer >= 2);
|
||||
std::vector<uint32_t> n_head_per_layer;
|
||||
n_head_per_layer.reserve(n_layer);
|
||||
for (uint32_t il = 0; il < n_layer; il++) {
|
||||
n_head_per_layer.push_back(il == 1 ? 0 : n_head);
|
||||
}
|
||||
ms.add_kv(LLM_KV_ATTENTION_HEAD_COUNT, n_head_per_layer);
|
||||
ms.add_kv(LLM_KV_ATTENTION_HEAD_COUNT_KV, n_head_per_layer);
|
||||
} else {
|
||||
ms.add_kv(LLM_KV_ATTENTION_HEAD_COUNT, n_head);
|
||||
ms.add_kv(LLM_KV_ATTENTION_HEAD_COUNT_KV, n_head);
|
||||
}
|
||||
|
||||
ms.add_kv(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, 8.0f);
|
||||
if (arch == LLM_ARCH_DEEPSEEK2 || arch == LLM_ARCH_GLM_DSA || arch == LLM_ARCH_KIMI_LINEAR) {
|
||||
ms.add_kv(LLM_KV_ATTENTION_KEY_LENGTH, uint32_t(576));
|
||||
ms.add_kv(LLM_KV_ATTENTION_VALUE_LENGTH, uint32_t(512));
|
||||
ms.add_kv(LLM_KV_ROPE_DIMENSION_COUNT, uint32_t(64));
|
||||
ms.add_kv(LLM_KV_ATTENTION_KEY_LENGTH_MLA, uint32_t(192));
|
||||
ms.add_kv(LLM_KV_ATTENTION_VALUE_LENGTH_MLA, uint32_t(128));
|
||||
}
|
||||
ms.add_kv(LLM_KV_ATTENTION_CLAMP_KQV, 1.0f);
|
||||
ms.add_kv(LLM_KV_ATTENTION_LAYERNORM_EPS, 1e-5f);
|
||||
ms.add_kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, 1e-5f);
|
||||
ms.add_kv(LLM_KV_ATTENTION_GROUPNORM_EPS, 1e-5f);
|
||||
ms.add_kv(LLM_KV_ATTENTION_GROUPNORM_GROUPS, uint32_t(8));
|
||||
ms.add_kv(LLM_KV_ATTENTION_Q_LORA_RANK, uint32_t(512));
|
||||
ms.add_kv(LLM_KV_ATTENTION_KV_LORA_RANK, uint32_t(512));
|
||||
ms.add_kv(LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, uint32_t(8));
|
||||
ms.add_kv(LLM_KV_ATTENTION_SLIDING_WINDOW, n_ctx/8);
|
||||
|
||||
if (arch == LLM_ARCH_MIMO2 || arch == LLM_ARCH_STEP35) {
|
||||
std::vector<uint32_t> pattern;
|
||||
pattern.reserve(n_layer);
|
||||
for (uint32_t il = 0; il < n_layer; il++) {
|
||||
pattern.push_back(il % 2);
|
||||
}
|
||||
ms.add_kv(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, pattern);
|
||||
} else {
|
||||
ms.add_kv(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, uint32_t(2));
|
||||
}
|
||||
|
||||
ms.add_kv(LLM_KV_ATTENTION_INDEXER_HEAD_COUNT, uint32_t(1));
|
||||
ms.add_kv(LLM_KV_ATTENTION_INDEXER_KEY_LENGTH, uint32_t(64));
|
||||
ms.add_kv(LLM_KV_ATTENTION_INDEXER_TOP_K, uint32_t(8));
|
||||
ms.add_kv(LLM_KV_ROPE_DIMENSION_SECTIONS, std::vector<uint32_t>({n_embd_head/4, n_embd_head/4, n_embd_head/4, n_embd_head/4}));
|
||||
ms.add_kv(LLM_KV_TOKENIZER_MODEL, "no_vocab");
|
||||
// ms.add_kv(LLM_KV_DENSE_2_FEAT_OUT, n_embd);
|
||||
// ms.add_kv(LLM_KV_DENSE_3_FEAT_IN, n_embd);
|
||||
|
||||
if (moe) {
|
||||
ms.add_kv(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, n_ff);
|
||||
ms.add_kv(LLM_KV_INTERLEAVE_MOE_LAYER_STEP, uint32_t(2));
|
||||
ms.add_kv(LLM_KV_EXPERT_COUNT, uint32_t(2));
|
||||
ms.add_kv(LLM_KV_EXPERT_USED_COUNT, uint32_t(1));
|
||||
ms.add_kv(LLM_KV_EXPERT_SHARED_COUNT, uint32_t(1));
|
||||
ms.add_kv(LLM_KV_EXPERT_GATING_FUNC, uint32_t(2)); // sigmoid
|
||||
ms.add_kv(LLM_KV_EXPERT_GROUP_SCALE, 1.0f);
|
||||
ms.add_kv(LLM_KV_EXPERTS_PER_GROUP, uint32_t(1));
|
||||
}
|
||||
|
||||
ms.add_kv(LLM_KV_POSNET_EMBEDDING_LENGTH, n_embd);
|
||||
ms.add_kv(LLM_KV_POSNET_BLOCK_COUNT, n_layer);
|
||||
ms.add_kv(LLM_KV_CONVNEXT_EMBEDDING_LENGTH, n_embd);
|
||||
ms.add_kv(LLM_KV_CONVNEXT_BLOCK_COUNT, n_layer);
|
||||
ms.add_kv(LLM_KV_XIELU_ALPHA_N, 1.0f);
|
||||
ms.add_kv(LLM_KV_XIELU_ALPHA_P, 1.0f);
|
||||
ms.add_kv(LLM_KV_XIELU_BETA, 1.0f);
|
||||
ms.add_kv(LLM_KV_XIELU_EPS, 1.0e-7f);
|
||||
ms.add_kv(LLM_KV_SSM_INNER_SIZE, arch == LLM_ARCH_QWEN3NEXT || arch == LLM_ARCH_QWEN35 || arch == LLM_ARCH_QWEN35MOE ? 64 : 2*n_embd);
|
||||
ms.add_kv(LLM_KV_SSM_CONV_KERNEL, uint32_t(4));
|
||||
ms.add_kv(LLM_KV_SSM_STATE_SIZE, uint32_t(32));
|
||||
ms.add_kv(LLM_KV_SSM_TIME_STEP_RANK, n_head);
|
||||
ms.add_kv(LLM_KV_SSM_GROUP_COUNT, arch == LLM_ARCH_PLAMO2 ? 0 : uint32_t(2));
|
||||
ms.add_kv(LLM_KV_KDA_HEAD_DIM, uint32_t(128));
|
||||
ms.add_kv(LLM_KV_WKV_HEAD_SIZE, n_embd/n_head);
|
||||
ms.add_kv(LLM_KV_SHORTCONV_L_CACHE, uint32_t(3));
|
||||
|
||||
for (uint32_t il = 0; il < n_layer; il++) {
|
||||
ggml_tensor t;
|
||||
memset(&t, 0, sizeof(ggml_tensor));
|
||||
t.type = GGML_TYPE_F16;
|
||||
ggml_format_name(&t, "conv%" PRIu32 "d.weight", il);
|
||||
gguf_add_tensor(ms.gguf_ctx, &t);
|
||||
ggml_format_name(&t, "posnet.%" PRIu32 ".conv1.weight", il);
|
||||
gguf_add_tensor(ms.gguf_ctx, &t);
|
||||
ggml_format_name(&t, "posnet.%" PRIu32 ".conv2.weight", il);
|
||||
gguf_add_tensor(ms.gguf_ctx, &t);
|
||||
ggml_format_name(&t, "convnext.%" PRIu32 ".dw.weight", il);
|
||||
gguf_add_tensor(ms.gguf_ctx, &t);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
static std::pair<llama_model_ptr, llama_context_ptr> get_model_and_ctx(
|
||||
struct gguf_context * gguf_ctx, const size_t seed, const std::vector<ggml_backend_dev_t> & devs) {
|
||||
llama_model_params model_params = llama_model_default_params();
|
||||
std::vector<ggml_backend_dev_t> devs_copy = devs;
|
||||
devs_copy.push_back(nullptr);
|
||||
model_params.devices = devs_copy.data();
|
||||
|
||||
llama_context_params ctx_params = llama_context_default_params();
|
||||
ctx_params.n_ctx = 0;
|
||||
ctx_params.n_threads = 4;
|
||||
ctx_params.n_threads_batch = 4;
|
||||
|
||||
size_t tmp = seed;
|
||||
llama_model_ptr model(llama_model_init_from_user(gguf_ctx, set_tensor_data, &tmp, model_params));
|
||||
if (!model) {
|
||||
throw std::runtime_error("failed to create llama model");
|
||||
}
|
||||
llama_context_ptr lctx(llama_init_from_model(model.get(), ctx_params));
|
||||
if (!lctx) {
|
||||
throw std::runtime_error("failed to create llama context");
|
||||
}
|
||||
return std::make_pair(std::move(model), std::move(lctx));
|
||||
}
|
||||
|
||||
static std::vector<float> get_logits(
|
||||
llama_model * model, llama_context * lctx, const std::vector<llama_token> & tokens, bool encode = false) {
|
||||
const uint32_t n_vocab = llama_vocab_n_tokens(llama_model_get_vocab(model));
|
||||
const uint32_t n_ctx = llama_n_ctx(lctx);
|
||||
const uint32_t n_tokens = tokens.size();
|
||||
llama_batch batch = llama_batch_init(n_ctx, 0, 1);
|
||||
GGML_ASSERT(n_tokens <= n_ctx);
|
||||
for (uint32_t pos = 0; pos < n_tokens; pos++) {
|
||||
common_batch_add(batch, tokens[pos], pos, {0}, true);
|
||||
}
|
||||
batch.n_tokens = n_tokens;
|
||||
if (encode) {
|
||||
if (llama_encode(lctx, batch)) {
|
||||
llama_batch_free(batch);
|
||||
throw std::runtime_error("failed to encode batch");
|
||||
}
|
||||
}
|
||||
if (llama_decode(lctx, batch)) {
|
||||
llama_batch_free(batch);
|
||||
throw std::runtime_error("failed to decode batch");
|
||||
}
|
||||
|
||||
std::vector<float> ret;
|
||||
ret.reserve(n_tokens*n_vocab);
|
||||
for (uint32_t i = 0; i < n_tokens; i++) {
|
||||
const float * logits_ith = llama_get_logits_ith(lctx, i);
|
||||
for (uint32_t j = 0; j < n_vocab; j++) {
|
||||
ret.push_back(logits_ith[j]);
|
||||
}
|
||||
}
|
||||
llama_batch_free(batch);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static bool moe_mandatory(const llm_arch arch) {
|
||||
switch (arch) {
|
||||
case LLM_ARCH_LLAMA4:
|
||||
case LLM_ARCH_GROK:
|
||||
case LLM_ARCH_QWEN2MOE:
|
||||
case LLM_ARCH_QWEN3MOE:
|
||||
case LLM_ARCH_QWEN3NEXT:
|
||||
case LLM_ARCH_QWEN3VLMOE:
|
||||
case LLM_ARCH_QWEN35MOE:
|
||||
case LLM_ARCH_PHIMOE:
|
||||
case LLM_ARCH_DBRX:
|
||||
case LLM_ARCH_OLMOE:
|
||||
case LLM_ARCH_ARCTIC:
|
||||
case LLM_ARCH_DEEPSEEK:
|
||||
case LLM_ARCH_DEEPSEEK2:
|
||||
case LLM_ARCH_GLM4_MOE:
|
||||
case LLM_ARCH_GLM_DSA:
|
||||
case LLM_ARCH_EXAONE_MOE:
|
||||
case LLM_ARCH_BAILINGMOE:
|
||||
case LLM_ARCH_BAILINGMOE2:
|
||||
case LLM_ARCH_DOTS1:
|
||||
case LLM_ARCH_AFMOE:
|
||||
case LLM_ARCH_ERNIE4_5:
|
||||
case LLM_ARCH_ERNIE4_5_MOE:
|
||||
case LLM_ARCH_HUNYUAN_MOE:
|
||||
case LLM_ARCH_OPENAI_MOE:
|
||||
case LLM_ARCH_LFM2MOE:
|
||||
case LLM_ARCH_SMALLTHINKER:
|
||||
case LLM_ARCH_LLADA_MOE:
|
||||
case LLM_ARCH_GROVEMOE:
|
||||
case LLM_ARCH_MINIMAX_M2:
|
||||
case LLM_ARCH_RND1:
|
||||
case LLM_ARCH_PADDLEOCR:
|
||||
case LLM_ARCH_MIMO2:
|
||||
case LLM_ARCH_KIMI_LINEAR:
|
||||
case LLM_ARCH_STEP35:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
static bool moe_implemented(const llm_arch arch) {
|
||||
if (moe_mandatory(arch)) {
|
||||
return true;
|
||||
}
|
||||
switch (arch) {
|
||||
case LLM_ARCH_LLAMA:
|
||||
case LLM_ARCH_REFACT:
|
||||
case LLM_ARCH_MINICPM:
|
||||
case LLM_ARCH_GRANITE:
|
||||
case LLM_ARCH_GRANITE_MOE:
|
||||
case LLM_ARCH_MISTRAL3:
|
||||
case LLM_ARCH_LLAMA_EMBED:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
static int save_models(const llm_arch target_arch, const size_t seed, const ggml_log_level log_level, const std::string & dir) {
|
||||
GGML_ABORT("llama_model_save_to_file is broken");
|
||||
struct user_data_t {
|
||||
struct {
|
||||
ggml_log_callback callback;
|
||||
void * user_data;
|
||||
} original_logger;
|
||||
ggml_log_level min_level; // prints below this log level go to debug log
|
||||
};
|
||||
user_data_t ud;
|
||||
llama_log_get(&ud.original_logger.callback, &ud.original_logger.user_data);
|
||||
ud.min_level = log_level;
|
||||
|
||||
llama_log_set([](ggml_log_level level, const char * text, void * user_data) {
|
||||
const user_data_t * ud = (const user_data_t *) user_data;
|
||||
const ggml_log_level level_eff = level >= ud->min_level ? level : GGML_LOG_LEVEL_DEBUG;
|
||||
ud->original_logger.callback(level_eff, text, ud->original_logger.user_data);
|
||||
}, &ud);
|
||||
|
||||
for (const llm_arch & arch : llm_arch_all()) {
|
||||
if (target_arch != LLM_ARCH_UNKNOWN && arch != target_arch) {
|
||||
continue;
|
||||
}
|
||||
if (arch == LLM_ARCH_CLIP || arch == LLM_ARCH_GPTJ || arch == LLM_ARCH_UNKNOWN) {
|
||||
continue; // These models don't have usable implementations.
|
||||
}
|
||||
for (bool moe : {false, true}) {
|
||||
if (moe && !moe_implemented(arch)) {
|
||||
continue;
|
||||
}
|
||||
if (!moe && moe_mandatory(arch)) {
|
||||
continue;
|
||||
}
|
||||
gguf_context_ptr gguf_ctx = get_gguf_ctx(arch, moe);
|
||||
auto model_and_ctx = get_model_and_ctx(gguf_ctx.get(), seed, {});
|
||||
const std::string path = dir + "/" + llm_arch_name(arch) + (moe ? "-moe.gguf" : "-dense.gguf");
|
||||
LOG_INF("%s: Saving %s model (%s) to %s...\n", __func__, llm_arch_name(arch), moe ? "MoE" : "dense", path.c_str());
|
||||
llama_model_save_to_file(model_and_ctx.first.get(), path.c_str());
|
||||
}
|
||||
}
|
||||
llama_log_set(ud.original_logger.callback, ud.original_logger.user_data);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int test_backends(const llm_arch target_arch, const size_t seed, const ggml_log_level log_level) {
|
||||
struct user_data_t {
|
||||
struct {
|
||||
ggml_log_callback callback;
|
||||
void * user_data;
|
||||
} original_logger;
|
||||
ggml_log_level min_level; // prints below this log level go to debug log
|
||||
};
|
||||
user_data_t ud;
|
||||
llama_log_get(&ud.original_logger.callback, &ud.original_logger.user_data);
|
||||
ud.min_level = log_level;
|
||||
|
||||
llama_log_set([](ggml_log_level level, const char * text, void * user_data) {
|
||||
const user_data_t * ud = (const user_data_t *) user_data;
|
||||
const ggml_log_level level_eff = level >= ud->min_level ? level : GGML_LOG_LEVEL_DEBUG;
|
||||
ud->original_logger.callback(level_eff, text, ud->original_logger.user_data);
|
||||
}, &ud);
|
||||
|
||||
const std::vector<llama_token> tokens = get_tokens(128, 128, seed);
|
||||
|
||||
bool all_ok = true;
|
||||
common_log_flush(common_log_main());
|
||||
printf("|%15s|%30s|%6s|%8s|%6s|\n", "Model arch.", "Device", "Config", "NMSE", "Status");
|
||||
printf("|---------------|------------------------------|------|--------|------|\n");
|
||||
for (const llm_arch & arch : llm_arch_all()) {
|
||||
if (target_arch != LLM_ARCH_UNKNOWN && arch != target_arch) {
|
||||
continue;
|
||||
}
|
||||
if (arch == LLM_ARCH_CLIP || arch == LLM_ARCH_GPTJ || arch == LLM_ARCH_UNKNOWN) {
|
||||
continue; // These models don't have usable implementations.
|
||||
}
|
||||
if (arch == LLM_ARCH_WAVTOKENIZER_DEC) {
|
||||
continue; // FIXME CUDA backend crashes.
|
||||
}
|
||||
if (arch == LLM_ARCH_LLAMA_EMBED || arch == LLM_ARCH_GEMMA_EMBEDDING || arch == LLM_ARCH_T5ENCODER) {
|
||||
continue; // FIXME Embedding (?) models produce inconsistent results.
|
||||
}
|
||||
if (arch == LLM_ARCH_RWKV6 || arch == LLM_ARCH_RWKV6QWEN2 || arch == LLM_ARCH_RWKV7 || arch == LLM_ARCH_ARWKV7) {
|
||||
continue; // FIXME RWKV models hang indefinitely.
|
||||
}
|
||||
if (arch == LLM_ARCH_BERT || arch == LLM_ARCH_MODERN_BERT || arch == LLM_ARCH_NOMIC_BERT || arch == LLM_ARCH_NOMIC_BERT_MOE ||
|
||||
arch == LLM_ARCH_NEO_BERT || arch == LLM_ARCH_JINA_BERT_V2 || arch == LLM_ARCH_JINA_BERT_V3 || arch == LLM_ARCH_EUROBERT) {
|
||||
continue; // TODO vocab
|
||||
}
|
||||
if (arch == LLM_ARCH_PLM) {
|
||||
continue; // TODO tensor shapes
|
||||
}
|
||||
|
||||
// FIXME some models are segfaulting with WebGPU:
|
||||
#ifdef GGML_USE_WEBGPU
|
||||
if (arch == LLM_ARCH_QWEN3NEXT || arch == LLM_ARCH_QWEN35 || arch == LLM_ARCH_QWEN35MOE || arch == LLM_ARCH_KIMI_LINEAR) {
|
||||
continue;
|
||||
}
|
||||
#endif // GGML_USE_WEBGPU
|
||||
|
||||
const bool encode = arch == LLM_ARCH_T5;
|
||||
for (bool moe : {false, true}) {
|
||||
if (moe && !moe_implemented(arch)) {
|
||||
continue;
|
||||
}
|
||||
if (!moe && moe_mandatory(arch)) {
|
||||
continue;
|
||||
}
|
||||
gguf_context_ptr gguf_ctx = get_gguf_ctx(arch, moe);
|
||||
auto model_and_ctx_cpu = get_model_and_ctx(gguf_ctx.get(), seed, {});
|
||||
const std::vector<float> logits_cpu = get_logits(model_and_ctx_cpu.first.get(), model_and_ctx_cpu.second.get(), tokens, encode);
|
||||
for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
|
||||
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
|
||||
if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) {
|
||||
continue;
|
||||
}
|
||||
auto model_and_ctx_dev = get_model_and_ctx(gguf_ctx.get(), seed, {dev});
|
||||
const std::vector<float> logits_dev = get_logits(model_and_ctx_dev.first.get(), model_and_ctx_dev.second.get(), tokens, encode);
|
||||
const double nmse_val = nmse(logits_cpu, logits_dev);
|
||||
const bool ok = nmse_val <= 1e-4;
|
||||
all_ok = all_ok && ok;
|
||||
char nmse_str[10];
|
||||
snprintf(nmse_str, sizeof(nmse_str), "%.2e", nmse_val);
|
||||
printf("|%15s|%30s|%6s|%8s|%17s|\n", llm_arch_name(arch), ggml_backend_dev_description(dev),
|
||||
moe ? "MoE" : "Dense", nmse_str, ok ? "\033[1;32mOK\033[0m" : "\033[1;31mFAIL\033[0m");
|
||||
}
|
||||
}
|
||||
}
|
||||
llama_log_set(ud.original_logger.callback, ud.original_logger.user_data);
|
||||
return all_ok ? 0 : 1;
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
// FIXME these tests are disabled in the CI for macOS-latest-cmake-arm64 because they are segfaulting
|
||||
common_init();
|
||||
std::random_device rd;
|
||||
|
||||
llm_arch arch = LLM_ARCH_UNKNOWN;
|
||||
size_t seed = rd();
|
||||
ggml_log_level log_level = GGML_LOG_LEVEL_ERROR;
|
||||
std::string out;
|
||||
|
||||
for (int i = 1; i < argc; i++) {
|
||||
if (strcmp(argv[i], "-a") == 0 || strcmp(argv[i], "--arch") == 0) {
|
||||
if (i + 1 < argc) {
|
||||
const std::string arch_name = argv[++i];
|
||||
arch = llm_arch_from_string(arch_name);
|
||||
if (arch == LLM_ARCH_UNKNOWN) {
|
||||
LOG_ERR("%s: unkown LLM architecture: %s\n", __func__, arch_name.c_str());
|
||||
return 1;
|
||||
}
|
||||
} else {
|
||||
usage(argv);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
if (strcmp(argv[i], "-s") == 0 || strcmp(argv[i], "--seed") == 0) {
|
||||
if (i + 1 < argc) {
|
||||
seed = std::stoull(argv[++i]);
|
||||
} else {
|
||||
usage(argv);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
if (strcmp(argv[i], "-v") == 0 || strcmp(argv[i], "--verbose") == 0) {
|
||||
log_level = GGML_LOG_LEVEL_INFO;
|
||||
continue;
|
||||
}
|
||||
if (strcmp(argv[i], "-o") == 0 || strcmp(argv[i], "--out") == 0) {
|
||||
if (i + 1 < argc) {
|
||||
out = argv[++i];
|
||||
} else {
|
||||
usage(argv);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
try {
|
||||
if (!out.empty()) {
|
||||
return save_models(arch, seed, log_level, out);
|
||||
}
|
||||
return test_backends(arch, seed, log_level);
|
||||
} catch (const std::exception & err) {
|
||||
fprintf(stderr, "encountered runtime error: %s\n", err.what());
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
|
@ -38,4 +38,5 @@ else()
|
|||
add_subdirectory(export-lora)
|
||||
endif()
|
||||
add_subdirectory(fit-params)
|
||||
add_subdirectory(results)
|
||||
endif()
|
||||
|
|
|
|||
|
|
@ -0,0 +1,8 @@
|
|||
set(TARGET llama-results)
|
||||
add_executable(${TARGET} results.cpp)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
if(LLAMA_TOOLS_INSTALL)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
endif()
|
||||
|
|
@ -0,0 +1,11 @@
|
|||
# Results
|
||||
|
||||
The `llama-results` tool can be used to `--check` the outputs of a model vs. a previous commit to detect whether they have changed.
|
||||
Example usage:
|
||||
|
||||
``` sh
|
||||
llama-results --model model.gguf --output results.gguf --prompt "People die when they are killed." # writes results to file
|
||||
llama-results --model model.gguf --output results.gguf --prompt "People die when they are killed." --check # compares results vs file
|
||||
```
|
||||
|
||||
The metric by which the results are compared is the normalized mean squared error (NMSE) with a tolerance of $10^{-6}$.
|
||||
|
|
@ -0,0 +1,181 @@
|
|||
#include "ggml-cpp.h"
|
||||
#include "ggml.h"
|
||||
#include "gguf.h"
|
||||
#include "llama.h"
|
||||
#include "common.h"
|
||||
#include "arg.h"
|
||||
#include "log.h"
|
||||
|
||||
#include <cstdint>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
// normalized mean squared error = mse(a, b) / mse(a, 0)
|
||||
static double nmse(const std::vector<float> & a, const std::vector<float> & b) {
|
||||
GGML_ASSERT(a.size() == b.size());
|
||||
double mse_a_b = 0.0;
|
||||
double mse_a_0 = 0.0;
|
||||
|
||||
for (size_t i = 0; i < a.size(); i++) {
|
||||
float a_i = a[i];
|
||||
float b_i = b[i];
|
||||
|
||||
mse_a_b += (a_i - b_i) * (a_i - b_i);
|
||||
mse_a_0 += a_i * a_i;
|
||||
}
|
||||
|
||||
return mse_a_b / mse_a_0;
|
||||
}
|
||||
|
||||
static std::vector<float> get_logits(
|
||||
llama_model * model, llama_context * lctx, const std::vector<llama_token> & tokens) {
|
||||
const uint32_t n_vocab = llama_vocab_n_tokens(llama_model_get_vocab(model));
|
||||
const uint32_t n_ctx = llama_n_ctx(lctx);
|
||||
const uint32_t n_tokens = tokens.size();
|
||||
llama_batch batch = llama_batch_init(n_ctx, 0, 1);
|
||||
GGML_ASSERT(n_tokens <= n_ctx);
|
||||
for (uint32_t pos = 0; pos < n_tokens; pos++) {
|
||||
common_batch_add(batch, tokens[pos], pos, {0}, true);
|
||||
}
|
||||
batch.n_tokens = n_tokens;
|
||||
if (llama_decode(lctx, batch)) {
|
||||
llama_batch_free(batch);
|
||||
throw std::runtime_error("failed to decode batch");
|
||||
}
|
||||
|
||||
std::vector<float> ret;
|
||||
ret.reserve(n_tokens*n_vocab);
|
||||
for (uint32_t i = 0; i < n_tokens; i++) {
|
||||
const float * logits_ith = llama_get_logits_ith(lctx, i);
|
||||
for (uint32_t j = 0; j < n_vocab; j++) {
|
||||
ret.push_back(logits_ith[j]);
|
||||
}
|
||||
}
|
||||
llama_batch_free(batch);
|
||||
return ret;
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
common_params params;
|
||||
params.escape = false;
|
||||
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_RESULTS)) {
|
||||
return 1;
|
||||
}
|
||||
if (params.out_file.empty()) {
|
||||
LOG_ERR("%s: an output file must be specified", __func__);
|
||||
return 1;
|
||||
}
|
||||
common_init();
|
||||
llama_backend_init();
|
||||
llama_numa_init(params.numa);
|
||||
common_init_result_ptr llama_init = common_init_from_params(params);
|
||||
struct llama_model * model = llama_init->model();
|
||||
struct llama_context * lctx = llama_init->context();
|
||||
if (model == nullptr) {
|
||||
LOG_ERR("%s: unable to load model\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
const uint32_t n_vocab = llama_vocab_n_tokens(llama_model_get_vocab(model));
|
||||
|
||||
const std::vector<llama_token> tokens_calc = common_tokenize(lctx, params.prompt, true);
|
||||
const std::vector<float> logits_calc = get_logits(model, lctx, tokens_calc);
|
||||
GGML_ASSERT(logits_calc.size() == tokens_calc.size()*n_vocab);
|
||||
|
||||
struct gguf_init_params gguf_params = {
|
||||
/*.no_alloc =*/ true,
|
||||
/*.ctx =*/ nullptr,
|
||||
};
|
||||
gguf_context_ptr gguf_ctx_model(gguf_init_from_file(params.model.path.c_str(), gguf_params));
|
||||
|
||||
if (params.check) {
|
||||
LOG_INF("%s: loading results from %s...\n", __func__, params.out_file.c_str());
|
||||
gguf_context_ptr gguf_ctx;
|
||||
{
|
||||
struct gguf_init_params gguf_params = {
|
||||
/*no_alloc =*/ true,
|
||||
/*ctx =*/ nullptr,
|
||||
};
|
||||
gguf_ctx.reset(gguf_init_from_file(params.out_file.c_str(), gguf_params));
|
||||
}
|
||||
const std::string path_model_disk = gguf_get_val_str(gguf_ctx.get(), gguf_find_key(gguf_ctx.get(), "path_model"));
|
||||
GGML_ASSERT(path_model_disk == params.model.path); // TODO better checks
|
||||
|
||||
auto load_tensor_data = [&](const std::string & name, void * dst, const size_t size){
|
||||
const int64_t tid = gguf_find_tensor(gguf_ctx.get(), name.c_str());
|
||||
const size_t offset = gguf_get_data_offset(gguf_ctx.get()) + gguf_get_tensor_offset(gguf_ctx.get(), tid);
|
||||
GGML_ASSERT(size == gguf_get_tensor_size(gguf_ctx.get(), tid));
|
||||
|
||||
FILE * file = ggml_fopen(params.out_file.c_str(), "rb");
|
||||
if (file == nullptr) {
|
||||
throw std::runtime_error("failed to open results file");
|
||||
}
|
||||
if (fseek(file, offset, SEEK_SET) != 0) {
|
||||
throw std::runtime_error("fseek failed");
|
||||
}
|
||||
const size_t nbytes_read = fread(dst, 1, size, file);
|
||||
if (nbytes_read != size) {
|
||||
throw std::runtime_error("fread failed");
|
||||
}
|
||||
};
|
||||
|
||||
std::vector<llama_token> tokens_disk(tokens_calc.size());
|
||||
load_tensor_data("tokens", tokens_disk.data(), tokens_disk.size()*sizeof(llama_token));
|
||||
GGML_ASSERT(tokens_disk.size() == tokens_calc.size());
|
||||
for (size_t i = 0; i < tokens_calc.size(); i++) {
|
||||
GGML_ASSERT(tokens_disk[i] == tokens_calc[i]);
|
||||
}
|
||||
|
||||
std::vector<float> logits_disk(logits_calc.size());
|
||||
load_tensor_data("logits", logits_disk.data(), logits_disk.size()*sizeof(float));
|
||||
const double nmse_val = nmse(logits_disk, logits_calc);
|
||||
LOG_INF("%s: NMSE=%.3e\n", __func__, nmse_val);
|
||||
|
||||
if (nmse_val > 1e-6) {
|
||||
printf("\033[1;31mFAIL\033[0m\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
printf("\033[1;32mOK\033[0m\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
ggml_context_ptr ggml_ctx_calc;
|
||||
{
|
||||
const size_t size_tokens = tokens_calc.size()*sizeof(llama_token) + ggml_tensor_overhead();
|
||||
const size_t size_logits = logits_calc.size()*sizeof(float) + ggml_tensor_overhead();
|
||||
struct ggml_init_params params = {
|
||||
/*.mem_size =*/ size_tokens + size_logits,
|
||||
/*.mem_buffer =*/ nullptr,
|
||||
/*.no_alloc =*/ false,
|
||||
};
|
||||
ggml_ctx_calc.reset(ggml_init(params));
|
||||
}
|
||||
|
||||
gguf_context_ptr gguf_ctx(gguf_init_empty());
|
||||
gguf_set_val_str(gguf_ctx.get(), "path_model", params.model.path.c_str());
|
||||
{
|
||||
ggml_tensor * t_tokens = ggml_new_tensor_1d(ggml_ctx_calc.get(), GGML_TYPE_I32, tokens_calc.size());
|
||||
ggml_set_name(t_tokens, "tokens");
|
||||
int32_t * tokens_data = (int32_t *) t_tokens->data;
|
||||
for (uint32_t i = 0; i < tokens_calc.size(); i++) {
|
||||
tokens_data[i] = tokens_calc[i];
|
||||
}
|
||||
gguf_add_tensor(gguf_ctx.get(), t_tokens);
|
||||
}
|
||||
{
|
||||
ggml_tensor * t_logits = ggml_new_tensor_2d(ggml_ctx_calc.get(), GGML_TYPE_F32, tokens_calc.size(), n_vocab);
|
||||
ggml_set_name(t_logits, "logits");
|
||||
float * logits_data = ggml_get_data_f32(t_logits);
|
||||
for (uint32_t i = 0; i < tokens_calc.size(); i++) {
|
||||
const float * logits_ith = llama_get_logits_ith(lctx, i);
|
||||
for (uint32_t j = 0; j < n_vocab; j++) {
|
||||
logits_data[i*n_vocab + j] = logits_ith[j];
|
||||
}
|
||||
}
|
||||
gguf_add_tensor(gguf_ctx.get(), t_logits);
|
||||
}
|
||||
LOG_INF("%s: writing results to %s...\n", __func__, params.out_file.c_str());
|
||||
gguf_write_to_file(gguf_ctx.get(), params.out_file.c_str(), /*only_meta =*/ false);
|
||||
return 0;
|
||||
}
|
||||
|
||||
Loading…
Reference in New Issue