Merge 60312f6a46 into 1fb2290a51
This commit is contained in:
commit
7d732af8e5
|
|
@ -77,6 +77,7 @@ extern "C" {
|
|||
};
|
||||
|
||||
GGML_API struct gguf_context * gguf_init_empty(void);
|
||||
GGML_API struct gguf_context * gguf_init_from_file_ptr(FILE * file, struct gguf_init_params params);
|
||||
GGML_API struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params);
|
||||
//GGML_API struct gguf_context * gguf_init_from_buffer(..);
|
||||
|
||||
|
|
@ -189,6 +190,7 @@ extern "C" {
|
|||
//
|
||||
|
||||
// write the entire context to a binary file
|
||||
GGML_API bool gguf_write_to_file_ptr(const struct gguf_context * ctx, FILE * file, bool only_meta);
|
||||
GGML_API bool gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta);
|
||||
|
||||
// get the size in bytes of the meta data (header, kv pairs, tensor info) including padding
|
||||
|
|
|
|||
|
|
@ -773,6 +773,5 @@ inline bool ggml_check_edges(const struct ggml_cgraph * cgraph,
|
|||
|
||||
// expose GGUF internals for test code
|
||||
GGML_API size_t gguf_type_size(enum gguf_type type);
|
||||
GGML_API struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_params params);
|
||||
GGML_API void gguf_write_to_buf(const struct gguf_context * ctx, std::vector<int8_t> & buf, bool only_meta);
|
||||
#endif // __cplusplus
|
||||
|
|
|
|||
|
|
@ -394,7 +394,11 @@ bool gguf_read_emplace_helper(const struct gguf_reader & gr, std::vector<struct
|
|||
return true;
|
||||
}
|
||||
|
||||
struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_params params) {
|
||||
struct gguf_context * gguf_init_from_file_ptr(FILE * file, struct gguf_init_params params) {
|
||||
if (!file) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
const struct gguf_reader gr(file);
|
||||
struct gguf_context * ctx = new gguf_context;
|
||||
|
||||
|
|
@ -848,7 +852,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|||
return nullptr;
|
||||
}
|
||||
|
||||
struct gguf_context * result = gguf_init_from_file_impl(file, params);
|
||||
struct gguf_context * result = gguf_init_from_file_ptr(file, params);
|
||||
fclose(file);
|
||||
return result;
|
||||
}
|
||||
|
|
@ -1508,6 +1512,19 @@ void gguf_write_to_buf(const struct gguf_context * ctx, std::vector<int8_t> & bu
|
|||
gguf_write_out(ctx, gw, only_meta);
|
||||
}
|
||||
|
||||
bool gguf_write_to_file_ptr(const struct gguf_context * ctx, FILE * file, bool only_meta) {
|
||||
GGML_ASSERT(file);
|
||||
|
||||
try {
|
||||
gguf_writer_file gw(file);
|
||||
gguf_write_out(ctx, gw, only_meta);
|
||||
} catch (const std::runtime_error& ex) {
|
||||
GGML_LOG_ERROR("%s: failed to write GGUF data: %s\n", __func__, ex.what());
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta) {
|
||||
FILE * file = ggml_fopen(fname, "wb");
|
||||
|
||||
|
|
@ -1516,17 +1533,13 @@ bool gguf_write_to_file(const struct gguf_context * ctx, const char * fname, boo
|
|||
return false;
|
||||
}
|
||||
|
||||
try {
|
||||
gguf_writer_file gw(file);
|
||||
gguf_write_out(ctx, gw, only_meta);
|
||||
} catch (const std::runtime_error& ex) {
|
||||
GGML_LOG_ERROR("%s: failed to write GGUF data into '%s': %s\n", __func__, fname, ex.what());
|
||||
fclose(file);
|
||||
return false;
|
||||
const bool success = gguf_write_to_file_ptr(ctx, file, only_meta);
|
||||
if (!success) {
|
||||
GGML_LOG_ERROR("%s: failed to write GGUF data into '%s'\n", __func__, fname);
|
||||
}
|
||||
|
||||
fclose(file);
|
||||
return true;
|
||||
return success;
|
||||
}
|
||||
|
||||
size_t gguf_get_meta_size(const struct gguf_context * ctx) {
|
||||
|
|
|
|||
|
|
@ -465,6 +465,11 @@ extern "C" {
|
|||
const char * path_model,
|
||||
struct llama_model_params params);
|
||||
|
||||
// Load a model from an open FILE pointer
|
||||
LLAMA_API struct llama_model * llama_model_load_from_file_ptr(
|
||||
FILE * file,
|
||||
struct llama_model_params params);
|
||||
|
||||
// Load a model from multiple splits (support custom naming scheme)
|
||||
// The paths must be in the correct order
|
||||
LLAMA_API struct llama_model * llama_model_load_from_splits(
|
||||
|
|
|
|||
|
|
@ -544,6 +544,10 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
|
|||
case LLM_ARCH_CLIP:
|
||||
return {};
|
||||
case LLM_ARCH_LLAMA:
|
||||
case LLM_ARCH_REFACT:
|
||||
case LLM_ARCH_MINICPM:
|
||||
case LLM_ARCH_GRANITE:
|
||||
case LLM_ARCH_GRANITE_MOE:
|
||||
case LLM_ARCH_DECI:
|
||||
case LLM_ARCH_MISTRAL3:
|
||||
case LLM_ARCH_LLAMA_EMBED:
|
||||
|
|
@ -744,11 +748,9 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
|
|||
LLM_TENSOR_ATTN_Q_NORM,
|
||||
LLM_TENSOR_ATTN_K_NORM,
|
||||
};
|
||||
case LLM_ARCH_REFACT:
|
||||
case LLM_ARCH_QWEN2:
|
||||
case LLM_ARCH_QWEN2VL:
|
||||
case LLM_ARCH_INTERNLM2:
|
||||
case LLM_ARCH_GRANITE:
|
||||
case LLM_ARCH_ERNIE4_5:
|
||||
case LLM_ARCH_PADDLEOCR:
|
||||
case LLM_ARCH_SMOLLM3:
|
||||
|
|
@ -759,6 +761,7 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
|
|||
LLM_TENSOR_TOKEN_EMBD,
|
||||
LLM_TENSOR_OUTPUT_NORM,
|
||||
LLM_TENSOR_OUTPUT,
|
||||
LLM_TENSOR_ROPE_FREQS,
|
||||
LLM_TENSOR_ATTN_NORM,
|
||||
LLM_TENSOR_ATTN_Q,
|
||||
LLM_TENSOR_ATTN_K,
|
||||
|
|
@ -1232,29 +1235,6 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
|
|||
LLM_TENSOR_FFN_DOWN,
|
||||
LLM_TENSOR_FFN_UP,
|
||||
};
|
||||
case LLM_ARCH_MINICPM:
|
||||
return {
|
||||
LLM_TENSOR_TOKEN_EMBD,
|
||||
LLM_TENSOR_OUTPUT_NORM,
|
||||
LLM_TENSOR_OUTPUT,
|
||||
LLM_TENSOR_ROPE_FREQS,
|
||||
LLM_TENSOR_ROPE_FACTORS_LONG,
|
||||
LLM_TENSOR_ROPE_FACTORS_SHORT,
|
||||
LLM_TENSOR_ATTN_NORM,
|
||||
LLM_TENSOR_ATTN_Q,
|
||||
LLM_TENSOR_ATTN_K,
|
||||
LLM_TENSOR_ATTN_V,
|
||||
LLM_TENSOR_ATTN_OUT,
|
||||
LLM_TENSOR_ATTN_ROT_EMBD,
|
||||
LLM_TENSOR_FFN_GATE_INP,
|
||||
LLM_TENSOR_FFN_NORM,
|
||||
LLM_TENSOR_FFN_GATE,
|
||||
LLM_TENSOR_FFN_DOWN,
|
||||
LLM_TENSOR_FFN_UP,
|
||||
LLM_TENSOR_FFN_GATE_EXP,
|
||||
LLM_TENSOR_FFN_DOWN_EXP,
|
||||
LLM_TENSOR_FFN_UP_EXP,
|
||||
};
|
||||
case LLM_ARCH_MINICPM3:
|
||||
return {
|
||||
LLM_TENSOR_TOKEN_EMBD,
|
||||
|
|
@ -1442,6 +1422,7 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
|
|||
LLM_TENSOR_TOKEN_EMBD,
|
||||
LLM_TENSOR_OUTPUT,
|
||||
LLM_TENSOR_OUTPUT_NORM,
|
||||
LLM_TENSOR_ROPE_FREQS,
|
||||
LLM_TENSOR_ATTN_NORM,
|
||||
LLM_TENSOR_ATTN_Q,
|
||||
LLM_TENSOR_ATTN_K,
|
||||
|
|
@ -1657,7 +1638,9 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
|
|||
LLM_TENSOR_ROPE_FREQS,
|
||||
LLM_TENSOR_OUTPUT_NORM,
|
||||
LLM_TENSOR_OUTPUT,
|
||||
LLM_TENSOR_TOKEN_EMBD,
|
||||
LLM_TENSOR_ATTN_NORM,
|
||||
LLM_TENSOR_ATTN_QKV,
|
||||
LLM_TENSOR_ATTN_Q,
|
||||
LLM_TENSOR_ATTN_K,
|
||||
LLM_TENSOR_ATTN_V,
|
||||
|
|
@ -2061,30 +2044,12 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
|
|||
LLM_TENSOR_FFN_DOWN,
|
||||
LLM_TENSOR_FFN_UP,
|
||||
};
|
||||
case LLM_ARCH_GRANITE_MOE:
|
||||
return {
|
||||
LLM_TENSOR_TOKEN_EMBD,
|
||||
LLM_TENSOR_OUTPUT_NORM,
|
||||
LLM_TENSOR_OUTPUT,
|
||||
LLM_TENSOR_ATTN_NORM,
|
||||
LLM_TENSOR_ATTN_Q,
|
||||
LLM_TENSOR_ATTN_K,
|
||||
LLM_TENSOR_ATTN_V,
|
||||
LLM_TENSOR_ATTN_OUT,
|
||||
LLM_TENSOR_FFN_NORM,
|
||||
LLM_TENSOR_FFN_GATE_INP,
|
||||
LLM_TENSOR_FFN_GATE_EXPS,
|
||||
LLM_TENSOR_FFN_DOWN_EXPS,
|
||||
LLM_TENSOR_FFN_UP_EXPS,
|
||||
LLM_TENSOR_FFN_GATE_SHEXP,
|
||||
LLM_TENSOR_FFN_DOWN_SHEXP,
|
||||
LLM_TENSOR_FFN_UP_SHEXP,
|
||||
};
|
||||
case LLM_ARCH_GRANITE_HYBRID:
|
||||
return {
|
||||
LLM_TENSOR_TOKEN_EMBD,
|
||||
LLM_TENSOR_OUTPUT_NORM,
|
||||
LLM_TENSOR_OUTPUT,
|
||||
LLM_TENSOR_ROPE_FREQS,
|
||||
LLM_TENSOR_ATTN_NORM,
|
||||
LLM_TENSOR_SSM_IN,
|
||||
LLM_TENSOR_SSM_CONV1D,
|
||||
|
|
@ -2412,6 +2377,7 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
|
|||
LLM_TENSOR_TOKEN_EMBD,
|
||||
LLM_TENSOR_OUTPUT_NORM,
|
||||
LLM_TENSOR_OUTPUT,
|
||||
LLM_TENSOR_ROPE_FREQS,
|
||||
LLM_TENSOR_ATTN_NORM,
|
||||
LLM_TENSOR_ATTN_QKV,
|
||||
LLM_TENSOR_ATTN_OUT,
|
||||
|
|
@ -2789,7 +2755,12 @@ std::string LLM_TN_IMPL::str() const {
|
|||
}
|
||||
|
||||
if (model_tensors.find(tensor) == model_tensors.end()) {
|
||||
return LLM_TENSOR_NAMES.at(tensor);
|
||||
const char * name = LLM_TENSOR_NAMES.at(tensor);
|
||||
if (suffix != nullptr || bid != -1 || xid != -1) {
|
||||
LLAMA_LOG_WARN("%s: cannot properly format tensor name %s with suffix=%s bid=%d xid=%d\n",
|
||||
__func__, name, suffix, bid, xid);
|
||||
}
|
||||
return name;
|
||||
}
|
||||
|
||||
std::string name = ::format(LLM_TENSOR_NAMES.at(tensor), bid, xid);
|
||||
|
|
|
|||
|
|
@ -86,6 +86,14 @@ struct llama_file::impl {
|
|||
seek(0, SEEK_SET);
|
||||
}
|
||||
|
||||
impl(FILE * file) : owns_fp(false) {
|
||||
fp = file;
|
||||
fp_win32 = (HANDLE) _get_osfhandle(_fileno(fp));
|
||||
seek(0, SEEK_END);
|
||||
size = tell();
|
||||
seek(0, SEEK_SET);
|
||||
}
|
||||
|
||||
size_t tell() const {
|
||||
LARGE_INTEGER li;
|
||||
li.QuadPart = 0;
|
||||
|
|
@ -159,7 +167,7 @@ struct llama_file::impl {
|
|||
}
|
||||
|
||||
~impl() {
|
||||
if (fp) {
|
||||
if (fp && owns_fp) {
|
||||
std::fclose(fp);
|
||||
}
|
||||
}
|
||||
|
|
@ -209,6 +217,13 @@ struct llama_file::impl {
|
|||
seek(0, SEEK_SET);
|
||||
}
|
||||
|
||||
impl(FILE * file) : fname("(file*)"), owns_fp(false) {
|
||||
fp = file;
|
||||
seek(0, SEEK_END);
|
||||
size = tell();
|
||||
seek(0, SEEK_SET);
|
||||
}
|
||||
|
||||
size_t tell() const {
|
||||
if (fd == -1) {
|
||||
long ret = std::ftell(fp);
|
||||
|
|
@ -353,7 +368,7 @@ struct llama_file::impl {
|
|||
~impl() {
|
||||
if (fd != -1) {
|
||||
close(fd);
|
||||
} else {
|
||||
} else if (owns_fp) {
|
||||
std::fclose(fp);
|
||||
}
|
||||
}
|
||||
|
|
@ -369,10 +384,14 @@ struct llama_file::impl {
|
|||
|
||||
FILE * fp{};
|
||||
size_t size{};
|
||||
bool owns_fp = true;
|
||||
};
|
||||
|
||||
llama_file::llama_file(const char * fname, const char * mode, const bool use_direct_io) :
|
||||
pimpl(std::make_unique<impl>(fname, mode, use_direct_io)) {}
|
||||
|
||||
llama_file::llama_file(FILE * file) : pimpl(std::make_unique<impl>(file)) {}
|
||||
|
||||
llama_file::~llama_file() = default;
|
||||
|
||||
size_t llama_file::tell() const { return pimpl->tell(); }
|
||||
|
|
|
|||
|
|
@ -15,6 +15,7 @@ using llama_mlocks = std::vector<std::unique_ptr<llama_mlock>>;
|
|||
|
||||
struct llama_file {
|
||||
llama_file(const char * fname, const char * mode, bool use_direct_io = false);
|
||||
llama_file(FILE * file);
|
||||
~llama_file();
|
||||
|
||||
size_t tell() const;
|
||||
|
|
|
|||
|
|
@ -511,6 +511,7 @@ llama_model_loader::llama_model_loader(
|
|||
void * set_tensor_data_ud,
|
||||
const std::string & fname,
|
||||
std::vector<std::string> & splits,
|
||||
FILE * file,
|
||||
bool use_mmap,
|
||||
bool use_direct_io,
|
||||
bool check_tensors,
|
||||
|
|
@ -658,6 +659,36 @@ llama_model_loader::llama_model_loader(
|
|||
|
||||
LLAMA_LOG_INFO("%s: additional %d GGUFs metadata loaded.\n", __func__, n_split - 1);
|
||||
}
|
||||
} else if (file != nullptr) {
|
||||
struct ggml_context * ctx = NULL;
|
||||
struct gguf_init_params params = {
|
||||
/*.no_alloc = */ true,
|
||||
/*.ctx = */ &ctx,
|
||||
};
|
||||
|
||||
metadata_ptr.reset(gguf_init_from_file_ptr(file, params));
|
||||
metadata = metadata_ptr.get();
|
||||
if (metadata == nullptr) {
|
||||
throw std::runtime_error(format("%s: failed to load model from file pointer", __func__));
|
||||
}
|
||||
|
||||
get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
|
||||
llm_kv = LLM_KV(llm_arch_from_string(arch_name));
|
||||
|
||||
files.emplace_back(new llama_file(file));
|
||||
contexts.emplace_back(ctx);
|
||||
|
||||
// Save tensors data offset info of the main file.
|
||||
for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
|
||||
std::string tensor_name = std::string(cur->name);
|
||||
// make sure there is no duplicated tensor names
|
||||
if (weights_map.find(tensor_name) != weights_map.end()) {
|
||||
throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", ggml_get_name(cur)));
|
||||
}
|
||||
n_elements += ggml_nelements(cur);
|
||||
n_bytes += ggml_nbytes(cur);
|
||||
weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), 0, metadata, cur));
|
||||
}
|
||||
} else {
|
||||
get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
|
||||
llm_kv = LLM_KV(llm_arch_from_string(arch_name));
|
||||
|
|
@ -669,7 +700,7 @@ llama_model_loader::llama_model_loader(
|
|||
fver = (enum llama_fver) gguf_get_version(metadata);
|
||||
|
||||
LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n",
|
||||
__func__, n_kv, n_tensors, fname.c_str(), llama_file_version_name(fver));
|
||||
__func__, n_kv, n_tensors, fname.empty() ? "(file*)" : fname.c_str(), llama_file_version_name(fver));
|
||||
|
||||
// determine file type based on the number of tensors for each quantization and print meta data
|
||||
// TODO: make optional
|
||||
|
|
|
|||
|
|
@ -125,6 +125,7 @@ struct llama_model_loader {
|
|||
void * set_tensor_data_ud,
|
||||
const std::string & fname,
|
||||
std::vector<std::string> & splits, // optional, only need if the split does not follow naming scheme
|
||||
FILE * file,
|
||||
bool use_mmap,
|
||||
bool use_direct_io,
|
||||
bool check_tensors,
|
||||
|
|
|
|||
|
|
@ -1,7 +1,9 @@
|
|||
#include "llama-model-saver.h"
|
||||
|
||||
#include "ggml.h"
|
||||
#include "gguf.h"
|
||||
|
||||
#include "llama-arch.h"
|
||||
#include "llama.h"
|
||||
#include "llama-hparams.h"
|
||||
#include "llama-model.h"
|
||||
|
|
@ -10,8 +12,33 @@
|
|||
#include <cstdint>
|
||||
#include <string>
|
||||
|
||||
bool llama_model_saver_supports_arch(llm_arch arch) {
|
||||
switch (arch) {
|
||||
case LLM_ARCH_QWEN3NEXT:
|
||||
case LLM_ARCH_QWEN35:
|
||||
case LLM_ARCH_QWEN35MOE:
|
||||
case LLM_ARCH_PLAMO3:
|
||||
case LLM_ARCH_GEMMA3:
|
||||
case LLM_ARCH_GEMMA3N:
|
||||
case LLM_ARCH_COHERE2:
|
||||
case LLM_ARCH_OLMO2:
|
||||
case LLM_ARCH_BITNET:
|
||||
case LLM_ARCH_T5:
|
||||
case LLM_ARCH_EXAONE_MOE:
|
||||
case LLM_ARCH_AFMOE:
|
||||
case LLM_ARCH_APERTUS:
|
||||
case LLM_ARCH_MIMO2:
|
||||
case LLM_ARCH_STEP35:
|
||||
return false;
|
||||
default:
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
llama_model_saver::llama_model_saver(const struct llama_model * model) :
|
||||
gguf_ctx(gguf_init_empty()), gguf_ctx_owned(true), model(model), llm_kv(model->arch) {}
|
||||
gguf_ctx(gguf_init_empty()), gguf_ctx_owned(true), model(model), llm_kv(model->arch) {
|
||||
GGML_ASSERT(llama_model_saver_supports_arch(model->arch));
|
||||
}
|
||||
|
||||
llama_model_saver::llama_model_saver(enum llm_arch arch, struct gguf_context * gguf_ctx) :
|
||||
gguf_ctx(gguf_ctx == nullptr ? gguf_init_empty() : gguf_ctx), gguf_ctx_owned(gguf_ctx == nullptr), model(nullptr), llm_kv(arch) {}
|
||||
|
|
@ -105,7 +132,10 @@ void llama_model_saver::add_tensor(const struct ggml_tensor * tensor) {
|
|||
return;
|
||||
}
|
||||
if (gguf_find_tensor(gguf_ctx, tensor->name) >= 0) {
|
||||
GGML_ASSERT(std::string(tensor->name) == "rope_freqs.weight"); // FIXME
|
||||
const std::string tensor_name = tensor->name;
|
||||
GGML_ASSERT(
|
||||
tensor_name == "rope_freqs.weight" || tensor_name == "rope_factors_long.weight" ||
|
||||
tensor_name == "rope_factors_short.weight"); // FIXME
|
||||
return;
|
||||
}
|
||||
gguf_add_tensor(gguf_ctx, tensor);
|
||||
|
|
@ -127,6 +157,7 @@ void llama_model_saver::add_kv_from_model() {
|
|||
tokens[id] = token_data.text;
|
||||
scores[id] = token_data.score;
|
||||
|
||||
// FIXME should this be treated as flags?
|
||||
switch(token_data.attr) {
|
||||
case LLAMA_TOKEN_ATTR_UNKNOWN: token_types[id] = LLAMA_TOKEN_TYPE_UNKNOWN; break;
|
||||
case LLAMA_TOKEN_ATTR_UNUSED: token_types[id] = LLAMA_TOKEN_TYPE_UNUSED; break;
|
||||
|
|
@ -134,6 +165,9 @@ void llama_model_saver::add_kv_from_model() {
|
|||
case LLAMA_TOKEN_ATTR_CONTROL: token_types[id] = LLAMA_TOKEN_TYPE_CONTROL; break;
|
||||
case LLAMA_TOKEN_ATTR_USER_DEFINED: token_types[id] = LLAMA_TOKEN_TYPE_USER_DEFINED; break;
|
||||
case LLAMA_TOKEN_ATTR_BYTE: token_types[id] = LLAMA_TOKEN_TYPE_BYTE; break;
|
||||
// case LLAMA_TOKEN_ATTR_NORMALIZED: ???
|
||||
// case LLAMA_TOKEN_ATTR_LSTRIP: ???
|
||||
// case LLAMA_TOKEN_ATTR_RSTRIP: ???
|
||||
case LLAMA_TOKEN_ATTR_UNDEFINED:
|
||||
default: token_types[id] = LLAMA_TOKEN_TYPE_UNDEFINED; break;
|
||||
}
|
||||
|
|
@ -144,6 +178,19 @@ void llama_model_saver::add_kv_from_model() {
|
|||
add_kv(LLM_KV_GENERAL_ARCHITECTURE, model->arch_name());
|
||||
// add_kv(LLM_KV_GENERAL_QUANTIZATION_VERSION, ???);
|
||||
// add_kv(LLM_KV_GENERAL_ALIGNMENT, ???);
|
||||
// add_kv(LLM_KV_GENERAL_FILE_TYPE, ???);
|
||||
// add_kv(LLM_KV_GENERAL_SAMPLING_SEQUENCE, ???);
|
||||
// add_kv(LLM_KV_GENERAL_SAMPLING_TOP_K, ???);
|
||||
// add_kv(LLM_KV_GENERAL_SAMPLING_TOP_P, ???);
|
||||
// add_kv(LLM_KV_GENERAL_SAMPLING_MIN_P, ???);
|
||||
// add_kv(LLM_KV_GENERAL_SAMPLING_XTC_PROBABILITY, ???);
|
||||
// add_kv(LLM_KV_GENERAL_SAMPLING_XTC_THRESHOLD, ???);
|
||||
// add_kv(LLM_KV_GENERAL_SAMPLING_TEMP, ???);
|
||||
// add_kv(LLM_KV_GENERAL_SAMPLING_PENALTY_LAST_N, ???);
|
||||
// add_kv(LLM_KV_GENERAL_SAMPLING_PENALTY_REPEAT, ???);
|
||||
// add_kv(LLM_KV_GENERAL_SAMPLING_MIROSTAT, ???);
|
||||
// add_kv(LLM_KV_GENERAL_SAMPLING_MIROSTAT_TAU, ???);
|
||||
// add_kv(LLM_KV_GENERAL_SAMPLING_MIROSTAT_ETA, ???);
|
||||
add_kv(LLM_KV_GENERAL_NAME, model->name);
|
||||
// add_kv(LLM_KV_GENERAL_AUTHOR, ???);
|
||||
// add_kv(LLM_KV_GENERAL_VERSION, ???);
|
||||
|
|
@ -163,17 +210,31 @@ void llama_model_saver::add_kv_from_model() {
|
|||
add_kv(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
|
||||
add_kv(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, true);
|
||||
add_kv(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
||||
add_kv(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
||||
add_kv(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp);
|
||||
add_kv(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_chexp);
|
||||
add_kv(LLM_KV_SWIGLU_CLAMP_EXP, hparams.swiglu_clamp_exp);
|
||||
add_kv(LLM_KV_SWIGLU_CLAMP_SHEXP, hparams.swiglu_clamp_shexp);
|
||||
add_kv(LLM_KV_USE_PARALLEL_RESIDUAL, hparams.use_par_res);
|
||||
// add_kv(LLM_KV_TENSOR_DATA_LAYOUT, ???);
|
||||
add_kv(LLM_KV_EXPERT_COUNT, hparams.n_expert);
|
||||
add_kv(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used);
|
||||
add_kv(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
|
||||
add_kv(LLM_KV_EXPERT_GROUP_COUNT, hparams.n_expert_groups);
|
||||
add_kv(LLM_KV_EXPERT_GROUP_USED_COUNT, hparams.n_group_used);
|
||||
add_kv(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
|
||||
add_kv(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm);
|
||||
add_kv(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func);
|
||||
add_kv(LLM_KV_EXPERT_GROUP_SCALE, hparams.expert_group_scale);
|
||||
add_kv(LLM_KV_EXPERTS_PER_GROUP, hparams.n_group_experts);
|
||||
add_kv(LLM_KV_MOE_EVERY_N_LAYERS, hparams.moe_every_n_layers);
|
||||
add_kv(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers);
|
||||
add_kv(LLM_KV_NUM_DEEPSTACK_LAYERS, hparams.n_deepstack_layers);
|
||||
add_kv(LLM_KV_POOLING_TYPE, uint32_t(hparams.pooling_type));
|
||||
add_kv(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
|
||||
add_kv(LLM_KV_DECODER_START_TOKEN_ID, hparams.dec_start_token_id);
|
||||
add_kv(LLM_KV_DECODER_BLOCK_COUNT, hparams.dec_n_layer);
|
||||
add_kv(LLM_KV_ATTN_LOGIT_SOFTCAPPING, hparams.f_attn_logit_softcapping);
|
||||
add_kv(LLM_KV_ROUTER_LOGIT_SOFTCAPPING, hparams.f_router_logit_softcapping);
|
||||
add_kv(LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping);
|
||||
add_kv(LLM_KV_SWIN_NORM, hparams.swin_norm);
|
||||
add_kv(LLM_KV_RESCALE_EVERY_N_LAYERS, hparams.rescale_every_n_layers);
|
||||
|
|
@ -181,6 +242,9 @@ void llama_model_saver::add_kv_from_model() {
|
|||
add_kv(LLM_KV_TIME_DECAY_EXTRA_DIM, hparams.time_decay_extra_dim);
|
||||
add_kv(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale);
|
||||
add_kv(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale);
|
||||
add_kv(LLM_KV_TOKEN_SHIFT_COUNT, hparams.token_shift_count);
|
||||
add_kv(LLM_KV_INTERLEAVE_MOE_LAYER_STEP, hparams.n_moe_layer_step);
|
||||
// add_kv(LLM_KV_FULL_ATTENTION_INTERVAL, ???);
|
||||
|
||||
add_kv(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, true);
|
||||
add_kv(LLM_KV_ATTENTION_HEAD_COUNT_KV, hparams.n_head_kv_arr, true);
|
||||
|
|
@ -188,22 +252,39 @@ void llama_model_saver::add_kv_from_model() {
|
|||
add_kv(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv);
|
||||
add_kv(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k_full);
|
||||
add_kv(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v_full);
|
||||
add_kv(LLM_KV_ATTENTION_KEY_LENGTH_SWA, hparams.n_embd_head_k_swa);
|
||||
add_kv(LLM_KV_ATTENTION_VALUE_LENGTH_SWA, hparams.n_embd_head_v_swa);
|
||||
add_kv(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||
add_kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
add_kv(LLM_KV_ATTENTION_GROUPNORM_EPS, hparams.f_norm_group_eps);
|
||||
add_kv(LLM_KV_ATTENTION_GROUPNORM_GROUPS, hparams.n_norm_groups);
|
||||
add_kv(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
||||
add_kv(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q);
|
||||
add_kv(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
|
||||
add_kv(LLM_KV_ATTENTION_DECAY_LORA_RANK, hparams.n_lora_decay);
|
||||
add_kv(LLM_KV_ATTENTION_ICLR_LORA_RANK, hparams.n_lora_iclr);
|
||||
add_kv(LLM_KV_ATTENTION_VALUE_RESIDUAL_MIX_LORA_RANK, hparams.n_lora_value_res_mix);
|
||||
add_kv(LLM_KV_ATTENTION_GATE_LORA_RANK, hparams.n_lora_gate);
|
||||
add_kv(LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, hparams.n_rel_attn_bkts);
|
||||
add_kv(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
|
||||
// add_kv(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, ???);
|
||||
add_kv(LLM_KV_ATTENTION_SCALE, hparams.f_attention_scale);
|
||||
add_kv(LLM_KV_ATTENTION_OUTPUT_SCALE, hparams.f_attn_out_scale);
|
||||
add_kv(LLM_KV_ATTENTION_TEMPERATURE_LENGTH, hparams.attn_temp_length);
|
||||
add_kv(LLM_KV_ATTENTION_TEMPERATURE_SCALE, hparams.f_attn_temp_scale);
|
||||
add_kv(LLM_KV_ATTENTION_KEY_LENGTH_MLA, hparams.n_embd_head_k_mla_impl);
|
||||
add_kv(LLM_KV_ATTENTION_VALUE_LENGTH_MLA, hparams.n_embd_head_v_mla_impl);
|
||||
add_kv(LLM_KV_ATTENTION_KEY_LENGTH_SWA, hparams.n_embd_head_k_swa);
|
||||
add_kv(LLM_KV_ATTENTION_VALUE_LENGTH_SWA, hparams.n_embd_head_v_swa);
|
||||
add_kv(LLM_KV_ATTENTION_INDEXER_HEAD_COUNT, hparams.indexer_n_head);
|
||||
add_kv(LLM_KV_ATTENTION_INDEXER_KEY_LENGTH, hparams.indexer_head_size);
|
||||
add_kv(LLM_KV_ATTENTION_INDEXER_TOP_K, hparams.indexer_top_k);
|
||||
|
||||
const float rope_scaling_factor = hparams.rope_freq_scale_train == 1.0f ? 0.0f : 1.0f/hparams.rope_freq_scale_train;
|
||||
|
||||
add_kv(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot_full);
|
||||
add_kv(LLM_KV_ROPE_DIMENSION_COUNT_SWA, hparams.n_rot_swa);
|
||||
add_kv(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections);
|
||||
add_kv(LLM_KV_ROPE_FREQ_BASE, hparams.rope_freq_base_train);
|
||||
add_kv(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa);
|
||||
// add_kv(LLM_KV_ROPE_SCALE_LINEAR, rope_scaling_factor); // old name
|
||||
add_kv(LLM_KV_ROPE_SCALING_TYPE, llama_rope_scaling_type_name(hparams.rope_scaling_type_train));
|
||||
add_kv(LLM_KV_ROPE_SCALING_FACTOR, rope_scaling_factor);
|
||||
|
|
@ -211,6 +292,10 @@ void llama_model_saver::add_kv_from_model() {
|
|||
add_kv(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, hparams.n_ctx_orig_yarn);
|
||||
add_kv(LLM_KV_ROPE_SCALING_FINETUNED, hparams.rope_finetuned);
|
||||
add_kv(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul);
|
||||
add_kv(LLM_KV_ROPE_SCALING_YARN_EXT_FACTOR, hparams.yarn_ext_factor);
|
||||
add_kv(LLM_KV_ROPE_SCALING_YARN_ATTN_FACTOR, hparams.yarn_attn_factor);
|
||||
add_kv(LLM_KV_ROPE_SCALING_YARN_BETA_FAST, hparams.yarn_beta_fast);
|
||||
add_kv(LLM_KV_ROPE_SCALING_YARN_BETA_SLOW, hparams.yarn_beta_slow);
|
||||
|
||||
// TODO: implement split file support
|
||||
// add_kv(LLM_KV_SPLIT_NO, ???);
|
||||
|
|
@ -221,8 +306,11 @@ void llama_model_saver::add_kv_from_model() {
|
|||
add_kv(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
|
||||
add_kv(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
|
||||
add_kv(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
|
||||
add_kv(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group);
|
||||
add_kv(LLM_KV_SSM_DT_B_C_RMS, hparams.ssm_dt_b_c_rms);
|
||||
|
||||
add_kv(LLM_KV_KDA_HEAD_DIM, hparams.n_embd_head_kda);
|
||||
|
||||
add_kv(LLM_KV_WKV_HEAD_SIZE, hparams.wkv_head_size);
|
||||
|
||||
add_kv(LLM_KV_TOKENIZER_MODEL, vocab.get_tokenizer_model());
|
||||
|
|
@ -260,15 +348,39 @@ void llama_model_saver::add_kv_from_model() {
|
|||
// TODO: implement LoRA support
|
||||
// add_kv(LLM_KV_ADAPTER_TYPE, ???);
|
||||
// add_kv(LLM_KV_ADAPTER_LORA_ALPHA, ???);
|
||||
// add_kv(LLM_KV_ADAPTER_LORA_TASK_NAME, ???);
|
||||
// add_kv(LLM_KV_ADAPTER_LORA_PROMPT_PREFIX, ???);
|
||||
// add_kv(LLM_KV_ADAPTER_ALORA_INVOCATION_TOKENS, ???);
|
||||
|
||||
add_kv(LLM_KV_POSNET_EMBEDDING_LENGTH, hparams.posnet.n_embd);
|
||||
add_kv(LLM_KV_POSNET_BLOCK_COUNT, hparams.posnet.n_layer);
|
||||
|
||||
add_kv(LLM_KV_CONVNEXT_EMBEDDING_LENGTH, hparams.convnext.n_embd);
|
||||
add_kv(LLM_KV_CONVNEXT_BLOCK_COUNT, hparams.convnext.n_layer);
|
||||
|
||||
add_kv(LLM_KV_CLASSIFIER_OUTPUT_LABELS, model->classifier_labels);
|
||||
|
||||
add_kv(LLM_KV_SHORTCONV_L_CACHE, hparams.n_shortconv_l_cache);
|
||||
|
||||
add_kv(LLM_KV_XIELU_ALPHA_N, hparams.xielu_alpha_n);
|
||||
add_kv(LLM_KV_XIELU_ALPHA_P, hparams.xielu_alpha_p);
|
||||
add_kv(LLM_KV_XIELU_BETA, hparams.xielu_beta);
|
||||
add_kv(LLM_KV_XIELU_EPS, hparams.xielu_eps);
|
||||
|
||||
// deprecated
|
||||
// add_kv(LLM_KV_TOKENIZER_PREFIX_ID, ???);
|
||||
// add_kv(LLM_KV_TOKENIZER_SUFFIX_ID, ???);
|
||||
// add_kv(LLM_KV_TOKENIZER_MIDDLE_ID, ???);
|
||||
|
||||
add_kv(LLM_KV_DENSE_2_FEAT_IN, hparams.dense_2_feat_in);
|
||||
add_kv(LLM_KV_DENSE_2_FEAT_OUT, hparams.dense_2_feat_out);
|
||||
add_kv(LLM_KV_DENSE_3_FEAT_IN, hparams.dense_3_feat_in);
|
||||
add_kv(LLM_KV_DENSE_3_FEAT_OUT, hparams.dense_3_feat_out);
|
||||
}
|
||||
|
||||
void llama_model_saver::add_tensors_from_model() {
|
||||
if (std::string(model->output->name) != std::string(model->tok_embd->name)) {
|
||||
if (model->output != nullptr &&
|
||||
std::string(model->output->name) != std::string(model->tok_embd->name)) {
|
||||
add_tensor(model->tok_embd); // some models use the same tensor for tok_embd and output
|
||||
}
|
||||
add_tensor(model->type_embd);
|
||||
|
|
@ -297,3 +409,6 @@ void llama_model_saver::save(const std::string & path_model) {
|
|||
gguf_write_to_file(gguf_ctx, path_model.c_str(), false);
|
||||
}
|
||||
|
||||
void llama_model_saver::save(FILE * file) {
|
||||
gguf_write_to_file_ptr(gguf_ctx, file, false);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -6,6 +6,9 @@
|
|||
|
||||
#include <vector>
|
||||
|
||||
// FIXME temporary function for better error messages
|
||||
bool llama_model_saver_supports_arch(llm_arch arch);
|
||||
|
||||
struct llama_model_saver {
|
||||
struct gguf_context * gguf_ctx = nullptr;
|
||||
const bool gguf_ctx_owned;
|
||||
|
|
@ -37,4 +40,5 @@ struct llama_model_saver {
|
|||
void add_tensors_from_model();
|
||||
|
||||
void save(const std::string & path_model);
|
||||
void save(FILE * file);
|
||||
};
|
||||
|
|
|
|||
|
|
@ -1624,7 +1624,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||
|
||||
// (optional) temperature tuning - used by mistral-large
|
||||
ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_SCALE, hparams.f_attn_temp_scale, false);
|
||||
ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_LENGTH, hparams.n_attn_temp_floor_scale, false);
|
||||
ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_LENGTH, hparams.n_attn_temp_floor_scale, false); // FIXME why not use temperature_length?
|
||||
|
||||
hparams.f_attn_temp_offset = 0.0f;
|
||||
|
||||
|
|
|
|||
|
|
@ -859,7 +859,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|||
|
||||
std::vector<std::string> splits = {};
|
||||
llama_model_loader ml(/*metadata*/ nullptr, /*set_tensor_data*/ nullptr, /*set_tensor_data_ud*/ nullptr,
|
||||
fname_inp, splits, use_mmap, /*use_direct_io*/ false, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr);
|
||||
fname_inp, splits, /*file*/ nullptr, use_mmap, /*use_direct_io*/ false, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr);
|
||||
ml.init_mappings(false); // no prefetching
|
||||
|
||||
llama_model model(llama_model_default_params());
|
||||
|
|
|
|||
|
|
@ -828,7 +828,7 @@ int64_t llama_time_us(void) {
|
|||
|
||||
// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
|
||||
static int llama_model_load(struct gguf_context * metadata, llama_model_set_tensor_data_t set_tensor_data, void * set_tensor_data_ud,
|
||||
const std::string & fname, std::vector<std::string> & splits, llama_model & model, llama_model_params & params) {
|
||||
const std::string & fname, std::vector<std::string> & splits, FILE * file, llama_model & model, llama_model_params & params) {
|
||||
// loading time will be recalculated after the first eval, so
|
||||
// we take page faults deferred by mmap() into consideration
|
||||
model.t_load_us = 0;
|
||||
|
|
@ -837,7 +837,7 @@ static int llama_model_load(struct gguf_context * metadata, llama_model_set_tens
|
|||
model.t_start_us = tm.t_start_us;
|
||||
|
||||
try {
|
||||
llama_model_loader ml(metadata, set_tensor_data, set_tensor_data_ud, fname, splits, params.use_mmap, params.use_direct_io,
|
||||
llama_model_loader ml(metadata, set_tensor_data, set_tensor_data_ud, fname, splits, file, params.use_mmap, params.use_direct_io,
|
||||
params.check_tensors, params.no_alloc, params.kv_overrides, params.tensor_buft_overrides);
|
||||
|
||||
ml.print_info();
|
||||
|
|
@ -889,8 +889,24 @@ static struct llama_model * llama_model_load_from_file_impl(
|
|||
void * set_tensor_data_ud,
|
||||
const std::string & path_model,
|
||||
std::vector<std::string> & splits,
|
||||
FILE * file,
|
||||
struct llama_model_params params) {
|
||||
GGML_ASSERT((metadata == nullptr) != path_model.empty() && "exactly one out of metadata and path_model needs to be defined");
|
||||
{
|
||||
int n_sources_defined = 0;
|
||||
if (metadata != nullptr) {
|
||||
n_sources_defined++;
|
||||
}
|
||||
if (!path_model.empty()) {
|
||||
n_sources_defined++;
|
||||
}
|
||||
if (file != nullptr) {
|
||||
n_sources_defined++;
|
||||
}
|
||||
if (n_sources_defined != 1) {
|
||||
LLAMA_LOG_ERROR("%s: exactly one out metadata, path_model, and file must be defined\n", __func__);
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
ggml_time_init();
|
||||
|
||||
if (!params.vocab_only && ggml_backend_reg_count() == 0) {
|
||||
|
|
@ -1011,7 +1027,7 @@ static struct llama_model * llama_model_load_from_file_impl(
|
|||
props.memory_free/1024/1024);
|
||||
}
|
||||
|
||||
const int status = llama_model_load(metadata, set_tensor_data, set_tensor_data_ud, path_model, splits, *model, params);
|
||||
const int status = llama_model_load(metadata, set_tensor_data, set_tensor_data_ud, path_model, splits, file, *model, params);
|
||||
GGML_ASSERT(status <= 0);
|
||||
if (status < 0) {
|
||||
if (status == -1) {
|
||||
|
|
@ -1037,7 +1053,7 @@ struct llama_model * llama_model_init_from_user(
|
|||
std::vector<std::string> splits = {};
|
||||
params.use_mmap = false;
|
||||
params.use_extra_bufts = false;
|
||||
return llama_model_load_from_file_impl(metadata, set_tensor_data, set_tensor_data_ud, path_model, splits, params);
|
||||
return llama_model_load_from_file_impl(metadata, set_tensor_data, set_tensor_data_ud, path_model, splits, /*file*/ nullptr, params);
|
||||
}
|
||||
// deprecated
|
||||
struct llama_model * llama_load_model_from_file(
|
||||
|
|
@ -1050,7 +1066,7 @@ struct llama_model * llama_model_load_from_file(
|
|||
const char * path_model,
|
||||
struct llama_model_params params) {
|
||||
std::vector<std::string> splits = {};
|
||||
return llama_model_load_from_file_impl(nullptr, nullptr, nullptr, path_model, splits, params);
|
||||
return llama_model_load_from_file_impl(nullptr, nullptr, nullptr, path_model, splits, /*file*/ nullptr, params);
|
||||
}
|
||||
|
||||
struct llama_model * llama_model_load_from_splits(
|
||||
|
|
@ -1066,7 +1082,17 @@ struct llama_model * llama_model_load_from_splits(
|
|||
for (size_t i = 0; i < n_paths; ++i) {
|
||||
splits.push_back(paths[i]);
|
||||
}
|
||||
return llama_model_load_from_file_impl(nullptr, nullptr, nullptr, splits.front(), splits, params);
|
||||
return llama_model_load_from_file_impl(nullptr, nullptr, nullptr, splits.front(), splits, /*file*/ nullptr, params);
|
||||
}
|
||||
|
||||
struct llama_model * llama_model_load_from_file_ptr(FILE * file, struct llama_model_params params) {
|
||||
if (!file) {
|
||||
LLAMA_LOG_ERROR("%s: file is NULL\n", __func__);
|
||||
return nullptr;
|
||||
}
|
||||
std::string path_model;
|
||||
std::vector<std::string> splits = {};
|
||||
return llama_model_load_from_file_impl(nullptr, nullptr, nullptr, path_model, splits, file, params);
|
||||
}
|
||||
|
||||
void llama_model_save_to_file(const struct llama_model * model, const char * path_model) {
|
||||
|
|
|
|||
|
|
@ -742,7 +742,7 @@ static std::pair<int, int> test_handcrafted_file(const unsigned int seed) {
|
|||
/*ctx =*/ hft >= offset_has_data ? &ctx : nullptr,
|
||||
};
|
||||
|
||||
struct gguf_context * gguf_ctx = gguf_init_from_file_impl(file, gguf_params);
|
||||
struct gguf_context * gguf_ctx = gguf_init_from_file_ptr(file, gguf_params);
|
||||
|
||||
if (expect_context_not_null(hft)) {
|
||||
printf("%s: - context_not_null: ", __func__);
|
||||
|
|
@ -1125,19 +1125,15 @@ static std::pair<int, int> test_roundtrip(ggml_backend_dev_t dev, const unsigned
|
|||
GGML_ASSERT(file);
|
||||
#endif // _WIN32
|
||||
|
||||
{
|
||||
std::vector<int8_t> buf;
|
||||
gguf_write_to_buf(gguf_ctx_0, buf, only_meta);
|
||||
GGML_ASSERT(fwrite(buf.data(), 1, buf.size(), file) == buf.size());
|
||||
rewind(file);
|
||||
}
|
||||
gguf_write_to_file_ptr(gguf_ctx_0, file, only_meta);
|
||||
rewind(file);
|
||||
|
||||
struct ggml_context * ctx_1 = nullptr;
|
||||
struct gguf_init_params gguf_params = {
|
||||
/*no_alloc =*/ false,
|
||||
/*ctx =*/ only_meta ? nullptr : &ctx_1,
|
||||
};
|
||||
struct gguf_context * gguf_ctx_1 = gguf_init_from_file_impl(file, gguf_params);
|
||||
struct gguf_context * gguf_ctx_1 = gguf_init_from_file_ptr(file, gguf_params);
|
||||
|
||||
printf("%s: same_version: ", __func__);
|
||||
if (gguf_get_version(gguf_ctx_0) == gguf_get_version(gguf_ctx_1)) {
|
||||
|
|
|
|||
|
|
@ -90,6 +90,7 @@ static gguf_context_ptr get_gguf_ctx(const llm_arch arch, const bool moe) {
|
|||
n_embd = 64;
|
||||
n_head = 1;
|
||||
n_ff = 96;
|
||||
n_layer = 22; // hparams.n_layer_kv_from_start = 20 is hardcoded
|
||||
} else if (arch == LLM_ARCH_DEEPSEEK2
|
||||
|| arch == LLM_ARCH_GLM_DSA
|
||||
|| arch == LLM_ARCH_KIMI_LINEAR
|
||||
|
|
@ -101,8 +102,6 @@ static gguf_context_ptr get_gguf_ctx(const llm_arch arch, const bool moe) {
|
|||
n_layer = 3;
|
||||
} else if (arch == LLM_ARCH_CHAMELEON) {
|
||||
n_vocab = 10240;
|
||||
} else if (arch == LLM_ARCH_GEMMA3N) {
|
||||
n_layer = 22; // hparams.n_layer_kv_from_start = 20 is hardcoded
|
||||
}
|
||||
|
||||
const uint32_t n_embd_head = n_embd / n_head;
|
||||
|
|
@ -231,9 +230,15 @@ static gguf_context_ptr get_gguf_ctx(const llm_arch arch, const bool moe) {
|
|||
return ret;
|
||||
}
|
||||
|
||||
static bool silent_model_load_progress(float /*progress*/, void * /*user_data*/) {
|
||||
return true;
|
||||
}
|
||||
|
||||
static std::pair<llama_model_ptr, llama_context_ptr> get_model_and_ctx(
|
||||
struct gguf_context * gguf_ctx, const size_t seed, const std::vector<ggml_backend_dev_t> & devs) {
|
||||
struct gguf_context * gguf_ctx, FILE * file, const size_t seed, const std::vector<ggml_backend_dev_t> & devs) {
|
||||
GGML_ASSERT((gguf_ctx == nullptr) != (file == nullptr));
|
||||
llama_model_params model_params = llama_model_default_params();
|
||||
model_params.progress_callback = silent_model_load_progress;
|
||||
std::vector<ggml_backend_dev_t> devs_copy = devs;
|
||||
devs_copy.push_back(nullptr);
|
||||
model_params.devices = devs_copy.data();
|
||||
|
|
@ -244,7 +249,9 @@ static std::pair<llama_model_ptr, llama_context_ptr> get_model_and_ctx(
|
|||
ctx_params.n_threads_batch = 4;
|
||||
|
||||
size_t tmp = seed;
|
||||
llama_model_ptr model(llama_model_init_from_user(gguf_ctx, set_tensor_data, &tmp, model_params));
|
||||
llama_model_ptr model(gguf_ctx != nullptr ?
|
||||
llama_model_init_from_user(gguf_ctx, set_tensor_data, &tmp, model_params) :
|
||||
llama_model_load_from_file_ptr(file, model_params));
|
||||
if (!model) {
|
||||
throw std::runtime_error("failed to create llama model");
|
||||
}
|
||||
|
|
@ -351,7 +358,6 @@ static bool moe_implemented(const llm_arch arch) {
|
|||
}
|
||||
|
||||
static int save_models(const llm_arch target_arch, const size_t seed, const ggml_log_level log_level, const std::string & dir) {
|
||||
GGML_ABORT("llama_model_save_to_file is broken");
|
||||
struct user_data_t {
|
||||
struct {
|
||||
ggml_log_callback callback;
|
||||
|
|
@ -376,6 +382,16 @@ static int save_models(const llm_arch target_arch, const size_t seed, const ggml
|
|||
if (arch == LLM_ARCH_CLIP || arch == LLM_ARCH_GPTJ || arch == LLM_ARCH_UNKNOWN) {
|
||||
continue; // These models don't have usable implementations.
|
||||
}
|
||||
if (arch == LLM_ARCH_RWKV6 || arch == LLM_ARCH_RWKV6QWEN2 || arch == LLM_ARCH_RWKV7 || arch == LLM_ARCH_ARWKV7) {
|
||||
continue; // FIXME
|
||||
}
|
||||
if (arch == LLM_ARCH_BERT || arch == LLM_ARCH_MODERN_BERT || arch == LLM_ARCH_NOMIC_BERT || arch == LLM_ARCH_NOMIC_BERT_MOE ||
|
||||
arch == LLM_ARCH_NEO_BERT || arch == LLM_ARCH_JINA_BERT_V2 || arch == LLM_ARCH_JINA_BERT_V3 || arch == LLM_ARCH_EUROBERT) {
|
||||
continue; // TODO vocab
|
||||
}
|
||||
if (arch == LLM_ARCH_PLM) {
|
||||
continue; // TODO tensor shapes
|
||||
}
|
||||
for (bool moe : {false, true}) {
|
||||
if (moe && !moe_implemented(arch)) {
|
||||
continue;
|
||||
|
|
@ -383,8 +399,12 @@ static int save_models(const llm_arch target_arch, const size_t seed, const ggml
|
|||
if (!moe && moe_mandatory(arch)) {
|
||||
continue;
|
||||
}
|
||||
if (!llama_model_saver_supports_arch(arch)) {
|
||||
LOG_INF("%s: %s model (%s) is unsupported, skipping\n", __func__, llm_arch_name(arch), moe ? "MoE" : "dense");
|
||||
continue;
|
||||
}
|
||||
gguf_context_ptr gguf_ctx = get_gguf_ctx(arch, moe);
|
||||
auto model_and_ctx = get_model_and_ctx(gguf_ctx.get(), seed, {});
|
||||
auto model_and_ctx = get_model_and_ctx(gguf_ctx.get(), nullptr, seed, {});
|
||||
const std::string path = dir + "/" + llm_arch_name(arch) + (moe ? "-moe.gguf" : "-dense.gguf");
|
||||
LOG_INF("%s: Saving %s model (%s) to %s...\n", __func__, llm_arch_name(arch), moe ? "MoE" : "dense", path.c_str());
|
||||
llama_model_save_to_file(model_and_ctx.first.get(), path.c_str());
|
||||
|
|
@ -416,8 +436,8 @@ static int test_backends(const llm_arch target_arch, const size_t seed, const gg
|
|||
|
||||
bool all_ok = true;
|
||||
common_log_flush(common_log_main());
|
||||
printf("|%15s|%30s|%6s|%8s|%6s|\n", "Model arch.", "Device", "Config", "NMSE", "Status");
|
||||
printf("|---------------|------------------------------|------|--------|------|\n");
|
||||
printf("|%15s|%30s|%6s|%15s|%9s|\n", "Model arch.", "Device", "Config", "NMSE vs. CPU", "Roundtrip");
|
||||
printf("|---------------|------------------------------|------|---------------|---------|\n");
|
||||
for (const llm_arch & arch : llm_arch_all()) {
|
||||
if (target_arch != LLM_ARCH_UNKNOWN && arch != target_arch) {
|
||||
continue;
|
||||
|
|
@ -458,22 +478,50 @@ static int test_backends(const llm_arch target_arch, const size_t seed, const gg
|
|||
continue;
|
||||
}
|
||||
gguf_context_ptr gguf_ctx = get_gguf_ctx(arch, moe);
|
||||
auto model_and_ctx_cpu = get_model_and_ctx(gguf_ctx.get(), seed, {});
|
||||
auto model_and_ctx_cpu = get_model_and_ctx(gguf_ctx.get(), nullptr, seed, {});
|
||||
const std::vector<float> logits_cpu = get_logits(model_and_ctx_cpu.first.get(), model_and_ctx_cpu.second.get(), tokens, encode);
|
||||
for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
|
||||
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
|
||||
if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) {
|
||||
continue;
|
||||
}
|
||||
auto model_and_ctx_dev = get_model_and_ctx(gguf_ctx.get(), seed, {dev});
|
||||
auto model_and_ctx_dev = get_model_and_ctx(gguf_ctx.get(), nullptr, seed, {dev});
|
||||
std::string config_name = moe ? "MoE" : "Dense";
|
||||
const std::vector<float> logits_dev = get_logits(model_and_ctx_dev.first.get(), model_and_ctx_dev.second.get(), tokens, encode);
|
||||
const double nmse_val = nmse(logits_cpu, logits_dev);
|
||||
const bool ok = nmse_val <= 1e-4;
|
||||
all_ok = all_ok && ok;
|
||||
char nmse_str[10];
|
||||
snprintf(nmse_str, sizeof(nmse_str), "%.2e", nmse_val);
|
||||
printf("|%15s|%30s|%6s|%8s|%17s|\n", llm_arch_name(arch), ggml_backend_dev_description(dev),
|
||||
moe ? "MoE" : "Dense", nmse_str, ok ? "\033[1;32mOK\033[0m" : "\033[1;31mFAIL\033[0m");
|
||||
std::string status_nmse = "\033[1;32mOK\033[0m";
|
||||
if (nmse_val > 1e-4) {
|
||||
all_ok = false;
|
||||
status_nmse = "\033[1;31mFAIL\033[0m";
|
||||
}
|
||||
|
||||
std::string status_roundtrip = "\033[1;33mSKIP\033[0m";
|
||||
FILE * file = tmpfile(); // Can be null on Windows without administrator privileges.
|
||||
if (file != nullptr && llama_model_saver_supports_arch(arch)) {
|
||||
llama_model_saver ms = llama_model_saver(model_and_ctx_dev.first.get());
|
||||
ms.add_kv_from_model();
|
||||
ms.add_tensors_from_model();
|
||||
ms.save(file);
|
||||
rewind(file);
|
||||
|
||||
auto model_and_ctx_roundtrip = get_model_and_ctx(nullptr, file, seed, {dev});
|
||||
const std::vector<float> logits_roundtrip = get_logits(
|
||||
model_and_ctx_roundtrip.first.get(), model_and_ctx_roundtrip.second.get(), tokens, encode);
|
||||
status_roundtrip = "\033[1;32mOK\033[0m";
|
||||
GGML_ASSERT(logits_roundtrip.size() == logits_dev.size());
|
||||
for (size_t i = 0; i < logits_roundtrip.size(); i++) {
|
||||
if (logits_roundtrip[i] != logits_dev[i]) {
|
||||
all_ok = false;
|
||||
status_roundtrip = "\033[1;31mFAIL\033[0m";
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
printf("|%15s|%30s|%6s|%15s (%8s)|%20s|\n", llm_arch_name(arch), ggml_backend_dev_description(dev),
|
||||
config_name.c_str(), status_nmse.c_str(), nmse_str, status_roundtrip.c_str());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue