llama : add fd-based model loading via llama_model_load_from_fd
This commit is contained in:
parent
182acfe5c5
commit
158239a2b1
|
|
@ -78,6 +78,7 @@ extern "C" {
|
|||
|
||||
GGML_API struct gguf_context * gguf_init_empty(void);
|
||||
GGML_API struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params);
|
||||
GGML_API struct gguf_context * gguf_init_from_fd(int fd, struct gguf_init_params params);
|
||||
//GGML_API struct gguf_context * gguf_init_from_buffer(..);
|
||||
|
||||
GGML_API void gguf_free(struct gguf_context * ctx);
|
||||
|
|
|
|||
|
|
@ -15,6 +15,10 @@
|
|||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#ifndef _WIN32
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
|
||||
#define GGUF_MAX_STRING_LENGTH (1024*1024*1024)
|
||||
#define GGUF_MAX_ARRAY_ELEMENTS (1024*1024*1024)
|
||||
|
||||
|
|
@ -853,6 +857,34 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|||
return result;
|
||||
}
|
||||
|
||||
#ifndef _WIN32
|
||||
struct gguf_context * gguf_init_from_fd(int fd, struct gguf_init_params params) {
|
||||
const int fd_duped = dup(fd);
|
||||
if (fd_duped < 0) {
|
||||
GGML_LOG_ERROR("%s: failed to dup fd %d: %s\n", __func__, fd, strerror(errno));
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
FILE * file = fdopen(fd_duped, "rb");
|
||||
if (!file) {
|
||||
close(fd_duped);
|
||||
GGML_LOG_ERROR("%s: failed to fdopen fd %d: %s\n", __func__, fd, strerror(errno));
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
struct gguf_context * result = gguf_init_from_file_impl(file, params);
|
||||
fclose(file);
|
||||
return result;
|
||||
}
|
||||
#else
|
||||
struct gguf_context * gguf_init_from_fd(int fd, struct gguf_init_params params) {
|
||||
GGML_LOG_ERROR("%s: fd-based loading is not supported on Windows\n", __func__);
|
||||
GGML_UNUSED(fd);
|
||||
GGML_UNUSED(params);
|
||||
return nullptr;
|
||||
}
|
||||
#endif
|
||||
|
||||
void gguf_free(struct gguf_context * ctx) {
|
||||
if (ctx == nullptr) {
|
||||
return;
|
||||
|
|
|
|||
|
|
@ -464,6 +464,10 @@ extern "C" {
|
|||
const char * path_model,
|
||||
struct llama_model_params params);
|
||||
|
||||
// Load a model from a POSIX file descriptor
|
||||
// Not supported on Windows
|
||||
LLAMA_API struct llama_model * llama_model_load_from_fd(int fd, struct llama_model_params params);
|
||||
|
||||
// Load a model from multiple splits (support custom naming scheme)
|
||||
// The paths must be in the correct order
|
||||
LLAMA_API struct llama_model * llama_model_load_from_splits(
|
||||
|
|
|
|||
|
|
@ -86,6 +86,10 @@ struct llama_file::impl {
|
|||
seek(0, SEEK_SET);
|
||||
}
|
||||
|
||||
impl(int /*fd_src*/) {
|
||||
throw std::runtime_error("fd-based loading is not supported on Windows");
|
||||
}
|
||||
|
||||
size_t tell() const {
|
||||
LARGE_INTEGER li;
|
||||
li.QuadPart = 0;
|
||||
|
|
@ -209,6 +213,25 @@ struct llama_file::impl {
|
|||
seek(0, SEEK_SET);
|
||||
}
|
||||
|
||||
impl(int fd_src) : fname("(fd:" + std::to_string(fd_src) + ")") {
|
||||
init_from_fd(fd_src);
|
||||
}
|
||||
|
||||
void init_from_fd(int fd_src) {
|
||||
const int fd_duped = dup(fd_src);
|
||||
if (fd_duped < 0) {
|
||||
throw std::runtime_error(format("llama_file: failed to dup fd %d: %s", fd_src, strerror(errno)));
|
||||
}
|
||||
fp = fdopen(fd_duped, "rb");
|
||||
if (!fp) {
|
||||
close(fd_duped);
|
||||
throw std::runtime_error(format("llama_file: failed to fdopen fd %d: %s", fd_src, strerror(errno)));
|
||||
}
|
||||
seek(0, SEEK_END);
|
||||
size = tell();
|
||||
seek(0, SEEK_SET);
|
||||
}
|
||||
|
||||
size_t tell() const {
|
||||
if (fd == -1) {
|
||||
long ret = std::ftell(fp);
|
||||
|
|
@ -373,6 +396,9 @@ struct llama_file::impl {
|
|||
|
||||
llama_file::llama_file(const char * fname, const char * mode, const bool use_direct_io) :
|
||||
pimpl(std::make_unique<impl>(fname, mode, use_direct_io)) {}
|
||||
|
||||
llama_file::llama_file(int fd) : pimpl(std::make_unique<impl>(fd)) {}
|
||||
|
||||
llama_file::~llama_file() = default;
|
||||
|
||||
size_t llama_file::tell() const { return pimpl->tell(); }
|
||||
|
|
|
|||
|
|
@ -15,12 +15,13 @@ using llama_mlocks = std::vector<std::unique_ptr<llama_mlock>>;
|
|||
|
||||
struct llama_file {
|
||||
llama_file(const char * fname, const char * mode, bool use_direct_io = false);
|
||||
llama_file(int fd);
|
||||
~llama_file();
|
||||
|
||||
size_t tell() const;
|
||||
size_t size() const;
|
||||
|
||||
int file_id() const; // fileno overload
|
||||
int file_id() const;
|
||||
|
||||
void seek(size_t offset, int whence) const;
|
||||
|
||||
|
|
|
|||
|
|
@ -510,6 +510,7 @@ llama_model_loader::llama_model_loader(
|
|||
void * set_tensor_data_ud,
|
||||
const std::string & fname,
|
||||
std::vector<std::string> & splits,
|
||||
int fd,
|
||||
bool use_mmap,
|
||||
bool use_direct_io,
|
||||
bool check_tensors,
|
||||
|
|
@ -657,6 +658,36 @@ llama_model_loader::llama_model_loader(
|
|||
|
||||
LLAMA_LOG_INFO("%s: additional %d GGUFs metadata loaded.\n", __func__, n_split - 1);
|
||||
}
|
||||
} else if (fd >= 0) {
|
||||
struct ggml_context * ctx = NULL;
|
||||
struct gguf_init_params params = {
|
||||
/*.no_alloc = */ true,
|
||||
/*.ctx = */ &ctx,
|
||||
};
|
||||
|
||||
metadata_ptr.reset(gguf_init_from_fd(fd, params));
|
||||
metadata = metadata_ptr.get();
|
||||
if (metadata == nullptr) {
|
||||
throw std::runtime_error(format("%s: failed to load model from fd %d", __func__, fd));
|
||||
}
|
||||
|
||||
get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
|
||||
llm_kv = LLM_KV(llm_arch_from_string(arch_name));
|
||||
|
||||
contexts.emplace_back(ctx);
|
||||
files.emplace_back(new llama_file(fd));
|
||||
|
||||
// Save tensors data offset info of the main file.
|
||||
for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
|
||||
std::string tensor_name = std::string(cur->name);
|
||||
// make sure there is no duplicated tensor names
|
||||
if (weights_map.find(tensor_name) != weights_map.end()) {
|
||||
throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", ggml_get_name(cur)));
|
||||
}
|
||||
n_elements += ggml_nelements(cur);
|
||||
n_bytes += ggml_nbytes(cur);
|
||||
weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), 0, metadata, cur));
|
||||
}
|
||||
} else {
|
||||
get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
|
||||
llm_kv = LLM_KV(llm_arch_from_string(arch_name));
|
||||
|
|
@ -668,7 +699,7 @@ llama_model_loader::llama_model_loader(
|
|||
fver = (enum llama_fver) gguf_get_version(metadata);
|
||||
|
||||
LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n",
|
||||
__func__, n_kv, n_tensors, fname.c_str(), llama_file_version_name(fver));
|
||||
__func__, n_kv, n_tensors, fname.empty() ? "(fd)" : fname.c_str(), llama_file_version_name(fver));
|
||||
|
||||
// determine file type based on the number of tensors for each quantization and print meta data
|
||||
// TODO: make optional
|
||||
|
|
|
|||
|
|
@ -125,6 +125,7 @@ struct llama_model_loader {
|
|||
void * set_tensor_data_ud,
|
||||
const std::string & fname,
|
||||
std::vector<std::string> & splits, // optional, only need if the split does not follow naming scheme
|
||||
int fd,
|
||||
bool use_mmap,
|
||||
bool use_direct_io,
|
||||
bool check_tensors,
|
||||
|
|
|
|||
|
|
@ -859,7 +859,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|||
|
||||
std::vector<std::string> splits = {};
|
||||
llama_model_loader ml(/*metadata*/ nullptr, /*set_tensor_data*/ nullptr, /*set_tensor_data_ud*/ nullptr,
|
||||
fname_inp, splits, use_mmap, /*use_direct_io*/ false, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr);
|
||||
fname_inp, splits, /*fd*/ -1, use_mmap, /*use_direct_io*/ false, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr);
|
||||
ml.init_mappings(false); // no prefetching
|
||||
|
||||
llama_model model(llama_model_default_params());
|
||||
|
|
|
|||
|
|
@ -828,7 +828,7 @@ int64_t llama_time_us(void) {
|
|||
|
||||
// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
|
||||
static int llama_model_load(struct gguf_context * metadata, llama_model_set_tensor_data_t set_tensor_data, void * set_tensor_data_ud,
|
||||
const std::string & fname, std::vector<std::string> & splits, llama_model & model, llama_model_params & params) {
|
||||
const std::string & fname, std::vector<std::string> & splits, int fd, llama_model & model, llama_model_params & params) {
|
||||
// loading time will be recalculated after the first eval, so
|
||||
// we take page faults deferred by mmap() into consideration
|
||||
model.t_load_us = 0;
|
||||
|
|
@ -837,7 +837,7 @@ static int llama_model_load(struct gguf_context * metadata, llama_model_set_tens
|
|||
model.t_start_us = tm.t_start_us;
|
||||
|
||||
try {
|
||||
llama_model_loader ml(metadata, set_tensor_data, set_tensor_data_ud, fname, splits, params.use_mmap, params.use_direct_io,
|
||||
llama_model_loader ml(metadata, set_tensor_data, set_tensor_data_ud, fname, splits, fd, params.use_mmap, params.use_direct_io,
|
||||
params.check_tensors, params.no_alloc, params.kv_overrides, params.tensor_buft_overrides);
|
||||
|
||||
ml.print_info();
|
||||
|
|
@ -889,8 +889,12 @@ static struct llama_model * llama_model_load_from_file_impl(
|
|||
void * set_tensor_data_ud,
|
||||
const std::string & path_model,
|
||||
std::vector<std::string> & splits,
|
||||
int fd,
|
||||
struct llama_model_params params) {
|
||||
GGML_ASSERT((metadata == nullptr) != path_model.empty() && "exactly one out of metadata and path_model needs to be defined");
|
||||
if (metadata == nullptr && path_model.empty() && fd < 0) {
|
||||
LLAMA_LOG_ERROR("%s: no model source provided\n", __func__);
|
||||
return nullptr;
|
||||
}
|
||||
ggml_time_init();
|
||||
|
||||
if (!params.vocab_only && ggml_backend_reg_count() == 0) {
|
||||
|
|
@ -1011,7 +1015,7 @@ static struct llama_model * llama_model_load_from_file_impl(
|
|||
props.memory_free/1024/1024);
|
||||
}
|
||||
|
||||
const int status = llama_model_load(metadata, set_tensor_data, set_tensor_data_ud, path_model, splits, *model, params);
|
||||
const int status = llama_model_load(metadata, set_tensor_data, set_tensor_data_ud, path_model, splits, fd, *model, params);
|
||||
GGML_ASSERT(status <= 0);
|
||||
if (status < 0) {
|
||||
if (status == -1) {
|
||||
|
|
@ -1037,7 +1041,7 @@ struct llama_model * llama_model_init_from_user(
|
|||
std::vector<std::string> splits = {};
|
||||
params.use_mmap = false;
|
||||
params.use_extra_bufts = false;
|
||||
return llama_model_load_from_file_impl(metadata, set_tensor_data, set_tensor_data_ud, path_model, splits, params);
|
||||
return llama_model_load_from_file_impl(metadata, set_tensor_data, set_tensor_data_ud, path_model, splits, /*fd*/ -1, params);
|
||||
}
|
||||
// deprecated
|
||||
struct llama_model * llama_load_model_from_file(
|
||||
|
|
@ -1050,7 +1054,7 @@ struct llama_model * llama_model_load_from_file(
|
|||
const char * path_model,
|
||||
struct llama_model_params params) {
|
||||
std::vector<std::string> splits = {};
|
||||
return llama_model_load_from_file_impl(nullptr, nullptr, nullptr, path_model, splits, params);
|
||||
return llama_model_load_from_file_impl(nullptr, nullptr, nullptr, path_model, splits, /*fd*/ -1, params);
|
||||
}
|
||||
|
||||
struct llama_model * llama_model_load_from_splits(
|
||||
|
|
@ -1066,7 +1070,20 @@ struct llama_model * llama_model_load_from_splits(
|
|||
for (size_t i = 0; i < n_paths; ++i) {
|
||||
splits.push_back(paths[i]);
|
||||
}
|
||||
return llama_model_load_from_file_impl(nullptr, nullptr, nullptr, splits.front(), splits, params);
|
||||
return llama_model_load_from_file_impl(nullptr, nullptr, nullptr, splits.front(), splits, /*fd*/ -1, params);
|
||||
}
|
||||
|
||||
struct llama_model * llama_model_load_from_fd(int fd, struct llama_model_params params) {
|
||||
#ifdef _WIN32
|
||||
LLAMA_LOG_ERROR("%s: fd-based loading is not supported on Windows\n", __func__);
|
||||
GGML_UNUSED(fd);
|
||||
GGML_UNUSED(params);
|
||||
return nullptr;
|
||||
#else
|
||||
std::string path_model;
|
||||
std::vector<std::string> splits = {};
|
||||
return llama_model_load_from_file_impl(nullptr, nullptr, nullptr, path_model, splits, fd, params);
|
||||
#endif
|
||||
}
|
||||
|
||||
void llama_model_save_to_file(const struct llama_model * model, const char * path_model) {
|
||||
|
|
|
|||
|
|
@ -240,6 +240,7 @@ llama_build_and_test(test-gguf.cpp)
|
|||
llama_build_and_test(test-backend-ops.cpp)
|
||||
|
||||
llama_build_and_test(test-model-load-cancel.cpp LABEL "model")
|
||||
llama_build_and_test(test-model-load-fd.cpp LABEL "model")
|
||||
llama_build_and_test(test-autorelease.cpp LABEL "model")
|
||||
llama_build_and_test(test-backend-sampler.cpp LABEL "model")
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,47 @@
|
|||
#include "llama.h"
|
||||
#include "get-model.h"
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
|
||||
#ifdef _WIN32
|
||||
int main(int /*argc*/, char ** /*argv*/) {
|
||||
fprintf(stderr, "skipping on Windows\n");
|
||||
return EXIT_SUCCESS;
|
||||
}
|
||||
#else
|
||||
# include <fcntl.h>
|
||||
# include <unistd.h>
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
auto * model_path = get_model_or_exit(argc, argv);
|
||||
|
||||
llama_backend_init();
|
||||
|
||||
const int fd = open(model_path, O_RDONLY);
|
||||
if (fd < 0) {
|
||||
fprintf(stderr, "failed to open %s\n", model_path);
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
|
||||
auto params = llama_model_default_params();
|
||||
params.use_mmap = true;
|
||||
params.vocab_only = true;
|
||||
|
||||
struct llama_model * model = llama_model_load_from_fd(fd, params);
|
||||
close(fd);
|
||||
|
||||
if (model == nullptr) {
|
||||
fprintf(stderr, "load from fd failed\n");
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
|
||||
const int n_vocab = llama_vocab_n_tokens(llama_model_get_vocab(model));
|
||||
fprintf(stderr, "loaded %d tokens from fd\n", n_vocab);
|
||||
|
||||
llama_model_free(model);
|
||||
llama_backend_free();
|
||||
|
||||
return n_vocab > 0 ? EXIT_SUCCESS : EXIT_FAILURE;
|
||||
}
|
||||
#endif
|
||||
Loading…
Reference in New Issue