diff --git a/common/arg.cpp b/common/arg.cpp index 9aa5689ccd..10aa1b5e4f 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -2671,7 +2671,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params, const std::string & value) { params.out_file = value; } - ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_FINETUNE, LLAMA_EXAMPLE_RESULTS})); + ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_FINETUNE, + LLAMA_EXAMPLE_RESULTS, LLAMA_EXAMPLE_EXPORT_GRAPH_OPS})); add_opt(common_arg( {"-ofreq", "--output-frequency"}, "N", string_format("output the imatrix every N iterations (default: %d)", params.n_out_freq), diff --git a/common/common.h b/common/common.h index c5645bba46..ee7a2d805e 100644 --- a/common/common.h +++ b/common/common.h @@ -105,6 +105,7 @@ enum llama_example { LLAMA_EXAMPLE_FINETUNE, LLAMA_EXAMPLE_FIT_PARAMS, LLAMA_EXAMPLE_RESULTS, + LLAMA_EXAMPLE_EXPORT_GRAPH_OPS, LLAMA_EXAMPLE_COUNT, }; diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 0be9493910..bbdad1800e 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -7,6 +7,7 @@ #include "llama-memory.h" #include "llama-mmap.h" #include "llama-model.h" +#include "llama-ext.h" #include #include @@ -3129,6 +3130,19 @@ uint32_t llama_get_sampled_probs_count_ith(llama_context * ctx, int32_t i) { return static_cast(ctx->get_sampled_probs_count(i)); } +struct ggml_cgraph * llama_graph_reserve( + struct llama_context * ctx, + uint32_t n_tokens, + uint32_t n_seqs, + uint32_t n_outputs) { + auto * memory = ctx->get_memory(); + llama_memory_context_ptr mctx; + if (memory) { + mctx = memory->init_full(); + } + return ctx->graph_reserve(n_tokens, n_seqs, n_outputs, mctx.get()); +} + // llama adapter API int32_t llama_set_adapters_lora( diff --git a/src/llama-ext.h b/src/llama-ext.h new file mode 100644 index 0000000000..13ced783b4 --- /dev/null +++ b/src/llama-ext.h @@ -0,0 +1,12 @@ +#pragma once + +#include "llama-context.h" +#include "ggml.h" +#include "stdint.h" + +// Reserve a new compute graph. It is valid until the next call to llama_graph_reserve. +LLAMA_API struct ggml_cgraph * llama_graph_reserve( + struct llama_context * ctx, + uint32_t n_tokens, + uint32_t n_seqs, + uint32_t n_outputs); diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index bb0f0ef0ed..9582164b58 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -260,6 +260,7 @@ endif() set(LLAMA_TEST_NAME test-mtmd-c-api) llama_build_and_test(test-mtmd-c-api.c) target_link_libraries(${LLAMA_TEST_NAME} PRIVATE mtmd) +unset(LLAMA_TEST_NAME) # GGUF model data fetcher library for tests that need real model metadata # Only compile when cpp-httplib has SSL support (CPPHTTPLIB_OPENSSL_SUPPORT) @@ -284,4 +285,5 @@ target_link_libraries(${TEST_TARGET} PRIVATE llama) llama_build_and_test(test-alloc.cpp) target_include_directories(test-alloc PRIVATE ${PROJECT_SOURCE_DIR}/ggml/src) - +llama_build(export-graph-ops.cpp) +target_include_directories(export-graph-ops PRIVATE ${PROJECT_SOURCE_DIR}/ggml/src) diff --git a/tests/export-graph-ops.cpp b/tests/export-graph-ops.cpp new file mode 100644 index 0000000000..754089d068 --- /dev/null +++ b/tests/export-graph-ops.cpp @@ -0,0 +1,169 @@ +#include "arg.h" +#include "common.h" +#include "log.h" +#include "llama.h" +#include "../src/llama-ext.h" +#include "ggml.h" + +#include +#include +#include +#include +#include + +struct input_tensor { + ggml_type type; + std::array ne; + std::array nb; + + input_tensor(ggml_type type, int64_t * ne, size_t * nb): type(type) { + memcpy(this->ne.data(), ne, 4 * sizeof(int64_t)); + memcpy(this->nb.data(), nb, 4 * sizeof(size_t)); + } + + bool operator<(const input_tensor &b) const { + return std::tie(type, ne, nb) < + std::tie(b.type, b.ne, b.nb); + } + + void serialize(std::ostream& out) const { + out << type << ' '; + for (size_t i = 0; i < 4; i++) { + out << ne[i] << ' '; + } + for (size_t i = 0; i < 4; i++) { + out << nb[i] << ' '; + } + } +}; + +struct test_object { + ggml_op op; + ggml_type type; + std::array ne; + std::vector op_params; + std::vector sources; + std::string name; + + void serialize(std::ostream& out) const { + out << op << ' ' << type << ' '; + for (size_t i = 0; i < 4; i++) { + out << ne[i] << ' '; + } + + out << op_params.size() << ' '; + for (size_t i = 0; i < op_params.size(); i++) { + out << op_params[i] << ' '; + } + + out << sources.size() << ' '; + for (size_t s = 0; s < sources.size(); s++) { + sources[s].serialize(out); + } + + if (!name.empty()) { + out << name; + } else { + out << '-'; + } + + out << '\n'; + } + + bool operator<(const test_object &b) const { + return std::tie(op, type, ne, op_params, sources) < + std::tie(b.op, b.type, b.ne, b.op_params, b.sources); + } +}; + +static void extract_graph_ops(ggml_cgraph * cgraph, const char * label, std::set & tests) { + int n_nodes = ggml_graph_n_nodes(cgraph); + int n_skipped = 0; + int n_before = (int) tests.size(); + for (int i = 0; i < n_nodes; i++) { + ggml_tensor * node = ggml_graph_node(cgraph, i); + + if (node->op == GGML_OP_NONE || node->op == GGML_OP_VIEW || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_TRANSPOSE) { + n_skipped++; + continue; + } + + test_object test; + + test.op = node->op; + test.type = node->type; + memcpy(&test.ne, node->ne, 4 * sizeof(int64_t)); + + test.op_params.resize(GGML_MAX_OP_PARAMS / sizeof(int32_t)); + memcpy(test.op_params.data(), node->op_params, GGML_MAX_OP_PARAMS); + + for (size_t s = 0; s < GGML_MAX_SRC; s++) { + if (node->src[s] == nullptr) { + break; + } + + test.sources.emplace_back(node->src[s]->type, node->src[s]->ne, node->src[s]->nb); + } + + test.name = node->name; + tests.insert(test); + } + + int n_new = (int) tests.size() - n_before; + LOG_INF("%s: %d unique ops, %d total nodes, %d skipped (view ops)\n", + label, n_new, n_nodes, n_skipped); +} + +int main(int argc, char ** argv) { + common_params params; + params.out_file = "tests.txt"; + + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_EXPORT_GRAPH_OPS)) { + return 1; + } + + common_init(); + + // Load CPU-only + ggml_backend_dev_t cpu_device = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); + params.devices = { cpu_device, nullptr }; + params.fit_params = false; + params.n_gpu_layers = 0; + + params.warmup = false; + + auto init_result = common_init_from_params(params); + + llama_context * ctx = init_result->context(); + + const uint32_t n_seqs = llama_n_seq_max(ctx); + const uint32_t n_tokens = std::min(llama_n_ctx(ctx), llama_n_ubatch(ctx)); + + std::set tests; + + auto * gf_pp = llama_graph_reserve(ctx, n_tokens, n_seqs, n_tokens); + if (!gf_pp) { + throw std::runtime_error("failed to reserve prompt processing graph"); + } + extract_graph_ops(gf_pp, "pp", tests); + + auto * gf_tg = llama_graph_reserve(ctx, n_seqs, n_seqs, n_seqs); + if (!gf_tg) { + throw std::runtime_error("failed to reserve token generation graph"); + } + extract_graph_ops(gf_tg, "tg", tests); + + LOG_INF("%d unique ops total\n", (int) tests.size()); + + std::ofstream f(params.out_file); + + if (!f.is_open()) { + throw std::runtime_error("Unable to open output file"); + } + + for (const auto& test : tests) { + test.serialize(f); + } + + return 0; +} diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index e9f2e8ace4..abf914faa1 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -31,10 +31,12 @@ #include #include #include +#include #include #include #include #include +#include #include #include #include @@ -6648,6 +6650,236 @@ struct test_diag : public test_case { } }; +// Deserializable generic test case +struct input_tensor { + ggml_type type; + std::array ne; + std::array nb; // strides (0 = use default contiguous strides) +}; + +static bool is_non_contiguous(const input_tensor & src) { + if (src.nb[0] == 0) { + return false; + } + const size_t default_nb0 = ggml_type_size(src.type); + const size_t default_nb1 = default_nb0 * (src.ne[0] / ggml_blck_size(src.type)); + const size_t default_nb2 = default_nb1 * src.ne[1]; + const size_t default_nb3 = default_nb2 * src.ne[2]; + return src.nb[0] != default_nb0 || + src.nb[1] != default_nb1 || + src.nb[2] != default_nb2 || + src.nb[3] != default_nb3; +} + +static std::string var_to_str(const std::vector& sources) { + std::ostringstream oss; + bool first = true; + for (const auto& src : sources) { + if (!first) oss << ","; + oss << ggml_type_name(src.type) << "[" << src.ne[0] << "," << src.ne[1] << "," << src.ne[2] << "," << src.ne[3] << "]"; + if (is_non_contiguous(src)) { + oss << "nb[" << src.nb[0] << "," << src.nb[1] << "," << src.nb[2] << "," << src.nb[3] << "]"; + } + first = false; + } + return oss.str(); +} + +static std::string var_to_str(const std::array& params) { + std::ostringstream oss; + oss << "["; + bool first = true; + for (size_t i = 0; i < params.size(); ++i) { + if (params[i] != 0) { + if (!first) oss << ","; + oss << i << ":" << params[i]; + first = false; + } + } + oss << "]"; + return oss.str(); +} + + +struct test_generic_op : public test_case { + const ggml_op op; + const ggml_type type; + const std::array ne; + const std::array op_params; + + const std::vector sources; + const std::string name; + + std::string vars() override { + if (name.empty()) { + return VARS_TO_STR4(type, ne, op_params, sources); + } + + return VARS_TO_STR5(name, type, ne, op_params, sources); + } + + test_generic_op(ggml_op op, ggml_type type, std::array ne, + std::array op_params, + std::vector sources, std::string name = "") + : op(op), type(type), ne(ne), op_params(op_params), sources(sources), name(std::move(name)) {} + + ggml_tensor * build_graph(ggml_context * ctx) override { + const size_t source_count = std::min(sources.size(), (size_t)GGML_MAX_SRC); + + std::array source_tensors; + for (size_t i = 0; i < source_count; ++i) { + const input_tensor& src = sources[i]; + + if (is_non_contiguous(src)) { + size_t total_size; + const size_t blck_size = ggml_blck_size(src.type); + if (blck_size == 1) { + total_size = ggml_type_size(src.type); + for (int d = 0; d < 4; d++) { + total_size += (src.ne[d] - 1) * src.nb[d]; + } + } else { + total_size = src.ne[0] * src.nb[0] / blck_size; + for (int d = 1; d < 4; d++) { + total_size += (src.ne[d] - 1) * src.nb[d]; + } + } + + // Convert bytes to elements, padded to block size for quantized types + const size_t type_size = ggml_type_size(src.type); + size_t backing_elements = (total_size * blck_size + type_size - 1) / type_size; + backing_elements = ((backing_elements + blck_size - 1) / blck_size) * blck_size; + ggml_tensor * backing = ggml_new_tensor_1d(ctx, src.type, backing_elements); + source_tensors[i] = ggml_view_4d(ctx, backing, + src.ne[0], src.ne[1], src.ne[2], src.ne[3], + src.nb[1], src.nb[2], src.nb[3], 0); + // nb[0] does not get set by view_4d, so set it manually + source_tensors[i]->nb[0] = src.nb[0]; + } else { + source_tensors[i] = ggml_new_tensor_4d(ctx, src.type, src.ne[0], src.ne[1], src.ne[2], src.ne[3]); + } + } + + // Ops with an inplace flag create a view of src[0] as their output. + bool inplace = false; + if (op == GGML_OP_SET || op == GGML_OP_ACC) { + inplace = op_params[4] != 0; + } else if (op == GGML_OP_ADD_REL_POS) { + inplace = op_params[0] != 0; + } + + ggml_tensor * out; + if (inplace && source_count > 0) { + out = ggml_view_tensor(ctx, source_tensors[0]); + } else { + out = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], ne[2], ne[3]); + } + out->op = op; + for (size_t i = 0; i < source_count; ++i) { + out->src[i] = source_tensors[i]; + } + + memcpy(out->op_params, op_params.data(), GGML_MAX_OP_PARAMS); + ggml_set_name(out, "out"); + + return out; + } + + double max_nmse_err() override { + switch (op) { + case GGML_OP_MUL_MAT: + case GGML_OP_MUL_MAT_ID: + case GGML_OP_OUT_PROD: + case GGML_OP_CONV_TRANSPOSE_2D: + case GGML_OP_IM2COL: + case GGML_OP_CONV_2D: + case GGML_OP_CONV_3D: + case GGML_OP_SET_ROWS: + case GGML_OP_CPY: + return 5e-4; + case GGML_OP_SOFT_MAX: + return 1e-6; + case GGML_OP_RWKV_WKV7: + return 5e-3; + case GGML_OP_FLASH_ATTN_EXT: + { + // Scale error with kv length to account for accumulating floating point error + const int64_t kv = sources[1].ne[1]; + return 5e-4 * std::max(1.0, kv / 20000.0); + } + default: + return 1e-7; + } + } + + void initialize_tensors(ggml_context * ctx) override { + ggml_tensor * out = ggml_get_tensor(ctx, "out"); + + std::random_device rd; + std::default_random_engine rng(rd()); + + for (size_t i = 0; i < sources.size() && i < GGML_MAX_SRC; i++) { + ggml_tensor * t = out->src[i]; + if (!t) { + break; + } + + // FLASH_ATTN_EXT: src[3] is the KQ mask + if (op == GGML_OP_FLASH_ATTN_EXT && i == 3) { + init_tensor_kq_mask(t); + continue; + } + + if (t->type == GGML_TYPE_I32 || t->type == GGML_TYPE_I64) { + if (op == GGML_OP_GET_ROWS || op == GGML_OP_GET_ROWS_BACK) { + const int64_t num_rows = sources[0].ne[1]; + const int64_t nels = ggml_nelements(t); + std::vector data(nels); + std::uniform_int_distribution dist(0, num_rows - 1); + for (int64_t i = 0; i < nels; i++) { + data[i] = dist(rng); + } + ggml_backend_tensor_set(t, data.data(), 0, nels * sizeof(int32_t)); + } else if (op == GGML_OP_SET_ROWS) { + init_set_rows_row_ids(t, ne[1]); + } else if (op == GGML_OP_ROPE) { + const int mode = op_params[2]; + const int64_t nels = (mode & GGML_ROPE_TYPE_MROPE) ? ne[2] * 4 : ne[2]; + std::vector data(nels); + std::uniform_int_distribution dist(0, ne[2] - 1); + for (int64_t i = 0; i < nels; i++) { + data[i] = dist(rng); + } + ggml_backend_tensor_set(t, data.data(), 0, nels * sizeof(int32_t)); + } else if (op == GGML_OP_MUL_MAT_ID || op == GGML_OP_ADD_ID) { + const int64_t n_expert = (op == GGML_OP_MUL_MAT_ID) ? sources[0].ne[2] : sources[1].ne[1]; + for (int64_t r = 0; r < ggml_nrows(t); r++) { + std::vector data(t->ne[0]); + for (int32_t i = 0; i < t->ne[0]; i++) { + data[i] = i % n_expert; + } + std::shuffle(data.begin(), data.end(), rng); + ggml_backend_tensor_set(t, data.data(), r * t->nb[1], t->ne[0] * sizeof(int32_t)); + } + } else if (op == GGML_OP_SSM_SCAN) { + for (int64_t r = 0; r < ggml_nrows(t); r++) { + std::vector data(t->ne[0]); + for (int32_t i = 0; i < t->ne[0]; i++) { + data[i] = i; + } + std::shuffle(data.begin(), data.end(), rng); + ggml_backend_tensor_set(t, data.data(), r * t->nb[1], t->ne[0] * sizeof(int32_t)); + } + } else { + init_tensor_uniform(t); + } + } else { + init_tensor_uniform(t); + } + } + } +}; + enum llm_norm_type { LLM_NORM, @@ -8751,8 +8983,72 @@ static std::vector> make_test_cases_perf() { return test_cases; } +static std::vector> make_test_cases_from_file(const char * path) { + std::ifstream f(path); + + if (!f.is_open()) { + throw std::runtime_error("Unable to read test file"); + } + + std::vector> test_cases; + + std::string line; + + while (std::getline(f, line)) { + std::istringstream iss(line); + + ggml_op op; + ggml_type type; + std::array ne; + std::array op_params = {}; + std::string name; + uint64_t tmp; + + iss >> tmp; + op = (ggml_op)tmp; + iss >> tmp; + type = (ggml_type)tmp; + + for (size_t i = 0; i < 4; i++) { + iss >> ne[i]; + } + + iss >> tmp; + for (size_t i = 0; i < tmp && i < op_params.size(); i++) { + iss >> op_params[i]; + } + + iss >> tmp; + + size_t num_src = std::min((uint64_t)GGML_MAX_SRC, tmp); + std::vector sources(num_src); + for (size_t i = 0; i < num_src; i++) { + input_tensor& src = sources[i]; + iss >> tmp; + src.type = (ggml_type)tmp; + + for (size_t i = 0; i < 4; i++) { + iss >> src.ne[i]; + } + for (size_t i = 0; i < 4; i++) { + iss >> src.nb[i]; + } + } + + iss >> name; + + if (name.length() == 1 && name[0] == '-') { + name = ""; + } + + test_cases.emplace_back(new test_generic_op(op, type, ne, op_params, sources, std::move(name))); + } + + return test_cases; +} + static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op_names_filter, const char * params_filter, - printer * output_printer) { + printer * output_printer, const char * test_file_path) { auto filter_test_cases = [](std::vector> & test_cases, const char * params_filter) { if (params_filter == nullptr) { return; @@ -8770,9 +9066,26 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op } }; + std::vector> test_cases; + + if (test_file_path == nullptr) { + switch (mode) { + case MODE_TEST: + case MODE_GRAD: + case MODE_SUPPORT: + test_cases = make_test_cases_eval(); + break; + case MODE_PERF: + test_cases = make_test_cases_perf(); + break; + } + } else { + test_cases = make_test_cases_from_file(test_file_path); + } + + filter_test_cases(test_cases, params_filter); + if (mode == MODE_TEST) { - auto test_cases = make_test_cases_eval(); - filter_test_cases(test_cases, params_filter); ggml_backend_t backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, NULL); if (backend_cpu == NULL) { test_operation_info info("", "", "CPU"); @@ -8812,8 +9125,6 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op } if (mode == MODE_GRAD) { - auto test_cases = make_test_cases_eval(); - filter_test_cases(test_cases, params_filter); size_t n_ok = 0; for (auto & test : test_cases) { if (test->eval_grad(backend, op_names_filter, output_printer)) { @@ -8826,8 +9137,6 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op } if (mode == MODE_PERF) { - auto test_cases = make_test_cases_perf(); - filter_test_cases(test_cases, params_filter); for (auto & test : test_cases) { test->eval_perf(backend, op_names_filter, output_printer); } @@ -8835,9 +9144,6 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op } if (mode == MODE_SUPPORT) { - auto test_cases = make_test_cases_eval(); - filter_test_cases(test_cases, params_filter); - // Filter out fusion cases test_cases.erase( std::remove_if(test_cases.begin(), test_cases.end(), [](const std::unique_ptr & tc) { @@ -8956,7 +9262,8 @@ static void show_test_coverage() { } static void usage(char ** argv) { - printf("Usage: %s [mode] [-o ] [-b ] [-p ] [--output ] [--list-ops] [--show-coverage]\n", argv[0]); + printf("Usage: %s [mode] [-o ] [-b ] [-p ] [--output ] [--list-ops]", argv[0]); + printf(" [--show-coverage] [--test-file ]\n"); printf(" valid modes:\n"); printf(" - test (default, compare with CPU backend for correctness)\n"); printf(" - grad (compare gradients from backpropagation with method of finite differences)\n"); @@ -8967,6 +9274,7 @@ static void usage(char ** argv) { printf(" --output specifies output format (default: console, options: console, sql, csv)\n"); printf(" --list-ops lists all available GGML operations\n"); printf(" --show-coverage shows test coverage\n"); + printf(" --test-file reads test operators from a test file generated by llama-export-graph-ops\n"); } int main(int argc, char ** argv) { @@ -8975,6 +9283,7 @@ int main(int argc, char ** argv) { const char * op_names_filter = nullptr; const char * backend_filter = nullptr; const char * params_filter = nullptr; + const char * test_file_path = nullptr; for (int i = 1; i < argc; i++) { if (strcmp(argv[i], "test") == 0) { @@ -9022,6 +9331,13 @@ int main(int argc, char ** argv) { } else if (strcmp(argv[i], "--show-coverage") == 0) { show_test_coverage(); return 0; + } else if (strcmp(argv[i], "--test-file") == 0) { + if (i + 1 < argc) { + test_file_path = argv[++i]; + } else { + usage(argv); + return 1; + } } else { usage(argv); return 1; @@ -9074,7 +9390,7 @@ int main(int argc, char ** argv) { false, "", ggml_backend_dev_description(dev), total / 1024 / 1024, free / 1024 / 1024, true)); - bool ok = test_backend(backend, mode, op_names_filter, params_filter, output_printer.get()); + bool ok = test_backend(backend, mode, op_names_filter, params_filter, output_printer.get(), test_file_path); if (ok) { n_ok++;