Update llama_model_quantize_params

Split model correctly even if tensor id is out-of-order
Update examples/quantize/quantize.cpp
2024-04-22 23:38:09 +08:00 · 2024-04-22 23:18:41 +08:00 · 2024-04-22 22:27:25 +08:00
3 changed files with 21 additions and 12 deletions
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@ -354,7 +354,7 @@ int main(int argc, char ** argv) {
        }
    } else {
        fname_out = argv[arg_idx];
-        if (params.keep_split && fname_out.find(suffix) !=std::string::npos) {
+        if (params.keep_split && fname_out.find(suffix) != std::string::npos) {
            fname_out = fname_out.substr(0, fname_out.length() - suffix.length());
        }
        arg_idx++;
--- a/llama.cpp
+++ b/llama.cpp
@ -13534,7 +13534,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s

    const size_t align = GGUF_DEFAULT_ALIGNMENT;
    struct gguf_context * ctx_out = gguf_init_empty();
-    std::vector<gguf_context*> ctx_outs = {ctx_out};

    // copy the KV pairs from the input file
    gguf_set_kv     (ctx_out, ml.meta);
@ -13596,19 +13595,28 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
    std::vector<no_init<uint8_t>> work;
    std::vector<no_init<float>> f32_conv_buf;

+    uint16_t n_split = 1;
+    // Assume split index is continuous
+    if (params->keep_split) {
+        for (int i = 0; i < ml.n_tensors; ++i) {
+            n_split = std::max(uint16_t(ml.get_weight(i)->idx+1), n_split);
+        }
+    }
+    std::vector<gguf_context*> ctx_outs(n_split, NULL);
+    ctx_outs[0] = ctx_out;
+
    // populate the original tensors so we get an initial meta data
    for (int i = 0; i < ml.n_tensors; ++i) {
        auto weight = ml.get_weight(i);
+        uint16_t i_split = params->keep_split ? weight->idx : 0;
        struct ggml_tensor * tensor = weight->tensor;
-        if (weight->idx != (ctx_outs.size() - 1) && params->keep_split) {
-            ctx_out = gguf_init_empty();
-            ctx_outs.push_back(ctx_out);
+        if (ctx_outs[i_split] == NULL) {
+            ctx_outs[i_split] = gguf_init_empty();
        }
-        gguf_add_tensor(ctx_out, tensor);
+        gguf_add_tensor(ctx_outs[i_split], tensor);
    }

    // Set split info if needed
-    uint16_t n_split = ctx_outs.size();
    if (n_split > 1) {
        for (int i = 0; i < ctx_outs.size(); ++i) {
            gguf_set_val_u16(ctx_outs[i], ml.llm_kv(LLM_KV_SPLIT_NO).c_str(), i);
@ -13629,8 +13637,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
            fout.close();
        }
    };
-    auto new_ofstream = [&]() {
-        ++cur_split;
+    auto new_ofstream = [&](int index = 0) {
+        cur_split = index;
+        GGML_ASSERT(ctx_outs[cur_split] && "Find uninitialized gguf_context");
        std::string fname = fname_out;
        if (params->keep_split) {
            char split_path[PATH_MAX] = {0};
@ -13651,9 +13660,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
        auto weight = ml.get_weight(i);
        struct ggml_tensor * tensor = weight->tensor;
        if (weight->idx != cur_split && params->keep_split) {
-            GGML_ASSERT(cur_split == weight->idx-1 && "Invalid split index found in weight");
            close_ofstream();
-            new_ofstream();
+            new_ofstream(weight->idx);
        }

        const std::string name = ggml_get_name(tensor);
@ -14176,6 +14184,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
        /*.quantize_output_tensor      =*/ true,
        /*.only_copy                   =*/ false,
        /*.pure                        =*/ false,
+        /*.keep_split                  =*/ false,
        /*.imatrix                     =*/ nullptr,
        /*.kv_overrides                =*/ nullptr,
    };
--- a/llama.h
+++ b/llama.h
@ -288,9 +288,9 @@ extern "C" {
        bool quantize_output_tensor;         // quantize output.weight
        bool only_copy;                      // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
        bool pure;                           // quantize all tensors to the default type
+        bool keep_split;                     // quantize to the same number of shards
        void * imatrix;                      // pointer to importance matrix data
        void * kv_overrides;                 // pointer to vector containing overrides
-        bool keep_split;                     // quantize to the same number of shards
    } llama_model_quantize_params;

    // grammar types
Author	SHA1	Message	Date
z5269887	141eb5107f	Update llama_model_quantize_params	2024-04-22 23:38:09 +08:00
z5269887	d6e453eb6c	Split model correctly even if tensor id is out-of-order	2024-04-22 23:18:41 +08:00
jiez	6d66e609b5	Update examples/quantize/quantize.cpp Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>	2024-04-22 22:27:25 +08:00