llama.cpp/ggml/src/ggml-qnn/npu/device/device.cpp


#include <AEEStdErr.h>
#include <HAP_compute_res.h>
#include <hexagon_types.h>

#include <memory>
#include <new>

#include "graph.hpp"
#include "hexagon_npu.h"
#include "op_impl.hpp"
#include "quants.hpp"
#include "remote.h"
#include "tensor.hpp"
#include "thread_pool.hpp"
#include "util.hpp"

namespace {

struct npu_device_context {
    std::unique_ptr<hexagon::default_thread_pool> thread_pool;
    std::unique_ptr<float[]>                      f16_to_f32_table;  // TODO: store vtcm?

    bool init() {
        if (!init_ltu()) {
            DEVICE_LOG_ERROR("Failed to initialize LTU");
            return false;
        }

        if (!init_thread_pool()) {
            DEVICE_LOG_ERROR("Failed to initialize thread pool");
            return false;
        }

        DEVICE_LOG_DEBUG("NPU device context initialized");
        return true;
    }

  private:
    bool init_ltu() {
        constexpr const size_t kLtuCount = 1U << 16;

        f16_to_f32_table = std::make_unique<float[]>(kLtuCount);
        if (!f16_to_f32_table) {
            DEVICE_LOG_ERROR("Failed to allocate memory for f16_to_f32 table");
            return false;
        }

        hexagon::init_f16_f32_table(f16_to_f32_table.get(), kLtuCount);
        DEVICE_LOG_DEBUG("f16_to_f32 table initialized");
        return true;
    }

    bool init_thread_pool() {
        if (thread_pool) {
            DEVICE_LOG_DEBUG("Thread pool already initialized");
            return true;
        }

        auto pool = std::make_unique<hexagon::default_thread_pool>();
        if (!pool) {
            DEVICE_LOG_ERROR("Failed to create thread pool");
            return false;
        }

        thread_pool = std::move(pool);
        DEVICE_LOG_DEBUG("Thread pool initialized");
        return true;
    }
};

inline hexagon::tensor * tensor_from_handle(npu_device_graph_handle_t h) {
    return reinterpret_cast<hexagon::tensor *>(h);
}

inline npu_device_graph_handle_t tensor_to_handle(hexagon::tensor * tensor) {
    return reinterpret_cast<npu_device_graph_handle_t>(tensor);
}

inline hexagon::graph * graph_from_handle(npu_device_tensor_handle_t h) {
    return reinterpret_cast<hexagon::graph *>(h);
}

inline npu_device_tensor_handle_t graph_to_handle(hexagon::graph * graph) {
    return reinterpret_cast<npu_device_tensor_handle_t>(graph);
}

inline npu_device_context * device_context_from_handle(remote_handle64 h) {
    return reinterpret_cast<npu_device_context *>(h);
}

}  // namespace

int npu_device_open(const char * uri, remote_handle64 * h) {
    // TODO: should we have a device context here?
    auto * context = new (std::nothrow) npu_device_context();
    if (!context) {
        DEVICE_LOG_ERROR("Failed to allocate memory for the npu_device_context");
        return AEE_ENOMEMORY;
    }

    if (!context->init()) {
        DEVICE_LOG_ERROR("Failed to initialize npu_device_context");
        delete context;
        return AEE_EFAILED;
    }

    *h = reinterpret_cast<remote_handle64>(context);
    DEVICE_LOG_INFO("NPU device context created: %p", (void *) *h);
    return AEE_SUCCESS;
}

int npu_device_close(remote_handle64 h) {
    auto * context = device_context_from_handle(h);
    if (!context) {
        DEVICE_LOG_ERROR("Invalid npu_device_context handle");
        return AEE_EINVHANDLE;
    }

    delete context;
    DEVICE_LOG_INFO("NPU device context destroyed: %p", (void *) h);
    return AEE_SUCCESS;
}

AEEResult npu_device_device_get_alignment(remote_handle64 _h, uint32_t * alignment) {
    NPU_UNUSED(_h);
    *alignment = sizeof(HVX_Vector);
    return AEE_SUCCESS;
}

AEEResult npu_device_device_support_op(remote_handle64 _h, const npu_device_tensor_spec * src0,
                                       const npu_device_tensor_spec * src1, const npu_device_tensor_spec * dst,
                                       npu_device_tensor_op op, boolean * is_supported) {
    NPU_UNUSED(_h);

    if (!src0 || !src1 || !dst || !is_supported) {
        DEVICE_LOG_ERROR("npu_device_device_support_op: Invalid arguments");
        return AEE_EINVARGS;
    }

    *is_supported = hexagon::support_op(*src0, *src1, *dst, op);
    return AEE_SUCCESS;
}

AEEResult npu_device_tensor_init(remote_handle64 _h, const npu_device_tensor_config * info,
                                 npu_device_tensor_handle_t * tensor_handle) {
    NPU_UNUSED(_h);
    auto * tensor = new (std::nothrow) hexagon::tensor(*info);
    if (!tensor) {
        DEVICE_LOG_ERROR("Failed to allocate memory for the tensor");
        return AEE_ENOMEMORY;
    }

    *tensor_handle = tensor_to_handle(tensor);
    return AEE_SUCCESS;
}

AEEResult npu_device_tensor_update_params(remote_handle64 _h, npu_device_tensor_handle_t tensor_handle,
                                          const npu_device_tensor_update_config * config) {
    NPU_UNUSED(_h);
    auto * tensor = tensor_from_handle(tensor_handle);
    if (!tensor || !config) {
        return AEE_EINVHANDLE;
    }

    tensor->update_config(*config);
    return AEE_SUCCESS;
}

AEEResult npu_device_tensor_free(remote_handle64 _h, npu_device_tensor_handle_t tensor_handle) {
    NPU_UNUSED(_h);
    auto * tensor = tensor_from_handle(tensor_handle);
    if (!tensor) {
        return AEE_EINVHANDLE;
    }

    delete tensor;
    return AEE_SUCCESS;
}

AEEResult npu_device_graph_init(remote_handle64 _h, npu_device_graph_handle_t * graph_handle) {
    NPU_UNUSED(_h);
    auto * graph = new (std::nothrow) hexagon::graph();
    if (!graph) {
        return AEE_ENOMEMORY;
    }

    *graph_handle = graph_to_handle(graph);
    return AEE_SUCCESS;
}

AEEResult npu_device_graph_set_tensor(remote_handle64 _h, npu_device_graph_handle_t graph_handle,
                                      const npu_device_tensor_handle_t * tensor_handles, int tensor_handlesLen) {
    NPU_UNUSED(_h);
    auto * graph = graph_from_handle(graph_handle);
    if (!graph || !tensor_handles || tensor_handlesLen <= 0) {
        return AEE_EINVHANDLE;
    }

    graph->set_tensor(tensor_handles, tensor_handlesLen);
    return AEE_SUCCESS;
}

AEEResult npu_device_graph_set_tensor_with_param(remote_handle64 _h, npu_device_graph_handle_t graph_handle,
                                                 const npu_device_tensor_handle_t *      tensor_handles,
                                                 int                                     tensor_handlesLen,
                                                 const npu_device_tensor_update_config * tensor_params,
                                                 int                                     tensor_paramsLen) {
    NPU_UNUSED(_h);
    auto * graph = graph_from_handle(graph_handle);
    if (!graph || !tensor_handles || tensor_handlesLen <= 0 || !tensor_params ||
        tensor_handlesLen != tensor_paramsLen) {
        return AEE_EINVHANDLE;
    }

    graph->set_tensor(tensor_handles, tensor_handlesLen);
    for (int i = 0; i < tensor_handlesLen; ++i) {
        auto * tensor = tensor_from_handle(tensor_handles[i]);
        if (tensor) {
            tensor->update_config(tensor_params[i]);
        }
    }

    return AEE_SUCCESS;
}

AEEResult npu_device_graph_compute(remote_handle64 _h, npu_device_graph_handle_t graph_handle) {
    auto dev_ctx = device_context_from_handle(_h);
    if (!dev_ctx) {
        DEVICE_LOG_DEBUG("Invalid npu_device_context handle");
        return AEE_EINVHANDLE;
    }

    auto * graph = graph_from_handle(graph_handle);
    if (!graph) {
        DEVICE_LOG_ERROR("Invalid graph handle");
        return AEE_EINVHANDLE;
    }

    if (!graph->compute(dev_ctx->thread_pool.get(), dev_ctx->f16_to_f32_table.get())) {
        return AEE_EFAILED;
    }

    return AEE_SUCCESS;
}

AEEResult npu_device_graph_free(remote_handle64 _h, npu_device_graph_handle_t graph_handle) {
    NPU_UNUSED(_h);
    auto * graph = graph_from_handle(graph_handle);
    if (graph) {
        delete graph;
    }

    return AEE_SUCCESS;
}