// SPDX-License-Identifier: Apache-2.0 #pragma once #include "kompute/Core.hpp" #include "kompute/operations/OpAlgoDispatch.hpp" #include "kompute/operations/OpBase.hpp" namespace kp { /** * Container of operations that can be sent to GPU as batch */ class Sequence : public std::enable_shared_from_this { public: /** * Main constructor for sequence which requires core vulkan components to * generate all dependent resources. * * @param physicalDevice Vulkan physical device * @param device Vulkan logical device * @param computeQueue Vulkan compute queue * @param queueIndex Vulkan compute queue index in device * @param totalTimestamps Maximum number of timestamps to allocate */ Sequence(std::shared_ptr physicalDevice, std::shared_ptr device, std::shared_ptr computeQueue, uint32_t queueIndex, uint32_t totalTimestamps = 0); /** * Destructor for sequence which is responsible for cleaning all subsequent * owned operations. */ ~Sequence(); /** * Record function for operation to be added to the GPU queue in batch. This * template requires classes to be derived from the OpBase class. This * function also requires the Sequence to be recording, otherwise it will * not be able to add the operation. * * @param op Object derived from kp::BaseOp that will be recoreded by the * sequence which will be used when the operation is evaluated. * @return shared_ptr of the Sequence class itself */ std::shared_ptr record(std::shared_ptr op); /** * Record function for operation to be added to the GPU queue in batch. This * template requires classes to be derived from the OpBase class. This * function also requires the Sequence to be recording, otherwise it will * not be able to add the operation. * * @param tensors Vector of tensors to use for the operation * @param TArgs Template parameters that are used to initialise operation * which allows for extensible configurations on initialisation. * @return shared_ptr of the Sequence class itself */ template std::shared_ptr record( std::vector> tensors, TArgs&&... params) { std::shared_ptr op{ new T(tensors, std::forward(params)...) }; return this->record(op); } /** * Record function for operation to be added to the GPU queue in batch. This * template requires classes to be derived from the OpBase class. This * function also requires the Sequence to be recording, otherwise it will * not be able to add the operation. * * @param algorithm Algorithm to use for the record often used for OpAlgo * operations * @param TArgs Template parameters that are used to initialise operation * which allows for extensible configurations on initialisation. * @return shared_ptr of the Sequence class itself */ template std::shared_ptr record(std::shared_ptr algorithm, TArgs&&... params) { std::shared_ptr op{ new T(algorithm, std::forward(params)...) }; return this->record(op); } /** * Eval sends all the recorded and stored operations in the vector of * operations into the gpu as a submit job synchronously (with a barrier). * * @return shared_ptr of the Sequence class itself */ std::shared_ptr eval(); /** * Resets all the recorded and stored operations, records the operation * provided and submits into the gpu as a submit job synchronously (with a * barrier). * * @return shared_ptr of the Sequence class itself */ std::shared_ptr eval(std::shared_ptr op); /** * Eval sends all the recorded and stored operations in the vector of * operations into the gpu as a submit job with a barrier. * * @param tensors Vector of tensors to use for the operation * @param TArgs Template parameters that are used to initialise operation * which allows for extensible configurations on initialisation. * @return shared_ptr of the Sequence class itself */ template std::shared_ptr eval(std::vector> tensors, TArgs&&... params) { std::shared_ptr op{ new T(tensors, std::forward(params)...) }; return this->eval(op); } template std::shared_ptr eval(vk::Buffer *primaryBuffer, vk::Buffer *stagingBuffer, vk::DeviceSize size, TArgs&&... params) { std::shared_ptr op{ new T(primaryBuffer, stagingBuffer, size, std::forward(params)...) }; return this->eval(op); } /** * Eval sends all the recorded and stored operations in the vector of * operations into the gpu as a submit job with a barrier. * * @param algorithm Algorithm to use for the record often used for OpAlgo * operations * @param TArgs Template parameters that are used to initialise operation * which allows for extensible configurations on initialisation. * @return shared_ptr of the Sequence class itself */ template std::shared_ptr eval(std::shared_ptr algorithm, TArgs&&... params) { std::shared_ptr op{ new T(algorithm, std::forward(params)...) }; return this->eval(op); } /** * Eval Async sends all the recorded and stored operations in the vector of * operations into the gpu as a submit job without a barrier. EvalAwait() * must ALWAYS be called after to ensure the sequence is terminated * correctly. * * @return Boolean stating whether execution was successful. */ std::shared_ptr evalAsync(); /** * Clears currnet operations to record provided one in the vector of * operations into the gpu as a submit job without a barrier. EvalAwait() * must ALWAYS be called after to ensure the sequence is terminated * correctly. * * @return Boolean stating whether execution was successful. */ std::shared_ptr evalAsync(std::shared_ptr op); /** * Eval sends all the recorded and stored operations in the vector of * operations into the gpu as a submit job with a barrier. * * @param tensors Vector of tensors to use for the operation * @param TArgs Template parameters that are used to initialise operation * which allows for extensible configurations on initialisation. * @return shared_ptr of the Sequence class itself */ template std::shared_ptr evalAsync( std::vector> tensors, TArgs&&... params) { std::shared_ptr op{ new T(tensors, std::forward(params)...) }; return this->evalAsync(op); } /** * Eval sends all the recorded and stored operations in the vector of * operations into the gpu as a submit job with a barrier. * * @param algorithm Algorithm to use for the record often used for OpAlgo * operations * @param TArgs Template parameters that are used to initialise operation * which allows for extensible configurations on initialisation. * @return shared_ptr of the Sequence class itself */ template std::shared_ptr evalAsync(std::shared_ptr algorithm, TArgs&&... params) { std::shared_ptr op{ new T(algorithm, std::forward(params)...) }; return this->evalAsync(op); } /** * Eval Await waits for the fence to finish processing and then once it * finishes, it runs the postEval of all operations. * * @param waitFor Number of milliseconds to wait before timing out. * @return shared_ptr of the Sequence class itself */ std::shared_ptr evalAwait(uint64_t waitFor = UINT64_MAX); /** * Clear function clears all operations currently recorded and starts * recording again. */ void clear(); /** * Return the timestamps that were latched at the beginning and * after each operation during the last eval() call. */ std::vector getTimestamps(); /** * Begins recording commands for commands to be submitted into the command * buffer. */ void begin(); /** * Ends the recording and stops recording commands when the record command * is sent. */ void end(); /** * Returns true if the sequence is currently in recording activated. * * @return Boolean stating if recording ongoing. */ bool isRecording() const; /** * Returns true if the sequence has been initialised, and it's based on the * GPU resources being referenced. * * @return Boolean stating if is initialized */ bool isInit() const; /** * Clears command buffer and triggers re-record of all the current * operations saved, which is useful if the underlying kp::Tensors or * kp::Algorithms are modified and need to be re-recorded. */ void rerecord(); /** * Returns true if the sequence is currently running - mostly used for async * workloads. * * @return Boolean stating if currently running. */ bool isRunning() const; /** * Destroys and frees the GPU resources which include the buffer and memory * and sets the sequence as init=False. */ void destroy(); private: // -------------- NEVER OWNED RESOURCES std::shared_ptr mPhysicalDevice = nullptr; std::shared_ptr mDevice = nullptr; std::shared_ptr mComputeQueue = nullptr; uint32_t mQueueIndex = -1; // -------------- OPTIONALLY OWNED RESOURCES std::shared_ptr mCommandPool = nullptr; bool mFreeCommandPool = false; std::shared_ptr mCommandBuffer = nullptr; bool mFreeCommandBuffer = false; // -------------- ALWAYS OWNED RESOURCES vk::Fence mFence; std::vector> mOperations{}; std::shared_ptr timestampQueryPool = nullptr; // State bool mRecording = false; bool mIsRunning = false; // Create functions void createCommandPool(); void createCommandBuffer(); void createTimestampQueryPool(uint32_t totalTimestamps); }; } // End namespace kp