Line data Source code
1 : // SPDX-License-Identifier: Apache-2.0 2 : #pragma once 3 : 4 : #include "kompute/Core.hpp" 5 : 6 : #include "kompute/operations/OpAlgoDispatch.hpp" 7 : #include "kompute/operations/OpBase.hpp" 8 : 9 : namespace kp { 10 : 11 : /** 12 : * Container of operations that can be sent to GPU as batch 13 : */ 14 : class Sequence : public std::enable_shared_from_this<Sequence> 15 : { 16 : public: 17 : /** 18 : * Main constructor for sequence which requires core vulkan components to 19 : * generate all dependent resources. 20 : * 21 : * @param physicalDevice Vulkan physical device 22 : * @param device Vulkan logical device 23 : * @param computeQueue Vulkan compute queue 24 : * @param queueIndex Vulkan compute queue index in device 25 : * @param totalTimestamps Maximum number of timestamps to allocate 26 : */ 27 : Sequence(std::shared_ptr<vk::PhysicalDevice> physicalDevice, 28 : std::shared_ptr<vk::Device> device, 29 : std::shared_ptr<vk::Queue> computeQueue, 30 : uint32_t queueIndex, 31 : uint32_t totalTimestamps = 0); 32 : /** 33 : * Destructor for sequence which is responsible for cleaning all subsequent 34 : * owned operations. 35 : */ 36 : ~Sequence(); 37 : 38 : /** 39 : * Record function for operation to be added to the GPU queue in batch. This 40 : * template requires classes to be derived from the OpBase class. This 41 : * function also requires the Sequence to be recording, otherwise it will 42 : * not be able to add the operation. 43 : * 44 : * @param op Object derived from kp::BaseOp that will be recoreded by the 45 : * sequence which will be used when the operation is evaluated. 46 : * @return shared_ptr<Sequence> of the Sequence class itself 47 : */ 48 : std::shared_ptr<Sequence> record(std::shared_ptr<OpBase> op); 49 : 50 : /** 51 : * Record function for operation to be added to the GPU queue in batch. This 52 : * template requires classes to be derived from the OpBase class. This 53 : * function also requires the Sequence to be recording, otherwise it will 54 : * not be able to add the operation. 55 : * 56 : * @param tensors Vector of tensors to use for the operation 57 : * @param TArgs Template parameters that are used to initialise operation 58 : * which allows for extensible configurations on initialisation. 59 : * @return shared_ptr<Sequence> of the Sequence class itself 60 : */ 61 : template<typename T, typename... TArgs> 62 31 : std::shared_ptr<Sequence> record( 63 : std::vector<std::shared_ptr<Tensor>> tensors, 64 : TArgs&&... params) 65 : { 66 32 : std::shared_ptr<T> op{ new T(tensors, std::forward<TArgs>(params)...) }; 67 62 : return this->record(op); 68 31 : } 69 : /** 70 : * Record function for operation to be added to the GPU queue in batch. This 71 : * template requires classes to be derived from the OpBase class. This 72 : * function also requires the Sequence to be recording, otherwise it will 73 : * not be able to add the operation. 74 : * 75 : * @param algorithm Algorithm to use for the record often used for OpAlgo 76 : * operations 77 : * @param TArgs Template parameters that are used to initialise operation 78 : * which allows for extensible configurations on initialisation. 79 : * @return shared_ptr<Sequence> of the Sequence class itself 80 : */ 81 : template<typename T, typename... TArgs> 82 24 : std::shared_ptr<Sequence> record(std::shared_ptr<Algorithm> algorithm, 83 : TArgs&&... params) 84 : { 85 24 : std::shared_ptr<T> op{ new T(algorithm, 86 2 : std::forward<TArgs>(params)...) }; 87 48 : return this->record(op); 88 24 : } 89 : 90 : /** 91 : * Eval sends all the recorded and stored operations in the vector of 92 : * operations into the gpu as a submit job synchronously (with a barrier). 93 : * 94 : * @return shared_ptr<Sequence> of the Sequence class itself 95 : */ 96 : std::shared_ptr<Sequence> eval(); 97 : 98 : /** 99 : * Resets all the recorded and stored operations, records the operation 100 : * provided and submits into the gpu as a submit job synchronously (with a 101 : * barrier). 102 : * 103 : * @return shared_ptr<Sequence> of the Sequence class itself 104 : */ 105 : std::shared_ptr<Sequence> eval(std::shared_ptr<OpBase> op); 106 : 107 : /** 108 : * Eval sends all the recorded and stored operations in the vector of 109 : * operations into the gpu as a submit job with a barrier. 110 : * 111 : * @param tensors Vector of tensors to use for the operation 112 : * @param TArgs Template parameters that are used to initialise operation 113 : * which allows for extensible configurations on initialisation. 114 : * @return shared_ptr<Sequence> of the Sequence class itself 115 : */ 116 : template<typename T, typename... TArgs> 117 53 : std::shared_ptr<Sequence> eval(std::vector<std::shared_ptr<Tensor>> tensors, 118 : TArgs&&... params) 119 : { 120 56 : std::shared_ptr<T> op{ new T(tensors, std::forward<TArgs>(params)...) }; 121 104 : return this->eval(op); 122 52 : } 123 : /** 124 : * Eval sends all the recorded and stored operations in the vector of 125 : * operations into the gpu as a submit job with a barrier. 126 : * 127 : * @param algorithm Algorithm to use for the record often used for OpAlgo 128 : * operations 129 : * @param TArgs Template parameters that are used to initialise operation 130 : * which allows for extensible configurations on initialisation. 131 : * @return shared_ptr<Sequence> of the Sequence class itself 132 : */ 133 : template<typename T, typename... TArgs> 134 14 : std::shared_ptr<Sequence> eval(std::shared_ptr<Algorithm> algorithm, 135 : TArgs&&... params) 136 : { 137 14 : std::shared_ptr<T> op{ new T(algorithm, 138 9 : std::forward<TArgs>(params)...) }; 139 28 : return this->eval(op); 140 14 : } 141 : 142 : /** 143 : * Eval Async sends all the recorded and stored operations in the vector of 144 : * operations into the gpu as a submit job without a barrier. EvalAwait() 145 : * must ALWAYS be called after to ensure the sequence is terminated 146 : * correctly. 147 : * 148 : * @return Boolean stating whether execution was successful. 149 : */ 150 : std::shared_ptr<Sequence> evalAsync(); 151 : /** 152 : * Clears currnet operations to record provided one in the vector of 153 : * operations into the gpu as a submit job without a barrier. EvalAwait() 154 : * must ALWAYS be called after to ensure the sequence is terminated 155 : * correctly. 156 : * 157 : * @return Boolean stating whether execution was successful. 158 : */ 159 : std::shared_ptr<Sequence> evalAsync(std::shared_ptr<OpBase> op); 160 : /** 161 : * Eval sends all the recorded and stored operations in the vector of 162 : * operations into the gpu as a submit job with a barrier. 163 : * 164 : * @param tensors Vector of tensors to use for the operation 165 : * @param TArgs Template parameters that are used to initialise operation 166 : * which allows for extensible configurations on initialisation. 167 : * @return shared_ptr<Sequence> of the Sequence class itself 168 : */ 169 : template<typename T, typename... TArgs> 170 3 : std::shared_ptr<Sequence> evalAsync( 171 : std::vector<std::shared_ptr<Tensor>> tensors, 172 : TArgs&&... params) 173 : { 174 3 : std::shared_ptr<T> op{ new T(tensors, std::forward<TArgs>(params)...) }; 175 6 : return this->evalAsync(op); 176 3 : } 177 : /** 178 : * Eval sends all the recorded and stored operations in the vector of 179 : * operations into the gpu as a submit job with a barrier. 180 : * 181 : * @param algorithm Algorithm to use for the record often used for OpAlgo 182 : * operations 183 : * @param TArgs Template parameters that are used to initialise operation 184 : * which allows for extensible configurations on initialisation. 185 : * @return shared_ptr<Sequence> of the Sequence class itself 186 : */ 187 : template<typename T, typename... TArgs> 188 4 : std::shared_ptr<Sequence> evalAsync(std::shared_ptr<Algorithm> algorithm, 189 : TArgs&&... params) 190 : { 191 4 : std::shared_ptr<T> op{ new T(algorithm, 192 : std::forward<TArgs>(params)...) }; 193 8 : return this->evalAsync(op); 194 4 : } 195 : 196 : /** 197 : * Eval Await waits for the fence to finish processing and then once it 198 : * finishes, it runs the postEval of all operations. 199 : * 200 : * @param waitFor Number of milliseconds to wait before timing out. 201 : * @return shared_ptr<Sequence> of the Sequence class itself 202 : */ 203 : std::shared_ptr<Sequence> evalAwait(uint64_t waitFor = UINT64_MAX); 204 : 205 : /** 206 : * Clear function clears all operations currently recorded and starts 207 : * recording again. 208 : */ 209 : void clear(); 210 : 211 : /** 212 : * Return the timestamps that were latched at the beginning and 213 : * after each operation during the last eval() call. 214 : */ 215 : std::vector<std::uint64_t> getTimestamps(); 216 : 217 : /** 218 : * Begins recording commands for commands to be submitted into the command 219 : * buffer. 220 : */ 221 : void begin(); 222 : 223 : /** 224 : * Ends the recording and stops recording commands when the record command 225 : * is sent. 226 : */ 227 : void end(); 228 : 229 : /** 230 : * Returns true if the sequence is currently in recording activated. 231 : * 232 : * @return Boolean stating if recording ongoing. 233 : */ 234 : bool isRecording() const; 235 : 236 : /** 237 : * Returns true if the sequence has been initialised, and it's based on the 238 : * GPU resources being referenced. 239 : * 240 : * @return Boolean stating if is initialized 241 : */ 242 : bool isInit() const; 243 : 244 : /** 245 : * Clears command buffer and triggers re-record of all the current 246 : * operations saved, which is useful if the underlying kp::Tensors or 247 : * kp::Algorithms are modified and need to be re-recorded. 248 : */ 249 : void rerecord(); 250 : 251 : /** 252 : * Returns true if the sequence is currently running - mostly used for async 253 : * workloads. 254 : * 255 : * @return Boolean stating if currently running. 256 : */ 257 : bool isRunning() const; 258 : 259 : /** 260 : * Destroys and frees the GPU resources which include the buffer and memory 261 : * and sets the sequence as init=False. 262 : */ 263 : void destroy(); 264 : 265 : private: 266 : // -------------- NEVER OWNED RESOURCES 267 : std::shared_ptr<vk::PhysicalDevice> mPhysicalDevice = nullptr; 268 : std::shared_ptr<vk::Device> mDevice = nullptr; 269 : std::shared_ptr<vk::Queue> mComputeQueue = nullptr; 270 : uint32_t mQueueIndex = -1; 271 : 272 : // -------------- OPTIONALLY OWNED RESOURCES 273 : std::shared_ptr<vk::CommandPool> mCommandPool = nullptr; 274 : bool mFreeCommandPool = false; 275 : std::shared_ptr<vk::CommandBuffer> mCommandBuffer = nullptr; 276 : bool mFreeCommandBuffer = false; 277 : 278 : // -------------- ALWAYS OWNED RESOURCES 279 : vk::Fence mFence; 280 : std::vector<std::shared_ptr<OpBase>> mOperations{}; 281 : std::shared_ptr<vk::QueryPool> timestampQueryPool = nullptr; 282 : 283 : // State 284 : bool mRecording = false; 285 : bool mIsRunning = false; 286 : 287 : // Create functions 288 : void createCommandPool(); 289 : void createCommandBuffer(); 290 : void createTimestampQueryPool(uint32_t totalTimestamps); 291 : }; 292 : 293 : } // End namespace kp