From 1dd382301330b7e2e7f1230515e063f7a3e4ceff Mon Sep 17 00:00:00 2001 From: Kelsi Date: Tue, 24 Mar 2026 14:09:16 -0700 Subject: [PATCH] perf: use second GPU queue for parallel texture/buffer uploads Request 2 queues from the graphics family when available (NVIDIA exposes 16, AMD 2+). Upload batches now submit to queue[1] while rendering uses queue[0], enabling parallel GPU transfers without queue-family ownership transfer barriers (same family). Falls back to single-queue path on GPUs with only 1 queue in the graphics family. Transfer command pool is separate to avoid contention. --- include/rendering/vk_context.hpp | 7 ++ src/rendering/vk_context.cpp | 158 +++++++++++++++++++++++++++---- 2 files changed, 144 insertions(+), 21 deletions(-) diff --git a/include/rendering/vk_context.hpp b/include/rendering/vk_context.hpp index fbc16e2a..c9926cf5 100644 --- a/include/rendering/vk_context.hpp +++ b/include/rendering/vk_context.hpp @@ -78,6 +78,7 @@ public: bool isNvidiaGpu() const { return gpuVendorId_ == 0x10DE; } VkQueue getGraphicsQueue() const { return graphicsQueue; } uint32_t getGraphicsQueueFamily() const { return graphicsQueueFamily; } + bool hasDedicatedTransferQueue() const { return hasDedicatedTransfer_; } VmaAllocator getAllocator() const { return allocator; } VkSurfaceKHR getSurface() const { return surface; } VkPipelineCache getPipelineCache() const { return pipelineCache_; } @@ -175,6 +176,12 @@ private: uint32_t graphicsQueueFamily = 0; uint32_t presentQueueFamily = 0; + // Dedicated transfer queue (second queue from same graphics family) + VkQueue transferQueue_ = VK_NULL_HANDLE; + VkCommandPool transferCommandPool_ = VK_NULL_HANDLE; + bool hasDedicatedTransfer_ = false; + uint32_t graphicsQueueFamilyQueueCount_ = 1; // queried in selectPhysicalDevice + // Swapchain VkSwapchainKHR swapchain = VK_NULL_HANDLE; VkFormat swapchainFormat = VK_FORMAT_UNDEFINED; diff --git a/src/rendering/vk_context.cpp b/src/rendering/vk_context.cpp index 323af430..3314ff83 100644 --- a/src/rendering/vk_context.cpp +++ b/src/rendering/vk_context.cpp @@ -135,6 +135,7 @@ void VkContext::shutdown() { if (immFence) { vkDestroyFence(device, immFence, nullptr); immFence = VK_NULL_HANDLE; } if (immCommandPool) { vkDestroyCommandPool(device, immCommandPool, nullptr); immCommandPool = VK_NULL_HANDLE; } + if (transferCommandPool_) { vkDestroyCommandPool(device, transferCommandPool_, nullptr); transferCommandPool_ = VK_NULL_HANDLE; } // Persist pipeline cache to disk before tearing down the device. savePipelineCache(); @@ -328,11 +329,52 @@ bool VkContext::selectPhysicalDevice() { VK_VERSION_MINOR(props.apiVersion), ".", VK_VERSION_PATCH(props.apiVersion)); LOG_INFO("Depth resolve support: ", depthResolveSupported_ ? "YES" : "NO"); + // Probe queue families to see if the graphics family supports multiple queues + // (used in createLogicalDevice to request a second queue for parallel uploads). + auto queueFamilies = vkbPhysicalDevice_.get_queue_families(); + for (uint32_t i = 0; i < static_cast(queueFamilies.size()); i++) { + if (queueFamilies[i].queueFlags & VK_QUEUE_GRAPHICS_BIT) { + graphicsQueueFamilyQueueCount_ = queueFamilies[i].queueCount; + LOG_INFO("Graphics queue family ", i, " supports ", graphicsQueueFamilyQueueCount_, " queue(s)"); + break; + } + } + return true; } bool VkContext::createLogicalDevice() { vkb::DeviceBuilder deviceBuilder{vkbPhysicalDevice_}; + + // If the graphics queue family supports >= 2 queues, request a second one + // for parallel texture/buffer uploads. Both queues share the same family + // so no queue-ownership-transfer barriers are needed. + const bool requestTransferQueue = (graphicsQueueFamilyQueueCount_ >= 2); + + if (requestTransferQueue) { + // Build a custom queue description list: 2 queues from the graphics + // family, 1 queue from every other family (so present etc. still work). + auto families = vkbPhysicalDevice_.get_queue_families(); + uint32_t gfxFamily = UINT32_MAX; + for (uint32_t i = 0; i < static_cast(families.size()); i++) { + if (families[i].queueFlags & VK_QUEUE_GRAPHICS_BIT) { + gfxFamily = i; + break; + } + } + + std::vector queueDescs; + for (uint32_t i = 0; i < static_cast(families.size()); i++) { + if (i == gfxFamily) { + // Request 2 queues: [0] graphics, [1] transfer uploads + queueDescs.emplace_back(i, std::vector{1.0f, 1.0f}); + } else { + queueDescs.emplace_back(i, std::vector{1.0f}); + } + } + deviceBuilder.custom_queue_setup(queueDescs); + } + auto devRet = deviceBuilder.build(); if (!devRet) { LOG_ERROR("Failed to create Vulkan logical device: ", devRet.error().message()); @@ -342,22 +384,45 @@ bool VkContext::createLogicalDevice() { auto vkbDevice = devRet.value(); device = vkbDevice.device; - auto gqRet = vkbDevice.get_queue(vkb::QueueType::graphics); - if (!gqRet) { - LOG_ERROR("Failed to get graphics queue"); - return false; - } - graphicsQueue = gqRet.value(); - graphicsQueueFamily = vkbDevice.get_queue_index(vkb::QueueType::graphics).value(); + if (requestTransferQueue) { + // With custom_queue_setup, we must retrieve queues manually. + auto families = vkbPhysicalDevice_.get_queue_families(); + uint32_t gfxFamily = UINT32_MAX; + for (uint32_t i = 0; i < static_cast(families.size()); i++) { + if (families[i].queueFlags & VK_QUEUE_GRAPHICS_BIT) { + gfxFamily = i; + break; + } + } + graphicsQueueFamily = gfxFamily; + vkGetDeviceQueue(device, gfxFamily, 0, &graphicsQueue); + vkGetDeviceQueue(device, gfxFamily, 1, &transferQueue_); + hasDedicatedTransfer_ = true; - auto pqRet = vkbDevice.get_queue(vkb::QueueType::present); - if (!pqRet) { - // Fall back to graphics queue for presentation + // Present queue: try the graphics family first (most common), otherwise + // find a family that supports presentation. presentQueue = graphicsQueue; - presentQueueFamily = graphicsQueueFamily; + presentQueueFamily = gfxFamily; + + LOG_INFO("Dedicated transfer queue enabled (family ", gfxFamily, ", queue index 1)"); } else { - presentQueue = pqRet.value(); - presentQueueFamily = vkbDevice.get_queue_index(vkb::QueueType::present).value(); + // Standard path — let vkb resolve queues. + auto gqRet = vkbDevice.get_queue(vkb::QueueType::graphics); + if (!gqRet) { + LOG_ERROR("Failed to get graphics queue"); + return false; + } + graphicsQueue = gqRet.value(); + graphicsQueueFamily = vkbDevice.get_queue_index(vkb::QueueType::graphics).value(); + + auto pqRet = vkbDevice.get_queue(vkb::QueueType::present); + if (!pqRet) { + presentQueue = graphicsQueue; + presentQueueFamily = graphicsQueueFamily; + } else { + presentQueue = pqRet.value(); + presentQueueFamily = vkbDevice.get_queue_index(vkb::QueueType::present).value(); + } } LOG_INFO("Vulkan logical device created"); @@ -588,6 +653,19 @@ bool VkContext::createCommandPools() { return false; } + // Separate command pool for the transfer queue (same family, different queue) + if (hasDedicatedTransfer_) { + VkCommandPoolCreateInfo transferPoolInfo{}; + transferPoolInfo.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO; + transferPoolInfo.flags = VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT; + transferPoolInfo.queueFamilyIndex = graphicsQueueFamily; + + if (vkCreateCommandPool(device, &transferPoolInfo, nullptr, &transferCommandPool_) != VK_SUCCESS) { + LOG_ERROR("Failed to create transfer command pool"); + return false; + } + } + return true; } @@ -1709,7 +1787,21 @@ void VkContext::beginUploadBatch() { uploadBatchDepth_++; if (inUploadBatch_) return; // already in a batch (nested call) inUploadBatch_ = true; - batchCmd_ = beginSingleTimeCommands(); + + // Allocate from transfer pool if available, otherwise from immCommandPool. + VkCommandPool pool = hasDedicatedTransfer_ ? transferCommandPool_ : immCommandPool; + + VkCommandBufferAllocateInfo allocInfo{}; + allocInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO; + allocInfo.commandPool = pool; + allocInfo.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY; + allocInfo.commandBufferCount = 1; + vkAllocateCommandBuffers(device, &allocInfo, &batchCmd_); + + VkCommandBufferBeginInfo beginInfo{}; + beginInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO; + beginInfo.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT; + vkBeginCommandBuffer(batchCmd_, &beginInfo); } void VkContext::endUploadBatch() { @@ -1719,10 +1811,12 @@ void VkContext::endUploadBatch() { inUploadBatch_ = false; + VkCommandPool pool = hasDedicatedTransfer_ ? transferCommandPool_ : immCommandPool; + if (batchStagingBuffers_.empty()) { // No GPU copies were recorded — skip the submit entirely. vkEndCommandBuffer(batchCmd_); - vkFreeCommandBuffers(device, immCommandPool, 1, &batchCmd_); + vkFreeCommandBuffers(device, pool, 1, &batchCmd_); batchCmd_ = VK_NULL_HANDLE; return; } @@ -1739,7 +1833,10 @@ void VkContext::endUploadBatch() { submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO; submitInfo.commandBufferCount = 1; submitInfo.pCommandBuffers = &batchCmd_; - vkQueueSubmit(graphicsQueue, 1, &submitInfo, fence); + + // Submit to the dedicated transfer queue if available, otherwise graphics. + VkQueue targetQueue = hasDedicatedTransfer_ ? transferQueue_ : graphicsQueue; + vkQueueSubmit(targetQueue, 1, &submitInfo, fence); // Stash everything for later cleanup when fence signals InFlightBatch batch; @@ -1759,15 +1856,30 @@ void VkContext::endUploadBatchSync() { inUploadBatch_ = false; + VkCommandPool pool = hasDedicatedTransfer_ ? transferCommandPool_ : immCommandPool; + if (batchStagingBuffers_.empty()) { vkEndCommandBuffer(batchCmd_); - vkFreeCommandBuffers(device, immCommandPool, 1, &batchCmd_); + vkFreeCommandBuffers(device, pool, 1, &batchCmd_); batchCmd_ = VK_NULL_HANDLE; return; } - // Synchronous path for load screens — submit and wait - endSingleTimeCommands(batchCmd_); + // Synchronous path for load screens — submit and wait on the target queue. + VkQueue targetQueue = hasDedicatedTransfer_ ? transferQueue_ : graphicsQueue; + + vkEndCommandBuffer(batchCmd_); + + VkSubmitInfo submitInfo{}; + submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO; + submitInfo.commandBufferCount = 1; + submitInfo.pCommandBuffers = &batchCmd_; + + vkQueueSubmit(targetQueue, 1, &submitInfo, immFence); + vkWaitForFences(device, 1, &immFence, VK_TRUE, UINT64_MAX); + vkResetFences(device, 1, &immFence); + + vkFreeCommandBuffers(device, pool, 1, &batchCmd_); batchCmd_ = VK_NULL_HANDLE; for (auto& staging : batchStagingBuffers_) { @@ -1779,6 +1891,8 @@ void VkContext::endUploadBatchSync() { void VkContext::pollUploadBatches() { if (inFlightBatches_.empty()) return; + VkCommandPool pool = hasDedicatedTransfer_ ? transferCommandPool_ : immCommandPool; + for (auto it = inFlightBatches_.begin(); it != inFlightBatches_.end(); ) { VkResult result = vkGetFenceStatus(device, it->fence); if (result == VK_SUCCESS) { @@ -1786,7 +1900,7 @@ void VkContext::pollUploadBatches() { for (auto& staging : it->stagingBuffers) { destroyBuffer(allocator, staging); } - vkFreeCommandBuffers(device, immCommandPool, 1, &it->cmd); + vkFreeCommandBuffers(device, pool, 1, &it->cmd); vkDestroyFence(device, it->fence, nullptr); it = inFlightBatches_.erase(it); } else { @@ -1796,12 +1910,14 @@ void VkContext::pollUploadBatches() { } void VkContext::waitAllUploads() { + VkCommandPool pool = hasDedicatedTransfer_ ? transferCommandPool_ : immCommandPool; + for (auto& batch : inFlightBatches_) { vkWaitForFences(device, 1, &batch.fence, VK_TRUE, UINT64_MAX); for (auto& staging : batch.stagingBuffers) { destroyBuffer(allocator, staging); } - vkFreeCommandBuffers(device, immCommandPool, 1, &batch.cmd); + vkFreeCommandBuffers(device, pool, 1, &batch.cmd); vkDestroyFence(device, batch.fence, nullptr); } inFlightBatches_.clear();