perf: use second GPU queue for parallel texture/buffer uploads

Request 2 queues from the graphics family when available (NVIDIA exposes 16, AMD 2+). Upload batches now submit to queue[1] while rendering uses queue[0], enabling parallel GPU transfers without queue-family ownership transfer barriers (same family). Falls back to single-queue path on GPUs with only 1 queue in the graphics family. Transfer command pool is separate to avoid contention.
2026-05-10 19:13:52 +00:00 · 2026-03-24 14:09:16 -07:00 · 2026-03-24 14:09:16 -07:00 · 1dd3823013
commit 1dd3823013
parent ed0cb0ad25
2 changed files with 144 additions and 21 deletions
--- a/include/rendering/vk_context.hpp
+++ b/include/rendering/vk_context.hpp
@ -78,6 +78,7 @@ public:
    bool isNvidiaGpu() const { return gpuVendorId_ == 0x10DE; }
    VkQueue getGraphicsQueue() const { return graphicsQueue; }
    uint32_t getGraphicsQueueFamily() const { return graphicsQueueFamily; }
+    bool hasDedicatedTransferQueue() const { return hasDedicatedTransfer_; }
    VmaAllocator getAllocator() const { return allocator; }
    VkSurfaceKHR getSurface() const { return surface; }
    VkPipelineCache getPipelineCache() const { return pipelineCache_; }
@ -175,6 +176,12 @@ private:
    uint32_t graphicsQueueFamily = 0;
    uint32_t presentQueueFamily = 0;

+    // Dedicated transfer queue (second queue from same graphics family)
+    VkQueue transferQueue_ = VK_NULL_HANDLE;
+    VkCommandPool transferCommandPool_ = VK_NULL_HANDLE;
+    bool hasDedicatedTransfer_ = false;
+    uint32_t graphicsQueueFamilyQueueCount_ = 1; // queried in selectPhysicalDevice
+
    // Swapchain
    VkSwapchainKHR swapchain = VK_NULL_HANDLE;
    VkFormat swapchainFormat = VK_FORMAT_UNDEFINED;
--- a/src/rendering/vk_context.cpp
+++ b/src/rendering/vk_context.cpp
@ -135,6 +135,7 @@ void VkContext::shutdown() {

    if (immFence) { vkDestroyFence(device, immFence, nullptr); immFence = VK_NULL_HANDLE; }
    if (immCommandPool) { vkDestroyCommandPool(device, immCommandPool, nullptr); immCommandPool = VK_NULL_HANDLE; }
+    if (transferCommandPool_) { vkDestroyCommandPool(device, transferCommandPool_, nullptr); transferCommandPool_ = VK_NULL_HANDLE; }

    // Persist pipeline cache to disk before tearing down the device.
    savePipelineCache();
@ -328,11 +329,52 @@ bool VkContext::selectPhysicalDevice() {
             VK_VERSION_MINOR(props.apiVersion), ".", VK_VERSION_PATCH(props.apiVersion));
    LOG_INFO("Depth resolve support: ", depthResolveSupported_ ? "YES" : "NO");

+    // Probe queue families to see if the graphics family supports multiple queues
+    // (used in createLogicalDevice to request a second queue for parallel uploads).
+    auto queueFamilies = vkbPhysicalDevice_.get_queue_families();
+    for (uint32_t i = 0; i < static_cast<uint32_t>(queueFamilies.size()); i++) {
+        if (queueFamilies[i].queueFlags & VK_QUEUE_GRAPHICS_BIT) {
+            graphicsQueueFamilyQueueCount_ = queueFamilies[i].queueCount;
+            LOG_INFO("Graphics queue family ", i, " supports ", graphicsQueueFamilyQueueCount_, " queue(s)");
+            break;
+        }
+    }
+
    return true;
 }

 bool VkContext::createLogicalDevice() {
    vkb::DeviceBuilder deviceBuilder{vkbPhysicalDevice_};
+
+    // If the graphics queue family supports >= 2 queues, request a second one
+    // for parallel texture/buffer uploads.  Both queues share the same family
+    // so no queue-ownership-transfer barriers are needed.
+    const bool requestTransferQueue = (graphicsQueueFamilyQueueCount_ >= 2);
+
+    if (requestTransferQueue) {
+        // Build a custom queue description list: 2 queues from the graphics
+        // family, 1 queue from every other family (so present etc. still work).
+        auto families = vkbPhysicalDevice_.get_queue_families();
+        uint32_t gfxFamily = UINT32_MAX;
+        for (uint32_t i = 0; i < static_cast<uint32_t>(families.size()); i++) {
+            if (families[i].queueFlags & VK_QUEUE_GRAPHICS_BIT) {
+                gfxFamily = i;
+                break;
+            }
+        }
+
+        std::vector<vkb::CustomQueueDescription> queueDescs;
+        for (uint32_t i = 0; i < static_cast<uint32_t>(families.size()); i++) {
+            if (i == gfxFamily) {
+                // Request 2 queues: [0] graphics, [1] transfer uploads
+                queueDescs.emplace_back(i, std::vector<float>{1.0f, 1.0f});
+            } else {
+                queueDescs.emplace_back(i, std::vector<float>{1.0f});
+            }
+        }
+        deviceBuilder.custom_queue_setup(queueDescs);
+    }
+
    auto devRet = deviceBuilder.build();
    if (!devRet) {
        LOG_ERROR("Failed to create Vulkan logical device: ", devRet.error().message());
@ -342,22 +384,45 @@ bool VkContext::createLogicalDevice() {
    auto vkbDevice = devRet.value();
    device = vkbDevice.device;

-    auto gqRet = vkbDevice.get_queue(vkb::QueueType::graphics);
-    if (!gqRet) {
-        LOG_ERROR("Failed to get graphics queue");
-        return false;
-    }
-    graphicsQueue = gqRet.value();
-    graphicsQueueFamily = vkbDevice.get_queue_index(vkb::QueueType::graphics).value();
+    if (requestTransferQueue) {
+        // With custom_queue_setup, we must retrieve queues manually.
+        auto families = vkbPhysicalDevice_.get_queue_families();
+        uint32_t gfxFamily = UINT32_MAX;
+        for (uint32_t i = 0; i < static_cast<uint32_t>(families.size()); i++) {
+            if (families[i].queueFlags & VK_QUEUE_GRAPHICS_BIT) {
+                gfxFamily = i;
+                break;
+            }
+        }
+        graphicsQueueFamily = gfxFamily;
+        vkGetDeviceQueue(device, gfxFamily, 0, &graphicsQueue);
+        vkGetDeviceQueue(device, gfxFamily, 1, &transferQueue_);
+        hasDedicatedTransfer_ = true;

-    auto pqRet = vkbDevice.get_queue(vkb::QueueType::present);
-    if (!pqRet) {
-        // Fall back to graphics queue for presentation
+        // Present queue: try the graphics family first (most common), otherwise
+        // find a family that supports presentation.
        presentQueue = graphicsQueue;
-        presentQueueFamily = graphicsQueueFamily;
+        presentQueueFamily = gfxFamily;
+
+        LOG_INFO("Dedicated transfer queue enabled (family ", gfxFamily, ", queue index 1)");
    } else {
-        presentQueue = pqRet.value();
-        presentQueueFamily = vkbDevice.get_queue_index(vkb::QueueType::present).value();
+        // Standard path — let vkb resolve queues.
+        auto gqRet = vkbDevice.get_queue(vkb::QueueType::graphics);
+        if (!gqRet) {
+            LOG_ERROR("Failed to get graphics queue");
+            return false;
+        }
+        graphicsQueue = gqRet.value();
+        graphicsQueueFamily = vkbDevice.get_queue_index(vkb::QueueType::graphics).value();
+
+        auto pqRet = vkbDevice.get_queue(vkb::QueueType::present);
+        if (!pqRet) {
+            presentQueue = graphicsQueue;
+            presentQueueFamily = graphicsQueueFamily;
+        } else {
+            presentQueue = pqRet.value();
+            presentQueueFamily = vkbDevice.get_queue_index(vkb::QueueType::present).value();
+        }
    }

    LOG_INFO("Vulkan logical device created");
@ -588,6 +653,19 @@ bool VkContext::createCommandPools() {
        return false;
    }

+    // Separate command pool for the transfer queue (same family, different queue)
+    if (hasDedicatedTransfer_) {
+        VkCommandPoolCreateInfo transferPoolInfo{};
+        transferPoolInfo.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO;
+        transferPoolInfo.flags = VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT;
+        transferPoolInfo.queueFamilyIndex = graphicsQueueFamily;
+
+        if (vkCreateCommandPool(device, &transferPoolInfo, nullptr, &transferCommandPool_) != VK_SUCCESS) {
+            LOG_ERROR("Failed to create transfer command pool");
+            return false;
+        }
+    }
+
    return true;
 }

@ -1709,7 +1787,21 @@ void VkContext::beginUploadBatch() {
    uploadBatchDepth_++;
    if (inUploadBatch_) return; // already in a batch (nested call)
    inUploadBatch_ = true;
-    batchCmd_ = beginSingleTimeCommands();
+
+    // Allocate from transfer pool if available, otherwise from immCommandPool.
+    VkCommandPool pool = hasDedicatedTransfer_ ? transferCommandPool_ : immCommandPool;
+
+    VkCommandBufferAllocateInfo allocInfo{};
+    allocInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO;
+    allocInfo.commandPool = pool;
+    allocInfo.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY;
+    allocInfo.commandBufferCount = 1;
+    vkAllocateCommandBuffers(device, &allocInfo, &batchCmd_);
+
+    VkCommandBufferBeginInfo beginInfo{};
+    beginInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO;
+    beginInfo.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
+    vkBeginCommandBuffer(batchCmd_, &beginInfo);
 }

 void VkContext::endUploadBatch() {
@ -1719,10 +1811,12 @@ void VkContext::endUploadBatch() {

    inUploadBatch_ = false;

+    VkCommandPool pool = hasDedicatedTransfer_ ? transferCommandPool_ : immCommandPool;
+
    if (batchStagingBuffers_.empty()) {
        // No GPU copies were recorded — skip the submit entirely.
        vkEndCommandBuffer(batchCmd_);
-        vkFreeCommandBuffers(device, immCommandPool, 1, &batchCmd_);
+        vkFreeCommandBuffers(device, pool, 1, &batchCmd_);
        batchCmd_ = VK_NULL_HANDLE;
        return;
    }
@ -1739,7 +1833,10 @@ void VkContext::endUploadBatch() {
    submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
    submitInfo.commandBufferCount = 1;
    submitInfo.pCommandBuffers = &batchCmd_;
-    vkQueueSubmit(graphicsQueue, 1, &submitInfo, fence);
+
+    // Submit to the dedicated transfer queue if available, otherwise graphics.
+    VkQueue targetQueue = hasDedicatedTransfer_ ? transferQueue_ : graphicsQueue;
+    vkQueueSubmit(targetQueue, 1, &submitInfo, fence);

    // Stash everything for later cleanup when fence signals
    InFlightBatch batch;
@ -1759,15 +1856,30 @@ void VkContext::endUploadBatchSync() {

    inUploadBatch_ = false;

+    VkCommandPool pool = hasDedicatedTransfer_ ? transferCommandPool_ : immCommandPool;
+
    if (batchStagingBuffers_.empty()) {
        vkEndCommandBuffer(batchCmd_);
-        vkFreeCommandBuffers(device, immCommandPool, 1, &batchCmd_);
+        vkFreeCommandBuffers(device, pool, 1, &batchCmd_);
        batchCmd_ = VK_NULL_HANDLE;
        return;
    }

-    // Synchronous path for load screens — submit and wait
-    endSingleTimeCommands(batchCmd_);
+    // Synchronous path for load screens — submit and wait on the target queue.
+    VkQueue targetQueue = hasDedicatedTransfer_ ? transferQueue_ : graphicsQueue;
+
+    vkEndCommandBuffer(batchCmd_);
+
+    VkSubmitInfo submitInfo{};
+    submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
+    submitInfo.commandBufferCount = 1;
+    submitInfo.pCommandBuffers = &batchCmd_;
+
+    vkQueueSubmit(targetQueue, 1, &submitInfo, immFence);
+    vkWaitForFences(device, 1, &immFence, VK_TRUE, UINT64_MAX);
+    vkResetFences(device, 1, &immFence);
+
+    vkFreeCommandBuffers(device, pool, 1, &batchCmd_);
    batchCmd_ = VK_NULL_HANDLE;

    for (auto& staging : batchStagingBuffers_) {
@ -1779,6 +1891,8 @@ void VkContext::endUploadBatchSync() {
 void VkContext::pollUploadBatches() {
    if (inFlightBatches_.empty()) return;

+    VkCommandPool pool = hasDedicatedTransfer_ ? transferCommandPool_ : immCommandPool;
+
    for (auto it = inFlightBatches_.begin(); it != inFlightBatches_.end(); ) {
        VkResult result = vkGetFenceStatus(device, it->fence);
        if (result == VK_SUCCESS) {
@ -1786,7 +1900,7 @@ void VkContext::pollUploadBatches() {
            for (auto& staging : it->stagingBuffers) {
                destroyBuffer(allocator, staging);
            }
-            vkFreeCommandBuffers(device, immCommandPool, 1, &it->cmd);
+            vkFreeCommandBuffers(device, pool, 1, &it->cmd);
            vkDestroyFence(device, it->fence, nullptr);
            it = inFlightBatches_.erase(it);
        } else {
@ -1796,12 +1910,14 @@ void VkContext::pollUploadBatches() {
 }

 void VkContext::waitAllUploads() {
+    VkCommandPool pool = hasDedicatedTransfer_ ? transferCommandPool_ : immCommandPool;
+
    for (auto& batch : inFlightBatches_) {
        vkWaitForFences(device, 1, &batch.fence, VK_TRUE, UINT64_MAX);
        for (auto& staging : batch.stagingBuffers) {
            destroyBuffer(allocator, staging);
        }
-        vkFreeCommandBuffers(device, immCommandPool, 1, &batch.cmd);
+        vkFreeCommandBuffers(device, pool, 1, &batch.cmd);
        vkDestroyFence(device, batch.fence, nullptr);
    }
    inFlightBatches_.clear();