From 1dd382301330b7e2e7f1230515e063f7a3e4ceff Mon Sep 17 00:00:00 2001
From: Kelsi <kelsihates2fa@gmail.com>
Date: Tue, 24 Mar 2026 14:09:16 -0700
Subject: [PATCH] perf: use second GPU queue for parallel texture/buffer
 uploads

Request 2 queues from the graphics family when available (NVIDIA
exposes 16, AMD 2+). Upload batches now submit to queue[1] while
rendering uses queue[0], enabling parallel GPU transfers without
queue-family ownership transfer barriers (same family).

Falls back to single-queue path on GPUs with only 1 queue in the
graphics family. Transfer command pool is separate to avoid contention.
---
 include/rendering/vk_context.hpp |   7 ++
 src/rendering/vk_context.cpp     | 158 +++++++++++++++++++++++++++----
 2 files changed, 144 insertions(+), 21 deletions(-)

diff --git a/include/rendering/vk_context.hpp b/include/rendering/vk_context.hpp
index fbc16e2a..c9926cf5 100644
--- a/include/rendering/vk_context.hpp
+++ b/include/rendering/vk_context.hpp
@@ -78,6 +78,7 @@ public:
     bool isNvidiaGpu() const { return gpuVendorId_ == 0x10DE; }
     VkQueue getGraphicsQueue() const { return graphicsQueue; }
     uint32_t getGraphicsQueueFamily() const { return graphicsQueueFamily; }
+    bool hasDedicatedTransferQueue() const { return hasDedicatedTransfer_; }
     VmaAllocator getAllocator() const { return allocator; }
     VkSurfaceKHR getSurface() const { return surface; }
     VkPipelineCache getPipelineCache() const { return pipelineCache_; }
@@ -175,6 +176,12 @@ private:
     uint32_t graphicsQueueFamily = 0;
     uint32_t presentQueueFamily = 0;
 
+    // Dedicated transfer queue (second queue from same graphics family)
+    VkQueue transferQueue_ = VK_NULL_HANDLE;
+    VkCommandPool transferCommandPool_ = VK_NULL_HANDLE;
+    bool hasDedicatedTransfer_ = false;
+    uint32_t graphicsQueueFamilyQueueCount_ = 1; // queried in selectPhysicalDevice
+
     // Swapchain
     VkSwapchainKHR swapchain = VK_NULL_HANDLE;
     VkFormat swapchainFormat = VK_FORMAT_UNDEFINED;
diff --git a/src/rendering/vk_context.cpp b/src/rendering/vk_context.cpp
index 323af430..3314ff83 100644
--- a/src/rendering/vk_context.cpp
+++ b/src/rendering/vk_context.cpp
@@ -135,6 +135,7 @@ void VkContext::shutdown() {
 
     if (immFence) { vkDestroyFence(device, immFence, nullptr); immFence = VK_NULL_HANDLE; }
     if (immCommandPool) { vkDestroyCommandPool(device, immCommandPool, nullptr); immCommandPool = VK_NULL_HANDLE; }
+    if (transferCommandPool_) { vkDestroyCommandPool(device, transferCommandPool_, nullptr); transferCommandPool_ = VK_NULL_HANDLE; }
 
     // Persist pipeline cache to disk before tearing down the device.
     savePipelineCache();
@@ -328,11 +329,52 @@ bool VkContext::selectPhysicalDevice() {
              VK_VERSION_MINOR(props.apiVersion), ".", VK_VERSION_PATCH(props.apiVersion));
     LOG_INFO("Depth resolve support: ", depthResolveSupported_ ? "YES" : "NO");
 
+    // Probe queue families to see if the graphics family supports multiple queues
+    // (used in createLogicalDevice to request a second queue for parallel uploads).
+    auto queueFamilies = vkbPhysicalDevice_.get_queue_families();
+    for (uint32_t i = 0; i < static_cast<uint32_t>(queueFamilies.size()); i++) {
+        if (queueFamilies[i].queueFlags & VK_QUEUE_GRAPHICS_BIT) {
+            graphicsQueueFamilyQueueCount_ = queueFamilies[i].queueCount;
+            LOG_INFO("Graphics queue family ", i, " supports ", graphicsQueueFamilyQueueCount_, " queue(s)");
+            break;
+        }
+    }
+
     return true;
 }
 
 bool VkContext::createLogicalDevice() {
     vkb::DeviceBuilder deviceBuilder{vkbPhysicalDevice_};
+
+    // If the graphics queue family supports >= 2 queues, request a second one
+    // for parallel texture/buffer uploads.  Both queues share the same family
+    // so no queue-ownership-transfer barriers are needed.
+    const bool requestTransferQueue = (graphicsQueueFamilyQueueCount_ >= 2);
+
+    if (requestTransferQueue) {
+        // Build a custom queue description list: 2 queues from the graphics
+        // family, 1 queue from every other family (so present etc. still work).
+        auto families = vkbPhysicalDevice_.get_queue_families();
+        uint32_t gfxFamily = UINT32_MAX;
+        for (uint32_t i = 0; i < static_cast<uint32_t>(families.size()); i++) {
+            if (families[i].queueFlags & VK_QUEUE_GRAPHICS_BIT) {
+                gfxFamily = i;
+                break;
+            }
+        }
+
+        std::vector<vkb::CustomQueueDescription> queueDescs;
+        for (uint32_t i = 0; i < static_cast<uint32_t>(families.size()); i++) {
+            if (i == gfxFamily) {
+                // Request 2 queues: [0] graphics, [1] transfer uploads
+                queueDescs.emplace_back(i, std::vector<float>{1.0f, 1.0f});
+            } else {
+                queueDescs.emplace_back(i, std::vector<float>{1.0f});
+            }
+        }
+        deviceBuilder.custom_queue_setup(queueDescs);
+    }
+
     auto devRet = deviceBuilder.build();
     if (!devRet) {
         LOG_ERROR("Failed to create Vulkan logical device: ", devRet.error().message());
@@ -342,22 +384,45 @@ bool VkContext::createLogicalDevice() {
     auto vkbDevice = devRet.value();
     device = vkbDevice.device;
 
-    auto gqRet = vkbDevice.get_queue(vkb::QueueType::graphics);
-    if (!gqRet) {
-        LOG_ERROR("Failed to get graphics queue");
-        return false;
-    }
-    graphicsQueue = gqRet.value();
-    graphicsQueueFamily = vkbDevice.get_queue_index(vkb::QueueType::graphics).value();
+    if (requestTransferQueue) {
+        // With custom_queue_setup, we must retrieve queues manually.
+        auto families = vkbPhysicalDevice_.get_queue_families();
+        uint32_t gfxFamily = UINT32_MAX;
+        for (uint32_t i = 0; i < static_cast<uint32_t>(families.size()); i++) {
+            if (families[i].queueFlags & VK_QUEUE_GRAPHICS_BIT) {
+                gfxFamily = i;
+                break;
+            }
+        }
+        graphicsQueueFamily = gfxFamily;
+        vkGetDeviceQueue(device, gfxFamily, 0, &graphicsQueue);
+        vkGetDeviceQueue(device, gfxFamily, 1, &transferQueue_);
+        hasDedicatedTransfer_ = true;
 
-    auto pqRet = vkbDevice.get_queue(vkb::QueueType::present);
-    if (!pqRet) {
-        // Fall back to graphics queue for presentation
+        // Present queue: try the graphics family first (most common), otherwise
+        // find a family that supports presentation.
         presentQueue = graphicsQueue;
-        presentQueueFamily = graphicsQueueFamily;
+        presentQueueFamily = gfxFamily;
+
+        LOG_INFO("Dedicated transfer queue enabled (family ", gfxFamily, ", queue index 1)");
     } else {
-        presentQueue = pqRet.value();
-        presentQueueFamily = vkbDevice.get_queue_index(vkb::QueueType::present).value();
+        // Standard path — let vkb resolve queues.
+        auto gqRet = vkbDevice.get_queue(vkb::QueueType::graphics);
+        if (!gqRet) {
+            LOG_ERROR("Failed to get graphics queue");
+            return false;
+        }
+        graphicsQueue = gqRet.value();
+        graphicsQueueFamily = vkbDevice.get_queue_index(vkb::QueueType::graphics).value();
+
+        auto pqRet = vkbDevice.get_queue(vkb::QueueType::present);
+        if (!pqRet) {
+            presentQueue = graphicsQueue;
+            presentQueueFamily = graphicsQueueFamily;
+        } else {
+            presentQueue = pqRet.value();
+            presentQueueFamily = vkbDevice.get_queue_index(vkb::QueueType::present).value();
+        }
     }
 
     LOG_INFO("Vulkan logical device created");
@@ -588,6 +653,19 @@ bool VkContext::createCommandPools() {
         return false;
     }
 
+    // Separate command pool for the transfer queue (same family, different queue)
+    if (hasDedicatedTransfer_) {
+        VkCommandPoolCreateInfo transferPoolInfo{};
+        transferPoolInfo.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO;
+        transferPoolInfo.flags = VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT;
+        transferPoolInfo.queueFamilyIndex = graphicsQueueFamily;
+
+        if (vkCreateCommandPool(device, &transferPoolInfo, nullptr, &transferCommandPool_) != VK_SUCCESS) {
+            LOG_ERROR("Failed to create transfer command pool");
+            return false;
+        }
+    }
+
     return true;
 }
 
@@ -1709,7 +1787,21 @@ void VkContext::beginUploadBatch() {
     uploadBatchDepth_++;
     if (inUploadBatch_) return; // already in a batch (nested call)
     inUploadBatch_ = true;
-    batchCmd_ = beginSingleTimeCommands();
+
+    // Allocate from transfer pool if available, otherwise from immCommandPool.
+    VkCommandPool pool = hasDedicatedTransfer_ ? transferCommandPool_ : immCommandPool;
+
+    VkCommandBufferAllocateInfo allocInfo{};
+    allocInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO;
+    allocInfo.commandPool = pool;
+    allocInfo.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY;
+    allocInfo.commandBufferCount = 1;
+    vkAllocateCommandBuffers(device, &allocInfo, &batchCmd_);
+
+    VkCommandBufferBeginInfo beginInfo{};
+    beginInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO;
+    beginInfo.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
+    vkBeginCommandBuffer(batchCmd_, &beginInfo);
 }
 
 void VkContext::endUploadBatch() {
@@ -1719,10 +1811,12 @@ void VkContext::endUploadBatch() {
 
     inUploadBatch_ = false;
 
+    VkCommandPool pool = hasDedicatedTransfer_ ? transferCommandPool_ : immCommandPool;
+
     if (batchStagingBuffers_.empty()) {
         // No GPU copies were recorded — skip the submit entirely.
         vkEndCommandBuffer(batchCmd_);
-        vkFreeCommandBuffers(device, immCommandPool, 1, &batchCmd_);
+        vkFreeCommandBuffers(device, pool, 1, &batchCmd_);
         batchCmd_ = VK_NULL_HANDLE;
         return;
     }
@@ -1739,7 +1833,10 @@ void VkContext::endUploadBatch() {
     submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
     submitInfo.commandBufferCount = 1;
     submitInfo.pCommandBuffers = &batchCmd_;
-    vkQueueSubmit(graphicsQueue, 1, &submitInfo, fence);
+
+    // Submit to the dedicated transfer queue if available, otherwise graphics.
+    VkQueue targetQueue = hasDedicatedTransfer_ ? transferQueue_ : graphicsQueue;
+    vkQueueSubmit(targetQueue, 1, &submitInfo, fence);
 
     // Stash everything for later cleanup when fence signals
     InFlightBatch batch;
@@ -1759,15 +1856,30 @@ void VkContext::endUploadBatchSync() {
 
     inUploadBatch_ = false;
 
+    VkCommandPool pool = hasDedicatedTransfer_ ? transferCommandPool_ : immCommandPool;
+
     if (batchStagingBuffers_.empty()) {
         vkEndCommandBuffer(batchCmd_);
-        vkFreeCommandBuffers(device, immCommandPool, 1, &batchCmd_);
+        vkFreeCommandBuffers(device, pool, 1, &batchCmd_);
         batchCmd_ = VK_NULL_HANDLE;
         return;
     }
 
-    // Synchronous path for load screens — submit and wait
-    endSingleTimeCommands(batchCmd_);
+    // Synchronous path for load screens — submit and wait on the target queue.
+    VkQueue targetQueue = hasDedicatedTransfer_ ? transferQueue_ : graphicsQueue;
+
+    vkEndCommandBuffer(batchCmd_);
+
+    VkSubmitInfo submitInfo{};
+    submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
+    submitInfo.commandBufferCount = 1;
+    submitInfo.pCommandBuffers = &batchCmd_;
+
+    vkQueueSubmit(targetQueue, 1, &submitInfo, immFence);
+    vkWaitForFences(device, 1, &immFence, VK_TRUE, UINT64_MAX);
+    vkResetFences(device, 1, &immFence);
+
+    vkFreeCommandBuffers(device, pool, 1, &batchCmd_);
     batchCmd_ = VK_NULL_HANDLE;
 
     for (auto& staging : batchStagingBuffers_) {
@@ -1779,6 +1891,8 @@ void VkContext::endUploadBatchSync() {
 void VkContext::pollUploadBatches() {
     if (inFlightBatches_.empty()) return;
 
+    VkCommandPool pool = hasDedicatedTransfer_ ? transferCommandPool_ : immCommandPool;
+
     for (auto it = inFlightBatches_.begin(); it != inFlightBatches_.end(); ) {
         VkResult result = vkGetFenceStatus(device, it->fence);
         if (result == VK_SUCCESS) {
@@ -1786,7 +1900,7 @@ void VkContext::pollUploadBatches() {
             for (auto& staging : it->stagingBuffers) {
                 destroyBuffer(allocator, staging);
             }
-            vkFreeCommandBuffers(device, immCommandPool, 1, &it->cmd);
+            vkFreeCommandBuffers(device, pool, 1, &it->cmd);
             vkDestroyFence(device, it->fence, nullptr);
             it = inFlightBatches_.erase(it);
         } else {
@@ -1796,12 +1910,14 @@ void VkContext::pollUploadBatches() {
 }
 
 void VkContext::waitAllUploads() {
+    VkCommandPool pool = hasDedicatedTransfer_ ? transferCommandPool_ : immCommandPool;
+
     for (auto& batch : inFlightBatches_) {
         vkWaitForFences(device, 1, &batch.fence, VK_TRUE, UINT64_MAX);
         for (auto& staging : batch.stagingBuffers) {
             destroyBuffer(allocator, staging);
         }
-        vkFreeCommandBuffers(device, immCommandPool, 1, &batch.cmd);
+        vkFreeCommandBuffers(device, pool, 1, &batch.cmd);
         vkDestroyFence(device, batch.fence, nullptr);
     }
     inFlightBatches_.clear();