perf: use second GPU queue for parallel texture/buffer uploads

Request 2 queues from the graphics family when available (NVIDIA
exposes 16, AMD 2+). Upload batches now submit to queue[1] while
rendering uses queue[0], enabling parallel GPU transfers without
queue-family ownership transfer barriers (same family).

Falls back to single-queue path on GPUs with only 1 queue in the
graphics family. Transfer command pool is separate to avoid contention.
This commit is contained in:
Kelsi 2026-03-24 14:09:16 -07:00
parent ed0cb0ad25
commit 1dd3823013
2 changed files with 144 additions and 21 deletions

View file

@ -78,6 +78,7 @@ public:
bool isNvidiaGpu() const { return gpuVendorId_ == 0x10DE; }
VkQueue getGraphicsQueue() const { return graphicsQueue; }
uint32_t getGraphicsQueueFamily() const { return graphicsQueueFamily; }
bool hasDedicatedTransferQueue() const { return hasDedicatedTransfer_; }
VmaAllocator getAllocator() const { return allocator; }
VkSurfaceKHR getSurface() const { return surface; }
VkPipelineCache getPipelineCache() const { return pipelineCache_; }
@ -175,6 +176,12 @@ private:
uint32_t graphicsQueueFamily = 0;
uint32_t presentQueueFamily = 0;
// Dedicated transfer queue (second queue from same graphics family)
VkQueue transferQueue_ = VK_NULL_HANDLE;
VkCommandPool transferCommandPool_ = VK_NULL_HANDLE;
bool hasDedicatedTransfer_ = false;
uint32_t graphicsQueueFamilyQueueCount_ = 1; // queried in selectPhysicalDevice
// Swapchain
VkSwapchainKHR swapchain = VK_NULL_HANDLE;
VkFormat swapchainFormat = VK_FORMAT_UNDEFINED;

View file

@ -135,6 +135,7 @@ void VkContext::shutdown() {
if (immFence) { vkDestroyFence(device, immFence, nullptr); immFence = VK_NULL_HANDLE; }
if (immCommandPool) { vkDestroyCommandPool(device, immCommandPool, nullptr); immCommandPool = VK_NULL_HANDLE; }
if (transferCommandPool_) { vkDestroyCommandPool(device, transferCommandPool_, nullptr); transferCommandPool_ = VK_NULL_HANDLE; }
// Persist pipeline cache to disk before tearing down the device.
savePipelineCache();
@ -328,11 +329,52 @@ bool VkContext::selectPhysicalDevice() {
VK_VERSION_MINOR(props.apiVersion), ".", VK_VERSION_PATCH(props.apiVersion));
LOG_INFO("Depth resolve support: ", depthResolveSupported_ ? "YES" : "NO");
// Probe queue families to see if the graphics family supports multiple queues
// (used in createLogicalDevice to request a second queue for parallel uploads).
auto queueFamilies = vkbPhysicalDevice_.get_queue_families();
for (uint32_t i = 0; i < static_cast<uint32_t>(queueFamilies.size()); i++) {
if (queueFamilies[i].queueFlags & VK_QUEUE_GRAPHICS_BIT) {
graphicsQueueFamilyQueueCount_ = queueFamilies[i].queueCount;
LOG_INFO("Graphics queue family ", i, " supports ", graphicsQueueFamilyQueueCount_, " queue(s)");
break;
}
}
return true;
}
bool VkContext::createLogicalDevice() {
vkb::DeviceBuilder deviceBuilder{vkbPhysicalDevice_};
// If the graphics queue family supports >= 2 queues, request a second one
// for parallel texture/buffer uploads. Both queues share the same family
// so no queue-ownership-transfer barriers are needed.
const bool requestTransferQueue = (graphicsQueueFamilyQueueCount_ >= 2);
if (requestTransferQueue) {
// Build a custom queue description list: 2 queues from the graphics
// family, 1 queue from every other family (so present etc. still work).
auto families = vkbPhysicalDevice_.get_queue_families();
uint32_t gfxFamily = UINT32_MAX;
for (uint32_t i = 0; i < static_cast<uint32_t>(families.size()); i++) {
if (families[i].queueFlags & VK_QUEUE_GRAPHICS_BIT) {
gfxFamily = i;
break;
}
}
std::vector<vkb::CustomQueueDescription> queueDescs;
for (uint32_t i = 0; i < static_cast<uint32_t>(families.size()); i++) {
if (i == gfxFamily) {
// Request 2 queues: [0] graphics, [1] transfer uploads
queueDescs.emplace_back(i, std::vector<float>{1.0f, 1.0f});
} else {
queueDescs.emplace_back(i, std::vector<float>{1.0f});
}
}
deviceBuilder.custom_queue_setup(queueDescs);
}
auto devRet = deviceBuilder.build();
if (!devRet) {
LOG_ERROR("Failed to create Vulkan logical device: ", devRet.error().message());
@ -342,22 +384,45 @@ bool VkContext::createLogicalDevice() {
auto vkbDevice = devRet.value();
device = vkbDevice.device;
auto gqRet = vkbDevice.get_queue(vkb::QueueType::graphics);
if (!gqRet) {
LOG_ERROR("Failed to get graphics queue");
return false;
}
graphicsQueue = gqRet.value();
graphicsQueueFamily = vkbDevice.get_queue_index(vkb::QueueType::graphics).value();
if (requestTransferQueue) {
// With custom_queue_setup, we must retrieve queues manually.
auto families = vkbPhysicalDevice_.get_queue_families();
uint32_t gfxFamily = UINT32_MAX;
for (uint32_t i = 0; i < static_cast<uint32_t>(families.size()); i++) {
if (families[i].queueFlags & VK_QUEUE_GRAPHICS_BIT) {
gfxFamily = i;
break;
}
}
graphicsQueueFamily = gfxFamily;
vkGetDeviceQueue(device, gfxFamily, 0, &graphicsQueue);
vkGetDeviceQueue(device, gfxFamily, 1, &transferQueue_);
hasDedicatedTransfer_ = true;
auto pqRet = vkbDevice.get_queue(vkb::QueueType::present);
if (!pqRet) {
// Fall back to graphics queue for presentation
// Present queue: try the graphics family first (most common), otherwise
// find a family that supports presentation.
presentQueue = graphicsQueue;
presentQueueFamily = graphicsQueueFamily;
presentQueueFamily = gfxFamily;
LOG_INFO("Dedicated transfer queue enabled (family ", gfxFamily, ", queue index 1)");
} else {
presentQueue = pqRet.value();
presentQueueFamily = vkbDevice.get_queue_index(vkb::QueueType::present).value();
// Standard path — let vkb resolve queues.
auto gqRet = vkbDevice.get_queue(vkb::QueueType::graphics);
if (!gqRet) {
LOG_ERROR("Failed to get graphics queue");
return false;
}
graphicsQueue = gqRet.value();
graphicsQueueFamily = vkbDevice.get_queue_index(vkb::QueueType::graphics).value();
auto pqRet = vkbDevice.get_queue(vkb::QueueType::present);
if (!pqRet) {
presentQueue = graphicsQueue;
presentQueueFamily = graphicsQueueFamily;
} else {
presentQueue = pqRet.value();
presentQueueFamily = vkbDevice.get_queue_index(vkb::QueueType::present).value();
}
}
LOG_INFO("Vulkan logical device created");
@ -588,6 +653,19 @@ bool VkContext::createCommandPools() {
return false;
}
// Separate command pool for the transfer queue (same family, different queue)
if (hasDedicatedTransfer_) {
VkCommandPoolCreateInfo transferPoolInfo{};
transferPoolInfo.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO;
transferPoolInfo.flags = VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT;
transferPoolInfo.queueFamilyIndex = graphicsQueueFamily;
if (vkCreateCommandPool(device, &transferPoolInfo, nullptr, &transferCommandPool_) != VK_SUCCESS) {
LOG_ERROR("Failed to create transfer command pool");
return false;
}
}
return true;
}
@ -1709,7 +1787,21 @@ void VkContext::beginUploadBatch() {
uploadBatchDepth_++;
if (inUploadBatch_) return; // already in a batch (nested call)
inUploadBatch_ = true;
batchCmd_ = beginSingleTimeCommands();
// Allocate from transfer pool if available, otherwise from immCommandPool.
VkCommandPool pool = hasDedicatedTransfer_ ? transferCommandPool_ : immCommandPool;
VkCommandBufferAllocateInfo allocInfo{};
allocInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO;
allocInfo.commandPool = pool;
allocInfo.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY;
allocInfo.commandBufferCount = 1;
vkAllocateCommandBuffers(device, &allocInfo, &batchCmd_);
VkCommandBufferBeginInfo beginInfo{};
beginInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO;
beginInfo.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
vkBeginCommandBuffer(batchCmd_, &beginInfo);
}
void VkContext::endUploadBatch() {
@ -1719,10 +1811,12 @@ void VkContext::endUploadBatch() {
inUploadBatch_ = false;
VkCommandPool pool = hasDedicatedTransfer_ ? transferCommandPool_ : immCommandPool;
if (batchStagingBuffers_.empty()) {
// No GPU copies were recorded — skip the submit entirely.
vkEndCommandBuffer(batchCmd_);
vkFreeCommandBuffers(device, immCommandPool, 1, &batchCmd_);
vkFreeCommandBuffers(device, pool, 1, &batchCmd_);
batchCmd_ = VK_NULL_HANDLE;
return;
}
@ -1739,7 +1833,10 @@ void VkContext::endUploadBatch() {
submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
submitInfo.commandBufferCount = 1;
submitInfo.pCommandBuffers = &batchCmd_;
vkQueueSubmit(graphicsQueue, 1, &submitInfo, fence);
// Submit to the dedicated transfer queue if available, otherwise graphics.
VkQueue targetQueue = hasDedicatedTransfer_ ? transferQueue_ : graphicsQueue;
vkQueueSubmit(targetQueue, 1, &submitInfo, fence);
// Stash everything for later cleanup when fence signals
InFlightBatch batch;
@ -1759,15 +1856,30 @@ void VkContext::endUploadBatchSync() {
inUploadBatch_ = false;
VkCommandPool pool = hasDedicatedTransfer_ ? transferCommandPool_ : immCommandPool;
if (batchStagingBuffers_.empty()) {
vkEndCommandBuffer(batchCmd_);
vkFreeCommandBuffers(device, immCommandPool, 1, &batchCmd_);
vkFreeCommandBuffers(device, pool, 1, &batchCmd_);
batchCmd_ = VK_NULL_HANDLE;
return;
}
// Synchronous path for load screens — submit and wait
endSingleTimeCommands(batchCmd_);
// Synchronous path for load screens — submit and wait on the target queue.
VkQueue targetQueue = hasDedicatedTransfer_ ? transferQueue_ : graphicsQueue;
vkEndCommandBuffer(batchCmd_);
VkSubmitInfo submitInfo{};
submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
submitInfo.commandBufferCount = 1;
submitInfo.pCommandBuffers = &batchCmd_;
vkQueueSubmit(targetQueue, 1, &submitInfo, immFence);
vkWaitForFences(device, 1, &immFence, VK_TRUE, UINT64_MAX);
vkResetFences(device, 1, &immFence);
vkFreeCommandBuffers(device, pool, 1, &batchCmd_);
batchCmd_ = VK_NULL_HANDLE;
for (auto& staging : batchStagingBuffers_) {
@ -1779,6 +1891,8 @@ void VkContext::endUploadBatchSync() {
void VkContext::pollUploadBatches() {
if (inFlightBatches_.empty()) return;
VkCommandPool pool = hasDedicatedTransfer_ ? transferCommandPool_ : immCommandPool;
for (auto it = inFlightBatches_.begin(); it != inFlightBatches_.end(); ) {
VkResult result = vkGetFenceStatus(device, it->fence);
if (result == VK_SUCCESS) {
@ -1786,7 +1900,7 @@ void VkContext::pollUploadBatches() {
for (auto& staging : it->stagingBuffers) {
destroyBuffer(allocator, staging);
}
vkFreeCommandBuffers(device, immCommandPool, 1, &it->cmd);
vkFreeCommandBuffers(device, pool, 1, &it->cmd);
vkDestroyFence(device, it->fence, nullptr);
it = inFlightBatches_.erase(it);
} else {
@ -1796,12 +1910,14 @@ void VkContext::pollUploadBatches() {
}
void VkContext::waitAllUploads() {
VkCommandPool pool = hasDedicatedTransfer_ ? transferCommandPool_ : immCommandPool;
for (auto& batch : inFlightBatches_) {
vkWaitForFences(device, 1, &batch.fence, VK_TRUE, UINT64_MAX);
for (auto& staging : batch.stagingBuffers) {
destroyBuffer(allocator, staging);
}
vkFreeCommandBuffers(device, immCommandPool, 1, &batch.cmd);
vkFreeCommandBuffers(device, pool, 1, &batch.cmd);
vkDestroyFence(device, batch.fence, nullptr);
}
inFlightBatches_.clear();