mirror of
https://github.com/Kelsidavis/WoWee.git
synced 2026-03-25 16:30:15 +00:00
perf: use second GPU queue for parallel texture/buffer uploads
Request 2 queues from the graphics family when available (NVIDIA exposes 16, AMD 2+). Upload batches now submit to queue[1] while rendering uses queue[0], enabling parallel GPU transfers without queue-family ownership transfer barriers (same family). Falls back to single-queue path on GPUs with only 1 queue in the graphics family. Transfer command pool is separate to avoid contention.
This commit is contained in:
parent
ed0cb0ad25
commit
1dd3823013
2 changed files with 144 additions and 21 deletions
|
|
@ -78,6 +78,7 @@ public:
|
|||
bool isNvidiaGpu() const { return gpuVendorId_ == 0x10DE; }
|
||||
VkQueue getGraphicsQueue() const { return graphicsQueue; }
|
||||
uint32_t getGraphicsQueueFamily() const { return graphicsQueueFamily; }
|
||||
bool hasDedicatedTransferQueue() const { return hasDedicatedTransfer_; }
|
||||
VmaAllocator getAllocator() const { return allocator; }
|
||||
VkSurfaceKHR getSurface() const { return surface; }
|
||||
VkPipelineCache getPipelineCache() const { return pipelineCache_; }
|
||||
|
|
@ -175,6 +176,12 @@ private:
|
|||
uint32_t graphicsQueueFamily = 0;
|
||||
uint32_t presentQueueFamily = 0;
|
||||
|
||||
// Dedicated transfer queue (second queue from same graphics family)
|
||||
VkQueue transferQueue_ = VK_NULL_HANDLE;
|
||||
VkCommandPool transferCommandPool_ = VK_NULL_HANDLE;
|
||||
bool hasDedicatedTransfer_ = false;
|
||||
uint32_t graphicsQueueFamilyQueueCount_ = 1; // queried in selectPhysicalDevice
|
||||
|
||||
// Swapchain
|
||||
VkSwapchainKHR swapchain = VK_NULL_HANDLE;
|
||||
VkFormat swapchainFormat = VK_FORMAT_UNDEFINED;
|
||||
|
|
|
|||
|
|
@ -135,6 +135,7 @@ void VkContext::shutdown() {
|
|||
|
||||
if (immFence) { vkDestroyFence(device, immFence, nullptr); immFence = VK_NULL_HANDLE; }
|
||||
if (immCommandPool) { vkDestroyCommandPool(device, immCommandPool, nullptr); immCommandPool = VK_NULL_HANDLE; }
|
||||
if (transferCommandPool_) { vkDestroyCommandPool(device, transferCommandPool_, nullptr); transferCommandPool_ = VK_NULL_HANDLE; }
|
||||
|
||||
// Persist pipeline cache to disk before tearing down the device.
|
||||
savePipelineCache();
|
||||
|
|
@ -328,11 +329,52 @@ bool VkContext::selectPhysicalDevice() {
|
|||
VK_VERSION_MINOR(props.apiVersion), ".", VK_VERSION_PATCH(props.apiVersion));
|
||||
LOG_INFO("Depth resolve support: ", depthResolveSupported_ ? "YES" : "NO");
|
||||
|
||||
// Probe queue families to see if the graphics family supports multiple queues
|
||||
// (used in createLogicalDevice to request a second queue for parallel uploads).
|
||||
auto queueFamilies = vkbPhysicalDevice_.get_queue_families();
|
||||
for (uint32_t i = 0; i < static_cast<uint32_t>(queueFamilies.size()); i++) {
|
||||
if (queueFamilies[i].queueFlags & VK_QUEUE_GRAPHICS_BIT) {
|
||||
graphicsQueueFamilyQueueCount_ = queueFamilies[i].queueCount;
|
||||
LOG_INFO("Graphics queue family ", i, " supports ", graphicsQueueFamilyQueueCount_, " queue(s)");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool VkContext::createLogicalDevice() {
|
||||
vkb::DeviceBuilder deviceBuilder{vkbPhysicalDevice_};
|
||||
|
||||
// If the graphics queue family supports >= 2 queues, request a second one
|
||||
// for parallel texture/buffer uploads. Both queues share the same family
|
||||
// so no queue-ownership-transfer barriers are needed.
|
||||
const bool requestTransferQueue = (graphicsQueueFamilyQueueCount_ >= 2);
|
||||
|
||||
if (requestTransferQueue) {
|
||||
// Build a custom queue description list: 2 queues from the graphics
|
||||
// family, 1 queue from every other family (so present etc. still work).
|
||||
auto families = vkbPhysicalDevice_.get_queue_families();
|
||||
uint32_t gfxFamily = UINT32_MAX;
|
||||
for (uint32_t i = 0; i < static_cast<uint32_t>(families.size()); i++) {
|
||||
if (families[i].queueFlags & VK_QUEUE_GRAPHICS_BIT) {
|
||||
gfxFamily = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<vkb::CustomQueueDescription> queueDescs;
|
||||
for (uint32_t i = 0; i < static_cast<uint32_t>(families.size()); i++) {
|
||||
if (i == gfxFamily) {
|
||||
// Request 2 queues: [0] graphics, [1] transfer uploads
|
||||
queueDescs.emplace_back(i, std::vector<float>{1.0f, 1.0f});
|
||||
} else {
|
||||
queueDescs.emplace_back(i, std::vector<float>{1.0f});
|
||||
}
|
||||
}
|
||||
deviceBuilder.custom_queue_setup(queueDescs);
|
||||
}
|
||||
|
||||
auto devRet = deviceBuilder.build();
|
||||
if (!devRet) {
|
||||
LOG_ERROR("Failed to create Vulkan logical device: ", devRet.error().message());
|
||||
|
|
@ -342,22 +384,45 @@ bool VkContext::createLogicalDevice() {
|
|||
auto vkbDevice = devRet.value();
|
||||
device = vkbDevice.device;
|
||||
|
||||
auto gqRet = vkbDevice.get_queue(vkb::QueueType::graphics);
|
||||
if (!gqRet) {
|
||||
LOG_ERROR("Failed to get graphics queue");
|
||||
return false;
|
||||
}
|
||||
graphicsQueue = gqRet.value();
|
||||
graphicsQueueFamily = vkbDevice.get_queue_index(vkb::QueueType::graphics).value();
|
||||
if (requestTransferQueue) {
|
||||
// With custom_queue_setup, we must retrieve queues manually.
|
||||
auto families = vkbPhysicalDevice_.get_queue_families();
|
||||
uint32_t gfxFamily = UINT32_MAX;
|
||||
for (uint32_t i = 0; i < static_cast<uint32_t>(families.size()); i++) {
|
||||
if (families[i].queueFlags & VK_QUEUE_GRAPHICS_BIT) {
|
||||
gfxFamily = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
graphicsQueueFamily = gfxFamily;
|
||||
vkGetDeviceQueue(device, gfxFamily, 0, &graphicsQueue);
|
||||
vkGetDeviceQueue(device, gfxFamily, 1, &transferQueue_);
|
||||
hasDedicatedTransfer_ = true;
|
||||
|
||||
auto pqRet = vkbDevice.get_queue(vkb::QueueType::present);
|
||||
if (!pqRet) {
|
||||
// Fall back to graphics queue for presentation
|
||||
// Present queue: try the graphics family first (most common), otherwise
|
||||
// find a family that supports presentation.
|
||||
presentQueue = graphicsQueue;
|
||||
presentQueueFamily = graphicsQueueFamily;
|
||||
presentQueueFamily = gfxFamily;
|
||||
|
||||
LOG_INFO("Dedicated transfer queue enabled (family ", gfxFamily, ", queue index 1)");
|
||||
} else {
|
||||
presentQueue = pqRet.value();
|
||||
presentQueueFamily = vkbDevice.get_queue_index(vkb::QueueType::present).value();
|
||||
// Standard path — let vkb resolve queues.
|
||||
auto gqRet = vkbDevice.get_queue(vkb::QueueType::graphics);
|
||||
if (!gqRet) {
|
||||
LOG_ERROR("Failed to get graphics queue");
|
||||
return false;
|
||||
}
|
||||
graphicsQueue = gqRet.value();
|
||||
graphicsQueueFamily = vkbDevice.get_queue_index(vkb::QueueType::graphics).value();
|
||||
|
||||
auto pqRet = vkbDevice.get_queue(vkb::QueueType::present);
|
||||
if (!pqRet) {
|
||||
presentQueue = graphicsQueue;
|
||||
presentQueueFamily = graphicsQueueFamily;
|
||||
} else {
|
||||
presentQueue = pqRet.value();
|
||||
presentQueueFamily = vkbDevice.get_queue_index(vkb::QueueType::present).value();
|
||||
}
|
||||
}
|
||||
|
||||
LOG_INFO("Vulkan logical device created");
|
||||
|
|
@ -588,6 +653,19 @@ bool VkContext::createCommandPools() {
|
|||
return false;
|
||||
}
|
||||
|
||||
// Separate command pool for the transfer queue (same family, different queue)
|
||||
if (hasDedicatedTransfer_) {
|
||||
VkCommandPoolCreateInfo transferPoolInfo{};
|
||||
transferPoolInfo.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO;
|
||||
transferPoolInfo.flags = VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT;
|
||||
transferPoolInfo.queueFamilyIndex = graphicsQueueFamily;
|
||||
|
||||
if (vkCreateCommandPool(device, &transferPoolInfo, nullptr, &transferCommandPool_) != VK_SUCCESS) {
|
||||
LOG_ERROR("Failed to create transfer command pool");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
@ -1709,7 +1787,21 @@ void VkContext::beginUploadBatch() {
|
|||
uploadBatchDepth_++;
|
||||
if (inUploadBatch_) return; // already in a batch (nested call)
|
||||
inUploadBatch_ = true;
|
||||
batchCmd_ = beginSingleTimeCommands();
|
||||
|
||||
// Allocate from transfer pool if available, otherwise from immCommandPool.
|
||||
VkCommandPool pool = hasDedicatedTransfer_ ? transferCommandPool_ : immCommandPool;
|
||||
|
||||
VkCommandBufferAllocateInfo allocInfo{};
|
||||
allocInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO;
|
||||
allocInfo.commandPool = pool;
|
||||
allocInfo.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY;
|
||||
allocInfo.commandBufferCount = 1;
|
||||
vkAllocateCommandBuffers(device, &allocInfo, &batchCmd_);
|
||||
|
||||
VkCommandBufferBeginInfo beginInfo{};
|
||||
beginInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO;
|
||||
beginInfo.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
|
||||
vkBeginCommandBuffer(batchCmd_, &beginInfo);
|
||||
}
|
||||
|
||||
void VkContext::endUploadBatch() {
|
||||
|
|
@ -1719,10 +1811,12 @@ void VkContext::endUploadBatch() {
|
|||
|
||||
inUploadBatch_ = false;
|
||||
|
||||
VkCommandPool pool = hasDedicatedTransfer_ ? transferCommandPool_ : immCommandPool;
|
||||
|
||||
if (batchStagingBuffers_.empty()) {
|
||||
// No GPU copies were recorded — skip the submit entirely.
|
||||
vkEndCommandBuffer(batchCmd_);
|
||||
vkFreeCommandBuffers(device, immCommandPool, 1, &batchCmd_);
|
||||
vkFreeCommandBuffers(device, pool, 1, &batchCmd_);
|
||||
batchCmd_ = VK_NULL_HANDLE;
|
||||
return;
|
||||
}
|
||||
|
|
@ -1739,7 +1833,10 @@ void VkContext::endUploadBatch() {
|
|||
submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
|
||||
submitInfo.commandBufferCount = 1;
|
||||
submitInfo.pCommandBuffers = &batchCmd_;
|
||||
vkQueueSubmit(graphicsQueue, 1, &submitInfo, fence);
|
||||
|
||||
// Submit to the dedicated transfer queue if available, otherwise graphics.
|
||||
VkQueue targetQueue = hasDedicatedTransfer_ ? transferQueue_ : graphicsQueue;
|
||||
vkQueueSubmit(targetQueue, 1, &submitInfo, fence);
|
||||
|
||||
// Stash everything for later cleanup when fence signals
|
||||
InFlightBatch batch;
|
||||
|
|
@ -1759,15 +1856,30 @@ void VkContext::endUploadBatchSync() {
|
|||
|
||||
inUploadBatch_ = false;
|
||||
|
||||
VkCommandPool pool = hasDedicatedTransfer_ ? transferCommandPool_ : immCommandPool;
|
||||
|
||||
if (batchStagingBuffers_.empty()) {
|
||||
vkEndCommandBuffer(batchCmd_);
|
||||
vkFreeCommandBuffers(device, immCommandPool, 1, &batchCmd_);
|
||||
vkFreeCommandBuffers(device, pool, 1, &batchCmd_);
|
||||
batchCmd_ = VK_NULL_HANDLE;
|
||||
return;
|
||||
}
|
||||
|
||||
// Synchronous path for load screens — submit and wait
|
||||
endSingleTimeCommands(batchCmd_);
|
||||
// Synchronous path for load screens — submit and wait on the target queue.
|
||||
VkQueue targetQueue = hasDedicatedTransfer_ ? transferQueue_ : graphicsQueue;
|
||||
|
||||
vkEndCommandBuffer(batchCmd_);
|
||||
|
||||
VkSubmitInfo submitInfo{};
|
||||
submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
|
||||
submitInfo.commandBufferCount = 1;
|
||||
submitInfo.pCommandBuffers = &batchCmd_;
|
||||
|
||||
vkQueueSubmit(targetQueue, 1, &submitInfo, immFence);
|
||||
vkWaitForFences(device, 1, &immFence, VK_TRUE, UINT64_MAX);
|
||||
vkResetFences(device, 1, &immFence);
|
||||
|
||||
vkFreeCommandBuffers(device, pool, 1, &batchCmd_);
|
||||
batchCmd_ = VK_NULL_HANDLE;
|
||||
|
||||
for (auto& staging : batchStagingBuffers_) {
|
||||
|
|
@ -1779,6 +1891,8 @@ void VkContext::endUploadBatchSync() {
|
|||
void VkContext::pollUploadBatches() {
|
||||
if (inFlightBatches_.empty()) return;
|
||||
|
||||
VkCommandPool pool = hasDedicatedTransfer_ ? transferCommandPool_ : immCommandPool;
|
||||
|
||||
for (auto it = inFlightBatches_.begin(); it != inFlightBatches_.end(); ) {
|
||||
VkResult result = vkGetFenceStatus(device, it->fence);
|
||||
if (result == VK_SUCCESS) {
|
||||
|
|
@ -1786,7 +1900,7 @@ void VkContext::pollUploadBatches() {
|
|||
for (auto& staging : it->stagingBuffers) {
|
||||
destroyBuffer(allocator, staging);
|
||||
}
|
||||
vkFreeCommandBuffers(device, immCommandPool, 1, &it->cmd);
|
||||
vkFreeCommandBuffers(device, pool, 1, &it->cmd);
|
||||
vkDestroyFence(device, it->fence, nullptr);
|
||||
it = inFlightBatches_.erase(it);
|
||||
} else {
|
||||
|
|
@ -1796,12 +1910,14 @@ void VkContext::pollUploadBatches() {
|
|||
}
|
||||
|
||||
void VkContext::waitAllUploads() {
|
||||
VkCommandPool pool = hasDedicatedTransfer_ ? transferCommandPool_ : immCommandPool;
|
||||
|
||||
for (auto& batch : inFlightBatches_) {
|
||||
vkWaitForFences(device, 1, &batch.fence, VK_TRUE, UINT64_MAX);
|
||||
for (auto& staging : batch.stagingBuffers) {
|
||||
destroyBuffer(allocator, staging);
|
||||
}
|
||||
vkFreeCommandBuffers(device, immCommandPool, 1, &batch.cmd);
|
||||
vkFreeCommandBuffers(device, pool, 1, &batch.cmd);
|
||||
vkDestroyFence(device, batch.fence, nullptr);
|
||||
}
|
||||
inFlightBatches_.clear();
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue