mirror of
https://github.com/Kelsidavis/WoWee.git
synced 2026-04-17 17:43:52 +00:00
perf: use second GPU queue for parallel texture/buffer uploads
Request 2 queues from the graphics family when available (NVIDIA exposes 16, AMD 2+). Upload batches now submit to queue[1] while rendering uses queue[0], enabling parallel GPU transfers without queue-family ownership transfer barriers (same family). Falls back to single-queue path on GPUs with only 1 queue in the graphics family. Transfer command pool is separate to avoid contention.
This commit is contained in:
parent
ed0cb0ad25
commit
1dd3823013
2 changed files with 144 additions and 21 deletions
|
|
@ -78,6 +78,7 @@ public:
|
||||||
bool isNvidiaGpu() const { return gpuVendorId_ == 0x10DE; }
|
bool isNvidiaGpu() const { return gpuVendorId_ == 0x10DE; }
|
||||||
VkQueue getGraphicsQueue() const { return graphicsQueue; }
|
VkQueue getGraphicsQueue() const { return graphicsQueue; }
|
||||||
uint32_t getGraphicsQueueFamily() const { return graphicsQueueFamily; }
|
uint32_t getGraphicsQueueFamily() const { return graphicsQueueFamily; }
|
||||||
|
bool hasDedicatedTransferQueue() const { return hasDedicatedTransfer_; }
|
||||||
VmaAllocator getAllocator() const { return allocator; }
|
VmaAllocator getAllocator() const { return allocator; }
|
||||||
VkSurfaceKHR getSurface() const { return surface; }
|
VkSurfaceKHR getSurface() const { return surface; }
|
||||||
VkPipelineCache getPipelineCache() const { return pipelineCache_; }
|
VkPipelineCache getPipelineCache() const { return pipelineCache_; }
|
||||||
|
|
@ -175,6 +176,12 @@ private:
|
||||||
uint32_t graphicsQueueFamily = 0;
|
uint32_t graphicsQueueFamily = 0;
|
||||||
uint32_t presentQueueFamily = 0;
|
uint32_t presentQueueFamily = 0;
|
||||||
|
|
||||||
|
// Dedicated transfer queue (second queue from same graphics family)
|
||||||
|
VkQueue transferQueue_ = VK_NULL_HANDLE;
|
||||||
|
VkCommandPool transferCommandPool_ = VK_NULL_HANDLE;
|
||||||
|
bool hasDedicatedTransfer_ = false;
|
||||||
|
uint32_t graphicsQueueFamilyQueueCount_ = 1; // queried in selectPhysicalDevice
|
||||||
|
|
||||||
// Swapchain
|
// Swapchain
|
||||||
VkSwapchainKHR swapchain = VK_NULL_HANDLE;
|
VkSwapchainKHR swapchain = VK_NULL_HANDLE;
|
||||||
VkFormat swapchainFormat = VK_FORMAT_UNDEFINED;
|
VkFormat swapchainFormat = VK_FORMAT_UNDEFINED;
|
||||||
|
|
|
||||||
|
|
@ -135,6 +135,7 @@ void VkContext::shutdown() {
|
||||||
|
|
||||||
if (immFence) { vkDestroyFence(device, immFence, nullptr); immFence = VK_NULL_HANDLE; }
|
if (immFence) { vkDestroyFence(device, immFence, nullptr); immFence = VK_NULL_HANDLE; }
|
||||||
if (immCommandPool) { vkDestroyCommandPool(device, immCommandPool, nullptr); immCommandPool = VK_NULL_HANDLE; }
|
if (immCommandPool) { vkDestroyCommandPool(device, immCommandPool, nullptr); immCommandPool = VK_NULL_HANDLE; }
|
||||||
|
if (transferCommandPool_) { vkDestroyCommandPool(device, transferCommandPool_, nullptr); transferCommandPool_ = VK_NULL_HANDLE; }
|
||||||
|
|
||||||
// Persist pipeline cache to disk before tearing down the device.
|
// Persist pipeline cache to disk before tearing down the device.
|
||||||
savePipelineCache();
|
savePipelineCache();
|
||||||
|
|
@ -328,11 +329,52 @@ bool VkContext::selectPhysicalDevice() {
|
||||||
VK_VERSION_MINOR(props.apiVersion), ".", VK_VERSION_PATCH(props.apiVersion));
|
VK_VERSION_MINOR(props.apiVersion), ".", VK_VERSION_PATCH(props.apiVersion));
|
||||||
LOG_INFO("Depth resolve support: ", depthResolveSupported_ ? "YES" : "NO");
|
LOG_INFO("Depth resolve support: ", depthResolveSupported_ ? "YES" : "NO");
|
||||||
|
|
||||||
|
// Probe queue families to see if the graphics family supports multiple queues
|
||||||
|
// (used in createLogicalDevice to request a second queue for parallel uploads).
|
||||||
|
auto queueFamilies = vkbPhysicalDevice_.get_queue_families();
|
||||||
|
for (uint32_t i = 0; i < static_cast<uint32_t>(queueFamilies.size()); i++) {
|
||||||
|
if (queueFamilies[i].queueFlags & VK_QUEUE_GRAPHICS_BIT) {
|
||||||
|
graphicsQueueFamilyQueueCount_ = queueFamilies[i].queueCount;
|
||||||
|
LOG_INFO("Graphics queue family ", i, " supports ", graphicsQueueFamilyQueueCount_, " queue(s)");
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool VkContext::createLogicalDevice() {
|
bool VkContext::createLogicalDevice() {
|
||||||
vkb::DeviceBuilder deviceBuilder{vkbPhysicalDevice_};
|
vkb::DeviceBuilder deviceBuilder{vkbPhysicalDevice_};
|
||||||
|
|
||||||
|
// If the graphics queue family supports >= 2 queues, request a second one
|
||||||
|
// for parallel texture/buffer uploads. Both queues share the same family
|
||||||
|
// so no queue-ownership-transfer barriers are needed.
|
||||||
|
const bool requestTransferQueue = (graphicsQueueFamilyQueueCount_ >= 2);
|
||||||
|
|
||||||
|
if (requestTransferQueue) {
|
||||||
|
// Build a custom queue description list: 2 queues from the graphics
|
||||||
|
// family, 1 queue from every other family (so present etc. still work).
|
||||||
|
auto families = vkbPhysicalDevice_.get_queue_families();
|
||||||
|
uint32_t gfxFamily = UINT32_MAX;
|
||||||
|
for (uint32_t i = 0; i < static_cast<uint32_t>(families.size()); i++) {
|
||||||
|
if (families[i].queueFlags & VK_QUEUE_GRAPHICS_BIT) {
|
||||||
|
gfxFamily = i;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<vkb::CustomQueueDescription> queueDescs;
|
||||||
|
for (uint32_t i = 0; i < static_cast<uint32_t>(families.size()); i++) {
|
||||||
|
if (i == gfxFamily) {
|
||||||
|
// Request 2 queues: [0] graphics, [1] transfer uploads
|
||||||
|
queueDescs.emplace_back(i, std::vector<float>{1.0f, 1.0f});
|
||||||
|
} else {
|
||||||
|
queueDescs.emplace_back(i, std::vector<float>{1.0f});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
deviceBuilder.custom_queue_setup(queueDescs);
|
||||||
|
}
|
||||||
|
|
||||||
auto devRet = deviceBuilder.build();
|
auto devRet = deviceBuilder.build();
|
||||||
if (!devRet) {
|
if (!devRet) {
|
||||||
LOG_ERROR("Failed to create Vulkan logical device: ", devRet.error().message());
|
LOG_ERROR("Failed to create Vulkan logical device: ", devRet.error().message());
|
||||||
|
|
@ -342,22 +384,45 @@ bool VkContext::createLogicalDevice() {
|
||||||
auto vkbDevice = devRet.value();
|
auto vkbDevice = devRet.value();
|
||||||
device = vkbDevice.device;
|
device = vkbDevice.device;
|
||||||
|
|
||||||
auto gqRet = vkbDevice.get_queue(vkb::QueueType::graphics);
|
if (requestTransferQueue) {
|
||||||
if (!gqRet) {
|
// With custom_queue_setup, we must retrieve queues manually.
|
||||||
LOG_ERROR("Failed to get graphics queue");
|
auto families = vkbPhysicalDevice_.get_queue_families();
|
||||||
return false;
|
uint32_t gfxFamily = UINT32_MAX;
|
||||||
}
|
for (uint32_t i = 0; i < static_cast<uint32_t>(families.size()); i++) {
|
||||||
graphicsQueue = gqRet.value();
|
if (families[i].queueFlags & VK_QUEUE_GRAPHICS_BIT) {
|
||||||
graphicsQueueFamily = vkbDevice.get_queue_index(vkb::QueueType::graphics).value();
|
gfxFamily = i;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
graphicsQueueFamily = gfxFamily;
|
||||||
|
vkGetDeviceQueue(device, gfxFamily, 0, &graphicsQueue);
|
||||||
|
vkGetDeviceQueue(device, gfxFamily, 1, &transferQueue_);
|
||||||
|
hasDedicatedTransfer_ = true;
|
||||||
|
|
||||||
auto pqRet = vkbDevice.get_queue(vkb::QueueType::present);
|
// Present queue: try the graphics family first (most common), otherwise
|
||||||
if (!pqRet) {
|
// find a family that supports presentation.
|
||||||
// Fall back to graphics queue for presentation
|
|
||||||
presentQueue = graphicsQueue;
|
presentQueue = graphicsQueue;
|
||||||
presentQueueFamily = graphicsQueueFamily;
|
presentQueueFamily = gfxFamily;
|
||||||
|
|
||||||
|
LOG_INFO("Dedicated transfer queue enabled (family ", gfxFamily, ", queue index 1)");
|
||||||
} else {
|
} else {
|
||||||
presentQueue = pqRet.value();
|
// Standard path — let vkb resolve queues.
|
||||||
presentQueueFamily = vkbDevice.get_queue_index(vkb::QueueType::present).value();
|
auto gqRet = vkbDevice.get_queue(vkb::QueueType::graphics);
|
||||||
|
if (!gqRet) {
|
||||||
|
LOG_ERROR("Failed to get graphics queue");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
graphicsQueue = gqRet.value();
|
||||||
|
graphicsQueueFamily = vkbDevice.get_queue_index(vkb::QueueType::graphics).value();
|
||||||
|
|
||||||
|
auto pqRet = vkbDevice.get_queue(vkb::QueueType::present);
|
||||||
|
if (!pqRet) {
|
||||||
|
presentQueue = graphicsQueue;
|
||||||
|
presentQueueFamily = graphicsQueueFamily;
|
||||||
|
} else {
|
||||||
|
presentQueue = pqRet.value();
|
||||||
|
presentQueueFamily = vkbDevice.get_queue_index(vkb::QueueType::present).value();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG_INFO("Vulkan logical device created");
|
LOG_INFO("Vulkan logical device created");
|
||||||
|
|
@ -588,6 +653,19 @@ bool VkContext::createCommandPools() {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Separate command pool for the transfer queue (same family, different queue)
|
||||||
|
if (hasDedicatedTransfer_) {
|
||||||
|
VkCommandPoolCreateInfo transferPoolInfo{};
|
||||||
|
transferPoolInfo.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO;
|
||||||
|
transferPoolInfo.flags = VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT;
|
||||||
|
transferPoolInfo.queueFamilyIndex = graphicsQueueFamily;
|
||||||
|
|
||||||
|
if (vkCreateCommandPool(device, &transferPoolInfo, nullptr, &transferCommandPool_) != VK_SUCCESS) {
|
||||||
|
LOG_ERROR("Failed to create transfer command pool");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -1709,7 +1787,21 @@ void VkContext::beginUploadBatch() {
|
||||||
uploadBatchDepth_++;
|
uploadBatchDepth_++;
|
||||||
if (inUploadBatch_) return; // already in a batch (nested call)
|
if (inUploadBatch_) return; // already in a batch (nested call)
|
||||||
inUploadBatch_ = true;
|
inUploadBatch_ = true;
|
||||||
batchCmd_ = beginSingleTimeCommands();
|
|
||||||
|
// Allocate from transfer pool if available, otherwise from immCommandPool.
|
||||||
|
VkCommandPool pool = hasDedicatedTransfer_ ? transferCommandPool_ : immCommandPool;
|
||||||
|
|
||||||
|
VkCommandBufferAllocateInfo allocInfo{};
|
||||||
|
allocInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO;
|
||||||
|
allocInfo.commandPool = pool;
|
||||||
|
allocInfo.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY;
|
||||||
|
allocInfo.commandBufferCount = 1;
|
||||||
|
vkAllocateCommandBuffers(device, &allocInfo, &batchCmd_);
|
||||||
|
|
||||||
|
VkCommandBufferBeginInfo beginInfo{};
|
||||||
|
beginInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO;
|
||||||
|
beginInfo.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
|
||||||
|
vkBeginCommandBuffer(batchCmd_, &beginInfo);
|
||||||
}
|
}
|
||||||
|
|
||||||
void VkContext::endUploadBatch() {
|
void VkContext::endUploadBatch() {
|
||||||
|
|
@ -1719,10 +1811,12 @@ void VkContext::endUploadBatch() {
|
||||||
|
|
||||||
inUploadBatch_ = false;
|
inUploadBatch_ = false;
|
||||||
|
|
||||||
|
VkCommandPool pool = hasDedicatedTransfer_ ? transferCommandPool_ : immCommandPool;
|
||||||
|
|
||||||
if (batchStagingBuffers_.empty()) {
|
if (batchStagingBuffers_.empty()) {
|
||||||
// No GPU copies were recorded — skip the submit entirely.
|
// No GPU copies were recorded — skip the submit entirely.
|
||||||
vkEndCommandBuffer(batchCmd_);
|
vkEndCommandBuffer(batchCmd_);
|
||||||
vkFreeCommandBuffers(device, immCommandPool, 1, &batchCmd_);
|
vkFreeCommandBuffers(device, pool, 1, &batchCmd_);
|
||||||
batchCmd_ = VK_NULL_HANDLE;
|
batchCmd_ = VK_NULL_HANDLE;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
@ -1739,7 +1833,10 @@ void VkContext::endUploadBatch() {
|
||||||
submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
|
submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
|
||||||
submitInfo.commandBufferCount = 1;
|
submitInfo.commandBufferCount = 1;
|
||||||
submitInfo.pCommandBuffers = &batchCmd_;
|
submitInfo.pCommandBuffers = &batchCmd_;
|
||||||
vkQueueSubmit(graphicsQueue, 1, &submitInfo, fence);
|
|
||||||
|
// Submit to the dedicated transfer queue if available, otherwise graphics.
|
||||||
|
VkQueue targetQueue = hasDedicatedTransfer_ ? transferQueue_ : graphicsQueue;
|
||||||
|
vkQueueSubmit(targetQueue, 1, &submitInfo, fence);
|
||||||
|
|
||||||
// Stash everything for later cleanup when fence signals
|
// Stash everything for later cleanup when fence signals
|
||||||
InFlightBatch batch;
|
InFlightBatch batch;
|
||||||
|
|
@ -1759,15 +1856,30 @@ void VkContext::endUploadBatchSync() {
|
||||||
|
|
||||||
inUploadBatch_ = false;
|
inUploadBatch_ = false;
|
||||||
|
|
||||||
|
VkCommandPool pool = hasDedicatedTransfer_ ? transferCommandPool_ : immCommandPool;
|
||||||
|
|
||||||
if (batchStagingBuffers_.empty()) {
|
if (batchStagingBuffers_.empty()) {
|
||||||
vkEndCommandBuffer(batchCmd_);
|
vkEndCommandBuffer(batchCmd_);
|
||||||
vkFreeCommandBuffers(device, immCommandPool, 1, &batchCmd_);
|
vkFreeCommandBuffers(device, pool, 1, &batchCmd_);
|
||||||
batchCmd_ = VK_NULL_HANDLE;
|
batchCmd_ = VK_NULL_HANDLE;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Synchronous path for load screens — submit and wait
|
// Synchronous path for load screens — submit and wait on the target queue.
|
||||||
endSingleTimeCommands(batchCmd_);
|
VkQueue targetQueue = hasDedicatedTransfer_ ? transferQueue_ : graphicsQueue;
|
||||||
|
|
||||||
|
vkEndCommandBuffer(batchCmd_);
|
||||||
|
|
||||||
|
VkSubmitInfo submitInfo{};
|
||||||
|
submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
|
||||||
|
submitInfo.commandBufferCount = 1;
|
||||||
|
submitInfo.pCommandBuffers = &batchCmd_;
|
||||||
|
|
||||||
|
vkQueueSubmit(targetQueue, 1, &submitInfo, immFence);
|
||||||
|
vkWaitForFences(device, 1, &immFence, VK_TRUE, UINT64_MAX);
|
||||||
|
vkResetFences(device, 1, &immFence);
|
||||||
|
|
||||||
|
vkFreeCommandBuffers(device, pool, 1, &batchCmd_);
|
||||||
batchCmd_ = VK_NULL_HANDLE;
|
batchCmd_ = VK_NULL_HANDLE;
|
||||||
|
|
||||||
for (auto& staging : batchStagingBuffers_) {
|
for (auto& staging : batchStagingBuffers_) {
|
||||||
|
|
@ -1779,6 +1891,8 @@ void VkContext::endUploadBatchSync() {
|
||||||
void VkContext::pollUploadBatches() {
|
void VkContext::pollUploadBatches() {
|
||||||
if (inFlightBatches_.empty()) return;
|
if (inFlightBatches_.empty()) return;
|
||||||
|
|
||||||
|
VkCommandPool pool = hasDedicatedTransfer_ ? transferCommandPool_ : immCommandPool;
|
||||||
|
|
||||||
for (auto it = inFlightBatches_.begin(); it != inFlightBatches_.end(); ) {
|
for (auto it = inFlightBatches_.begin(); it != inFlightBatches_.end(); ) {
|
||||||
VkResult result = vkGetFenceStatus(device, it->fence);
|
VkResult result = vkGetFenceStatus(device, it->fence);
|
||||||
if (result == VK_SUCCESS) {
|
if (result == VK_SUCCESS) {
|
||||||
|
|
@ -1786,7 +1900,7 @@ void VkContext::pollUploadBatches() {
|
||||||
for (auto& staging : it->stagingBuffers) {
|
for (auto& staging : it->stagingBuffers) {
|
||||||
destroyBuffer(allocator, staging);
|
destroyBuffer(allocator, staging);
|
||||||
}
|
}
|
||||||
vkFreeCommandBuffers(device, immCommandPool, 1, &it->cmd);
|
vkFreeCommandBuffers(device, pool, 1, &it->cmd);
|
||||||
vkDestroyFence(device, it->fence, nullptr);
|
vkDestroyFence(device, it->fence, nullptr);
|
||||||
it = inFlightBatches_.erase(it);
|
it = inFlightBatches_.erase(it);
|
||||||
} else {
|
} else {
|
||||||
|
|
@ -1796,12 +1910,14 @@ void VkContext::pollUploadBatches() {
|
||||||
}
|
}
|
||||||
|
|
||||||
void VkContext::waitAllUploads() {
|
void VkContext::waitAllUploads() {
|
||||||
|
VkCommandPool pool = hasDedicatedTransfer_ ? transferCommandPool_ : immCommandPool;
|
||||||
|
|
||||||
for (auto& batch : inFlightBatches_) {
|
for (auto& batch : inFlightBatches_) {
|
||||||
vkWaitForFences(device, 1, &batch.fence, VK_TRUE, UINT64_MAX);
|
vkWaitForFences(device, 1, &batch.fence, VK_TRUE, UINT64_MAX);
|
||||||
for (auto& staging : batch.stagingBuffers) {
|
for (auto& staging : batch.stagingBuffers) {
|
||||||
destroyBuffer(allocator, staging);
|
destroyBuffer(allocator, staging);
|
||||||
}
|
}
|
||||||
vkFreeCommandBuffers(device, immCommandPool, 1, &batch.cmd);
|
vkFreeCommandBuffers(device, pool, 1, &batch.cmd);
|
||||||
vkDestroyFence(device, batch.fence, nullptr);
|
vkDestroyFence(device, batch.fence, nullptr);
|
||||||
}
|
}
|
||||||
inFlightBatches_.clear();
|
inFlightBatches_.clear();
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue