Background BLP texture pre-decoding + deferred WMO normal maps (12x streaming perf)

Move CPU-heavy BLP texture decoding from main thread to background worker
threads for all hot paths: terrain M2 models, WMO doodad M2s, WMO textures,
creature models, and gameobject WMOs. Each renderer (M2, WMO, Character) now
accepts a pre-decoded BLP cache that loadTexture() checks before falling back
to synchronous decode.

Defer WMO normal/height map generation (3 per-pixel passes: luminance, box
blur, Sobel) during terrain streaming finalization — this was the dominant
remaining bottleneck after BLP pre-decoding.

Terrain streaming stalls: 1576ms → 124ms worst case.
This commit is contained in:
Kelsi 2026-03-07 15:46:56 -08:00
parent 0313bd8692
commit 7ac990cff4
13 changed files with 573 additions and 109 deletions

View file

@ -67,6 +67,14 @@ void VkContext::shutdown() {
frame = {};
}
// Clean up any in-flight async upload batches (device already idle)
for (auto& batch : inFlightBatches_) {
// Staging buffers: skip destroy — allocator is about to be torn down
vkDestroyFence(device, batch.fence, nullptr);
// Command buffer freed when pool is destroyed below
}
inFlightBatches_.clear();
if (immFence) { vkDestroyFence(device, immFence, nullptr); immFence = VK_NULL_HANDLE; }
if (immCommandPool) { vkDestroyCommandPool(device, immCommandPool, nullptr); immCommandPool = VK_NULL_HANDLE; }
@ -1447,17 +1455,94 @@ void VkContext::endUploadBatch() {
inUploadBatch_ = false;
// Submit all recorded commands with a single fence wait
if (batchStagingBuffers_.empty()) {
// No GPU copies were recorded — skip the submit entirely.
vkEndCommandBuffer(batchCmd_);
vkFreeCommandBuffers(device, immCommandPool, 1, &batchCmd_);
batchCmd_ = VK_NULL_HANDLE;
return;
}
// Submit commands with a NEW fence — don't wait, let GPU work in parallel.
vkEndCommandBuffer(batchCmd_);
VkFenceCreateInfo fenceInfo{};
fenceInfo.sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO;
VkFence fence = VK_NULL_HANDLE;
vkCreateFence(device, &fenceInfo, nullptr, &fence);
VkSubmitInfo submitInfo{};
submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
submitInfo.commandBufferCount = 1;
submitInfo.pCommandBuffers = &batchCmd_;
vkQueueSubmit(graphicsQueue, 1, &submitInfo, fence);
// Stash everything for later cleanup when fence signals
InFlightBatch batch;
batch.fence = fence;
batch.cmd = batchCmd_;
batch.stagingBuffers = std::move(batchStagingBuffers_);
inFlightBatches_.push_back(std::move(batch));
batchCmd_ = VK_NULL_HANDLE;
batchStagingBuffers_.clear();
}
void VkContext::endUploadBatchSync() {
if (uploadBatchDepth_ <= 0) return;
uploadBatchDepth_--;
if (uploadBatchDepth_ > 0) return;
inUploadBatch_ = false;
if (batchStagingBuffers_.empty()) {
vkEndCommandBuffer(batchCmd_);
vkFreeCommandBuffers(device, immCommandPool, 1, &batchCmd_);
batchCmd_ = VK_NULL_HANDLE;
return;
}
// Synchronous path for load screens — submit and wait
endSingleTimeCommands(batchCmd_);
batchCmd_ = VK_NULL_HANDLE;
// Destroy all deferred staging buffers
for (auto& staging : batchStagingBuffers_) {
destroyBuffer(allocator, staging);
}
batchStagingBuffers_.clear();
}
void VkContext::pollUploadBatches() {
if (inFlightBatches_.empty()) return;
for (auto it = inFlightBatches_.begin(); it != inFlightBatches_.end(); ) {
VkResult result = vkGetFenceStatus(device, it->fence);
if (result == VK_SUCCESS) {
// GPU finished — free resources
for (auto& staging : it->stagingBuffers) {
destroyBuffer(allocator, staging);
}
vkFreeCommandBuffers(device, immCommandPool, 1, &it->cmd);
vkDestroyFence(device, it->fence, nullptr);
it = inFlightBatches_.erase(it);
} else {
++it;
}
}
}
void VkContext::waitAllUploads() {
for (auto& batch : inFlightBatches_) {
vkWaitForFences(device, 1, &batch.fence, VK_TRUE, UINT64_MAX);
for (auto& staging : batch.stagingBuffers) {
destroyBuffer(allocator, staging);
}
vkFreeCommandBuffers(device, immCommandPool, 1, &batch.cmd);
vkDestroyFence(device, batch.fence, nullptr);
}
inFlightBatches_.clear();
}
void VkContext::deferStagingCleanup(AllocatedBuffer staging) {
batchStagingBuffers_.push_back(staging);
}