Background BLP texture pre-decoding + deferred WMO normal maps (12x streaming perf)

Move CPU-heavy BLP texture decoding from main thread to background worker threads for all hot paths: terrain M2 models, WMO doodad M2s, WMO textures, creature models, and gameobject WMOs. Each renderer (M2, WMO, Character) now accepts a pre-decoded BLP cache that loadTexture() checks before falling back to synchronous decode. Defer WMO normal/height map generation (3 per-pixel passes: luminance, box blur, Sobel) during terrain streaming finalization — this was the dominant remaining bottleneck after BLP pre-decoding. Terrain streaming stalls: 1576ms → 124ms worst case.
2026-03-25 00:20:16 +00:00 · 2026-03-07 15:46:56 -08:00 · 2026-03-07 15:46:56 -08:00 · 7ac990cff4
commit 7ac990cff4
parent 0313bd8692
13 changed files with 573 additions and 109 deletions
--- a/src/rendering/vk_context.cpp
+++ b/src/rendering/vk_context.cpp
@ -67,6 +67,14 @@ void VkContext::shutdown() {
        frame = {};
    }

+    // Clean up any in-flight async upload batches (device already idle)
+    for (auto& batch : inFlightBatches_) {
+        // Staging buffers: skip destroy — allocator is about to be torn down
+        vkDestroyFence(device, batch.fence, nullptr);
+        // Command buffer freed when pool is destroyed below
+    }
+    inFlightBatches_.clear();
+
    if (immFence) { vkDestroyFence(device, immFence, nullptr); immFence = VK_NULL_HANDLE; }
    if (immCommandPool) { vkDestroyCommandPool(device, immCommandPool, nullptr); immCommandPool = VK_NULL_HANDLE; }

@ -1447,17 +1455,94 @@ void VkContext::endUploadBatch() {

    inUploadBatch_ = false;

-    // Submit all recorded commands with a single fence wait
+    if (batchStagingBuffers_.empty()) {
+        // No GPU copies were recorded — skip the submit entirely.
+        vkEndCommandBuffer(batchCmd_);
+        vkFreeCommandBuffers(device, immCommandPool, 1, &batchCmd_);
+        batchCmd_ = VK_NULL_HANDLE;
+        return;
+    }
+
+    // Submit commands with a NEW fence — don't wait, let GPU work in parallel.
+    vkEndCommandBuffer(batchCmd_);
+
+    VkFenceCreateInfo fenceInfo{};
+    fenceInfo.sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO;
+    VkFence fence = VK_NULL_HANDLE;
+    vkCreateFence(device, &fenceInfo, nullptr, &fence);
+
+    VkSubmitInfo submitInfo{};
+    submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
+    submitInfo.commandBufferCount = 1;
+    submitInfo.pCommandBuffers = &batchCmd_;
+    vkQueueSubmit(graphicsQueue, 1, &submitInfo, fence);
+
+    // Stash everything for later cleanup when fence signals
+    InFlightBatch batch;
+    batch.fence = fence;
+    batch.cmd = batchCmd_;
+    batch.stagingBuffers = std::move(batchStagingBuffers_);
+    inFlightBatches_.push_back(std::move(batch));
+
+    batchCmd_ = VK_NULL_HANDLE;
+    batchStagingBuffers_.clear();
+}
+
+void VkContext::endUploadBatchSync() {
+    if (uploadBatchDepth_ <= 0) return;
+    uploadBatchDepth_--;
+    if (uploadBatchDepth_ > 0) return;
+
+    inUploadBatch_ = false;
+
+    if (batchStagingBuffers_.empty()) {
+        vkEndCommandBuffer(batchCmd_);
+        vkFreeCommandBuffers(device, immCommandPool, 1, &batchCmd_);
+        batchCmd_ = VK_NULL_HANDLE;
+        return;
+    }
+
+    // Synchronous path for load screens — submit and wait
    endSingleTimeCommands(batchCmd_);
    batchCmd_ = VK_NULL_HANDLE;

-    // Destroy all deferred staging buffers
    for (auto& staging : batchStagingBuffers_) {
        destroyBuffer(allocator, staging);
    }
    batchStagingBuffers_.clear();
 }

+void VkContext::pollUploadBatches() {
+    if (inFlightBatches_.empty()) return;
+
+    for (auto it = inFlightBatches_.begin(); it != inFlightBatches_.end(); ) {
+        VkResult result = vkGetFenceStatus(device, it->fence);
+        if (result == VK_SUCCESS) {
+            // GPU finished — free resources
+            for (auto& staging : it->stagingBuffers) {
+                destroyBuffer(allocator, staging);
+            }
+            vkFreeCommandBuffers(device, immCommandPool, 1, &it->cmd);
+            vkDestroyFence(device, it->fence, nullptr);
+            it = inFlightBatches_.erase(it);
+        } else {
+            ++it;
+        }
+    }
+}
+
+void VkContext::waitAllUploads() {
+    for (auto& batch : inFlightBatches_) {
+        vkWaitForFences(device, 1, &batch.fence, VK_TRUE, UINT64_MAX);
+        for (auto& staging : batch.stagingBuffers) {
+            destroyBuffer(allocator, staging);
+        }
+        vkFreeCommandBuffers(device, immCommandPool, 1, &batch.cmd);
+        vkDestroyFence(device, batch.fence, nullptr);
+    }
+    inFlightBatches_.clear();
+}
+
 void VkContext::deferStagingCleanup(AllocatedBuffer staging) {
    batchStagingBuffers_.push_back(staging);
 }