Batch GPU uploads to eliminate per-upload fence waits (stutter fix)

Every uploadBuffer/VkTexture::upload called immediateSubmit which did a separate vkQueueSubmit + vkWaitForFences. Loading a single creature model with textures caused 4-8+ fence waits; terrain chunks caused 80+ per batch. Added beginUploadBatch/endUploadBatch to VkContext: records all upload commands into a single command buffer, submits once with one fence wait. Staging buffers are deferred for cleanup after the batch completes. Wrapped in batch mode: - CharacterRenderer::loadModel (creature VB/IB + textures) - M2Renderer::loadModel (doodad VB/IB + textures) - TerrainRenderer::loadTerrain/loadTerrainIncremental (chunk geometry + textures) - TerrainRenderer::uploadPreloadedTextures - WMORenderer::loadModel (group geometry + textures)
2026-04-17 09:33:51 +00:00 · 2026-03-07 12:19:59 -08:00 · 2026-03-07 12:19:59 -08:00 · 16b4336700
commit 16b4336700
parent 884b72bc1c
8 changed files with 97 additions and 4 deletions
--- a/src/rendering/character_renderer.cpp
+++ b/src/rendering/character_renderer.cpp
@ -1247,6 +1247,10 @@ bool CharacterRenderer::loadModel(const pipeline::M2Model& model, uint32_t id) {
    M2ModelGPU gpuModel;
    gpuModel.data = model;

+    // Batch all GPU uploads (VB, IB, textures) into a single command buffer
+    // submission with one fence wait, instead of one fence wait per upload.
+    vkCtx_->beginUploadBatch();
+
    // Setup GPU buffers
    setupModelBuffers(gpuModel);

@ -1259,6 +1263,8 @@ bool CharacterRenderer::loadModel(const pipeline::M2Model& model, uint32_t id) {
        gpuModel.textureIds.push_back(texPtr);
    }

+    vkCtx_->endUploadBatch();
+
    models[id] = std::move(gpuModel);

    core::Logger::getInstance().debug("Loaded M2 model ", id, " (", model.vertices.size(),
--- a/src/rendering/m2_renderer.cpp
+++ b/src/rendering/m2_renderer.cpp
@ -1185,6 +1185,10 @@ bool M2Renderer::loadModel(const pipeline::M2Model& model, uint32_t modelId) {
        }
    }

+    // Batch all GPU uploads (VB, IB, textures) into a single command buffer
+    // submission with one fence wait, instead of one fence wait per upload.
+    vkCtx_->beginUploadBatch();
+
    if (hasGeometry) {
        // Create VBO with interleaved vertex data
        // Format: position (3), normal (3), texcoord0 (2), texcoord1 (2), boneWeights (4), boneIndices (4 as float)
@ -1536,6 +1540,8 @@ bool M2Renderer::loadModel(const pipeline::M2Model& model, uint32_t modelId) {
        }
    }

+    vkCtx_->endUploadBatch();
+
    // Allocate Vulkan descriptor sets and UBOs for each batch
    for (auto& bgpu : gpuModel.batches) {
        // Create combined UBO for M2Params (binding 1) + M2Material (binding 2)
--- a/src/rendering/terrain_renderer.cpp
+++ b/src/rendering/terrain_renderer.cpp
@ -326,6 +326,8 @@ bool TerrainRenderer::loadTerrain(const pipeline::TerrainMesh& mesh,
    }
    LOG_DEBUG("Loading terrain mesh: ", mesh.validChunkCount, " chunks");

+    vkCtx->beginUploadBatch();
+
    for (int y = 0; y < 16; y++) {
        for (int x = 0; x < 16; x++) {
            const auto& chunk = mesh.getChunk(x, y);
@ -405,6 +407,8 @@ bool TerrainRenderer::loadTerrain(const pipeline::TerrainMesh& mesh,
        }
    }

+    vkCtx->endUploadBatch();
+
    LOG_DEBUG("Loaded ", chunks.size(), " terrain chunks to GPU");
    return !chunks.empty();
 }
@ -413,6 +417,10 @@ bool TerrainRenderer::loadTerrainIncremental(const pipeline::TerrainMesh& mesh,
                                              const std::vector<std::string>& texturePaths,
                                              int tileX, int tileY,
                                              int& chunkIndex, int maxChunksPerCall) {
+    // Batch all GPU uploads (VBs, IBs, textures) into a single command buffer
+    // submission with one fence wait, instead of one per buffer/texture.
+    vkCtx->beginUploadBatch();
+
    int uploaded = 0;
    while (chunkIndex < 256 && uploaded < maxChunksPerCall) {
        int cy = chunkIndex / 16;
@ -490,6 +498,8 @@ bool TerrainRenderer::loadTerrainIncremental(const pipeline::TerrainMesh& mesh,
        uploaded++;
    }

+    vkCtx->endUploadBatch();
+
    return chunkIndex >= 256;
 }

@ -580,6 +590,9 @@ void TerrainRenderer::uploadPreloadedTextures(
                       [](unsigned char c) { return static_cast<char>(std::tolower(c)); });
        return key;
    };
+    // Batch all texture uploads into a single command buffer submission
+    vkCtx->beginUploadBatch();
+
    for (const auto& [path, blp] : textures) {
        std::string key = normalizeKey(path);
        if (textureCache.find(key) != textureCache.end()) continue;
@ -599,6 +612,8 @@ void TerrainRenderer::uploadPreloadedTextures(
        textureCacheBytes_ += e.approxBytes;
        textureCache[key] = std::move(e);
    }
+
+    vkCtx->endUploadBatch();
 }

 VkTexture* TerrainRenderer::createAlphaTexture(const std::vector<uint8_t>& alphaData) {
--- a/src/rendering/vk_context.cpp
+++ b/src/rendering/vk_context.cpp
@ -1423,10 +1423,44 @@ void VkContext::endSingleTimeCommands(VkCommandBuffer cmd) {
 }

 void VkContext::immediateSubmit(std::function<void(VkCommandBuffer cmd)>&& function) {
+    if (inUploadBatch_) {
+        // Record into the batch command buffer — no submit, no fence wait
+        function(batchCmd_);
+        return;
+    }
    VkCommandBuffer cmd = beginSingleTimeCommands();
    function(cmd);
    endSingleTimeCommands(cmd);
 }

+void VkContext::beginUploadBatch() {
+    uploadBatchDepth_++;
+    if (inUploadBatch_) return; // already in a batch (nested call)
+    inUploadBatch_ = true;
+    batchCmd_ = beginSingleTimeCommands();
+}
+
+void VkContext::endUploadBatch() {
+    if (uploadBatchDepth_ <= 0) return;
+    uploadBatchDepth_--;
+    if (uploadBatchDepth_ > 0) return; // still inside an outer batch
+
+    inUploadBatch_ = false;
+
+    // Submit all recorded commands with a single fence wait
+    endSingleTimeCommands(batchCmd_);
+    batchCmd_ = VK_NULL_HANDLE;
+
+    // Destroy all deferred staging buffers
+    for (auto& staging : batchStagingBuffers_) {
+        destroyBuffer(allocator, staging);
+    }
+    batchStagingBuffers_.clear();
+}
+
+void VkContext::deferStagingCleanup(AllocatedBuffer staging) {
+    batchStagingBuffers_.push_back(staging);
+}
+
 } // namespace rendering
 } // namespace wowee
--- a/src/rendering/vk_texture.cpp
+++ b/src/rendering/vk_texture.cpp
@ -96,7 +96,11 @@ bool VkTexture::upload(VkContext& ctx, const uint8_t* pixels, uint32_t width, ui
        generateMipmaps(ctx, format, width, height);
    }

-    destroyBuffer(ctx.getAllocator(), staging);
+    if (ctx.isInUploadBatch()) {
+        ctx.deferStagingCleanup(staging);
+    } else {
+        destroyBuffer(ctx.getAllocator(), staging);
+    }
    return true;
 }

@ -162,7 +166,11 @@ bool VkTexture::uploadMips(VkContext& ctx, const uint8_t* const* mipData,
            VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT);
    });

-    destroyBuffer(ctx.getAllocator(), staging);
+    if (ctx.isInUploadBatch()) {
+        ctx.deferStagingCleanup(staging);
+    } else {
+        destroyBuffer(ctx.getAllocator(), staging);
+    }
    return true;
 }

--- a/src/rendering/vk_utils.cpp
+++ b/src/rendering/vk_utils.cpp
@ -198,8 +198,12 @@ AllocatedBuffer uploadBuffer(VkContext& ctx, const void* data, VkDeviceSize size
        vkCmdCopyBuffer(cmd, staging.buffer, gpuBuffer.buffer, 1, &copyRegion);
    });

-    // Destroy staging buffer
-    destroyBuffer(ctx.getAllocator(), staging);
+    // Destroy staging buffer (deferred if in batch mode)
+    if (ctx.isInUploadBatch()) {
+        ctx.deferStagingCleanup(staging);
+    } else {
+        destroyBuffer(ctx.getAllocator(), staging);
+    }

    return gpuBuffer;
 }
--- a/src/rendering/wmo_renderer.cpp
+++ b/src/rendering/wmo_renderer.cpp
@ -419,6 +419,10 @@ bool WMORenderer::loadModel(const pipeline::WMOModel& model, uint32_t id) {
    core::Logger::getInstance().debug("  WMO bounds: min=(", model.boundingBoxMin.x, ", ", model.boundingBoxMin.y, ", ", model.boundingBoxMin.z,
                                      ") max=(", model.boundingBoxMax.x, ", ", model.boundingBoxMax.y, ", ", model.boundingBoxMax.z, ")");

+    // Batch all GPU uploads (textures, VBs, IBs) into a single command buffer
+    // submission with one fence wait, instead of one per upload.
+    vkCtx_->beginUploadBatch();
+
    // Load textures for this model
    core::Logger::getInstance().debug("  WMO has ", model.textures.size(), " texture paths, ", model.materials.size(), " materials");
    if (assetManager && !model.textures.empty()) {
@ -720,6 +724,8 @@ bool WMORenderer::loadModel(const pipeline::WMOModel& model, uint32_t id) {
        groupRes.allUntextured = !anyTextured && !groupRes.mergedBatches.empty();
    }

+    vkCtx_->endUploadBatch();
+
    // Copy portal data for visibility culling
    modelData.portalVertices = model.portalVertices;
    for (const auto& portal : model.portals) {