From 16b43367003334747a44f1089a619b1e1d1cb7a3 Mon Sep 17 00:00:00 2001
From: Kelsi <kelsihates2fa@gmail.com>
Date: Sat, 7 Mar 2026 12:19:59 -0800
Subject: [PATCH] Batch GPU uploads to eliminate per-upload fence waits
 (stutter fix)

Every uploadBuffer/VkTexture::upload called immediateSubmit which did a
separate vkQueueSubmit + vkWaitForFences. Loading a single creature model
with textures caused 4-8+ fence waits; terrain chunks caused 80+ per batch.

Added beginUploadBatch/endUploadBatch to VkContext: records all upload
commands into a single command buffer, submits once with one fence wait.
Staging buffers are deferred for cleanup after the batch completes.

Wrapped in batch mode:
- CharacterRenderer::loadModel (creature VB/IB + textures)
- M2Renderer::loadModel (doodad VB/IB + textures)
- TerrainRenderer::loadTerrain/loadTerrainIncremental (chunk geometry + textures)
- TerrainRenderer::uploadPreloadedTextures
- WMORenderer::loadModel (group geometry + textures)
---
 include/rendering/vk_context.hpp     | 14 ++++++++++++
 src/rendering/character_renderer.cpp |  6 +++++
 src/rendering/m2_renderer.cpp        |  6 +++++
 src/rendering/terrain_renderer.cpp   | 15 ++++++++++++
 src/rendering/vk_context.cpp         | 34 ++++++++++++++++++++++++++++
 src/rendering/vk_texture.cpp         | 12 ++++++++--
 src/rendering/vk_utils.cpp           |  8 +++++--
 src/rendering/wmo_renderer.cpp       |  6 +++++
 8 files changed, 97 insertions(+), 4 deletions(-)
diff --git a/include/rendering/vk_context.hpp b/include/rendering/vk_context.hpp
index 3a242940..dab96d2a 100644
--- a/include/rendering/vk_context.hpp
+++ b/include/rendering/vk_context.hpp
@@ -1,5 +1,6 @@
 #pragma once
 
+#include "rendering/vk_utils.hpp"
 #include <vulkan/vulkan.h>
 #include <vk_mem_alloc.h>
 #include <VkBootstrap.h>
@@ -46,6 +47,13 @@ public:
     // Immediate submit for one-off GPU work (descriptor pool creation, etc.)
     void immediateSubmit(std::function<void(VkCommandBuffer cmd)>&& function);
 
+    // Batch upload mode: records multiple upload commands into a single
+    // command buffer, then submits with ONE fence wait instead of one per upload.
+    void beginUploadBatch();
+    void endUploadBatch();
+    bool isInUploadBatch() const { return inUploadBatch_; }
+    void deferStagingCleanup(AllocatedBuffer staging);
+
     // Accessors
     VkInstance getInstance() const { return instance; }
     VkPhysicalDevice getPhysicalDevice() const { return physicalDevice; }
@@ -143,6 +151,12 @@ private:
     VkCommandPool immCommandPool = VK_NULL_HANDLE;
     VkFence immFence = VK_NULL_HANDLE;
 
+    // Batch upload state (nesting-safe via depth counter)
+    int uploadBatchDepth_ = 0;
+    bool inUploadBatch_ = false;
+    VkCommandBuffer batchCmd_ = VK_NULL_HANDLE;
+    std::vector<AllocatedBuffer> batchStagingBuffers_;
+
     // Depth buffer (shared across all framebuffers)
     VkImage depthImage = VK_NULL_HANDLE;
     VkImageView depthImageView = VK_NULL_HANDLE;
diff --git a/src/rendering/character_renderer.cpp b/src/rendering/character_renderer.cpp
index 2126e5e5..9aa99c72 100644
--- a/src/rendering/character_renderer.cpp
+++ b/src/rendering/character_renderer.cpp
@@ -1247,6 +1247,10 @@ bool CharacterRenderer::loadModel(const pipeline::M2Model& model, uint32_t id) {
     M2ModelGPU gpuModel;
     gpuModel.data = model;
 
+    // Batch all GPU uploads (VB, IB, textures) into a single command buffer
+    // submission with one fence wait, instead of one fence wait per upload.
+    vkCtx_->beginUploadBatch();
+
     // Setup GPU buffers
     setupModelBuffers(gpuModel);
 
@@ -1259,6 +1263,8 @@ bool CharacterRenderer::loadModel(const pipeline::M2Model& model, uint32_t id) {
         gpuModel.textureIds.push_back(texPtr);
     }
 
+    vkCtx_->endUploadBatch();
+
     models[id] = std::move(gpuModel);
 
     core::Logger::getInstance().debug("Loaded M2 model ", id, " (", model.vertices.size(),
diff --git a/src/rendering/m2_renderer.cpp b/src/rendering/m2_renderer.cpp
index d6df9dfe..c4e7a727 100644
--- a/src/rendering/m2_renderer.cpp
+++ b/src/rendering/m2_renderer.cpp
@@ -1185,6 +1185,10 @@ bool M2Renderer::loadModel(const pipeline::M2Model& model, uint32_t modelId) {
         }
     }
 
+    // Batch all GPU uploads (VB, IB, textures) into a single command buffer
+    // submission with one fence wait, instead of one fence wait per upload.
+    vkCtx_->beginUploadBatch();
+
     if (hasGeometry) {
         // Create VBO with interleaved vertex data
         // Format: position (3), normal (3), texcoord0 (2), texcoord1 (2), boneWeights (4), boneIndices (4 as float)
@@ -1536,6 +1540,8 @@ bool M2Renderer::loadModel(const pipeline::M2Model& model, uint32_t modelId) {
         }
     }
 
+    vkCtx_->endUploadBatch();
+
     // Allocate Vulkan descriptor sets and UBOs for each batch
     for (auto& bgpu : gpuModel.batches) {
         // Create combined UBO for M2Params (binding 1) + M2Material (binding 2)
diff --git a/src/rendering/terrain_renderer.cpp b/src/rendering/terrain_renderer.cpp
index 227178d5..fb20ce42 100644
--- a/src/rendering/terrain_renderer.cpp
+++ b/src/rendering/terrain_renderer.cpp
@@ -326,6 +326,8 @@ bool TerrainRenderer::loadTerrain(const pipeline::TerrainMesh& mesh,
     }
     LOG_DEBUG("Loading terrain mesh: ", mesh.validChunkCount, " chunks");
 
+    vkCtx->beginUploadBatch();
+
     for (int y = 0; y < 16; y++) {
         for (int x = 0; x < 16; x++) {
             const auto& chunk = mesh.getChunk(x, y);
@@ -405,6 +407,8 @@ bool TerrainRenderer::loadTerrain(const pipeline::TerrainMesh& mesh,
         }
     }
 
+    vkCtx->endUploadBatch();
+
     LOG_DEBUG("Loaded ", chunks.size(), " terrain chunks to GPU");
     return !chunks.empty();
 }
@@ -413,6 +417,10 @@ bool TerrainRenderer::loadTerrainIncremental(const pipeline::TerrainMesh& mesh,
                                               const std::vector<std::string>& texturePaths,
                                               int tileX, int tileY,
                                               int& chunkIndex, int maxChunksPerCall) {
+    // Batch all GPU uploads (VBs, IBs, textures) into a single command buffer
+    // submission with one fence wait, instead of one per buffer/texture.
+    vkCtx->beginUploadBatch();
+
     int uploaded = 0;
     while (chunkIndex < 256 && uploaded < maxChunksPerCall) {
         int cy = chunkIndex / 16;
@@ -490,6 +498,8 @@ bool TerrainRenderer::loadTerrainIncremental(const pipeline::TerrainMesh& mesh,
         uploaded++;
     }
 
+    vkCtx->endUploadBatch();
+
     return chunkIndex >= 256;
 }
 
@@ -580,6 +590,9 @@ void TerrainRenderer::uploadPreloadedTextures(
                        [](unsigned char c) { return static_cast<char>(std::tolower(c)); });
         return key;
     };
+    // Batch all texture uploads into a single command buffer submission
+    vkCtx->beginUploadBatch();
+
     for (const auto& [path, blp] : textures) {
         std::string key = normalizeKey(path);
         if (textureCache.find(key) != textureCache.end()) continue;
@@ -599,6 +612,8 @@ void TerrainRenderer::uploadPreloadedTextures(
         textureCacheBytes_ += e.approxBytes;
         textureCache[key] = std::move(e);
     }
+
+    vkCtx->endUploadBatch();
 }
 
 VkTexture* TerrainRenderer::createAlphaTexture(const std::vector<uint8_t>& alphaData) {
diff --git a/src/rendering/vk_context.cpp b/src/rendering/vk_context.cpp
index e1a76cee..dc73c685 100644
--- a/src/rendering/vk_context.cpp
+++ b/src/rendering/vk_context.cpp
@@ -1423,10 +1423,44 @@ void VkContext::endSingleTimeCommands(VkCommandBuffer cmd) {
 }
 
 void VkContext::immediateSubmit(std::function<void(VkCommandBuffer cmd)>&& function) {
+    if (inUploadBatch_) {
+        // Record into the batch command buffer — no submit, no fence wait
+        function(batchCmd_);
+        return;
+    }
     VkCommandBuffer cmd = beginSingleTimeCommands();
     function(cmd);
     endSingleTimeCommands(cmd);
 }
 
+void VkContext::beginUploadBatch() {
+    uploadBatchDepth_++;
+    if (inUploadBatch_) return; // already in a batch (nested call)
+    inUploadBatch_ = true;
+    batchCmd_ = beginSingleTimeCommands();
+}
+
+void VkContext::endUploadBatch() {
+    if (uploadBatchDepth_ <= 0) return;
+    uploadBatchDepth_--;
+    if (uploadBatchDepth_ > 0) return; // still inside an outer batch
+
+    inUploadBatch_ = false;
+
+    // Submit all recorded commands with a single fence wait
+    endSingleTimeCommands(batchCmd_);
+    batchCmd_ = VK_NULL_HANDLE;
+
+    // Destroy all deferred staging buffers
+    for (auto& staging : batchStagingBuffers_) {
+        destroyBuffer(allocator, staging);
+    }
+    batchStagingBuffers_.clear();
+}
+
+void VkContext::deferStagingCleanup(AllocatedBuffer staging) {
+    batchStagingBuffers_.push_back(staging);
+}
+
 } // namespace rendering
 } // namespace wowee
diff --git a/src/rendering/vk_texture.cpp b/src/rendering/vk_texture.cpp
index fba6d72b..415e3d56 100644
--- a/src/rendering/vk_texture.cpp
+++ b/src/rendering/vk_texture.cpp
@@ -96,7 +96,11 @@ bool VkTexture::upload(VkContext& ctx, const uint8_t* pixels, uint32_t width, ui
         generateMipmaps(ctx, format, width, height);
     }
 
-    destroyBuffer(ctx.getAllocator(), staging);
+    if (ctx.isInUploadBatch()) {
+        ctx.deferStagingCleanup(staging);
+    } else {
+        destroyBuffer(ctx.getAllocator(), staging);
+    }
     return true;
 }
 
@@ -162,7 +166,11 @@ bool VkTexture::uploadMips(VkContext& ctx, const uint8_t* const* mipData,
             VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT);
     });
 
-    destroyBuffer(ctx.getAllocator(), staging);
+    if (ctx.isInUploadBatch()) {
+        ctx.deferStagingCleanup(staging);
+    } else {
+        destroyBuffer(ctx.getAllocator(), staging);
+    }
     return true;
 }
 
diff --git a/src/rendering/vk_utils.cpp b/src/rendering/vk_utils.cpp
index d105c986..3a2f51d1 100644
--- a/src/rendering/vk_utils.cpp
+++ b/src/rendering/vk_utils.cpp
@@ -198,8 +198,12 @@ AllocatedBuffer uploadBuffer(VkContext& ctx, const void* data, VkDeviceSize size
         vkCmdCopyBuffer(cmd, staging.buffer, gpuBuffer.buffer, 1, &copyRegion);
     });
 
-    // Destroy staging buffer
-    destroyBuffer(ctx.getAllocator(), staging);
+    // Destroy staging buffer (deferred if in batch mode)
+    if (ctx.isInUploadBatch()) {
+        ctx.deferStagingCleanup(staging);
+    } else {
+        destroyBuffer(ctx.getAllocator(), staging);
+    }
 
     return gpuBuffer;
 }
diff --git a/src/rendering/wmo_renderer.cpp b/src/rendering/wmo_renderer.cpp
index ff6b0035..691abaa1 100644
--- a/src/rendering/wmo_renderer.cpp
+++ b/src/rendering/wmo_renderer.cpp
@@ -419,6 +419,10 @@ bool WMORenderer::loadModel(const pipeline::WMOModel& model, uint32_t id) {
     core::Logger::getInstance().debug("  WMO bounds: min=(", model.boundingBoxMin.x, ", ", model.boundingBoxMin.y, ", ", model.boundingBoxMin.z,
                                       ") max=(", model.boundingBoxMax.x, ", ", model.boundingBoxMax.y, ", ", model.boundingBoxMax.z, ")");
 
+    // Batch all GPU uploads (textures, VBs, IBs) into a single command buffer
+    // submission with one fence wait, instead of one per upload.
+    vkCtx_->beginUploadBatch();
+
     // Load textures for this model
     core::Logger::getInstance().debug("  WMO has ", model.textures.size(), " texture paths, ", model.materials.size(), " materials");
     if (assetManager && !model.textures.empty()) {
@@ -720,6 +724,8 @@ bool WMORenderer::loadModel(const pipeline::WMOModel& model, uint32_t id) {
         groupRes.allUntextured = !anyTextured && !groupRes.mergedBatches.empty();
     }
 
+    vkCtx_->endUploadBatch();
+
     // Copy portal data for visibility culling
     modelData.portalVertices = model.portalVertices;
     for (const auto& portal : model.portals) {