From 7ac990cff43028e26f6674b90950286f7817131b Mon Sep 17 00:00:00 2001
From: Kelsi <kelsihates2fa@gmail.com>
Date: Sat, 7 Mar 2026 15:46:56 -0800
Subject: [PATCH] Background BLP texture pre-decoding + deferred WMO normal
 maps (12x streaming perf)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Move CPU-heavy BLP texture decoding from main thread to background worker
threads for all hot paths: terrain M2 models, WMO doodad M2s, WMO textures,
creature models, and gameobject WMOs. Each renderer (M2, WMO, Character) now
accepts a pre-decoded BLP cache that loadTexture() checks before falling back
to synchronous decode.

Defer WMO normal/height map generation (3 per-pixel passes: luminance, box
blur, Sobel) during terrain streaming finalization — this was the dominant
remaining bottleneck after BLP pre-decoding.

Terrain streaming stalls: 1576ms → 124ms worst case.
---
 include/core/application.hpp             |  22 ++-
 include/rendering/character_renderer.hpp |   6 +
 include/rendering/m2_renderer.hpp        |   8 +
 include/rendering/terrain_manager.hpp    |   6 +
 include/rendering/vk_context.hpp         |  13 +-
 include/rendering/wmo_renderer.hpp       |   9 +
 src/core/application.cpp                 | 231 +++++++++++++++++++++--
 src/rendering/character_renderer.cpp     |  52 ++---
 src/rendering/m2_renderer.cpp            |  93 +++++----
 src/rendering/renderer.cpp               |  10 +
 src/rendering/terrain_manager.cpp        | 115 ++++++++---
 src/rendering/vk_context.cpp             |  89 ++++++++-
 src/rendering/wmo_renderer.cpp           |  28 ++-
 13 files changed, 573 insertions(+), 109 deletions(-)
diff --git a/include/core/application.hpp b/include/core/application.hpp
index a23e6bd8..c97bfaf6 100644
--- a/include/core/application.hpp
+++ b/include/core/application.hpp
@@ -3,6 +3,7 @@
 #include "core/window.hpp"
 #include "core/input.hpp"
 #include "game/character.hpp"
+#include "pipeline/blp_loader.hpp"
 #include <memory>
 #include <string>
 #include <vector>
@@ -23,7 +24,7 @@ namespace rendering { class Renderer; }
 namespace ui { class UIManager; }
 namespace auth { class AuthHandler; }
 namespace game { class GameHandler; class World; class ExpansionRegistry; }
-namespace pipeline { class AssetManager; class DBCLayout; struct M2Model; }
+namespace pipeline { class AssetManager; class DBCLayout; struct M2Model; struct WMOModel; }
 namespace audio { enum class VoiceType; }
 
 namespace core {
@@ -206,6 +207,7 @@ private:
         uint32_t modelId;
         float x, y, z, orientation;
         std::shared_ptr<pipeline::M2Model> model; // parsed on background thread
+        std::unordered_map<std::string, pipeline::BLPImage> predecodedTextures; // decoded on bg thread
         bool valid = false;
         bool permanent_failure = false;
     };
@@ -337,6 +339,24 @@ private:
     };
     std::vector<PendingGameObjectSpawn> pendingGameObjectSpawns_;
     void processGameObjectSpawnQueue();
+
+    // Async WMO loading for game objects (file I/O + parse on background thread)
+    struct PreparedGameObjectWMO {
+        uint64_t guid;
+        uint32_t entry;
+        uint32_t displayId;
+        float x, y, z, orientation;
+        std::shared_ptr<pipeline::WMOModel> wmoModel;
+        std::unordered_map<std::string, pipeline::BLPImage> predecodedTextures; // decoded on bg thread
+        bool valid = false;
+        bool isWmo = false;
+        std::string modelPath;
+    };
+    struct AsyncGameObjectLoad {
+        std::future<PreparedGameObjectWMO> future;
+    };
+    std::vector<AsyncGameObjectLoad> asyncGameObjectLoads_;
+    void processAsyncGameObjectResults();
     struct PendingTransportDoodadBatch {
         uint64_t guid = 0;
         uint32_t modelId = 0;
diff --git a/include/rendering/character_renderer.hpp b/include/rendering/character_renderer.hpp
index 52813cf4..c7cae0d7 100644
--- a/include/rendering/character_renderer.hpp
+++ b/include/rendering/character_renderer.hpp
@@ -1,6 +1,7 @@
 #pragma once
 
 #include "pipeline/m2_loader.hpp"
+#include "pipeline/blp_loader.hpp"
 #include <vulkan/vulkan.h>
 #include <vk_mem_alloc.h>
 #include <glm/glm.hpp>
@@ -114,7 +115,11 @@ public:
     void setShadowMap(VkTexture*, const glm::mat4&) {}
     void clearShadowMap() {}
 
+    // Pre-decoded BLP cache: set before calling loadModel() to skip main-thread BLP decode
+    void setPredecodedBLPCache(std::unordered_map<std::string, pipeline::BLPImage>* cache) { predecodedBLPCache_ = cache; }
+
 private:
+    std::unordered_map<std::string, pipeline::BLPImage>* predecodedBLPCache_ = nullptr;
     // GPU representation of M2 model
     struct M2ModelGPU {
         VkBuffer vertexBuffer = VK_NULL_HANDLE;
@@ -180,6 +185,7 @@ private:
 
         // Bone update throttling (skip frames for distant characters)
         uint32_t boneUpdateCounter = 0;
+        const M2ModelGPU* cachedModel = nullptr;  // Avoid per-frame hash lookups
 
         // Per-instance bone SSBO (double-buffered per frame)
         VkBuffer boneBuffer[2] = {};
diff --git a/include/rendering/m2_renderer.hpp b/include/rendering/m2_renderer.hpp
index 91616a28..1c35e34b 100644
--- a/include/rendering/m2_renderer.hpp
+++ b/include/rendering/m2_renderer.hpp
@@ -1,6 +1,7 @@
 #pragma once
 
 #include "pipeline/m2_loader.hpp"
+#include "pipeline/blp_loader.hpp"
 #include <vulkan/vulkan.h>
 #include <vk_mem_alloc.h>
 #include <glm/glm.hpp>
@@ -188,6 +189,7 @@ struct M2Instance {
     bool skipCollision = false;    // WMO interior doodads — skip player wall collision
     float cachedBoundRadius = 0.0f;
     float portalSpinAngle = 0.0f;  // Accumulated spin angle for portal rotation
+    const M2ModelGPU* cachedModel = nullptr;  // Avoid per-frame hash lookups
 
     // Frame-skip optimization (update distant animations less frequently)
     uint8_t frameSkipCounter = 0;
@@ -328,6 +330,10 @@ public:
 
     std::vector<glm::vec3> getWaterVegetationPositions(const glm::vec3& camPos, float maxDist) const;
 
+    // Pre-decoded BLP cache: set by terrain manager before calling loadModel()
+    // so loadTexture() can skip the expensive assetManager->loadTexture() call.
+    void setPredecodedBLPCache(std::unordered_map<std::string, pipeline::BLPImage>* cache) { predecodedBLPCache_ = cache; }
+
 private:
     bool initialized_ = false;
     bool insideInterior = false;
@@ -414,6 +420,8 @@ private:
     uint32_t modelLimitRejectWarnings_ = 0;
 
     VkTexture* loadTexture(const std::string& path, uint32_t texFlags = 0);
+    std::unordered_map<std::string, pipeline::BLPImage>* predecodedBLPCache_ = nullptr;
+
     struct TextureCacheEntry {
         std::unique_ptr<VkTexture> texture;
         size_t approxBytes = 0;
diff --git a/include/rendering/terrain_manager.hpp b/include/rendering/terrain_manager.hpp
index 1b2af320..6f732721 100644
--- a/include/rendering/terrain_manager.hpp
+++ b/include/rendering/terrain_manager.hpp
@@ -121,6 +121,12 @@ struct PendingTile {
     // Pre-loaded terrain texture BLP data (loaded on background thread to avoid
     // blocking file I/O on the main thread during finalizeTile)
     std::unordered_map<std::string, pipeline::BLPImage> preloadedTextures;
+
+    // Pre-decoded M2 model textures (decoded on background thread)
+    std::unordered_map<std::string, pipeline::BLPImage> preloadedM2Textures;
+
+    // Pre-decoded WMO textures (decoded on background thread)
+    std::unordered_map<std::string, pipeline::BLPImage> preloadedWMOTextures;
 };
 
 /**
diff --git a/include/rendering/vk_context.hpp b/include/rendering/vk_context.hpp
index dab96d2a..907e21bf 100644
--- a/include/rendering/vk_context.hpp
+++ b/include/rendering/vk_context.hpp
@@ -50,9 +50,12 @@ public:
     // Batch upload mode: records multiple upload commands into a single
     // command buffer, then submits with ONE fence wait instead of one per upload.
     void beginUploadBatch();
-    void endUploadBatch();
+    void endUploadBatch();       // Async: submits but does NOT wait for fence
+    void endUploadBatchSync();   // Sync: submits and waits (for load screens)
     bool isInUploadBatch() const { return inUploadBatch_; }
     void deferStagingCleanup(AllocatedBuffer staging);
+    void pollUploadBatches();    // Check completed async uploads, free staging buffers
+    void waitAllUploads();       // Block until all in-flight uploads complete
 
     // Accessors
     VkInstance getInstance() const { return instance; }
@@ -157,6 +160,14 @@ private:
     VkCommandBuffer batchCmd_ = VK_NULL_HANDLE;
     std::vector<AllocatedBuffer> batchStagingBuffers_;
 
+    // Async upload: in-flight batches awaiting GPU completion
+    struct InFlightBatch {
+        VkFence fence = VK_NULL_HANDLE;
+        VkCommandBuffer cmd = VK_NULL_HANDLE;
+        std::vector<AllocatedBuffer> stagingBuffers;
+    };
+    std::vector<InFlightBatch> inFlightBatches_;
+
     // Depth buffer (shared across all framebuffers)
     VkImage depthImage = VK_NULL_HANDLE;
     VkImageView depthImageView = VK_NULL_HANDLE;
diff --git a/include/rendering/wmo_renderer.hpp b/include/rendering/wmo_renderer.hpp
index 095a354d..f0d3b36f 100644
--- a/include/rendering/wmo_renderer.hpp
+++ b/include/rendering/wmo_renderer.hpp
@@ -1,5 +1,6 @@
 #pragma once
 
+#include "pipeline/blp_loader.hpp"
 #include <vulkan/vulkan.h>
 #include <vk_mem_alloc.h>
 #include <glm/glm.hpp>
@@ -325,6 +326,12 @@ public:
     // Pre-compute floor cache for all loaded WMO instances
     void precomputeFloorCache();
 
+    // Pre-decoded BLP cache: set before calling loadModel() to skip main-thread BLP decode
+    void setPredecodedBLPCache(std::unordered_map<std::string, pipeline::BLPImage>* cache) { predecodedBLPCache_ = cache; }
+
+    // Defer normal/height map generation during streaming to avoid CPU stalls
+    void setDeferNormalMaps(bool defer) { deferNormalMaps_ = defer; }
+
 private:
     // WMO material UBO — matches WMOMaterial in wmo.frag.glsl
     struct WMOMaterialUBO {
@@ -558,6 +565,7 @@ private:
      * Load a texture from path
      */
     VkTexture* loadTexture(const std::string& path);
+    std::unordered_map<std::string, pipeline::BLPImage>* predecodedBLPCache_ = nullptr;
 
     /**
      * Generate normal+height map from diffuse RGBA8 pixels
@@ -670,6 +678,7 @@ private:
 
     // Normal mapping / POM settings
     bool normalMappingEnabled_ = true;   // on by default
+    bool deferNormalMaps_ = false;       // skip normal map gen during streaming
     float normalMapStrength_ = 0.8f;     // 0.0 = flat, 1.0 = full, 2.0 = exaggerated
     bool pomEnabled_ = true;             // on by default
     int pomQuality_ = 1;                 // 0=Low(16), 1=Medium(32), 2=High(64)
diff --git a/src/core/application.cpp b/src/core/application.cpp
index f0c22a2c..f4712613 100644
--- a/src/core/application.cpp
+++ b/src/core/application.cpp
@@ -6883,7 +6883,7 @@ void Application::spawnOnlineGameObject(uint64_t guid, uint32_t entry, uint32_t
 void Application::processAsyncCreatureResults() {
     // Check completed async model loads and finalize on main thread (GPU upload + instance creation).
     // Limit GPU model uploads per frame to avoid spikes, but always drain cheap bookkeeping.
-    static constexpr int kMaxModelUploadsPerFrame = 3;
+    static constexpr int kMaxModelUploadsPerFrame = 1;
     int modelUploads = 0;
 
     for (auto it = asyncCreatureLoads_.begin(); it != asyncCreatureLoads_.end(); ) {
@@ -6925,13 +6925,17 @@ void Application::processAsyncCreatureResults() {
         }
 
         // Upload model to GPU (must happen on main thread)
+        // Use pre-decoded BLP cache to skip main-thread texture decode
+        charRenderer->setPredecodedBLPCache(&result.predecodedTextures);
         if (!charRenderer->loadModel(*result.model, result.modelId)) {
+            charRenderer->setPredecodedBLPCache(nullptr);
             nonRenderableCreatureDisplayIds_.insert(result.displayId);
             creaturePermanentFailureGuids_.insert(result.guid);
             pendingCreatureSpawnGuids_.erase(result.guid);
             creatureSpawnRetryCounts_.erase(result.guid);
             continue;
         }
+        charRenderer->setPredecodedBLPCache(nullptr);
         displayIdModelCache_[result.displayId] = result.modelId;
         modelUploads++;
 
@@ -6956,6 +6960,10 @@ void Application::processAsyncCreatureResults() {
 }
 
 void Application::processCreatureSpawnQueue() {
+    auto startTime = std::chrono::steady_clock::now();
+    // Budget: max 2ms per frame for creature spawning to prevent stutter.
+    static constexpr float kSpawnBudgetMs = 2.0f;
+
     // First, finalize any async model loads that completed on background threads.
     processAsyncCreatureResults();
 
@@ -6965,18 +6973,15 @@ void Application::processCreatureSpawnQueue() {
         if (!creatureLookupsBuilt_) return;
     }
 
-    auto startTime = std::chrono::steady_clock::now();
-    // Budget: max 4ms per frame for creature spawning to prevent stutter.
-    static constexpr float kSpawnBudgetMs = 4.0f;
-
     int processed = 0;
     int asyncLaunched = 0;
     size_t rotationsLeft = pendingCreatureSpawns_.size();
     while (!pendingCreatureSpawns_.empty() &&
            processed < MAX_SPAWNS_PER_FRAME &&
            rotationsLeft > 0) {
-        // Check time budget after each spawn (not for the first one, always process at least 1)
-        if (processed > 0) {
+        // Check time budget every iteration (including first — async results may
+        // have already consumed the budget via GPU model uploads).
+        {
             auto now = std::chrono::steady_clock::now();
             float elapsedMs = std::chrono::duration<float, std::milli>(now - startTime).count();
             if (elapsedMs >= kSpawnBudgetMs) break;
@@ -7081,6 +7086,20 @@ void Application::processCreatureSpawnQueue() {
                         }
                     }
 
+                    // Pre-decode model textures on background thread
+                    for (const auto& tex : model->textures) {
+                        if (tex.filename.empty()) continue;
+                        std::string texKey = tex.filename;
+                        std::replace(texKey.begin(), texKey.end(), '/', '\\');
+                        std::transform(texKey.begin(), texKey.end(), texKey.begin(),
+                                       [](unsigned char c) { return static_cast<char>(std::tolower(c)); });
+                        if (result.predecodedTextures.find(texKey) != result.predecodedTextures.end()) continue;
+                        auto blp = am->loadTexture(texKey);
+                        if (blp.isValid()) {
+                            result.predecodedTextures[texKey] = std::move(blp);
+                        }
+                    }
+
                     result.model = std::move(model);
                     result.valid = true;
                     return result;
@@ -7161,14 +7180,202 @@ void Application::processDeferredEquipmentQueue() {
     setOnlinePlayerEquipment(guid, equipData.first, equipData.second);
 }
 
+void Application::processAsyncGameObjectResults() {
+    for (auto it = asyncGameObjectLoads_.begin(); it != asyncGameObjectLoads_.end(); ) {
+        if (!it->future.valid() ||
+            it->future.wait_for(std::chrono::milliseconds(0)) != std::future_status::ready) {
+            ++it;
+            continue;
+        }
+
+        auto result = it->future.get();
+        it = asyncGameObjectLoads_.erase(it);
+
+        if (!result.valid || !result.isWmo || !result.wmoModel) {
+            // Fallback: spawn via sync path (likely an M2 or failed WMO)
+            spawnOnlineGameObject(result.guid, result.entry, result.displayId,
+                                 result.x, result.y, result.z, result.orientation);
+            continue;
+        }
+
+        // WMO parsed on background thread — do GPU upload + instance creation on main thread
+        auto* wmoRenderer = renderer ? renderer->getWMORenderer() : nullptr;
+        if (!wmoRenderer) continue;
+
+        uint32_t modelId = 0;
+        auto itCache = gameObjectDisplayIdWmoCache_.find(result.displayId);
+        if (itCache != gameObjectDisplayIdWmoCache_.end()) {
+            modelId = itCache->second;
+        } else {
+            modelId = nextGameObjectWmoModelId_++;
+            wmoRenderer->setPredecodedBLPCache(&result.predecodedTextures);
+            if (!wmoRenderer->loadModel(*result.wmoModel, modelId)) {
+                wmoRenderer->setPredecodedBLPCache(nullptr);
+                LOG_WARNING("Failed to load async gameobject WMO: ", result.modelPath);
+                continue;
+            }
+            wmoRenderer->setPredecodedBLPCache(nullptr);
+            gameObjectDisplayIdWmoCache_[result.displayId] = modelId;
+        }
+
+        glm::vec3 renderPos = core::coords::canonicalToRender(
+            glm::vec3(result.x, result.y, result.z));
+        uint32_t instanceId = wmoRenderer->createInstance(
+            modelId, renderPos, glm::vec3(0.0f, 0.0f, result.orientation), 1.0f);
+        if (instanceId == 0) continue;
+
+        gameObjectInstances_[result.guid] = {modelId, instanceId, true};
+
+        // Queue transport doodad loading if applicable
+        std::string lowerPath = result.modelPath;
+        std::transform(lowerPath.begin(), lowerPath.end(), lowerPath.begin(),
+                       [](unsigned char c) { return static_cast<char>(std::tolower(c)); });
+        if (lowerPath.find("transport") != std::string::npos) {
+            const auto* doodadTemplates = wmoRenderer->getDoodadTemplates(modelId);
+            if (doodadTemplates && !doodadTemplates->empty()) {
+                PendingTransportDoodadBatch batch;
+                batch.guid = result.guid;
+                batch.modelId = modelId;
+                batch.instanceId = instanceId;
+                batch.x = result.x;
+                batch.y = result.y;
+                batch.z = result.z;
+                batch.orientation = result.orientation;
+                batch.doodadBudget = doodadTemplates->size();
+                pendingTransportDoodadBatches_.push_back(batch);
+            }
+        }
+    }
+}
+
 void Application::processGameObjectSpawnQueue() {
+    // Finalize any completed async WMO loads first
+    processAsyncGameObjectResults();
+
     if (pendingGameObjectSpawns_.empty()) return;
 
-    // Only spawn 1 game object per frame — each can involve heavy synchronous
-    // WMO loading (root + groups from disk + GPU upload), easily 100ms+.
-    auto& s = pendingGameObjectSpawns_.front();
-    spawnOnlineGameObject(s.guid, s.entry, s.displayId, s.x, s.y, s.z, s.orientation);
-    pendingGameObjectSpawns_.erase(pendingGameObjectSpawns_.begin());
+    // Process spawns: cached WMOs and M2s go sync (cheap), uncached WMOs go async
+    auto startTime = std::chrono::steady_clock::now();
+    static constexpr float kBudgetMs = 2.0f;
+    static constexpr int kMaxAsyncLoads = 2;
+
+    while (!pendingGameObjectSpawns_.empty()) {
+        float elapsedMs = std::chrono::duration<float, std::milli>(
+            std::chrono::steady_clock::now() - startTime).count();
+        if (elapsedMs >= kBudgetMs) break;
+
+        auto& s = pendingGameObjectSpawns_.front();
+
+        // Check if this is an uncached WMO that needs async loading
+        std::string modelPath;
+        if (gameObjectLookupsBuilt_) {
+            // Check transport overrides first
+            bool isTransport = gameHandler && gameHandler->isTransportGuid(s.guid);
+            if (isTransport) {
+                if (s.entry == 20808 || s.entry == 176231 || s.entry == 176310)
+                    modelPath = "World\\wmo\\transports\\transport_ship\\transportship.wmo";
+                else if (s.displayId == 807 || s.displayId == 808 || s.displayId == 175080 || s.displayId == 176495 || s.displayId == 164871)
+                    modelPath = "World\\wmo\\transports\\transport_zeppelin\\transport_zeppelin.wmo";
+                else if (s.displayId == 1587)
+                    modelPath = "World\\wmo\\transports\\transport_horde_zeppelin\\Transport_Horde_Zeppelin.wmo";
+                else if (s.displayId == 2454 || s.displayId == 181688 || s.displayId == 190536)
+                    modelPath = "World\\wmo\\transports\\icebreaker\\Transport_Icebreaker_ship.wmo";
+            }
+            if (modelPath.empty())
+                modelPath = getGameObjectModelPathForDisplayId(s.displayId);
+        }
+
+        std::string lowerPath = modelPath;
+        std::transform(lowerPath.begin(), lowerPath.end(), lowerPath.begin(),
+                       [](unsigned char c) { return static_cast<char>(std::tolower(c)); });
+        bool isWmo = lowerPath.size() >= 4 && lowerPath.substr(lowerPath.size() - 4) == ".wmo";
+        bool isCached = isWmo && gameObjectDisplayIdWmoCache_.count(s.displayId);
+
+        if (isWmo && !isCached && !modelPath.empty() &&
+            static_cast<int>(asyncGameObjectLoads_.size()) < kMaxAsyncLoads) {
+            // Launch async WMO load — file I/O + parse on background thread
+            auto* am = assetManager.get();
+            PendingGameObjectSpawn capture = s;
+            std::string capturePath = modelPath;
+            AsyncGameObjectLoad load;
+            load.future = std::async(std::launch::async,
+                [am, capture, capturePath]() -> PreparedGameObjectWMO {
+                    PreparedGameObjectWMO result;
+                    result.guid = capture.guid;
+                    result.entry = capture.entry;
+                    result.displayId = capture.displayId;
+                    result.x = capture.x;
+                    result.y = capture.y;
+                    result.z = capture.z;
+                    result.orientation = capture.orientation;
+                    result.modelPath = capturePath;
+                    result.isWmo = true;
+
+                    auto wmoData = am->readFile(capturePath);
+                    if (wmoData.empty()) return result;
+
+                    auto wmo = std::make_shared<pipeline::WMOModel>(
+                        pipeline::WMOLoader::load(wmoData));
+
+                    // Load groups
+                    if (wmo->nGroups > 0) {
+                        std::string basePath = capturePath;
+                        std::string ext;
+                        if (basePath.size() > 4) {
+                            ext = basePath.substr(basePath.size() - 4);
+                            basePath = basePath.substr(0, basePath.size() - 4);
+                        }
+                        for (uint32_t gi = 0; gi < wmo->nGroups; gi++) {
+                            char suffix[16];
+                            snprintf(suffix, sizeof(suffix), "_%03u%s", gi, ext.c_str());
+                            auto groupData = am->readFile(basePath + suffix);
+                            if (groupData.empty()) {
+                                snprintf(suffix, sizeof(suffix), "_%03u.wmo", gi);
+                                groupData = am->readFile(basePath + suffix);
+                            }
+                            if (!groupData.empty()) {
+                                pipeline::WMOLoader::loadGroup(groupData, *wmo, gi);
+                            }
+                        }
+                    }
+
+                    // Pre-decode WMO textures on background thread
+                    for (const auto& texPath : wmo->textures) {
+                        if (texPath.empty()) continue;
+                        std::string texKey = texPath;
+                        size_t nul = texKey.find('\0');
+                        if (nul != std::string::npos) texKey.resize(nul);
+                        std::replace(texKey.begin(), texKey.end(), '/', '\\');
+                        std::transform(texKey.begin(), texKey.end(), texKey.begin(),
+                                       [](unsigned char c) { return static_cast<char>(std::tolower(c)); });
+                        if (texKey.empty()) continue;
+                        // Convert to .blp extension
+                        if (texKey.size() >= 4) {
+                            std::string ext = texKey.substr(texKey.size() - 4);
+                            if (ext == ".tga" || ext == ".dds") {
+                                texKey = texKey.substr(0, texKey.size() - 4) + ".blp";
+                            }
+                        }
+                        if (result.predecodedTextures.find(texKey) != result.predecodedTextures.end()) continue;
+                        auto blp = am->loadTexture(texKey);
+                        if (blp.isValid()) {
+                            result.predecodedTextures[texKey] = std::move(blp);
+                        }
+                    }
+
+                    result.wmoModel = wmo;
+                    result.valid = true;
+                    return result;
+                });
+            asyncGameObjectLoads_.push_back(std::move(load));
+            pendingGameObjectSpawns_.erase(pendingGameObjectSpawns_.begin());
+            continue;
+        }
+
+        // Cached WMO or M2 — spawn synchronously (cheap)
+        spawnOnlineGameObject(s.guid, s.entry, s.displayId, s.x, s.y, s.z, s.orientation);
+        pendingGameObjectSpawns_.erase(pendingGameObjectSpawns_.begin());
+    }
 }
 
 void Application::processPendingTransportDoodads() {
diff --git a/src/rendering/character_renderer.cpp b/src/rendering/character_renderer.cpp
index f735dd7d..040a301d 100644
--- a/src/rendering/character_renderer.cpp
+++ b/src/rendering/character_renderer.cpp
@@ -625,7 +625,18 @@ VkTexture* CharacterRenderer::loadTexture(const std::string& path) {
         return whiteTexture_.get();
     }
 
-    auto blpImage = assetManager->loadTexture(key);
+    // Check pre-decoded BLP cache first (populated by background threads)
+    pipeline::BLPImage blpImage;
+    if (predecodedBLPCache_) {
+        auto pit = predecodedBLPCache_->find(key);
+        if (pit != predecodedBLPCache_->end()) {
+            blpImage = std::move(pit->second);
+            predecodedBLPCache_->erase(pit);
+        }
+    }
+    if (!blpImage.isValid()) {
+        blpImage = assetManager->loadTexture(key);
+    }
     if (!blpImage.isValid()) {
         // Return white fallback but don't cache the failure — allow retry
         // on next character load in case the asset becomes available.
@@ -1412,8 +1423,9 @@ uint32_t CharacterRenderer::createInstance(uint32_t modelId, const glm::vec3& po
     instance.scale = scale;
 
     // Initialize bone matrices to identity
-    auto& model = models[modelId].data;
-    instance.boneMatrices.resize(std::max(static_cast<size_t>(1), model.bones.size()), glm::mat4(1.0f));
+    auto& gpuRef = models[modelId];
+    instance.boneMatrices.resize(std::max(static_cast<size_t>(1), gpuRef.data.bones.size()), glm::mat4(1.0f));
+    instance.cachedModel = &gpuRef;
 
     uint32_t id = instance.id;
     instances[id] = std::move(instance);
@@ -1511,13 +1523,12 @@ void CharacterRenderer::update(float deltaTime, const glm::vec3& cameraPos) {
         if (distSq >= animUpdateRadiusSq) continue;
 
         // Always advance animation time (cheap)
-        auto modelIt = models.find(inst.modelId);
-        if (modelIt != models.end() && !modelIt->second.data.sequences.empty()) {
+        if (inst.cachedModel && !inst.cachedModel->data.sequences.empty()) {
             if (inst.currentSequenceIndex < 0) {
                 inst.currentSequenceIndex = 0;
-                inst.currentAnimationId = modelIt->second.data.sequences[0].id;
+                inst.currentAnimationId = inst.cachedModel->data.sequences[0].id;
             }
-            const auto& seq = modelIt->second.data.sequences[inst.currentSequenceIndex];
+            const auto& seq = inst.cachedModel->data.sequences[inst.currentSequenceIndex];
             inst.animationTime += deltaTime * 1000.0f;
             if (seq.duration > 0 && inst.animationTime >= static_cast<float>(seq.duration)) {
                 if (inst.animationLoop) {
@@ -1528,10 +1539,11 @@ void CharacterRenderer::update(float deltaTime, const glm::vec3& cameraPos) {
             }
         }
 
-        // Distance-tiered bone throttling: near=every frame, mid=every 3rd, far=every 6th
+        // Distance-tiered bone throttling: near=every frame, mid=every 4th, far=every 8th
         uint32_t boneInterval = 1;
-        if (distSq > 60.0f * 60.0f) boneInterval = 6;
-        else if (distSq > 30.0f * 30.0f) boneInterval = 3;
+        if (distSq > 40.0f * 40.0f) boneInterval = 8;
+        else if (distSq > 20.0f * 20.0f) boneInterval = 4;
+        else if (distSq > 10.0f * 10.0f) boneInterval = 2;
 
         inst.boneUpdateCounter++;
         bool needsBones = (inst.boneUpdateCounter >= boneInterval) || inst.boneMatrices.empty();
@@ -1615,11 +1627,8 @@ void CharacterRenderer::update(float deltaTime, const glm::vec3& cameraPos) {
 }
 
 void CharacterRenderer::updateAnimation(CharacterInstance& instance, float deltaTime) {
-    auto modelIt = models.find(instance.modelId);
-    if (modelIt == models.end()) {
-        return;
-    }
-    const auto& model = modelIt->second.data;
+    if (!instance.cachedModel) return;
+    const auto& model = instance.cachedModel->data;
 
     if (model.sequences.empty()) {
         return;
@@ -1732,7 +1741,8 @@ glm::quat CharacterRenderer::interpolateQuat(const pipeline::M2AnimationTrack& t
 // --- Bone transform calculation ---
 
 void CharacterRenderer::calculateBoneMatrices(CharacterInstance& instance) {
-    auto& model = models[instance.modelId].data;
+    if (!instance.cachedModel) return;
+    auto& model = instance.cachedModel->data;
 
     if (model.bones.empty()) {
         return;
@@ -1833,9 +1843,8 @@ void CharacterRenderer::render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet,
             }
         }
 
-        auto modelIt = models.find(instance.modelId);
-        if (modelIt == models.end()) continue;
-        const auto& gpuModel = modelIt->second;
+        if (!instance.cachedModel) continue;
+        const auto& gpuModel = *instance.cachedModel;
 
         // Skip models without GPU buffers
         if (!gpuModel.vertexBuffer) continue;
@@ -2487,9 +2496,8 @@ void CharacterRenderer::renderShadow(VkCommandBuffer cmd, const glm::mat4& light
         glm::vec3 diff = inst.position - shadowCenter;
         if (glm::dot(diff, diff) > shadowRadiusSq) continue;
 
-        auto modelIt = models.find(inst.modelId);
-        if (modelIt == models.end()) continue;
-        const M2ModelGPU& gpuModel = modelIt->second;
+        if (!inst.cachedModel) continue;
+        const M2ModelGPU& gpuModel = *inst.cachedModel;
         if (!gpuModel.vertexBuffer) continue;
 
         glm::mat4 modelMat = inst.hasOverrideModelMatrix
diff --git a/src/rendering/m2_renderer.cpp b/src/rendering/m2_renderer.cpp
index c4e7a727..d455e494 100644
--- a/src/rendering/m2_renderer.cpp
+++ b/src/rendering/m2_renderer.cpp
@@ -1657,6 +1657,7 @@ uint32_t M2Renderer::createInstance(uint32_t modelId, const glm::vec3& position,
     instance.cachedIsInvisibleTrap = mdlRef.isInvisibleTrap;
     instance.cachedIsInstancePortal = mdlRef.isInstancePortal;
     instance.cachedIsValid = mdlRef.isValid();
+    instance.cachedModel = &mdlRef;
 
     // Initialize animation: play first sequence (usually Stand/Idle)
     const auto& mdl = mdlRef;
@@ -1748,6 +1749,7 @@ uint32_t M2Renderer::createInstanceWithMatrix(uint32_t modelId, const glm::mat4&
     instance.cachedIsGroundDetail = mdl2.isGroundDetail;
     instance.cachedIsInvisibleTrap = mdl2.isInvisibleTrap;
     instance.cachedIsValid = mdl2.isValid();
+    instance.cachedModel = &mdl2;
 
     // Initialize animation
     if (mdl2.hasAnimation && !mdl2.disableAnimation && !mdl2.sequences.empty()) {
@@ -2026,9 +2028,8 @@ void M2Renderer::update(float deltaTime, const glm::vec3& cameraPos, const glm::
         instance.animTime += dtMs * (instance.animSpeed - 1.0f);
 
         // For animation looping/variation, we need the actual model data.
-        auto it = models.find(instance.modelId);
-        if (it == models.end()) continue;
-        const M2ModelGPU& model = it->second;
+        if (!instance.cachedModel) continue;
+        const M2ModelGPU& model = *instance.cachedModel;
 
         // Validate sequence index
         if (instance.currentSequenceIndex < 0 ||
@@ -2084,6 +2085,14 @@ void M2Renderer::update(float deltaTime, const glm::vec3& cameraPos, const glm::
         float paddedRadius = std::max(cullRadius * 1.5f, cullRadius + 3.0f);
         if (cullRadius > 0.0f && !updateFrustum.intersectsSphere(instance.position, paddedRadius)) continue;
 
+        // Distance-based frame skipping: update distant bones less frequently
+        uint32_t boneInterval = 1;
+        if (distSq > 200.0f * 200.0f) boneInterval = 8;
+        else if (distSq > 100.0f * 100.0f) boneInterval = 4;
+        else if (distSq > 50.0f * 50.0f) boneInterval = 2;
+        instance.frameSkipCounter++;
+        if ((instance.frameSkipCounter % boneInterval) != 0) continue;
+
         boneWorkIndices_.push_back(idx);
     }
 
@@ -2097,9 +2106,8 @@ void M2Renderer::update(float deltaTime, const glm::vec3& cameraPos, const glm::
             for (size_t i : boneWorkIndices_) {
                 if (i >= instances.size()) continue;
                 auto& inst = instances[i];
-                auto mdlIt = models.find(inst.modelId);
-                if (mdlIt == models.end()) continue;
-                computeBoneMatrices(mdlIt->second, inst);
+                if (!inst.cachedModel) continue;
+                computeBoneMatrices(*inst.cachedModel, inst);
             }
         } else {
             // Parallel — dispatch across worker threads
@@ -2112,9 +2120,8 @@ void M2Renderer::update(float deltaTime, const glm::vec3& cameraPos, const glm::
                 for (size_t i : boneWorkIndices_) {
                     if (i >= instances.size()) continue;
                     auto& inst = instances[i];
-                    auto mdlIt = models.find(inst.modelId);
-                    if (mdlIt == models.end()) continue;
-                    computeBoneMatrices(mdlIt->second, inst);
+                    if (!inst.cachedModel) continue;
+                    computeBoneMatrices(*inst.cachedModel, inst);
                 }
             } else {
                 const size_t chunkSize = animCount / numThreads;
@@ -2135,9 +2142,8 @@ void M2Renderer::update(float deltaTime, const glm::vec3& cameraPos, const glm::
                                 size_t idx = boneWorkIndices_[j];
                                 if (idx >= instances.size()) continue;
                                 auto& inst = instances[idx];
-                                auto mdlIt = models.find(inst.modelId);
-                                if (mdlIt == models.end()) continue;
-                                computeBoneMatrices(mdlIt->second, inst);
+                                if (!inst.cachedModel) continue;
+                                computeBoneMatrices(*inst.cachedModel, inst);
                             }
                         }));
                     start = end;
@@ -2159,9 +2165,8 @@ void M2Renderer::update(float deltaTime, const glm::vec3& cameraPos, const glm::
         glm::vec3 toCam = instance.position - cachedCamPos_;
         float distSq = glm::dot(toCam, toCam);
         if (distSq > cachedMaxRenderDistSq_) continue;
-        auto mdlIt = models.find(instance.modelId);
-        if (mdlIt == models.end()) continue;
-        emitParticles(instance, mdlIt->second, deltaTime);
+        if (!instance.cachedModel) continue;
+        emitParticles(instance, *instance.cachedModel, deltaTime);
         updateParticles(instance, deltaTime);
     }
 
@@ -2865,9 +2870,8 @@ void M2Renderer::renderShadow(VkCommandBuffer cmd, const glm::mat4& lightSpaceMa
             glm::vec3 diff = instance.position - shadowCenter;
             if (glm::dot(diff, diff) > shadowRadiusSq) continue;
 
-            auto modelIt = models.find(instance.modelId);
-            if (modelIt == models.end()) continue;
-            const M2ModelGPU& model = modelIt->second;
+            if (!instance.cachedModel) continue;
+            const M2ModelGPU& model = *instance.cachedModel;
 
             // Filter: only draw foliage models in foliage pass, non-foliage in non-foliage pass
             if (model.shadowWindFoliage != foliagePass) continue;
@@ -2973,8 +2977,7 @@ std::vector<glm::vec3> M2Renderer::getWaterVegetationPositions(const glm::vec3&
     std::vector<glm::vec3> result;
     float maxDistSq = maxDist * maxDist;
     for (const auto& inst : instances) {
-        auto it = models.find(inst.modelId);
-        if (it == models.end() || !it->second.isWaterVegetation) continue;
+        if (!inst.cachedModel || !inst.cachedModel->isWaterVegetation) continue;
         glm::vec3 diff = inst.position - camPos;
         if (glm::dot(diff, diff) <= maxDistSq) {
             result.push_back(inst.position);
@@ -3085,9 +3088,8 @@ void M2Renderer::emitParticles(M2Instance& inst, const M2ModelGPU& gpu, float dt
 }
 
 void M2Renderer::updateParticles(M2Instance& inst, float dt) {
-    auto it = models.find(inst.modelId);
-    if (it == models.end()) return;
-    const auto& gpu = it->second;
+    if (!inst.cachedModel) return;
+    const auto& gpu = *inst.cachedModel;
 
     for (size_t i = 0; i < inst.particles.size(); ) {
         auto& p = inst.particles[i];
@@ -3162,9 +3164,8 @@ void M2Renderer::renderM2Particles(VkCommandBuffer cmd, VkDescriptorSet perFrame
 
     for (auto& inst : instances) {
         if (inst.particles.empty()) continue;
-        auto it = models.find(inst.modelId);
-        if (it == models.end()) continue;
-        const auto& gpu = it->second;
+        if (!inst.cachedModel) continue;
+        const auto& gpu = *inst.cachedModel;
 
         for (const auto& p : inst.particles) {
             if (p.emitterIndex < 0 || p.emitterIndex >= static_cast<int>(gpu.particleEmitters.size())) continue;
@@ -3549,9 +3550,13 @@ void M2Renderer::rebuildSpatialIndex() {
     particleInstanceIndices_.clear();
 
     for (size_t i = 0; i < instances.size(); i++) {
-        const auto& inst = instances[i];
+        auto& inst = instances[i];
         instanceIndexById[inst.id] = i;
 
+        // Re-cache model pointer (may have changed after model map modifications)
+        auto mdlIt = models.find(inst.modelId);
+        inst.cachedModel = (mdlIt != models.end()) ? &mdlIt->second : nullptr;
+
         // Rebuild dedup map (skip ground detail)
         if (!inst.cachedIsGroundDetail) {
             DedupKey dk{inst.modelId,
@@ -3684,8 +3689,18 @@ VkTexture* M2Renderer::loadTexture(const std::string& path, uint32_t texFlags) {
         containsToken(key, "campfire") ||
         containsToken(key, "bonfire");
 
-    // Load BLP texture
-    pipeline::BLPImage blp = assetManager->loadTexture(key);
+    // Check pre-decoded BLP cache first (populated by background worker threads)
+    pipeline::BLPImage blp;
+    if (predecodedBLPCache_) {
+        auto pit = predecodedBLPCache_->find(key);
+        if (pit != predecodedBLPCache_->end()) {
+            blp = std::move(pit->second);
+            predecodedBLPCache_->erase(pit);
+        }
+    }
+    if (!blp.isValid()) {
+        blp = assetManager->loadTexture(key);
+    }
     if (!blp.isValid()) {
         // Return white fallback but don't cache the failure — MPQ reads can
         // fail transiently during streaming; allow retry on next model load.
@@ -3751,9 +3766,8 @@ VkTexture* M2Renderer::loadTexture(const std::string& path, uint32_t texFlags) {
 uint32_t M2Renderer::getTotalTriangleCount() const {
     uint32_t total = 0;
     for (const auto& instance : instances) {
-        auto it = models.find(instance.modelId);
-        if (it != models.end()) {
-            total += it->second.indexCount / 3;
+        if (instance.cachedModel) {
+            total += instance.cachedModel->indexCount / 3;
         }
     }
     return total;
@@ -3775,11 +3789,10 @@ std::optional<float> M2Renderer::getFloorHeight(float glX, float glY, float glZ,
             continue;
         }
 
-        auto it = models.find(instance.modelId);
-        if (it == models.end()) continue;
+        if (!instance.cachedModel) continue;
         if (instance.scale <= 0.001f) continue;
 
-        const M2ModelGPU& model = it->second;
+        const M2ModelGPU& model = *instance.cachedModel;
         if (model.collisionNoBlock || model.isInvisibleTrap || model.isSpellEffect) continue;
         if (instance.skipCollision) continue;
 
@@ -3931,10 +3944,9 @@ bool M2Renderer::checkCollision(const glm::vec3& from, const glm::vec3& to,
         if (from.z > instance.worldBoundsMax.z + 2.5f && adjustedPos.z > instance.worldBoundsMax.z + 2.5f) continue;
         if (from.z + 2.5f < instance.worldBoundsMin.z && adjustedPos.z + 2.5f < instance.worldBoundsMin.z) continue;
 
-        auto it = models.find(instance.modelId);
-        if (it == models.end()) continue;
+        if (!instance.cachedModel) continue;
 
-        const M2ModelGPU& model = it->second;
+        const M2ModelGPU& model = *instance.cachedModel;
         if (model.collisionNoBlock || model.isInvisibleTrap || model.isSpellEffect) continue;
         if (instance.skipCollision) continue;
         if (instance.scale <= 0.001f) continue;
@@ -4172,10 +4184,9 @@ float M2Renderer::raycastBoundingBoxes(const glm::vec3& origin, const glm::vec3&
             continue;
         }
 
-        auto it = models.find(instance.modelId);
-        if (it == models.end()) continue;
+        if (!instance.cachedModel) continue;
 
-        const M2ModelGPU& model = it->second;
+        const M2ModelGPU& model = *instance.cachedModel;
         if (model.collisionNoBlock || model.isInvisibleTrap || model.isSpellEffect) continue;
         glm::vec3 localMin, localMax;
         getTightCollisionBounds(model, localMin, localMax);
diff --git a/src/rendering/renderer.cpp b/src/rendering/renderer.cpp
index 69bfecdb..55ba1370 100644
--- a/src/rendering/renderer.cpp
+++ b/src/rendering/renderer.cpp
@@ -2434,6 +2434,9 @@ void Renderer::update(float deltaTime) {
         cameraController->update(deltaTime);
         auto cameraEnd = std::chrono::steady_clock::now();
         lastCameraUpdateMs = std::chrono::duration<double, std::milli>(cameraEnd - cameraStart).count();
+        if (lastCameraUpdateMs > 3.0) {
+            LOG_WARNING("SLOW cameraController->update: ", lastCameraUpdateMs, "ms");
+        }
 
         // Update 3D audio listener position/orientation to match camera
         if (camera) {
@@ -2779,8 +2782,15 @@ void Renderer::update(float deltaTime) {
 
     // Update M2 doodad animations (pass camera for frustum-culling bone computation)
     if (m2Renderer && camera) {
+        auto m2Start = std::chrono::steady_clock::now();
         m2Renderer->update(deltaTime, camera->getPosition(),
                            camera->getProjectionMatrix() * camera->getViewMatrix());
+        float m2Ms = std::chrono::duration<float, std::milli>(
+            std::chrono::steady_clock::now() - m2Start).count();
+        if (m2Ms > 3.0f) {
+            LOG_WARNING("SLOW m2Renderer->update: ", m2Ms, "ms (",
+                        m2Renderer->getInstanceCount(), " instances)");
+        }
     }
 
     // Helper: play zone music, dispatching local files (file: prefix) vs MPQ paths
diff --git a/src/rendering/terrain_manager.cpp b/src/rendering/terrain_manager.cpp
index 20a2e9a1..97527c8c 100644
--- a/src/rendering/terrain_manager.cpp
+++ b/src/rendering/terrain_manager.cpp
@@ -231,9 +231,14 @@ bool TerrainManager::loadTile(int x, int y) {
         return false;
     }
 
+    VkContext* vkCtx = terrainRenderer ? terrainRenderer->getVkContext() : nullptr;
+    if (vkCtx) vkCtx->beginUploadBatch();
+
     FinalizingTile ft;
     ft.pending = std::move(pending);
     while (!advanceFinalization(ft)) {}
+
+    if (vkCtx) vkCtx->endUploadBatchSync();  // Sync — caller expects tile ready
     return true;
 }
 
@@ -407,6 +412,20 @@ std::shared_ptr<PendingTile> TerrainManager::prepareTile(int x, int y) {
             return false;
         }
 
+        // Pre-decode M2 model textures on background thread
+        for (const auto& tex : m2Model.textures) {
+            if (tex.filename.empty()) continue;
+            std::string texKey = tex.filename;
+            std::replace(texKey.begin(), texKey.end(), '/', '\\');
+            std::transform(texKey.begin(), texKey.end(), texKey.begin(),
+                           [](unsigned char c) { return static_cast<char>(std::tolower(c)); });
+            if (pending->preloadedM2Textures.find(texKey) != pending->preloadedM2Textures.end()) continue;
+            auto blp = assetManager->loadTexture(texKey);
+            if (blp.isValid()) {
+                pending->preloadedM2Textures[texKey] = std::move(blp);
+            }
+        }
+
         PendingTile::M2Ready ready;
         ready.modelId = modelId;
         ready.model = std::move(m2Model);
@@ -584,6 +603,20 @@ std::shared_ptr<PendingTile> TerrainManager::prepareTile(int x, int y) {
                                 pipeline::M2Loader::loadSkin(skinData, m2Model);
                             }
                             if (!m2Model.isValid()) continue;
+
+                            // Pre-decode doodad M2 textures on background thread
+                            for (const auto& tex : m2Model.textures) {
+                                if (tex.filename.empty()) continue;
+                                std::string texKey = tex.filename;
+                                std::replace(texKey.begin(), texKey.end(), '/', '\\');
+                                std::transform(texKey.begin(), texKey.end(), texKey.begin(),
+                                               [](unsigned char c) { return static_cast<char>(std::tolower(c)); });
+                                if (pending->preloadedM2Textures.find(texKey) != pending->preloadedM2Textures.end()) continue;
+                                auto blp = assetManager->loadTexture(texKey);
+                                if (blp.isValid()) {
+                                    pending->preloadedM2Textures[texKey] = std::move(blp);
+                                }
+                            }
                         }
 
                         // Build doodad's local transform (WoW coordinates)
@@ -654,6 +687,32 @@ std::shared_ptr<PendingTile> TerrainManager::prepareTile(int x, int y) {
                     }
                 }
 
+                // Pre-decode WMO textures on background thread
+                for (const auto& texPath : wmoModel.textures) {
+                    if (texPath.empty()) continue;
+                    std::string texKey = texPath;
+                    // Truncate at NUL (WMO paths can have stray bytes)
+                    size_t nul = texKey.find('\0');
+                    if (nul != std::string::npos) texKey.resize(nul);
+                    std::replace(texKey.begin(), texKey.end(), '/', '\\');
+                    std::transform(texKey.begin(), texKey.end(), texKey.begin(),
+                                   [](unsigned char c) { return static_cast<char>(std::tolower(c)); });
+                    if (texKey.empty()) continue;
+                    if (pending->preloadedWMOTextures.find(texKey) != pending->preloadedWMOTextures.end()) continue;
+                    // Try .blp variant
+                    std::string blpKey = texKey;
+                    if (blpKey.size() >= 4) {
+                        std::string ext = blpKey.substr(blpKey.size() - 4);
+                        if (ext == ".tga" || ext == ".dds") {
+                            blpKey = blpKey.substr(0, blpKey.size() - 4) + ".blp";
+                        }
+                    }
+                    auto blp = assetManager->loadTexture(blpKey);
+                    if (blp.isValid()) {
+                        pending->preloadedWMOTextures[blpKey] = std::move(blp);
+                    }
+                }
+
                 PendingTile::WMOReady ready;
                 // Cache WMO model uploads by path; placement dedup uses uniqueId separately.
                 ready.modelId = static_cast<uint32_t>(std::hash<std::string>{}(wmoPath));
@@ -741,7 +800,7 @@ bool TerrainManager::advanceFinalization(FinalizingTile& ft) {
             }
             bool allDone = terrainRenderer->loadTerrainIncremental(
                 pending->mesh, pending->terrain.textures, x, y,
-                ft.terrainChunkNext, 64);
+                ft.terrainChunkNext, 32);
             if (!allDone) {
                 return false; // More chunks remain — yield to time budget
             }
@@ -773,7 +832,9 @@ bool TerrainManager::advanceFinalization(FinalizingTile& ft) {
     case FinalizationPhase::M2_MODELS: {
         // Upload multiple M2 models per call (batched GPU uploads)
         if (m2Renderer && ft.m2ModelIndex < pending->m2Models.size()) {
-            constexpr size_t kModelsPerStep = 8;
+            // Set pre-decoded BLP cache so loadTexture() skips main-thread BLP decode
+            m2Renderer->setPredecodedBLPCache(&pending->preloadedM2Textures);
+            constexpr size_t kModelsPerStep = 4;
             size_t uploaded = 0;
             while (ft.m2ModelIndex < pending->m2Models.size() && uploaded < kModelsPerStep) {
                 auto& m2Ready = pending->m2Models[ft.m2ModelIndex];
@@ -786,6 +847,7 @@ bool TerrainManager::advanceFinalization(FinalizingTile& ft) {
                 ft.m2ModelIndex++;
                 uploaded++;
             }
+            m2Renderer->setPredecodedBLPCache(nullptr);
             // Stay in this phase until all models uploaded
             if (ft.m2ModelIndex < pending->m2Models.size()) {
                 return false;
@@ -830,8 +892,11 @@ bool TerrainManager::advanceFinalization(FinalizingTile& ft) {
         // Upload multiple WMO models per call (batched GPU uploads)
         if (wmoRenderer && assetManager) {
             wmoRenderer->initialize(nullptr, VK_NULL_HANDLE, assetManager);
+            // Set pre-decoded BLP cache and defer normal maps during streaming
+            wmoRenderer->setPredecodedBLPCache(&pending->preloadedWMOTextures);
+            wmoRenderer->setDeferNormalMaps(true);
 
-            constexpr size_t kWmosPerStep = 4;
+            constexpr size_t kWmosPerStep = 1;
             size_t uploaded = 0;
             while (ft.wmoModelIndex < pending->wmoModels.size() && uploaded < kWmosPerStep) {
                 auto& wmoReady = pending->wmoModels[ft.wmoModelIndex];
@@ -843,6 +908,8 @@ bool TerrainManager::advanceFinalization(FinalizingTile& ft) {
                     uploaded++;
                 }
             }
+            wmoRenderer->setDeferNormalMaps(false);
+            wmoRenderer->setPredecodedBLPCache(nullptr);
             if (ft.wmoModelIndex < pending->wmoModels.size()) return false;
         }
         ft.phase = FinalizationPhase::WMO_INSTANCES;
@@ -906,7 +973,9 @@ bool TerrainManager::advanceFinalization(FinalizingTile& ft) {
     case FinalizationPhase::WMO_DOODADS: {
         // Upload multiple WMO doodad M2s per call (batched GPU uploads)
         if (m2Renderer && ft.wmoDoodadIndex < pending->wmoDoodads.size()) {
-            constexpr size_t kDoodadsPerStep = 16;
+            // Set pre-decoded BLP cache for doodad M2 textures
+            m2Renderer->setPredecodedBLPCache(&pending->preloadedM2Textures);
+            constexpr size_t kDoodadsPerStep = 4;
             size_t uploaded = 0;
             while (ft.wmoDoodadIndex < pending->wmoDoodads.size() && uploaded < kDoodadsPerStep) {
                 auto& doodad = pending->wmoDoodads[ft.wmoDoodadIndex];
@@ -923,6 +992,7 @@ bool TerrainManager::advanceFinalization(FinalizingTile& ft) {
                 ft.wmoDoodadIndex++;
                 uploaded++;
             }
+            m2Renderer->setPredecodedBLPCache(nullptr);
             if (ft.wmoDoodadIndex < pending->wmoDoodads.size()) return false;
         }
         ft.phase = FinalizationPhase::WATER;
@@ -1080,11 +1150,6 @@ void TerrainManager::workerLoop() {
 }
 
 void TerrainManager::processReadyTiles() {
-    // Process tiles with time budget to avoid frame spikes
-    // Taxi mode gets a slightly larger budget to avoid visible late-pop terrain/models.
-    const float timeBudgetMs = taxiStreamingMode_ ? 8.0f : 3.0f;
-    auto startTime = std::chrono::high_resolution_clock::now();
-
     // Move newly ready tiles into the finalizing deque.
     // Keep them in pendingTiles so streamTiles() won't re-enqueue them.
     {
@@ -1100,28 +1165,32 @@ void TerrainManager::processReadyTiles() {
         }
     }
 
-    // Outer upload batch: all GPU uploads across all advanceFinalization calls
-    // this frame share a single command buffer submission + fence wait.
     VkContext* vkCtx = terrainRenderer ? terrainRenderer->getVkContext() : nullptr;
+
+    // Reclaim completed async uploads from previous frames (non-blocking)
+    if (vkCtx) vkCtx->pollUploadBatches();
+
+    // Nothing to finalize — done.
+    if (finalizingTiles_.empty()) return;
+
+    // Async upload batch: record GPU copies into a command buffer, submit with
+    // a fence, but DON'T wait.  The fence is polled on subsequent frames.
+    // This eliminates the main-thread stall from vkWaitForFences entirely.
+    const int maxSteps = taxiStreamingMode_ ? 8 : 2;
+    int steps = 0;
+
     if (vkCtx) vkCtx->beginUploadBatch();
 
-    // Drive incremental finalization within time budget
-    while (!finalizingTiles_.empty()) {
+    while (!finalizingTiles_.empty() && steps < maxSteps) {
         auto& ft = finalizingTiles_.front();
         bool done = advanceFinalization(ft);
-
         if (done) {
             finalizingTiles_.pop_front();
         }
-
-        auto now = std::chrono::high_resolution_clock::now();
-        float elapsedMs = std::chrono::duration<float, std::milli>(now - startTime).count();
-        if (elapsedMs >= timeBudgetMs) {
-            break;
-        }
+        steps++;
     }
 
-    if (vkCtx) vkCtx->endUploadBatch();
+    if (vkCtx) vkCtx->endUploadBatch();  // Async — submits but doesn't wait
 }
 
 void TerrainManager::processAllReadyTiles() {
@@ -1151,7 +1220,7 @@ void TerrainManager::processAllReadyTiles() {
         finalizingTiles_.pop_front();
     }
 
-    if (vkCtx) vkCtx->endUploadBatch();
+    if (vkCtx) vkCtx->endUploadBatchSync();  // Sync — load screen needs data ready
 }
 
 void TerrainManager::processOneReadyTile() {
@@ -1177,7 +1246,7 @@ void TerrainManager::processOneReadyTile() {
         while (!advanceFinalization(ft)) {}
         finalizingTiles_.pop_front();
 
-        if (vkCtx) vkCtx->endUploadBatch();
+        if (vkCtx) vkCtx->endUploadBatchSync();  // Sync — load screen needs data ready
     }
 }
 
diff --git a/src/rendering/vk_context.cpp b/src/rendering/vk_context.cpp
index dc73c685..79e7eac3 100644
--- a/src/rendering/vk_context.cpp
+++ b/src/rendering/vk_context.cpp
@@ -67,6 +67,14 @@ void VkContext::shutdown() {
         frame = {};
     }
 
+    // Clean up any in-flight async upload batches (device already idle)
+    for (auto& batch : inFlightBatches_) {
+        // Staging buffers: skip destroy — allocator is about to be torn down
+        vkDestroyFence(device, batch.fence, nullptr);
+        // Command buffer freed when pool is destroyed below
+    }
+    inFlightBatches_.clear();
+
     if (immFence) { vkDestroyFence(device, immFence, nullptr); immFence = VK_NULL_HANDLE; }
     if (immCommandPool) { vkDestroyCommandPool(device, immCommandPool, nullptr); immCommandPool = VK_NULL_HANDLE; }
 
@@ -1447,17 +1455,94 @@ void VkContext::endUploadBatch() {
 
     inUploadBatch_ = false;
 
-    // Submit all recorded commands with a single fence wait
+    if (batchStagingBuffers_.empty()) {
+        // No GPU copies were recorded — skip the submit entirely.
+        vkEndCommandBuffer(batchCmd_);
+        vkFreeCommandBuffers(device, immCommandPool, 1, &batchCmd_);
+        batchCmd_ = VK_NULL_HANDLE;
+        return;
+    }
+
+    // Submit commands with a NEW fence — don't wait, let GPU work in parallel.
+    vkEndCommandBuffer(batchCmd_);
+
+    VkFenceCreateInfo fenceInfo{};
+    fenceInfo.sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO;
+    VkFence fence = VK_NULL_HANDLE;
+    vkCreateFence(device, &fenceInfo, nullptr, &fence);
+
+    VkSubmitInfo submitInfo{};
+    submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
+    submitInfo.commandBufferCount = 1;
+    submitInfo.pCommandBuffers = &batchCmd_;
+    vkQueueSubmit(graphicsQueue, 1, &submitInfo, fence);
+
+    // Stash everything for later cleanup when fence signals
+    InFlightBatch batch;
+    batch.fence = fence;
+    batch.cmd = batchCmd_;
+    batch.stagingBuffers = std::move(batchStagingBuffers_);
+    inFlightBatches_.push_back(std::move(batch));
+
+    batchCmd_ = VK_NULL_HANDLE;
+    batchStagingBuffers_.clear();
+}
+
+void VkContext::endUploadBatchSync() {
+    if (uploadBatchDepth_ <= 0) return;
+    uploadBatchDepth_--;
+    if (uploadBatchDepth_ > 0) return;
+
+    inUploadBatch_ = false;
+
+    if (batchStagingBuffers_.empty()) {
+        vkEndCommandBuffer(batchCmd_);
+        vkFreeCommandBuffers(device, immCommandPool, 1, &batchCmd_);
+        batchCmd_ = VK_NULL_HANDLE;
+        return;
+    }
+
+    // Synchronous path for load screens — submit and wait
     endSingleTimeCommands(batchCmd_);
     batchCmd_ = VK_NULL_HANDLE;
 
-    // Destroy all deferred staging buffers
     for (auto& staging : batchStagingBuffers_) {
         destroyBuffer(allocator, staging);
     }
     batchStagingBuffers_.clear();
 }
 
+void VkContext::pollUploadBatches() {
+    if (inFlightBatches_.empty()) return;
+
+    for (auto it = inFlightBatches_.begin(); it != inFlightBatches_.end(); ) {
+        VkResult result = vkGetFenceStatus(device, it->fence);
+        if (result == VK_SUCCESS) {
+            // GPU finished — free resources
+            for (auto& staging : it->stagingBuffers) {
+                destroyBuffer(allocator, staging);
+            }
+            vkFreeCommandBuffers(device, immCommandPool, 1, &it->cmd);
+            vkDestroyFence(device, it->fence, nullptr);
+            it = inFlightBatches_.erase(it);
+        } else {
+            ++it;
+        }
+    }
+}
+
+void VkContext::waitAllUploads() {
+    for (auto& batch : inFlightBatches_) {
+        vkWaitForFences(device, 1, &batch.fence, VK_TRUE, UINT64_MAX);
+        for (auto& staging : batch.stagingBuffers) {
+            destroyBuffer(allocator, staging);
+        }
+        vkFreeCommandBuffers(device, immCommandPool, 1, &batch.cmd);
+        vkDestroyFence(device, batch.fence, nullptr);
+    }
+    inFlightBatches_.clear();
+}
+
 void VkContext::deferStagingCleanup(AllocatedBuffer staging) {
     batchStagingBuffers_.push_back(staging);
 }
diff --git a/src/rendering/wmo_renderer.cpp b/src/rendering/wmo_renderer.cpp
index 691abaa1..5dec0e3e 100644
--- a/src/rendering/wmo_renderer.cpp
+++ b/src/rendering/wmo_renderer.cpp
@@ -2325,13 +2325,27 @@ VkTexture* WMORenderer::loadTexture(const std::string& path) {
     const auto& attemptedCandidates = uniqueCandidates;
 
     // Try loading all candidates until one succeeds
+    // Check pre-decoded BLP cache first (populated by background worker threads)
     pipeline::BLPImage blp;
     std::string resolvedKey;
-    for (const auto& c : attemptedCandidates) {
-        blp = assetManager->loadTexture(c);
-        if (blp.isValid()) {
-            resolvedKey = c;
-            break;
+    if (predecodedBLPCache_) {
+        for (const auto& c : uniqueCandidates) {
+            auto pit = predecodedBLPCache_->find(c);
+            if (pit != predecodedBLPCache_->end()) {
+                blp = std::move(pit->second);
+                predecodedBLPCache_->erase(pit);
+                resolvedKey = c;
+                break;
+            }
+        }
+    }
+    if (!blp.isValid()) {
+        for (const auto& c : attemptedCandidates) {
+            blp = assetManager->loadTexture(c);
+            if (blp.isValid()) {
+                resolvedKey = c;
+                break;
+            }
         }
     }
     if (!blp.isValid()) {
@@ -2369,10 +2383,10 @@ VkTexture* WMORenderer::loadTexture(const std::string& path) {
     texture->createSampler(vkCtx_->getDevice(), VK_FILTER_LINEAR, VK_FILTER_LINEAR,
                             VK_SAMPLER_ADDRESS_MODE_REPEAT);
 
-    // Generate normal+height map from diffuse pixels
+    // Generate normal+height map from diffuse pixels (skip during streaming to avoid CPU stalls)
     float nhVariance = 0.0f;
     std::unique_ptr<VkTexture> nhMap;
-    if (normalMappingEnabled_ || pomEnabled_) {
+    if ((normalMappingEnabled_ || pomEnabled_) && !deferNormalMaps_) {
         nhMap = generateNormalHeightMap(blp.data.data(), blp.width, blp.height, nhVariance);
         if (nhMap) {
             approxBytes *= 2;  // account for normal map in budget