Background BLP texture pre-decoding + deferred WMO normal maps (12x streaming perf)

Move CPU-heavy BLP texture decoding from main thread to background worker threads for all hot paths: terrain M2 models, WMO doodad M2s, WMO textures, creature models, and gameobject WMOs. Each renderer (M2, WMO, Character) now accepts a pre-decoded BLP cache that loadTexture() checks before falling back to synchronous decode. Defer WMO normal/height map generation (3 per-pixel passes: luminance, box blur, Sobel) during terrain streaming finalization — this was the dominant remaining bottleneck after BLP pre-decoding. Terrain streaming stalls: 1576ms → 124ms worst case.
2026-05-02 15:53:51 +00:00 · 2026-03-07 15:46:56 -08:00 · 2026-03-07 15:46:56 -08:00 · 7ac990cff4
commit 7ac990cff4
parent 0313bd8692
13 changed files with 573 additions and 109 deletions
--- a/include/core/application.hpp
+++ b/include/core/application.hpp
@ -3,6 +3,7 @@
 #include "core/window.hpp"
 #include "core/input.hpp"
 #include "game/character.hpp"
+#include "pipeline/blp_loader.hpp"
 #include <memory>
 #include <string>
 #include <vector>
@ -23,7 +24,7 @@ namespace rendering { class Renderer; }
 namespace ui { class UIManager; }
 namespace auth { class AuthHandler; }
 namespace game { class GameHandler; class World; class ExpansionRegistry; }
-namespace pipeline { class AssetManager; class DBCLayout; struct M2Model; }
+namespace pipeline { class AssetManager; class DBCLayout; struct M2Model; struct WMOModel; }
 namespace audio { enum class VoiceType; }

 namespace core {
@ -206,6 +207,7 @@ private:
        uint32_t modelId;
        float x, y, z, orientation;
        std::shared_ptr<pipeline::M2Model> model; // parsed on background thread
+        std::unordered_map<std::string, pipeline::BLPImage> predecodedTextures; // decoded on bg thread
        bool valid = false;
        bool permanent_failure = false;
    };
@ -337,6 +339,24 @@ private:
    };
    std::vector<PendingGameObjectSpawn> pendingGameObjectSpawns_;
    void processGameObjectSpawnQueue();
+
+    // Async WMO loading for game objects (file I/O + parse on background thread)
+    struct PreparedGameObjectWMO {
+        uint64_t guid;
+        uint32_t entry;
+        uint32_t displayId;
+        float x, y, z, orientation;
+        std::shared_ptr<pipeline::WMOModel> wmoModel;
+        std::unordered_map<std::string, pipeline::BLPImage> predecodedTextures; // decoded on bg thread
+        bool valid = false;
+        bool isWmo = false;
+        std::string modelPath;
+    };
+    struct AsyncGameObjectLoad {
+        std::future<PreparedGameObjectWMO> future;
+    };
+    std::vector<AsyncGameObjectLoad> asyncGameObjectLoads_;
+    void processAsyncGameObjectResults();
    struct PendingTransportDoodadBatch {
        uint64_t guid = 0;
        uint32_t modelId = 0;
--- a/include/rendering/character_renderer.hpp
+++ b/include/rendering/character_renderer.hpp
@ -1,6 +1,7 @@
 #pragma once

 #include "pipeline/m2_loader.hpp"
+#include "pipeline/blp_loader.hpp"
 #include <vulkan/vulkan.h>
 #include <vk_mem_alloc.h>
 #include <glm/glm.hpp>
@ -114,7 +115,11 @@ public:
    void setShadowMap(VkTexture*, const glm::mat4&) {}
    void clearShadowMap() {}

+    // Pre-decoded BLP cache: set before calling loadModel() to skip main-thread BLP decode
+    void setPredecodedBLPCache(std::unordered_map<std::string, pipeline::BLPImage>* cache) { predecodedBLPCache_ = cache; }
+
 private:
+    std::unordered_map<std::string, pipeline::BLPImage>* predecodedBLPCache_ = nullptr;
    // GPU representation of M2 model
    struct M2ModelGPU {
        VkBuffer vertexBuffer = VK_NULL_HANDLE;
@ -180,6 +185,7 @@ private:

        // Bone update throttling (skip frames for distant characters)
        uint32_t boneUpdateCounter = 0;
+        const M2ModelGPU* cachedModel = nullptr;  // Avoid per-frame hash lookups

        // Per-instance bone SSBO (double-buffered per frame)
        VkBuffer boneBuffer[2] = {};
--- a/include/rendering/m2_renderer.hpp
+++ b/include/rendering/m2_renderer.hpp
@ -1,6 +1,7 @@
 #pragma once

 #include "pipeline/m2_loader.hpp"
+#include "pipeline/blp_loader.hpp"
 #include <vulkan/vulkan.h>
 #include <vk_mem_alloc.h>
 #include <glm/glm.hpp>
@ -188,6 +189,7 @@ struct M2Instance {
    bool skipCollision = false;    // WMO interior doodads — skip player wall collision
    float cachedBoundRadius = 0.0f;
    float portalSpinAngle = 0.0f;  // Accumulated spin angle for portal rotation
+    const M2ModelGPU* cachedModel = nullptr;  // Avoid per-frame hash lookups

    // Frame-skip optimization (update distant animations less frequently)
    uint8_t frameSkipCounter = 0;
@ -328,6 +330,10 @@ public:

    std::vector<glm::vec3> getWaterVegetationPositions(const glm::vec3& camPos, float maxDist) const;

+    // Pre-decoded BLP cache: set by terrain manager before calling loadModel()
+    // so loadTexture() can skip the expensive assetManager->loadTexture() call.
+    void setPredecodedBLPCache(std::unordered_map<std::string, pipeline::BLPImage>* cache) { predecodedBLPCache_ = cache; }
+
 private:
    bool initialized_ = false;
    bool insideInterior = false;
@ -414,6 +420,8 @@ private:
    uint32_t modelLimitRejectWarnings_ = 0;

    VkTexture* loadTexture(const std::string& path, uint32_t texFlags = 0);
+    std::unordered_map<std::string, pipeline::BLPImage>* predecodedBLPCache_ = nullptr;
+
    struct TextureCacheEntry {
        std::unique_ptr<VkTexture> texture;
        size_t approxBytes = 0;
--- a/include/rendering/terrain_manager.hpp
+++ b/include/rendering/terrain_manager.hpp
@ -121,6 +121,12 @@ struct PendingTile {
    // Pre-loaded terrain texture BLP data (loaded on background thread to avoid
    // blocking file I/O on the main thread during finalizeTile)
    std::unordered_map<std::string, pipeline::BLPImage> preloadedTextures;
+
+    // Pre-decoded M2 model textures (decoded on background thread)
+    std::unordered_map<std::string, pipeline::BLPImage> preloadedM2Textures;
+
+    // Pre-decoded WMO textures (decoded on background thread)
+    std::unordered_map<std::string, pipeline::BLPImage> preloadedWMOTextures;
 };

 /**
--- a/include/rendering/vk_context.hpp
+++ b/include/rendering/vk_context.hpp
@ -50,9 +50,12 @@ public:
    // Batch upload mode: records multiple upload commands into a single
    // command buffer, then submits with ONE fence wait instead of one per upload.
    void beginUploadBatch();
-    void endUploadBatch();
+    void endUploadBatch();       // Async: submits but does NOT wait for fence
+    void endUploadBatchSync();   // Sync: submits and waits (for load screens)
    bool isInUploadBatch() const { return inUploadBatch_; }
    void deferStagingCleanup(AllocatedBuffer staging);
+    void pollUploadBatches();    // Check completed async uploads, free staging buffers
+    void waitAllUploads();       // Block until all in-flight uploads complete

    // Accessors
    VkInstance getInstance() const { return instance; }
@ -157,6 +160,14 @@ private:
    VkCommandBuffer batchCmd_ = VK_NULL_HANDLE;
    std::vector<AllocatedBuffer> batchStagingBuffers_;

+    // Async upload: in-flight batches awaiting GPU completion
+    struct InFlightBatch {
+        VkFence fence = VK_NULL_HANDLE;
+        VkCommandBuffer cmd = VK_NULL_HANDLE;
+        std::vector<AllocatedBuffer> stagingBuffers;
+    };
+    std::vector<InFlightBatch> inFlightBatches_;
+
    // Depth buffer (shared across all framebuffers)
    VkImage depthImage = VK_NULL_HANDLE;
    VkImageView depthImageView = VK_NULL_HANDLE;
--- a/include/rendering/wmo_renderer.hpp
+++ b/include/rendering/wmo_renderer.hpp
@ -1,5 +1,6 @@
 #pragma once

+#include "pipeline/blp_loader.hpp"
 #include <vulkan/vulkan.h>
 #include <vk_mem_alloc.h>
 #include <glm/glm.hpp>
@ -325,6 +326,12 @@ public:
    // Pre-compute floor cache for all loaded WMO instances
    void precomputeFloorCache();

+    // Pre-decoded BLP cache: set before calling loadModel() to skip main-thread BLP decode
+    void setPredecodedBLPCache(std::unordered_map<std::string, pipeline::BLPImage>* cache) { predecodedBLPCache_ = cache; }
+
+    // Defer normal/height map generation during streaming to avoid CPU stalls
+    void setDeferNormalMaps(bool defer) { deferNormalMaps_ = defer; }
+
 private:
    // WMO material UBO — matches WMOMaterial in wmo.frag.glsl
    struct WMOMaterialUBO {
@ -558,6 +565,7 @@ private:
     * Load a texture from path
     */
    VkTexture* loadTexture(const std::string& path);
+    std::unordered_map<std::string, pipeline::BLPImage>* predecodedBLPCache_ = nullptr;

    /**
     * Generate normal+height map from diffuse RGBA8 pixels
@ -670,6 +678,7 @@ private:

    // Normal mapping / POM settings
    bool normalMappingEnabled_ = true;   // on by default
+    bool deferNormalMaps_ = false;       // skip normal map gen during streaming
    float normalMapStrength_ = 0.8f;     // 0.0 = flat, 1.0 = full, 2.0 = exaggerated
    bool pomEnabled_ = true;             // on by default
    int pomQuality_ = 1;                 // 0=Low(16), 1=Medium(32), 2=High(64)
--- a/src/core/application.cpp
+++ b/src/core/application.cpp
@ -6883,7 +6883,7 @@ void Application::spawnOnlineGameObject(uint64_t guid, uint32_t entry, uint32_t
 void Application::processAsyncCreatureResults() {
    // Check completed async model loads and finalize on main thread (GPU upload + instance creation).
    // Limit GPU model uploads per frame to avoid spikes, but always drain cheap bookkeeping.
-    static constexpr int kMaxModelUploadsPerFrame = 3;
+    static constexpr int kMaxModelUploadsPerFrame = 1;
    int modelUploads = 0;

    for (auto it = asyncCreatureLoads_.begin(); it != asyncCreatureLoads_.end(); ) {
@ -6925,13 +6925,17 @@ void Application::processAsyncCreatureResults() {
        }

        // Upload model to GPU (must happen on main thread)
+        // Use pre-decoded BLP cache to skip main-thread texture decode
+        charRenderer->setPredecodedBLPCache(&result.predecodedTextures);
        if (!charRenderer->loadModel(*result.model, result.modelId)) {
+            charRenderer->setPredecodedBLPCache(nullptr);
            nonRenderableCreatureDisplayIds_.insert(result.displayId);
            creaturePermanentFailureGuids_.insert(result.guid);
            pendingCreatureSpawnGuids_.erase(result.guid);
            creatureSpawnRetryCounts_.erase(result.guid);
            continue;
        }
+        charRenderer->setPredecodedBLPCache(nullptr);
        displayIdModelCache_[result.displayId] = result.modelId;
        modelUploads++;

@ -6956,6 +6960,10 @@ void Application::processAsyncCreatureResults() {
 }

 void Application::processCreatureSpawnQueue() {
+    auto startTime = std::chrono::steady_clock::now();
+    // Budget: max 2ms per frame for creature spawning to prevent stutter.
+    static constexpr float kSpawnBudgetMs = 2.0f;
+
    // First, finalize any async model loads that completed on background threads.
    processAsyncCreatureResults();

@ -6965,18 +6973,15 @@ void Application::processCreatureSpawnQueue() {
        if (!creatureLookupsBuilt_) return;
    }

-    auto startTime = std::chrono::steady_clock::now();
-    // Budget: max 4ms per frame for creature spawning to prevent stutter.
-    static constexpr float kSpawnBudgetMs = 4.0f;
-
    int processed = 0;
    int asyncLaunched = 0;
    size_t rotationsLeft = pendingCreatureSpawns_.size();
    while (!pendingCreatureSpawns_.empty() &&
           processed < MAX_SPAWNS_PER_FRAME &&
           rotationsLeft > 0) {
-        // Check time budget after each spawn (not for the first one, always process at least 1)
-        if (processed > 0) {
+        // Check time budget every iteration (including first — async results may
+        // have already consumed the budget via GPU model uploads).
+        {
            auto now = std::chrono::steady_clock::now();
            float elapsedMs = std::chrono::duration<float, std::milli>(now - startTime).count();
            if (elapsedMs >= kSpawnBudgetMs) break;
@ -7081,6 +7086,20 @@ void Application::processCreatureSpawnQueue() {
                        }
                    }

+                    // Pre-decode model textures on background thread
+                    for (const auto& tex : model->textures) {
+                        if (tex.filename.empty()) continue;
+                        std::string texKey = tex.filename;
+                        std::replace(texKey.begin(), texKey.end(), '/', '\\');
+                        std::transform(texKey.begin(), texKey.end(), texKey.begin(),
+                                       [](unsigned char c) { return static_cast<char>(std::tolower(c)); });
+                        if (result.predecodedTextures.find(texKey) != result.predecodedTextures.end()) continue;
+                        auto blp = am->loadTexture(texKey);
+                        if (blp.isValid()) {
+                            result.predecodedTextures[texKey] = std::move(blp);
+                        }
+                    }
+
                    result.model = std::move(model);
                    result.valid = true;
                    return result;
@ -7161,14 +7180,202 @@ void Application::processDeferredEquipmentQueue() {
    setOnlinePlayerEquipment(guid, equipData.first, equipData.second);
 }

+void Application::processAsyncGameObjectResults() {
+    for (auto it = asyncGameObjectLoads_.begin(); it != asyncGameObjectLoads_.end(); ) {
+        if (!it->future.valid() ||
+            it->future.wait_for(std::chrono::milliseconds(0)) != std::future_status::ready) {
+            ++it;
+            continue;
+        }
+
+        auto result = it->future.get();
+        it = asyncGameObjectLoads_.erase(it);
+
+        if (!result.valid || !result.isWmo || !result.wmoModel) {
+            // Fallback: spawn via sync path (likely an M2 or failed WMO)
+            spawnOnlineGameObject(result.guid, result.entry, result.displayId,
+                                 result.x, result.y, result.z, result.orientation);
+            continue;
+        }
+
+        // WMO parsed on background thread — do GPU upload + instance creation on main thread
+        auto* wmoRenderer = renderer ? renderer->getWMORenderer() : nullptr;
+        if (!wmoRenderer) continue;
+
+        uint32_t modelId = 0;
+        auto itCache = gameObjectDisplayIdWmoCache_.find(result.displayId);
+        if (itCache != gameObjectDisplayIdWmoCache_.end()) {
+            modelId = itCache->second;
+        } else {
+            modelId = nextGameObjectWmoModelId_++;
+            wmoRenderer->setPredecodedBLPCache(&result.predecodedTextures);
+            if (!wmoRenderer->loadModel(*result.wmoModel, modelId)) {
+                wmoRenderer->setPredecodedBLPCache(nullptr);
+                LOG_WARNING("Failed to load async gameobject WMO: ", result.modelPath);
+                continue;
+            }
+            wmoRenderer->setPredecodedBLPCache(nullptr);
+            gameObjectDisplayIdWmoCache_[result.displayId] = modelId;
+        }
+
+        glm::vec3 renderPos = core::coords::canonicalToRender(
+            glm::vec3(result.x, result.y, result.z));
+        uint32_t instanceId = wmoRenderer->createInstance(
+            modelId, renderPos, glm::vec3(0.0f, 0.0f, result.orientation), 1.0f);
+        if (instanceId == 0) continue;
+
+        gameObjectInstances_[result.guid] = {modelId, instanceId, true};
+
+        // Queue transport doodad loading if applicable
+        std::string lowerPath = result.modelPath;
+        std::transform(lowerPath.begin(), lowerPath.end(), lowerPath.begin(),
+                       [](unsigned char c) { return static_cast<char>(std::tolower(c)); });
+        if (lowerPath.find("transport") != std::string::npos) {
+            const auto* doodadTemplates = wmoRenderer->getDoodadTemplates(modelId);
+            if (doodadTemplates && !doodadTemplates->empty()) {
+                PendingTransportDoodadBatch batch;
+                batch.guid = result.guid;
+                batch.modelId = modelId;
+                batch.instanceId = instanceId;
+                batch.x = result.x;
+                batch.y = result.y;
+                batch.z = result.z;
+                batch.orientation = result.orientation;
+                batch.doodadBudget = doodadTemplates->size();
+                pendingTransportDoodadBatches_.push_back(batch);
+            }
+        }
+    }
+}
+
 void Application::processGameObjectSpawnQueue() {
+    // Finalize any completed async WMO loads first
+    processAsyncGameObjectResults();
+
    if (pendingGameObjectSpawns_.empty()) return;

-    // Only spawn 1 game object per frame — each can involve heavy synchronous
-    // WMO loading (root + groups from disk + GPU upload), easily 100ms+.
-    auto& s = pendingGameObjectSpawns_.front();
-    spawnOnlineGameObject(s.guid, s.entry, s.displayId, s.x, s.y, s.z, s.orientation);
-    pendingGameObjectSpawns_.erase(pendingGameObjectSpawns_.begin());
+    // Process spawns: cached WMOs and M2s go sync (cheap), uncached WMOs go async
+    auto startTime = std::chrono::steady_clock::now();
+    static constexpr float kBudgetMs = 2.0f;
+    static constexpr int kMaxAsyncLoads = 2;
+
+    while (!pendingGameObjectSpawns_.empty()) {
+        float elapsedMs = std::chrono::duration<float, std::milli>(
+            std::chrono::steady_clock::now() - startTime).count();
+        if (elapsedMs >= kBudgetMs) break;
+
+        auto& s = pendingGameObjectSpawns_.front();
+
+        // Check if this is an uncached WMO that needs async loading
+        std::string modelPath;
+        if (gameObjectLookupsBuilt_) {
+            // Check transport overrides first
+            bool isTransport = gameHandler && gameHandler->isTransportGuid(s.guid);
+            if (isTransport) {
+                if (s.entry == 20808 || s.entry == 176231 || s.entry == 176310)
+                    modelPath = "World\\wmo\\transports\\transport_ship\\transportship.wmo";
+                else if (s.displayId == 807 || s.displayId == 808 || s.displayId == 175080 || s.displayId == 176495 || s.displayId == 164871)
+                    modelPath = "World\\wmo\\transports\\transport_zeppelin\\transport_zeppelin.wmo";
+                else if (s.displayId == 1587)
+                    modelPath = "World\\wmo\\transports\\transport_horde_zeppelin\\Transport_Horde_Zeppelin.wmo";
+                else if (s.displayId == 2454 || s.displayId == 181688 || s.displayId == 190536)
+                    modelPath = "World\\wmo\\transports\\icebreaker\\Transport_Icebreaker_ship.wmo";
+            }
+            if (modelPath.empty())
+                modelPath = getGameObjectModelPathForDisplayId(s.displayId);
+        }
+
+        std::string lowerPath = modelPath;
+        std::transform(lowerPath.begin(), lowerPath.end(), lowerPath.begin(),
+                       [](unsigned char c) { return static_cast<char>(std::tolower(c)); });
+        bool isWmo = lowerPath.size() >= 4 && lowerPath.substr(lowerPath.size() - 4) == ".wmo";
+        bool isCached = isWmo && gameObjectDisplayIdWmoCache_.count(s.displayId);
+
+        if (isWmo && !isCached && !modelPath.empty() &&
+            static_cast<int>(asyncGameObjectLoads_.size()) < kMaxAsyncLoads) {
+            // Launch async WMO load — file I/O + parse on background thread
+            auto* am = assetManager.get();
+            PendingGameObjectSpawn capture = s;
+            std::string capturePath = modelPath;
+            AsyncGameObjectLoad load;
+            load.future = std::async(std::launch::async,
+                [am, capture, capturePath]() -> PreparedGameObjectWMO {
+                    PreparedGameObjectWMO result;
+                    result.guid = capture.guid;
+                    result.entry = capture.entry;
+                    result.displayId = capture.displayId;
+                    result.x = capture.x;
+                    result.y = capture.y;
+                    result.z = capture.z;
+                    result.orientation = capture.orientation;
+                    result.modelPath = capturePath;
+                    result.isWmo = true;
+
+                    auto wmoData = am->readFile(capturePath);
+                    if (wmoData.empty()) return result;
+
+                    auto wmo = std::make_shared<pipeline::WMOModel>(
+                        pipeline::WMOLoader::load(wmoData));
+
+                    // Load groups
+                    if (wmo->nGroups > 0) {
+                        std::string basePath = capturePath;
+                        std::string ext;
+                        if (basePath.size() > 4) {
+                            ext = basePath.substr(basePath.size() - 4);
+                            basePath = basePath.substr(0, basePath.size() - 4);
+                        }
+                        for (uint32_t gi = 0; gi < wmo->nGroups; gi++) {
+                            char suffix[16];
+                            snprintf(suffix, sizeof(suffix), "_%03u%s", gi, ext.c_str());
+                            auto groupData = am->readFile(basePath + suffix);
+                            if (groupData.empty()) {
+                                snprintf(suffix, sizeof(suffix), "_%03u.wmo", gi);
+                                groupData = am->readFile(basePath + suffix);
+                            }
+                            if (!groupData.empty()) {
+                                pipeline::WMOLoader::loadGroup(groupData, *wmo, gi);
+                            }
+                        }
+                    }
+
+                    // Pre-decode WMO textures on background thread
+                    for (const auto& texPath : wmo->textures) {
+                        if (texPath.empty()) continue;
+                        std::string texKey = texPath;
+                        size_t nul = texKey.find('\0');
+                        if (nul != std::string::npos) texKey.resize(nul);
+                        std::replace(texKey.begin(), texKey.end(), '/', '\\');
+                        std::transform(texKey.begin(), texKey.end(), texKey.begin(),
+                                       [](unsigned char c) { return static_cast<char>(std::tolower(c)); });
+                        if (texKey.empty()) continue;
+                        // Convert to .blp extension
+                        if (texKey.size() >= 4) {
+                            std::string ext = texKey.substr(texKey.size() - 4);
+                            if (ext == ".tga" || ext == ".dds") {
+                                texKey = texKey.substr(0, texKey.size() - 4) + ".blp";
+                            }
+                        }
+                        if (result.predecodedTextures.find(texKey) != result.predecodedTextures.end()) continue;
+                        auto blp = am->loadTexture(texKey);
+                        if (blp.isValid()) {
+                            result.predecodedTextures[texKey] = std::move(blp);
+                        }
+                    }
+
+                    result.wmoModel = wmo;
+                    result.valid = true;
+                    return result;
+                });
+            asyncGameObjectLoads_.push_back(std::move(load));
+            pendingGameObjectSpawns_.erase(pendingGameObjectSpawns_.begin());
+            continue;
+        }
+
+        // Cached WMO or M2 — spawn synchronously (cheap)
+        spawnOnlineGameObject(s.guid, s.entry, s.displayId, s.x, s.y, s.z, s.orientation);
+        pendingGameObjectSpawns_.erase(pendingGameObjectSpawns_.begin());
+    }
 }

 void Application::processPendingTransportDoodads() {
--- a/src/rendering/character_renderer.cpp
+++ b/src/rendering/character_renderer.cpp
@ -625,7 +625,18 @@ VkTexture* CharacterRenderer::loadTexture(const std::string& path) {
        return whiteTexture_.get();
    }

-    auto blpImage = assetManager->loadTexture(key);
+    // Check pre-decoded BLP cache first (populated by background threads)
+    pipeline::BLPImage blpImage;
+    if (predecodedBLPCache_) {
+        auto pit = predecodedBLPCache_->find(key);
+        if (pit != predecodedBLPCache_->end()) {
+            blpImage = std::move(pit->second);
+            predecodedBLPCache_->erase(pit);
+        }
+    }
+    if (!blpImage.isValid()) {
+        blpImage = assetManager->loadTexture(key);
+    }
    if (!blpImage.isValid()) {
        // Return white fallback but don't cache the failure — allow retry
        // on next character load in case the asset becomes available.
@ -1412,8 +1423,9 @@ uint32_t CharacterRenderer::createInstance(uint32_t modelId, const glm::vec3& po
    instance.scale = scale;

    // Initialize bone matrices to identity
-    auto& model = models[modelId].data;
-    instance.boneMatrices.resize(std::max(static_cast<size_t>(1), model.bones.size()), glm::mat4(1.0f));
+    auto& gpuRef = models[modelId];
+    instance.boneMatrices.resize(std::max(static_cast<size_t>(1), gpuRef.data.bones.size()), glm::mat4(1.0f));
+    instance.cachedModel = &gpuRef;

    uint32_t id = instance.id;
    instances[id] = std::move(instance);
@ -1511,13 +1523,12 @@ void CharacterRenderer::update(float deltaTime, const glm::vec3& cameraPos) {
        if (distSq >= animUpdateRadiusSq) continue;

        // Always advance animation time (cheap)
-        auto modelIt = models.find(inst.modelId);
-        if (modelIt != models.end() && !modelIt->second.data.sequences.empty()) {
+        if (inst.cachedModel && !inst.cachedModel->data.sequences.empty()) {
            if (inst.currentSequenceIndex < 0) {
                inst.currentSequenceIndex = 0;
-                inst.currentAnimationId = modelIt->second.data.sequences[0].id;
+                inst.currentAnimationId = inst.cachedModel->data.sequences[0].id;
            }
-            const auto& seq = modelIt->second.data.sequences[inst.currentSequenceIndex];
+            const auto& seq = inst.cachedModel->data.sequences[inst.currentSequenceIndex];
            inst.animationTime += deltaTime * 1000.0f;
            if (seq.duration > 0 && inst.animationTime >= static_cast<float>(seq.duration)) {
                if (inst.animationLoop) {
@ -1528,10 +1539,11 @@ void CharacterRenderer::update(float deltaTime, const glm::vec3& cameraPos) {
            }
        }

-        // Distance-tiered bone throttling: near=every frame, mid=every 3rd, far=every 6th
+        // Distance-tiered bone throttling: near=every frame, mid=every 4th, far=every 8th
        uint32_t boneInterval = 1;
-        if (distSq > 60.0f * 60.0f) boneInterval = 6;
-        else if (distSq > 30.0f * 30.0f) boneInterval = 3;
+        if (distSq > 40.0f * 40.0f) boneInterval = 8;
+        else if (distSq > 20.0f * 20.0f) boneInterval = 4;
+        else if (distSq > 10.0f * 10.0f) boneInterval = 2;

        inst.boneUpdateCounter++;
        bool needsBones = (inst.boneUpdateCounter >= boneInterval) || inst.boneMatrices.empty();
@ -1615,11 +1627,8 @@ void CharacterRenderer::update(float deltaTime, const glm::vec3& cameraPos) {
 }

 void CharacterRenderer::updateAnimation(CharacterInstance& instance, float deltaTime) {
-    auto modelIt = models.find(instance.modelId);
-    if (modelIt == models.end()) {
-        return;
-    }
-    const auto& model = modelIt->second.data;
+    if (!instance.cachedModel) return;
+    const auto& model = instance.cachedModel->data;

    if (model.sequences.empty()) {
        return;
@ -1732,7 +1741,8 @@ glm::quat CharacterRenderer::interpolateQuat(const pipeline::M2AnimationTrack& t
 // --- Bone transform calculation ---

 void CharacterRenderer::calculateBoneMatrices(CharacterInstance& instance) {
-    auto& model = models[instance.modelId].data;
+    if (!instance.cachedModel) return;
+    auto& model = instance.cachedModel->data;

    if (model.bones.empty()) {
        return;
@ -1833,9 +1843,8 @@ void CharacterRenderer::render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet,
            }
        }

-        auto modelIt = models.find(instance.modelId);
-        if (modelIt == models.end()) continue;
-        const auto& gpuModel = modelIt->second;
+        if (!instance.cachedModel) continue;
+        const auto& gpuModel = *instance.cachedModel;

        // Skip models without GPU buffers
        if (!gpuModel.vertexBuffer) continue;
@ -2487,9 +2496,8 @@ void CharacterRenderer::renderShadow(VkCommandBuffer cmd, const glm::mat4& light
        glm::vec3 diff = inst.position - shadowCenter;
        if (glm::dot(diff, diff) > shadowRadiusSq) continue;

-        auto modelIt = models.find(inst.modelId);
-        if (modelIt == models.end()) continue;
-        const M2ModelGPU& gpuModel = modelIt->second;
+        if (!inst.cachedModel) continue;
+        const M2ModelGPU& gpuModel = *inst.cachedModel;
        if (!gpuModel.vertexBuffer) continue;

        glm::mat4 modelMat = inst.hasOverrideModelMatrix
--- a/src/rendering/m2_renderer.cpp
+++ b/src/rendering/m2_renderer.cpp
@ -1657,6 +1657,7 @@ uint32_t M2Renderer::createInstance(uint32_t modelId, const glm::vec3& position,
    instance.cachedIsInvisibleTrap = mdlRef.isInvisibleTrap;
    instance.cachedIsInstancePortal = mdlRef.isInstancePortal;
    instance.cachedIsValid = mdlRef.isValid();
+    instance.cachedModel = &mdlRef;

    // Initialize animation: play first sequence (usually Stand/Idle)
    const auto& mdl = mdlRef;
@ -1748,6 +1749,7 @@ uint32_t M2Renderer::createInstanceWithMatrix(uint32_t modelId, const glm::mat4&
    instance.cachedIsGroundDetail = mdl2.isGroundDetail;
    instance.cachedIsInvisibleTrap = mdl2.isInvisibleTrap;
    instance.cachedIsValid = mdl2.isValid();
+    instance.cachedModel = &mdl2;

    // Initialize animation
    if (mdl2.hasAnimation && !mdl2.disableAnimation && !mdl2.sequences.empty()) {
@ -2026,9 +2028,8 @@ void M2Renderer::update(float deltaTime, const glm::vec3& cameraPos, const glm::
        instance.animTime += dtMs * (instance.animSpeed - 1.0f);

        // For animation looping/variation, we need the actual model data.
-        auto it = models.find(instance.modelId);
-        if (it == models.end()) continue;
-        const M2ModelGPU& model = it->second;
+        if (!instance.cachedModel) continue;
+        const M2ModelGPU& model = *instance.cachedModel;

        // Validate sequence index
        if (instance.currentSequenceIndex < 0 ||
@ -2084,6 +2085,14 @@ void M2Renderer::update(float deltaTime, const glm::vec3& cameraPos, const glm::
        float paddedRadius = std::max(cullRadius * 1.5f, cullRadius + 3.0f);
        if (cullRadius > 0.0f && !updateFrustum.intersectsSphere(instance.position, paddedRadius)) continue;

+        // Distance-based frame skipping: update distant bones less frequently
+        uint32_t boneInterval = 1;
+        if (distSq > 200.0f * 200.0f) boneInterval = 8;
+        else if (distSq > 100.0f * 100.0f) boneInterval = 4;
+        else if (distSq > 50.0f * 50.0f) boneInterval = 2;
+        instance.frameSkipCounter++;
+        if ((instance.frameSkipCounter % boneInterval) != 0) continue;
+
        boneWorkIndices_.push_back(idx);
    }

@ -2097,9 +2106,8 @@ void M2Renderer::update(float deltaTime, const glm::vec3& cameraPos, const glm::
            for (size_t i : boneWorkIndices_) {
                if (i >= instances.size()) continue;
                auto& inst = instances[i];
-                auto mdlIt = models.find(inst.modelId);
-                if (mdlIt == models.end()) continue;
-                computeBoneMatrices(mdlIt->second, inst);
+                if (!inst.cachedModel) continue;
+                computeBoneMatrices(*inst.cachedModel, inst);
            }
        } else {
            // Parallel — dispatch across worker threads
@ -2112,9 +2120,8 @@ void M2Renderer::update(float deltaTime, const glm::vec3& cameraPos, const glm::
                for (size_t i : boneWorkIndices_) {
                    if (i >= instances.size()) continue;
                    auto& inst = instances[i];
-                    auto mdlIt = models.find(inst.modelId);
-                    if (mdlIt == models.end()) continue;
-                    computeBoneMatrices(mdlIt->second, inst);
+                    if (!inst.cachedModel) continue;
+                    computeBoneMatrices(*inst.cachedModel, inst);
                }
            } else {
                const size_t chunkSize = animCount / numThreads;
@ -2135,9 +2142,8 @@ void M2Renderer::update(float deltaTime, const glm::vec3& cameraPos, const glm::
                                size_t idx = boneWorkIndices_[j];
                                if (idx >= instances.size()) continue;
                                auto& inst = instances[idx];
-                                auto mdlIt = models.find(inst.modelId);
-                                if (mdlIt == models.end()) continue;
-                                computeBoneMatrices(mdlIt->second, inst);
+                                if (!inst.cachedModel) continue;
+                                computeBoneMatrices(*inst.cachedModel, inst);
                            }
                        }));
                    start = end;
@ -2159,9 +2165,8 @@ void M2Renderer::update(float deltaTime, const glm::vec3& cameraPos, const glm::
        glm::vec3 toCam = instance.position - cachedCamPos_;
        float distSq = glm::dot(toCam, toCam);
        if (distSq > cachedMaxRenderDistSq_) continue;
-        auto mdlIt = models.find(instance.modelId);
-        if (mdlIt == models.end()) continue;
-        emitParticles(instance, mdlIt->second, deltaTime);
+        if (!instance.cachedModel) continue;
+        emitParticles(instance, *instance.cachedModel, deltaTime);
        updateParticles(instance, deltaTime);
    }

@ -2865,9 +2870,8 @@ void M2Renderer::renderShadow(VkCommandBuffer cmd, const glm::mat4& lightSpaceMa
            glm::vec3 diff = instance.position - shadowCenter;
            if (glm::dot(diff, diff) > shadowRadiusSq) continue;

-            auto modelIt = models.find(instance.modelId);
-            if (modelIt == models.end()) continue;
-            const M2ModelGPU& model = modelIt->second;
+            if (!instance.cachedModel) continue;
+            const M2ModelGPU& model = *instance.cachedModel;

            // Filter: only draw foliage models in foliage pass, non-foliage in non-foliage pass
            if (model.shadowWindFoliage != foliagePass) continue;
@ -2973,8 +2977,7 @@ std::vector<glm::vec3> M2Renderer::getWaterVegetationPositions(const glm::vec3&
    std::vector<glm::vec3> result;
    float maxDistSq = maxDist * maxDist;
    for (const auto& inst : instances) {
-        auto it = models.find(inst.modelId);
-        if (it == models.end() || !it->second.isWaterVegetation) continue;
+        if (!inst.cachedModel || !inst.cachedModel->isWaterVegetation) continue;
        glm::vec3 diff = inst.position - camPos;
        if (glm::dot(diff, diff) <= maxDistSq) {
            result.push_back(inst.position);
@ -3085,9 +3088,8 @@ void M2Renderer::emitParticles(M2Instance& inst, const M2ModelGPU& gpu, float dt
 }

 void M2Renderer::updateParticles(M2Instance& inst, float dt) {
-    auto it = models.find(inst.modelId);
-    if (it == models.end()) return;
-    const auto& gpu = it->second;
+    if (!inst.cachedModel) return;
+    const auto& gpu = *inst.cachedModel;

    for (size_t i = 0; i < inst.particles.size(); ) {
        auto& p = inst.particles[i];
@ -3162,9 +3164,8 @@ void M2Renderer::renderM2Particles(VkCommandBuffer cmd, VkDescriptorSet perFrame

    for (auto& inst : instances) {
        if (inst.particles.empty()) continue;
-        auto it = models.find(inst.modelId);
-        if (it == models.end()) continue;
-        const auto& gpu = it->second;
+        if (!inst.cachedModel) continue;
+        const auto& gpu = *inst.cachedModel;

        for (const auto& p : inst.particles) {
            if (p.emitterIndex < 0 || p.emitterIndex >= static_cast<int>(gpu.particleEmitters.size())) continue;
@ -3549,9 +3550,13 @@ void M2Renderer::rebuildSpatialIndex() {
    particleInstanceIndices_.clear();

    for (size_t i = 0; i < instances.size(); i++) {
-        const auto& inst = instances[i];
+        auto& inst = instances[i];
        instanceIndexById[inst.id] = i;

+        // Re-cache model pointer (may have changed after model map modifications)
+        auto mdlIt = models.find(inst.modelId);
+        inst.cachedModel = (mdlIt != models.end()) ? &mdlIt->second : nullptr;
+
        // Rebuild dedup map (skip ground detail)
        if (!inst.cachedIsGroundDetail) {
            DedupKey dk{inst.modelId,
@ -3684,8 +3689,18 @@ VkTexture* M2Renderer::loadTexture(const std::string& path, uint32_t texFlags) {
        containsToken(key, "campfire") ||
        containsToken(key, "bonfire");

-    // Load BLP texture
-    pipeline::BLPImage blp = assetManager->loadTexture(key);
+    // Check pre-decoded BLP cache first (populated by background worker threads)
+    pipeline::BLPImage blp;
+    if (predecodedBLPCache_) {
+        auto pit = predecodedBLPCache_->find(key);
+        if (pit != predecodedBLPCache_->end()) {
+            blp = std::move(pit->second);
+            predecodedBLPCache_->erase(pit);
+        }
+    }
+    if (!blp.isValid()) {
+        blp = assetManager->loadTexture(key);
+    }
    if (!blp.isValid()) {
        // Return white fallback but don't cache the failure — MPQ reads can
        // fail transiently during streaming; allow retry on next model load.
@ -3751,9 +3766,8 @@ VkTexture* M2Renderer::loadTexture(const std::string& path, uint32_t texFlags) {
 uint32_t M2Renderer::getTotalTriangleCount() const {
    uint32_t total = 0;
    for (const auto& instance : instances) {
-        auto it = models.find(instance.modelId);
-        if (it != models.end()) {
-            total += it->second.indexCount / 3;
+        if (instance.cachedModel) {
+            total += instance.cachedModel->indexCount / 3;
        }
    }
    return total;
@ -3775,11 +3789,10 @@ std::optional<float> M2Renderer::getFloorHeight(float glX, float glY, float glZ,
            continue;
        }

-        auto it = models.find(instance.modelId);
-        if (it == models.end()) continue;
+        if (!instance.cachedModel) continue;
        if (instance.scale <= 0.001f) continue;

-        const M2ModelGPU& model = it->second;
+        const M2ModelGPU& model = *instance.cachedModel;
        if (model.collisionNoBlock || model.isInvisibleTrap || model.isSpellEffect) continue;
        if (instance.skipCollision) continue;

@ -3931,10 +3944,9 @@ bool M2Renderer::checkCollision(const glm::vec3& from, const glm::vec3& to,
        if (from.z > instance.worldBoundsMax.z + 2.5f && adjustedPos.z > instance.worldBoundsMax.z + 2.5f) continue;
        if (from.z + 2.5f < instance.worldBoundsMin.z && adjustedPos.z + 2.5f < instance.worldBoundsMin.z) continue;

-        auto it = models.find(instance.modelId);
-        if (it == models.end()) continue;
+        if (!instance.cachedModel) continue;

-        const M2ModelGPU& model = it->second;
+        const M2ModelGPU& model = *instance.cachedModel;
        if (model.collisionNoBlock || model.isInvisibleTrap || model.isSpellEffect) continue;
        if (instance.skipCollision) continue;
        if (instance.scale <= 0.001f) continue;
@ -4172,10 +4184,9 @@ float M2Renderer::raycastBoundingBoxes(const glm::vec3& origin, const glm::vec3&
            continue;
        }

-        auto it = models.find(instance.modelId);
-        if (it == models.end()) continue;
+        if (!instance.cachedModel) continue;

-        const M2ModelGPU& model = it->second;
+        const M2ModelGPU& model = *instance.cachedModel;
        if (model.collisionNoBlock || model.isInvisibleTrap || model.isSpellEffect) continue;
        glm::vec3 localMin, localMax;
        getTightCollisionBounds(model, localMin, localMax);
--- a/src/rendering/renderer.cpp
+++ b/src/rendering/renderer.cpp
@ -2434,6 +2434,9 @@ void Renderer::update(float deltaTime) {
        cameraController->update(deltaTime);
        auto cameraEnd = std::chrono::steady_clock::now();
        lastCameraUpdateMs = std::chrono::duration<double, std::milli>(cameraEnd - cameraStart).count();
+        if (lastCameraUpdateMs > 3.0) {
+            LOG_WARNING("SLOW cameraController->update: ", lastCameraUpdateMs, "ms");
+        }

        // Update 3D audio listener position/orientation to match camera
        if (camera) {
@ -2779,8 +2782,15 @@ void Renderer::update(float deltaTime) {

    // Update M2 doodad animations (pass camera for frustum-culling bone computation)
    if (m2Renderer && camera) {
+        auto m2Start = std::chrono::steady_clock::now();
        m2Renderer->update(deltaTime, camera->getPosition(),
                           camera->getProjectionMatrix() * camera->getViewMatrix());
+        float m2Ms = std::chrono::duration<float, std::milli>(
+            std::chrono::steady_clock::now() - m2Start).count();
+        if (m2Ms > 3.0f) {
+            LOG_WARNING("SLOW m2Renderer->update: ", m2Ms, "ms (",
+                        m2Renderer->getInstanceCount(), " instances)");
+        }
    }

    // Helper: play zone music, dispatching local files (file: prefix) vs MPQ paths
--- a/src/rendering/terrain_manager.cpp
+++ b/src/rendering/terrain_manager.cpp
@ -231,9 +231,14 @@ bool TerrainManager::loadTile(int x, int y) {
        return false;
    }

+    VkContext* vkCtx = terrainRenderer ? terrainRenderer->getVkContext() : nullptr;
+    if (vkCtx) vkCtx->beginUploadBatch();
+
    FinalizingTile ft;
    ft.pending = std::move(pending);
    while (!advanceFinalization(ft)) {}
+
+    if (vkCtx) vkCtx->endUploadBatchSync();  // Sync — caller expects tile ready
    return true;
 }

@ -407,6 +412,20 @@ std::shared_ptr<PendingTile> TerrainManager::prepareTile(int x, int y) {
            return false;
        }

+        // Pre-decode M2 model textures on background thread
+        for (const auto& tex : m2Model.textures) {
+            if (tex.filename.empty()) continue;
+            std::string texKey = tex.filename;
+            std::replace(texKey.begin(), texKey.end(), '/', '\\');
+            std::transform(texKey.begin(), texKey.end(), texKey.begin(),
+                           [](unsigned char c) { return static_cast<char>(std::tolower(c)); });
+            if (pending->preloadedM2Textures.find(texKey) != pending->preloadedM2Textures.end()) continue;
+            auto blp = assetManager->loadTexture(texKey);
+            if (blp.isValid()) {
+                pending->preloadedM2Textures[texKey] = std::move(blp);
+            }
+        }
+
        PendingTile::M2Ready ready;
        ready.modelId = modelId;
        ready.model = std::move(m2Model);
@ -584,6 +603,20 @@ std::shared_ptr<PendingTile> TerrainManager::prepareTile(int x, int y) {
                                pipeline::M2Loader::loadSkin(skinData, m2Model);
                            }
                            if (!m2Model.isValid()) continue;
+
+                            // Pre-decode doodad M2 textures on background thread
+                            for (const auto& tex : m2Model.textures) {
+                                if (tex.filename.empty()) continue;
+                                std::string texKey = tex.filename;
+                                std::replace(texKey.begin(), texKey.end(), '/', '\\');
+                                std::transform(texKey.begin(), texKey.end(), texKey.begin(),
+                                               [](unsigned char c) { return static_cast<char>(std::tolower(c)); });
+                                if (pending->preloadedM2Textures.find(texKey) != pending->preloadedM2Textures.end()) continue;
+                                auto blp = assetManager->loadTexture(texKey);
+                                if (blp.isValid()) {
+                                    pending->preloadedM2Textures[texKey] = std::move(blp);
+                                }
+                            }
                        }

                        // Build doodad's local transform (WoW coordinates)
@ -654,6 +687,32 @@ std::shared_ptr<PendingTile> TerrainManager::prepareTile(int x, int y) {
                    }
                }

+                // Pre-decode WMO textures on background thread
+                for (const auto& texPath : wmoModel.textures) {
+                    if (texPath.empty()) continue;
+                    std::string texKey = texPath;
+                    // Truncate at NUL (WMO paths can have stray bytes)
+                    size_t nul = texKey.find('\0');
+                    if (nul != std::string::npos) texKey.resize(nul);
+                    std::replace(texKey.begin(), texKey.end(), '/', '\\');
+                    std::transform(texKey.begin(), texKey.end(), texKey.begin(),
+                                   [](unsigned char c) { return static_cast<char>(std::tolower(c)); });
+                    if (texKey.empty()) continue;
+                    if (pending->preloadedWMOTextures.find(texKey) != pending->preloadedWMOTextures.end()) continue;
+                    // Try .blp variant
+                    std::string blpKey = texKey;
+                    if (blpKey.size() >= 4) {
+                        std::string ext = blpKey.substr(blpKey.size() - 4);
+                        if (ext == ".tga" || ext == ".dds") {
+                            blpKey = blpKey.substr(0, blpKey.size() - 4) + ".blp";
+                        }
+                    }
+                    auto blp = assetManager->loadTexture(blpKey);
+                    if (blp.isValid()) {
+                        pending->preloadedWMOTextures[blpKey] = std::move(blp);
+                    }
+                }
+
                PendingTile::WMOReady ready;
                // Cache WMO model uploads by path; placement dedup uses uniqueId separately.
                ready.modelId = static_cast<uint32_t>(std::hash<std::string>{}(wmoPath));
@ -741,7 +800,7 @@ bool TerrainManager::advanceFinalization(FinalizingTile& ft) {
            }
            bool allDone = terrainRenderer->loadTerrainIncremental(
                pending->mesh, pending->terrain.textures, x, y,
-                ft.terrainChunkNext, 64);
+                ft.terrainChunkNext, 32);
            if (!allDone) {
                return false; // More chunks remain — yield to time budget
            }
@ -773,7 +832,9 @@ bool TerrainManager::advanceFinalization(FinalizingTile& ft) {
    case FinalizationPhase::M2_MODELS: {
        // Upload multiple M2 models per call (batched GPU uploads)
        if (m2Renderer && ft.m2ModelIndex < pending->m2Models.size()) {
-            constexpr size_t kModelsPerStep = 8;
+            // Set pre-decoded BLP cache so loadTexture() skips main-thread BLP decode
+            m2Renderer->setPredecodedBLPCache(&pending->preloadedM2Textures);
+            constexpr size_t kModelsPerStep = 4;
            size_t uploaded = 0;
            while (ft.m2ModelIndex < pending->m2Models.size() && uploaded < kModelsPerStep) {
                auto& m2Ready = pending->m2Models[ft.m2ModelIndex];
@ -786,6 +847,7 @@ bool TerrainManager::advanceFinalization(FinalizingTile& ft) {
                ft.m2ModelIndex++;
                uploaded++;
            }
+            m2Renderer->setPredecodedBLPCache(nullptr);
            // Stay in this phase until all models uploaded
            if (ft.m2ModelIndex < pending->m2Models.size()) {
                return false;
@ -830,8 +892,11 @@ bool TerrainManager::advanceFinalization(FinalizingTile& ft) {
        // Upload multiple WMO models per call (batched GPU uploads)
        if (wmoRenderer && assetManager) {
            wmoRenderer->initialize(nullptr, VK_NULL_HANDLE, assetManager);
+            // Set pre-decoded BLP cache and defer normal maps during streaming
+            wmoRenderer->setPredecodedBLPCache(&pending->preloadedWMOTextures);
+            wmoRenderer->setDeferNormalMaps(true);

-            constexpr size_t kWmosPerStep = 4;
+            constexpr size_t kWmosPerStep = 1;
            size_t uploaded = 0;
            while (ft.wmoModelIndex < pending->wmoModels.size() && uploaded < kWmosPerStep) {
                auto& wmoReady = pending->wmoModels[ft.wmoModelIndex];
@ -843,6 +908,8 @@ bool TerrainManager::advanceFinalization(FinalizingTile& ft) {
                    uploaded++;
                }
            }
+            wmoRenderer->setDeferNormalMaps(false);
+            wmoRenderer->setPredecodedBLPCache(nullptr);
            if (ft.wmoModelIndex < pending->wmoModels.size()) return false;
        }
        ft.phase = FinalizationPhase::WMO_INSTANCES;
@ -906,7 +973,9 @@ bool TerrainManager::advanceFinalization(FinalizingTile& ft) {
    case FinalizationPhase::WMO_DOODADS: {
        // Upload multiple WMO doodad M2s per call (batched GPU uploads)
        if (m2Renderer && ft.wmoDoodadIndex < pending->wmoDoodads.size()) {
-            constexpr size_t kDoodadsPerStep = 16;
+            // Set pre-decoded BLP cache for doodad M2 textures
+            m2Renderer->setPredecodedBLPCache(&pending->preloadedM2Textures);
+            constexpr size_t kDoodadsPerStep = 4;
            size_t uploaded = 0;
            while (ft.wmoDoodadIndex < pending->wmoDoodads.size() && uploaded < kDoodadsPerStep) {
                auto& doodad = pending->wmoDoodads[ft.wmoDoodadIndex];
@ -923,6 +992,7 @@ bool TerrainManager::advanceFinalization(FinalizingTile& ft) {
                ft.wmoDoodadIndex++;
                uploaded++;
            }
+            m2Renderer->setPredecodedBLPCache(nullptr);
            if (ft.wmoDoodadIndex < pending->wmoDoodads.size()) return false;
        }
        ft.phase = FinalizationPhase::WATER;
@ -1080,11 +1150,6 @@ void TerrainManager::workerLoop() {
 }

 void TerrainManager::processReadyTiles() {
-    // Process tiles with time budget to avoid frame spikes
-    // Taxi mode gets a slightly larger budget to avoid visible late-pop terrain/models.
-    const float timeBudgetMs = taxiStreamingMode_ ? 8.0f : 3.0f;
-    auto startTime = std::chrono::high_resolution_clock::now();
-
    // Move newly ready tiles into the finalizing deque.
    // Keep them in pendingTiles so streamTiles() won't re-enqueue them.
    {
@ -1100,28 +1165,32 @@ void TerrainManager::processReadyTiles() {
        }
    }

-    // Outer upload batch: all GPU uploads across all advanceFinalization calls
-    // this frame share a single command buffer submission + fence wait.
    VkContext* vkCtx = terrainRenderer ? terrainRenderer->getVkContext() : nullptr;
+
+    // Reclaim completed async uploads from previous frames (non-blocking)
+    if (vkCtx) vkCtx->pollUploadBatches();
+
+    // Nothing to finalize — done.
+    if (finalizingTiles_.empty()) return;
+
+    // Async upload batch: record GPU copies into a command buffer, submit with
+    // a fence, but DON'T wait.  The fence is polled on subsequent frames.
+    // This eliminates the main-thread stall from vkWaitForFences entirely.
+    const int maxSteps = taxiStreamingMode_ ? 8 : 2;
+    int steps = 0;
+
    if (vkCtx) vkCtx->beginUploadBatch();

-    // Drive incremental finalization within time budget
-    while (!finalizingTiles_.empty()) {
+    while (!finalizingTiles_.empty() && steps < maxSteps) {
        auto& ft = finalizingTiles_.front();
        bool done = advanceFinalization(ft);
-
        if (done) {
            finalizingTiles_.pop_front();
        }
-
-        auto now = std::chrono::high_resolution_clock::now();
-        float elapsedMs = std::chrono::duration<float, std::milli>(now - startTime).count();
-        if (elapsedMs >= timeBudgetMs) {
-            break;
-        }
+        steps++;
    }

-    if (vkCtx) vkCtx->endUploadBatch();
+    if (vkCtx) vkCtx->endUploadBatch();  // Async — submits but doesn't wait
 }

 void TerrainManager::processAllReadyTiles() {
@ -1151,7 +1220,7 @@ void TerrainManager::processAllReadyTiles() {
        finalizingTiles_.pop_front();
    }

-    if (vkCtx) vkCtx->endUploadBatch();
+    if (vkCtx) vkCtx->endUploadBatchSync();  // Sync — load screen needs data ready
 }

 void TerrainManager::processOneReadyTile() {
@ -1177,7 +1246,7 @@ void TerrainManager::processOneReadyTile() {
        while (!advanceFinalization(ft)) {}
        finalizingTiles_.pop_front();

-        if (vkCtx) vkCtx->endUploadBatch();
+        if (vkCtx) vkCtx->endUploadBatchSync();  // Sync — load screen needs data ready
    }
 }

--- a/src/rendering/vk_context.cpp
+++ b/src/rendering/vk_context.cpp
@ -67,6 +67,14 @@ void VkContext::shutdown() {
        frame = {};
    }

+    // Clean up any in-flight async upload batches (device already idle)
+    for (auto& batch : inFlightBatches_) {
+        // Staging buffers: skip destroy — allocator is about to be torn down
+        vkDestroyFence(device, batch.fence, nullptr);
+        // Command buffer freed when pool is destroyed below
+    }
+    inFlightBatches_.clear();
+
    if (immFence) { vkDestroyFence(device, immFence, nullptr); immFence = VK_NULL_HANDLE; }
    if (immCommandPool) { vkDestroyCommandPool(device, immCommandPool, nullptr); immCommandPool = VK_NULL_HANDLE; }

@ -1447,17 +1455,94 @@ void VkContext::endUploadBatch() {

    inUploadBatch_ = false;

-    // Submit all recorded commands with a single fence wait
+    if (batchStagingBuffers_.empty()) {
+        // No GPU copies were recorded — skip the submit entirely.
+        vkEndCommandBuffer(batchCmd_);
+        vkFreeCommandBuffers(device, immCommandPool, 1, &batchCmd_);
+        batchCmd_ = VK_NULL_HANDLE;
+        return;
+    }
+
+    // Submit commands with a NEW fence — don't wait, let GPU work in parallel.
+    vkEndCommandBuffer(batchCmd_);
+
+    VkFenceCreateInfo fenceInfo{};
+    fenceInfo.sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO;
+    VkFence fence = VK_NULL_HANDLE;
+    vkCreateFence(device, &fenceInfo, nullptr, &fence);
+
+    VkSubmitInfo submitInfo{};
+    submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
+    submitInfo.commandBufferCount = 1;
+    submitInfo.pCommandBuffers = &batchCmd_;
+    vkQueueSubmit(graphicsQueue, 1, &submitInfo, fence);
+
+    // Stash everything for later cleanup when fence signals
+    InFlightBatch batch;
+    batch.fence = fence;
+    batch.cmd = batchCmd_;
+    batch.stagingBuffers = std::move(batchStagingBuffers_);
+    inFlightBatches_.push_back(std::move(batch));
+
+    batchCmd_ = VK_NULL_HANDLE;
+    batchStagingBuffers_.clear();
+}
+
+void VkContext::endUploadBatchSync() {
+    if (uploadBatchDepth_ <= 0) return;
+    uploadBatchDepth_--;
+    if (uploadBatchDepth_ > 0) return;
+
+    inUploadBatch_ = false;
+
+    if (batchStagingBuffers_.empty()) {
+        vkEndCommandBuffer(batchCmd_);
+        vkFreeCommandBuffers(device, immCommandPool, 1, &batchCmd_);
+        batchCmd_ = VK_NULL_HANDLE;
+        return;
+    }
+
+    // Synchronous path for load screens — submit and wait
    endSingleTimeCommands(batchCmd_);
    batchCmd_ = VK_NULL_HANDLE;

-    // Destroy all deferred staging buffers
    for (auto& staging : batchStagingBuffers_) {
        destroyBuffer(allocator, staging);
    }
    batchStagingBuffers_.clear();
 }

+void VkContext::pollUploadBatches() {
+    if (inFlightBatches_.empty()) return;
+
+    for (auto it = inFlightBatches_.begin(); it != inFlightBatches_.end(); ) {
+        VkResult result = vkGetFenceStatus(device, it->fence);
+        if (result == VK_SUCCESS) {
+            // GPU finished — free resources
+            for (auto& staging : it->stagingBuffers) {
+                destroyBuffer(allocator, staging);
+            }
+            vkFreeCommandBuffers(device, immCommandPool, 1, &it->cmd);
+            vkDestroyFence(device, it->fence, nullptr);
+            it = inFlightBatches_.erase(it);
+        } else {
+            ++it;
+        }
+    }
+}
+
+void VkContext::waitAllUploads() {
+    for (auto& batch : inFlightBatches_) {
+        vkWaitForFences(device, 1, &batch.fence, VK_TRUE, UINT64_MAX);
+        for (auto& staging : batch.stagingBuffers) {
+            destroyBuffer(allocator, staging);
+        }
+        vkFreeCommandBuffers(device, immCommandPool, 1, &batch.cmd);
+        vkDestroyFence(device, batch.fence, nullptr);
+    }
+    inFlightBatches_.clear();
+}
+
 void VkContext::deferStagingCleanup(AllocatedBuffer staging) {
    batchStagingBuffers_.push_back(staging);
 }
--- a/src/rendering/wmo_renderer.cpp
+++ b/src/rendering/wmo_renderer.cpp
@ -2325,13 +2325,27 @@ VkTexture* WMORenderer::loadTexture(const std::string& path) {
    const auto& attemptedCandidates = uniqueCandidates;

    // Try loading all candidates until one succeeds
+    // Check pre-decoded BLP cache first (populated by background worker threads)
    pipeline::BLPImage blp;
    std::string resolvedKey;
-    for (const auto& c : attemptedCandidates) {
-        blp = assetManager->loadTexture(c);
-        if (blp.isValid()) {
-            resolvedKey = c;
-            break;
+    if (predecodedBLPCache_) {
+        for (const auto& c : uniqueCandidates) {
+            auto pit = predecodedBLPCache_->find(c);
+            if (pit != predecodedBLPCache_->end()) {
+                blp = std::move(pit->second);
+                predecodedBLPCache_->erase(pit);
+                resolvedKey = c;
+                break;
+            }
+        }
+    }
+    if (!blp.isValid()) {
+        for (const auto& c : attemptedCandidates) {
+            blp = assetManager->loadTexture(c);
+            if (blp.isValid()) {
+                resolvedKey = c;
+                break;
+            }
        }
    }
    if (!blp.isValid()) {
@ -2369,10 +2383,10 @@ VkTexture* WMORenderer::loadTexture(const std::string& path) {
    texture->createSampler(vkCtx_->getDevice(), VK_FILTER_LINEAR, VK_FILTER_LINEAR,
                            VK_SAMPLER_ADDRESS_MODE_REPEAT);

-    // Generate normal+height map from diffuse pixels
+    // Generate normal+height map from diffuse pixels (skip during streaming to avoid CPU stalls)
    float nhVariance = 0.0f;
    std::unique_ptr<VkTexture> nhMap;
-    if (normalMappingEnabled_ || pomEnabled_) {
+    if ((normalMappingEnabled_ || pomEnabled_) && !deferNormalMaps_) {
        nhMap = generateNormalHeightMap(blp.data.data(), blp.width, blp.height, nhVariance);
        if (nhMap) {
            approxBytes *= 2;  // account for normal map in budget