From f9410cc4bd0d168e316593728880d4edc0e2729d Mon Sep 17 00:00:00 2001
From: Kelsi <kelsihates2fa@gmail.com>
Date: Sat, 7 Mar 2026 11:44:14 -0800
Subject: [PATCH 1/9] Fix city NPC stuttering: async model loading,
 CharSections cache, frame budgets

- Async creature model loading: M2 file I/O and parsing on background threads
  via std::async, GPU upload on main thread when ready (MAX_ASYNC_CREATURE_LOADS=4)
- CharSections.dbc lookup cache: O(1) hash lookup instead of O(N) full DBC scan
  per humanoid NPC spawn (was scanning thousands of records twice per spawn)
- Frame time budget: 4ms cap on creature spawn processing per frame
- Wolf/worg model name check cached per modelId (was doing tolower+find per
  hostile creature per frame)
- Weapon attach throttle: max 2 per 1s tick (was attempting all unweaponized NPCs)
- Separate texture application tracking (displayIdTexturesApplied_) so async-loaded
  models still get skin/equipment textures applied correctly
---
 include/core/application.hpp |  34 ++-
 src/core/application.cpp     | 429 ++++++++++++++++++++++++++---------
 2 files changed, 361 insertions(+), 102 deletions(-)
diff --git a/include/core/application.hpp b/include/core/application.hpp
index 92e96e8e..7415da18 100644
--- a/include/core/application.hpp
+++ b/include/core/application.hpp
@@ -10,6 +10,8 @@
 #include <unordered_set>
 #include <array>
 #include <optional>
+#include <future>
+#include <mutex>
 
 namespace wowee {
 
@@ -18,7 +20,7 @@ namespace rendering { class Renderer; }
 namespace ui { class UIManager; }
 namespace auth { class AuthHandler; }
 namespace game { class GameHandler; class World; class ExpansionRegistry; }
-namespace pipeline { class AssetManager; class DBCLayout; }
+namespace pipeline { class AssetManager; class DBCLayout; struct M2Model; }
 namespace audio { enum class VoiceType; }
 
 namespace core {
@@ -90,6 +92,7 @@ private:
     static const char* mapIdToName(uint32_t mapId);
     void loadOnlineWorldTerrain(uint32_t mapId, float x, float y, float z);
     void buildFactionHostilityMap(uint8_t playerRace);
+    pipeline::M2Model loadCreatureM2Sync(const std::string& m2Path);
     void spawnOnlineCreature(uint64_t guid, uint32_t displayId, float x, float y, float z, float orientation);
     void despawnOnlineCreature(uint64_t guid);
     bool tryAttachCreatureVirtualWeapons(uint64_t guid, uint32_t instanceId);
@@ -181,8 +184,37 @@ private:
     std::unordered_map<uint64_t, glm::vec3> creatureRenderPosCache_; // guid -> last synced render position
     std::unordered_set<uint64_t> creatureWeaponsAttached_;       // guid set when NPC virtual weapons attached
     std::unordered_map<uint64_t, uint8_t> creatureWeaponAttachAttempts_; // guid -> attach attempts
+    std::unordered_map<uint32_t, bool> modelIdIsWolfLike_;     // modelId → cached wolf/worg check
+    static constexpr int MAX_WEAPON_ATTACHES_PER_TICK = 2;     // limit weapon attach work per 1s tick
+
+    // CharSections.dbc lookup cache to avoid O(N) DBC scan per NPC spawn.
+    // Key: (race<<24)|(sex<<16)|(section<<12)|(variation<<8)|color → texture path
+    std::unordered_map<uint64_t, std::string> charSectionsCache_;
+    bool charSectionsCacheBuilt_ = false;
+    void buildCharSectionsCache();
+    std::string lookupCharSection(uint8_t race, uint8_t sex, uint8_t section,
+                                  uint8_t variation, uint8_t color, int texIndex = 0) const;
+
+    // Async creature model loading: file I/O + M2 parsing on background thread,
+    // GPU upload + instance creation on main thread.
+    struct PreparedCreatureModel {
+        uint64_t guid;
+        uint32_t displayId;
+        uint32_t modelId;
+        float x, y, z, orientation;
+        std::shared_ptr<pipeline::M2Model> model; // parsed on background thread
+        bool valid = false;
+        bool permanent_failure = false;
+    };
+    struct AsyncCreatureLoad {
+        std::future<PreparedCreatureModel> future;
+    };
+    std::vector<AsyncCreatureLoad> asyncCreatureLoads_;
+    void processAsyncCreatureResults();
+    static constexpr int MAX_ASYNC_CREATURE_LOADS = 4; // concurrent background loads
     std::unordered_set<uint64_t> deadCreatureGuids_;            // GUIDs that should spawn in corpse/death pose
     std::unordered_map<uint32_t, uint32_t> displayIdModelCache_; // displayId → modelId (model caching)
+    std::unordered_set<uint32_t> displayIdTexturesApplied_;    // displayIds with per-model textures applied
     mutable std::unordered_set<uint32_t> warnedMissingDisplayDataIds_; // displayIds already warned
     mutable std::unordered_set<uint32_t> warnedMissingModelPathIds_;   // modelIds/displayIds already warned
     uint32_t nextCreatureModelId_ = 5000;  // Model IDs for online creatures
diff --git a/src/core/application.cpp b/src/core/application.cpp
index 2a8ef041..2a06bd5c 100644
--- a/src/core/application.cpp
+++ b/src/core/application.cpp
@@ -734,6 +734,16 @@ void Application::logoutToLogin() {
     deadCreatureGuids_.clear();
     nonRenderableCreatureDisplayIds_.clear();
     creaturePermanentFailureGuids_.clear();
+    modelIdIsWolfLike_.clear();
+    displayIdTexturesApplied_.clear();
+    charSectionsCache_.clear();
+    charSectionsCacheBuilt_ = false;
+
+    // Wait for any in-flight async creature loads before clearing state
+    for (auto& load : asyncCreatureLoads_) {
+        if (load.future.valid()) load.future.wait();
+    }
+    asyncCreatureLoads_.clear();
 
     // --- Creature spawn queues ---
     pendingCreatureSpawns_.clear();
@@ -1285,6 +1295,7 @@ void Application::update(float deltaTime) {
                 npcWeaponRetryTimer += deltaTime;
                 const bool npcWeaponRetryTick = (npcWeaponRetryTimer >= 1.0f);
                 if (npcWeaponRetryTick) npcWeaponRetryTimer = 0.0f;
+                int weaponAttachesThisTick = 0;
                 glm::vec3 playerPos(0.0f);
                 glm::vec3 playerRenderPos(0.0f);
                 bool havePlayerPos = false;
@@ -1304,11 +1315,14 @@ void Application::update(float deltaTime) {
                     auto entity = gameHandler->getEntityManager().getEntity(guid);
                     if (!entity || entity->getType() != game::ObjectType::UNIT) continue;
 
-                    if (npcWeaponRetryTick && !creatureWeaponsAttached_.count(guid)) {
+                    if (npcWeaponRetryTick &&
+                        weaponAttachesThisTick < MAX_WEAPON_ATTACHES_PER_TICK &&
+                        !creatureWeaponsAttached_.count(guid)) {
                         uint8_t attempts = 0;
                         auto itAttempts = creatureWeaponAttachAttempts_.find(guid);
                         if (itAttempts != creatureWeaponAttachAttempts_.end()) attempts = itAttempts->second;
                         if (attempts < 30) {
+                            weaponAttachesThisTick++;
                             if (tryAttachCreatureVirtualWeapons(guid, instanceId)) {
                                 creatureWeaponsAttached_.insert(guid);
                                 creatureWeaponAttachAttempts_.erase(guid);
@@ -1355,14 +1369,21 @@ void Application::update(float deltaTime) {
                         // often put head/torso inside the player capsule).
                         auto mit = creatureModelIds_.find(guid);
                         if (mit != creatureModelIds_.end()) {
-                            if (const auto* md = charRenderer->getModelData(mit->second)) {
-                                std::string modelName = md->name;
-                                std::transform(modelName.begin(), modelName.end(), modelName.begin(),
-                                               [](unsigned char c) { return static_cast<char>(std::tolower(c)); });
-                                if (modelName.find("wolf") != std::string::npos ||
-                                    modelName.find("worg") != std::string::npos) {
-                                    minSep = std::max(minSep, 2.45f);
+                            uint32_t mid = mit->second;
+                            auto wolfIt = modelIdIsWolfLike_.find(mid);
+                            if (wolfIt == modelIdIsWolfLike_.end()) {
+                                bool isWolf = false;
+                                if (const auto* md = charRenderer->getModelData(mid)) {
+                                    std::string modelName = md->name;
+                                    std::transform(modelName.begin(), modelName.end(), modelName.begin(),
+                                                   [](unsigned char c) { return static_cast<char>(std::tolower(c)); });
+                                    isWolf = (modelName.find("wolf") != std::string::npos ||
+                                              modelName.find("worg") != std::string::npos);
                                 }
+                                wolfIt = modelIdIsWolfLike_.emplace(mid, isWolf).first;
+                            }
+                            if (wolfIt->second) {
+                                minSep = std::max(minSep, 2.45f);
                             }
                         }
 
@@ -3465,6 +3486,14 @@ void Application::loadOnlineWorldTerrain(uint32_t mapId, float x, float y, float
         deadCreatureGuids_.clear();
         nonRenderableCreatureDisplayIds_.clear();
         creaturePermanentFailureGuids_.clear();
+        modelIdIsWolfLike_.clear();
+        displayIdTexturesApplied_.clear();
+        charSectionsCache_.clear();
+        charSectionsCacheBuilt_ = false;
+        for (auto& load : asyncCreatureLoads_) {
+            if (load.future.valid()) load.future.wait();
+        }
+        asyncCreatureLoads_.clear();
 
         playerInstances_.clear();
         onlinePlayerAppearance_.clear();
@@ -4140,6 +4169,55 @@ void Application::loadOnlineWorldTerrain(uint32_t mapId, float x, float y, float
     setState(AppState::IN_GAME);
 }
 
+void Application::buildCharSectionsCache() {
+    if (charSectionsCacheBuilt_ || !assetManager || !assetManager->isInitialized()) return;
+    auto dbc = assetManager->loadDBC("CharSections.dbc");
+    if (!dbc) return;
+    const auto* csL = pipeline::getActiveDBCLayout()
+        ? pipeline::getActiveDBCLayout()->getLayout("CharSections") : nullptr;
+    uint32_t raceF = csL ? (*csL)["RaceID"] : 1;
+    uint32_t sexF = csL ? (*csL)["SexID"] : 2;
+    uint32_t secF = csL ? (*csL)["BaseSection"] : 3;
+    uint32_t varF = csL ? (*csL)["VariationIndex"] : 4;
+    uint32_t colF = csL ? (*csL)["ColorIndex"] : 5;
+    uint32_t tex1F = csL ? (*csL)["Texture1"] : 6;
+    for (uint32_t r = 0; r < dbc->getRecordCount(); r++) {
+        uint32_t race = dbc->getUInt32(r, raceF);
+        uint32_t sex = dbc->getUInt32(r, sexF);
+        uint32_t section = dbc->getUInt32(r, secF);
+        uint32_t variation = dbc->getUInt32(r, varF);
+        uint32_t color = dbc->getUInt32(r, colF);
+        // We only cache sections 0 (skin), 1 (face), 3 (hair), 4 (underwear)
+        if (section != 0 && section != 1 && section != 3 && section != 4) continue;
+        for (int ti = 0; ti < 3; ti++) {
+            std::string tex = dbc->getString(r, tex1F + ti);
+            if (tex.empty()) continue;
+            // Key: race(8)|sex(4)|section(4)|variation(8)|color(8)|texIndex(2) packed into 64 bits
+            uint64_t key = (static_cast<uint64_t>(race) << 26) |
+                           (static_cast<uint64_t>(sex & 0xF) << 22) |
+                           (static_cast<uint64_t>(section & 0xF) << 18) |
+                           (static_cast<uint64_t>(variation & 0xFF) << 10) |
+                           (static_cast<uint64_t>(color & 0xFF) << 2) |
+                           static_cast<uint64_t>(ti);
+            charSectionsCache_.emplace(key, tex);
+        }
+    }
+    charSectionsCacheBuilt_ = true;
+    LOG_INFO("CharSections cache built: ", charSectionsCache_.size(), " entries");
+}
+
+std::string Application::lookupCharSection(uint8_t race, uint8_t sex, uint8_t section,
+                                           uint8_t variation, uint8_t color, int texIndex) const {
+    uint64_t key = (static_cast<uint64_t>(race) << 26) |
+                   (static_cast<uint64_t>(sex & 0xF) << 22) |
+                   (static_cast<uint64_t>(section & 0xF) << 18) |
+                   (static_cast<uint64_t>(variation & 0xFF) << 10) |
+                   (static_cast<uint64_t>(color & 0xFF) << 2) |
+                   static_cast<uint64_t>(texIndex);
+    auto it = charSectionsCache_.find(key);
+    return (it != charSectionsCache_.end()) ? it->second : std::string();
+}
+
 void Application::buildCreatureDisplayLookups() {
     if (creatureLookupsBuilt_ || !assetManager || !assetManager->isInitialized()) return;
 
@@ -4479,6 +4557,47 @@ bool Application::getRenderFootZForGuid(uint64_t guid, float& outFootZ) const {
     return renderer->getCharacterRenderer()->getInstanceFootZ(instanceId, outFootZ);
 }
 
+pipeline::M2Model Application::loadCreatureM2Sync(const std::string& m2Path) {
+    auto m2Data = assetManager->readFile(m2Path);
+    if (m2Data.empty()) {
+        LOG_WARNING("Failed to read creature M2: ", m2Path);
+        return {};
+    }
+
+    pipeline::M2Model model = pipeline::M2Loader::load(m2Data);
+    if (model.vertices.empty()) {
+        LOG_WARNING("Failed to parse creature M2: ", m2Path);
+        return {};
+    }
+
+    // Load skin file (only for WotLK M2s - vanilla has embedded skin)
+    if (model.version >= 264) {
+        std::string skinPath = m2Path.substr(0, m2Path.size() - 3) + "00.skin";
+        auto skinData = assetManager->readFile(skinPath);
+        if (!skinData.empty()) {
+            pipeline::M2Loader::loadSkin(skinData, model);
+        } else {
+            LOG_WARNING("Missing skin file for WotLK creature M2: ", skinPath);
+        }
+    }
+
+    // Load external .anim files for sequences without flag 0x20
+    std::string basePath = m2Path.substr(0, m2Path.size() - 3);
+    for (uint32_t si = 0; si < model.sequences.size(); si++) {
+        if (!(model.sequences[si].flags & 0x20)) {
+            char animFileName[256];
+            snprintf(animFileName, sizeof(animFileName), "%s%04u-%02u.anim",
+                basePath.c_str(), model.sequences[si].id, model.sequences[si].variationIndex);
+            auto animData = assetManager->readFileOptional(animFileName);
+            if (!animData.empty()) {
+                pipeline::M2Loader::loadAnimFile(m2Data, animData, si, model);
+            }
+        }
+    }
+
+    return model;
+}
+
 void Application::spawnOnlineCreature(uint64_t guid, uint32_t displayId, float x, float y, float z, float orientation) {
     if (!renderer || !renderer->getCharacterRenderer() || !assetManager) return;
 
@@ -4525,47 +4644,13 @@ void Application::spawnOnlineCreature(uint64_t guid, uint32_t displayId, float x
         // Load model from disk (only once per displayId)
         modelId = nextCreatureModelId_++;
 
-        auto m2Data = assetManager->readFile(m2Path);
-        if (m2Data.empty()) {
-            LOG_WARNING("Failed to read creature M2: ", m2Path);
+        pipeline::M2Model model = loadCreatureM2Sync(m2Path);
+        if (!model.isValid()) {
             nonRenderableCreatureDisplayIds_.insert(displayId);
             creaturePermanentFailureGuids_.insert(guid);
             return;
         }
 
-        pipeline::M2Model model = pipeline::M2Loader::load(m2Data);
-        if (model.vertices.empty()) {
-            LOG_WARNING("Failed to parse creature M2: ", m2Path);
-            nonRenderableCreatureDisplayIds_.insert(displayId);
-            creaturePermanentFailureGuids_.insert(guid);
-            return;
-        }
-
-        // Load skin file (only for WotLK M2s - vanilla has embedded skin)
-        if (model.version >= 264) {
-            std::string skinPath = m2Path.substr(0, m2Path.size() - 3) + "00.skin";
-            auto skinData = assetManager->readFile(skinPath);
-            if (!skinData.empty()) {
-                pipeline::M2Loader::loadSkin(skinData, model);
-            } else {
-                LOG_WARNING("Missing skin file for WotLK creature M2: ", skinPath);
-            }
-        }
-
-        // Load external .anim files for sequences without flag 0x20
-        std::string basePath = m2Path.substr(0, m2Path.size() - 3);
-        for (uint32_t si = 0; si < model.sequences.size(); si++) {
-            if (!(model.sequences[si].flags & 0x20)) {
-                char animFileName[256];
-                snprintf(animFileName, sizeof(animFileName), "%s%04u-%02u.anim",
-                    basePath.c_str(), model.sequences[si].id, model.sequences[si].variationIndex);
-                auto animData = assetManager->readFileOptional(animFileName);
-                if (!animData.empty()) {
-                    pipeline::M2Loader::loadAnimFile(m2Data, animData, si, model);
-                }
-            }
-        }
-
         if (!charRenderer->loadModel(model, modelId)) {
             LOG_WARNING("Failed to load creature model: ", m2Path);
             nonRenderableCreatureDisplayIds_.insert(displayId);
@@ -4576,9 +4661,13 @@ void Application::spawnOnlineCreature(uint64_t guid, uint32_t displayId, float x
         displayIdModelCache_[displayId] = modelId;
     }
 
-    // Apply skin textures from CreatureDisplayInfo.dbc (only for newly loaded models)
+    // Apply skin textures from CreatureDisplayInfo.dbc (only once per displayId model).
+    // Track separately from model cache because async loading may upload the model
+    // before textures are applied.
     auto itDisplayData = displayDataMap_.find(displayId);
-    if (!modelCached && itDisplayData != displayDataMap_.end()) {
+    bool needsTextures = (displayIdTexturesApplied_.find(displayId) == displayIdTexturesApplied_.end());
+    if (needsTextures && itDisplayData != displayDataMap_.end()) {
+        displayIdTexturesApplied_.insert(displayId);
         const auto& dispData = itDisplayData->second;
 
         // Get model directory for texture path construction
@@ -5058,7 +5147,9 @@ void Application::spawnOnlineCreature(uint64_t guid, uint32_t displayId, float x
 
     // Per-instance hair/skin texture overrides — runs for ALL NPCs (including cached models)
     // so that each NPC gets its own hair/skin color regardless of model sharing.
+    // Uses pre-built CharSections cache (O(1) lookup instead of O(N) DBC scan).
     {
+        if (!charSectionsCacheBuilt_) buildCharSectionsCache();
         auto itDD = displayDataMap_.find(displayId);
         if (itDD != displayDataMap_.end() && itDD->second.extraDisplayId != 0) {
             auto itExtra2 = humanoidExtraMap_.find(itDD->second.extraDisplayId);
@@ -5066,37 +5157,19 @@ void Application::spawnOnlineCreature(uint64_t guid, uint32_t displayId, float x
                 const auto& extra = itExtra2->second;
                 const auto* md = charRenderer->getModelData(modelId);
                 if (md) {
-                    auto charSectionsDbc2 = assetManager->loadDBC("CharSections.dbc");
-                    if (charSectionsDbc2) {
-                        const auto* csL = pipeline::getActiveDBCLayout()
-                            ? pipeline::getActiveDBCLayout()->getLayout("CharSections") : nullptr;
-                        uint32_t tgtRace = static_cast<uint32_t>(extra.raceId);
-                        uint32_t tgtSex = static_cast<uint32_t>(extra.sexId);
-
-                        // Look up hair texture (section 3)
+                        // Look up hair texture (section 3) via cache
                         rendering::VkTexture* whiteTex = charRenderer->loadTexture("");
-                        for (uint32_t r = 0; r < charSectionsDbc2->getRecordCount(); r++) {
-                            uint32_t rId = charSectionsDbc2->getUInt32(r, csL ? (*csL)["RaceID"] : 1);
-                            uint32_t sId = charSectionsDbc2->getUInt32(r, csL ? (*csL)["SexID"] : 2);
-                            if (rId != tgtRace || sId != tgtSex) continue;
-                            uint32_t sec = charSectionsDbc2->getUInt32(r, csL ? (*csL)["BaseSection"] : 3);
-                            if (sec != 3) continue;
-                            uint32_t var = charSectionsDbc2->getUInt32(r, csL ? (*csL)["VariationIndex"] : 4);
-                            uint32_t col = charSectionsDbc2->getUInt32(r, csL ? (*csL)["ColorIndex"] : 5);
-                            if (var != static_cast<uint32_t>(extra.hairStyleId)) continue;
-                            if (col != static_cast<uint32_t>(extra.hairColorId)) continue;
-                            std::string hairPath = charSectionsDbc2->getString(r, csL ? (*csL)["Texture1"] : 6);
-                            if (!hairPath.empty()) {
-                                rendering::VkTexture* hairTex = charRenderer->loadTexture(hairPath);
-                                if (hairTex && hairTex != whiteTex) {
-                                    for (size_t ti = 0; ti < md->textures.size(); ti++) {
-                                        if (md->textures[ti].type == 6) {
-                                            charRenderer->setTextureSlotOverride(instanceId, static_cast<uint16_t>(ti), hairTex);
-                                        }
+                        std::string hairPath = lookupCharSection(
+                            extra.raceId, extra.sexId, 3, extra.hairStyleId, extra.hairColorId, 0);
+                        if (!hairPath.empty()) {
+                            rendering::VkTexture* hairTex = charRenderer->loadTexture(hairPath);
+                            if (hairTex && hairTex != whiteTex) {
+                                for (size_t ti = 0; ti < md->textures.size(); ti++) {
+                                    if (md->textures[ti].type == 6) {
+                                        charRenderer->setTextureSlotOverride(instanceId, static_cast<uint16_t>(ti), hairTex);
                                     }
                                 }
                             }
-                            break;
                         }
 
                         // Look up skin texture (section 0) for per-instance skin color.
@@ -5108,30 +5181,20 @@ void Application::spawnOnlineCreature(uint64_t guid, uint32_t displayId, float x
                                 if (extra.equipDisplayId[s] != 0) hasEquipOrBake = true;
                         }
                         if (!hasEquipOrBake) {
-                            for (uint32_t r = 0; r < charSectionsDbc2->getRecordCount(); r++) {
-                                uint32_t rId = charSectionsDbc2->getUInt32(r, csL ? (*csL)["RaceID"] : 1);
-                                uint32_t sId = charSectionsDbc2->getUInt32(r, csL ? (*csL)["SexID"] : 2);
-                                if (rId != tgtRace || sId != tgtSex) continue;
-                                uint32_t sec = charSectionsDbc2->getUInt32(r, csL ? (*csL)["BaseSection"] : 3);
-                                if (sec != 0) continue;
-                                uint32_t col = charSectionsDbc2->getUInt32(r, csL ? (*csL)["ColorIndex"] : 5);
-                                if (col != static_cast<uint32_t>(extra.skinId)) continue;
-                                std::string skinPath = charSectionsDbc2->getString(r, csL ? (*csL)["Texture1"] : 6);
-                                if (!skinPath.empty()) {
-                                    rendering::VkTexture* skinTex = charRenderer->loadTexture(skinPath);
-                                    if (skinTex) {
-                                        for (size_t ti = 0; ti < md->textures.size(); ti++) {
-                                            uint32_t tt = md->textures[ti].type;
-                                            if (tt == 1 || tt == 11) {
-                                                charRenderer->setTextureSlotOverride(instanceId, static_cast<uint16_t>(ti), skinTex);
-                                            }
+                            std::string skinPath = lookupCharSection(
+                                extra.raceId, extra.sexId, 0, 0, extra.skinId, 0);
+                            if (!skinPath.empty()) {
+                                rendering::VkTexture* skinTex = charRenderer->loadTexture(skinPath);
+                                if (skinTex) {
+                                    for (size_t ti = 0; ti < md->textures.size(); ti++) {
+                                        uint32_t tt = md->textures[ti].type;
+                                        if (tt == 1 || tt == 11) {
+                                            charRenderer->setTextureSlotOverride(instanceId, static_cast<uint16_t>(ti), skinTex);
                                         }
                                     }
                                 }
-                                break;
                             }
                         }
-                    }
                 }
             }
         }
@@ -6692,19 +6755,94 @@ void Application::spawnOnlineGameObject(uint64_t guid, uint32_t entry, uint32_t
              " displayId=", displayId, " at (", x, ", ", y, ", ", z, ")");
 }
 
+void Application::processAsyncCreatureResults() {
+    // Check completed async model loads and finalize on main thread (GPU upload + instance creation).
+    for (auto it = asyncCreatureLoads_.begin(); it != asyncCreatureLoads_.end(); ) {
+        if (!it->future.valid() ||
+            it->future.wait_for(std::chrono::milliseconds(0)) != std::future_status::ready) {
+            ++it;
+            continue;
+        }
+        auto result = it->future.get();
+        it = asyncCreatureLoads_.erase(it);
+
+        if (result.permanent_failure) {
+            nonRenderableCreatureDisplayIds_.insert(result.displayId);
+            creaturePermanentFailureGuids_.insert(result.guid);
+            pendingCreatureSpawnGuids_.erase(result.guid);
+            creatureSpawnRetryCounts_.erase(result.guid);
+            continue;
+        }
+        if (!result.valid || !result.model) {
+            pendingCreatureSpawnGuids_.erase(result.guid);
+            creatureSpawnRetryCounts_.erase(result.guid);
+            continue;
+        }
+
+        // Model parsed on background thread — upload to GPU on main thread.
+        auto* charRenderer = renderer ? renderer->getCharacterRenderer() : nullptr;
+        if (!charRenderer) {
+            pendingCreatureSpawnGuids_.erase(result.guid);
+            continue;
+        }
+
+        // Upload model to GPU (must happen on main thread)
+        if (!charRenderer->loadModel(*result.model, result.modelId)) {
+            nonRenderableCreatureDisplayIds_.insert(result.displayId);
+            creaturePermanentFailureGuids_.insert(result.guid);
+            pendingCreatureSpawnGuids_.erase(result.guid);
+            creatureSpawnRetryCounts_.erase(result.guid);
+            continue;
+        }
+        displayIdModelCache_[result.displayId] = result.modelId;
+
+        pendingCreatureSpawnGuids_.erase(result.guid);
+        creatureSpawnRetryCounts_.erase(result.guid);
+
+        // Re-queue as a normal pending spawn — model is now cached, so sync spawn is fast
+        // (only creates instance + applies textures, no file I/O).
+        if (!creatureInstances_.count(result.guid) &&
+            !creaturePermanentFailureGuids_.count(result.guid)) {
+            PendingCreatureSpawn s{};
+            s.guid = result.guid;
+            s.displayId = result.displayId;
+            s.x = result.x;
+            s.y = result.y;
+            s.z = result.z;
+            s.orientation = result.orientation;
+            pendingCreatureSpawns_.push_back(s);
+            pendingCreatureSpawnGuids_.insert(result.guid);
+        }
+    }
+}
+
 void Application::processCreatureSpawnQueue() {
+    // First, finalize any async model loads that completed on background threads.
+    processAsyncCreatureResults();
+
     if (pendingCreatureSpawns_.empty()) return;
     if (!creatureLookupsBuilt_) {
         buildCreatureDisplayLookups();
         if (!creatureLookupsBuilt_) return;
     }
 
+    auto startTime = std::chrono::steady_clock::now();
+    // Budget: max 4ms per frame for creature spawning to prevent stutter.
+    static constexpr float kSpawnBudgetMs = 4.0f;
+
     int processed = 0;
-    int newModelLoads = 0;
+    int asyncLaunched = 0;
     size_t rotationsLeft = pendingCreatureSpawns_.size();
     while (!pendingCreatureSpawns_.empty() &&
            processed < MAX_SPAWNS_PER_FRAME &&
            rotationsLeft > 0) {
+        // Check time budget after each spawn (not for the first one, always process at least 1)
+        if (processed > 0) {
+            auto now = std::chrono::steady_clock::now();
+            float elapsedMs = std::chrono::duration<float, std::milli>(now - startTime).count();
+            if (elapsedMs >= kSpawnBudgetMs) break;
+        }
+
         PendingCreatureSpawn s = pendingCreatureSpawns_.front();
         pendingCreatureSpawns_.erase(pendingCreatureSpawns_.begin());
 
@@ -6717,14 +6855,106 @@ void Application::processCreatureSpawnQueue() {
         }
 
         const bool needsNewModel = (displayIdModelCache_.find(s.displayId) == displayIdModelCache_.end());
-        if (needsNewModel && newModelLoads >= MAX_NEW_CREATURE_MODELS_PER_FRAME) {
-            // Defer additional first-time model/texture loads to later frames so
-            // movement stays responsive in dense areas.
-            pendingCreatureSpawns_.push_back(s);
-            rotationsLeft--;
+
+        // For new models: launch async load on background thread instead of blocking.
+        if (needsNewModel) {
+            if (static_cast<int>(asyncCreatureLoads_.size()) + asyncLaunched >= MAX_ASYNC_CREATURE_LOADS) {
+                // Too many in-flight — defer to next frame
+                pendingCreatureSpawns_.push_back(s);
+                rotationsLeft--;
+                continue;
+            }
+
+            std::string m2Path = getModelPathForDisplayId(s.displayId);
+            if (m2Path.empty()) {
+                nonRenderableCreatureDisplayIds_.insert(s.displayId);
+                creaturePermanentFailureGuids_.insert(s.guid);
+                pendingCreatureSpawnGuids_.erase(s.guid);
+                creatureSpawnRetryCounts_.erase(s.guid);
+                processed++;
+                rotationsLeft = pendingCreatureSpawns_.size();
+                continue;
+            }
+
+            // Check for invisible stalkers
+            {
+                std::string lowerPath = m2Path;
+                std::transform(lowerPath.begin(), lowerPath.end(), lowerPath.begin(),
+                               [](unsigned char c) { return static_cast<char>(std::tolower(c)); });
+                if (lowerPath.find("invisiblestalker") != std::string::npos ||
+                    lowerPath.find("invisible_stalker") != std::string::npos) {
+                    nonRenderableCreatureDisplayIds_.insert(s.displayId);
+                    creaturePermanentFailureGuids_.insert(s.guid);
+                    pendingCreatureSpawnGuids_.erase(s.guid);
+                    processed++;
+                    rotationsLeft = pendingCreatureSpawns_.size();
+                    continue;
+                }
+            }
+
+            // Launch async M2 load — file I/O and parsing happen off the main thread.
+            uint32_t modelId = nextCreatureModelId_++;
+            auto* am = assetManager.get();
+            AsyncCreatureLoad load;
+            load.future = std::async(std::launch::async,
+                [am, m2Path, modelId, s]() -> PreparedCreatureModel {
+                    PreparedCreatureModel result;
+                    result.guid = s.guid;
+                    result.displayId = s.displayId;
+                    result.modelId = modelId;
+                    result.x = s.x;
+                    result.y = s.y;
+                    result.z = s.z;
+                    result.orientation = s.orientation;
+
+                    auto m2Data = am->readFile(m2Path);
+                    if (m2Data.empty()) {
+                        result.permanent_failure = true;
+                        return result;
+                    }
+
+                    auto model = std::make_shared<pipeline::M2Model>(pipeline::M2Loader::load(m2Data));
+                    if (model->vertices.empty()) {
+                        result.permanent_failure = true;
+                        return result;
+                    }
+
+                    // Load skin file
+                    if (model->version >= 264) {
+                        std::string skinPath = m2Path.substr(0, m2Path.size() - 3) + "00.skin";
+                        auto skinData = am->readFile(skinPath);
+                        if (!skinData.empty()) {
+                            pipeline::M2Loader::loadSkin(skinData, *model);
+                        }
+                    }
+
+                    // Load external .anim files
+                    std::string basePath = m2Path.substr(0, m2Path.size() - 3);
+                    for (uint32_t si = 0; si < model->sequences.size(); si++) {
+                        if (!(model->sequences[si].flags & 0x20)) {
+                            char animFileName[256];
+                            snprintf(animFileName, sizeof(animFileName), "%s%04u-%02u.anim",
+                                basePath.c_str(), model->sequences[si].id, model->sequences[si].variationIndex);
+                            auto animData = am->readFileOptional(animFileName);
+                            if (!animData.empty()) {
+                                pipeline::M2Loader::loadAnimFile(m2Data, animData, si, *model);
+                            }
+                        }
+                    }
+
+                    result.model = std::move(model);
+                    result.valid = true;
+                    return result;
+                });
+            asyncCreatureLoads_.push_back(std::move(load));
+            asyncLaunched++;
+            // Don't erase from pendingCreatureSpawnGuids_ — the async result handler will do it
+            rotationsLeft = pendingCreatureSpawns_.size();
+            processed++;
             continue;
         }
 
+        // Cached model — spawn is fast (no file I/O, just instance creation + texture setup)
         spawnOnlineCreature(s.guid, s.displayId, s.x, s.y, s.z, s.orientation);
         pendingCreatureSpawnGuids_.erase(s.guid);
 
@@ -6752,9 +6982,6 @@ void Application::processCreatureSpawnQueue() {
         } else {
             creatureSpawnRetryCounts_.erase(s.guid);
         }
-        if (needsNewModel) {
-            newModelLoads++;
-        }
         rotationsLeft = pendingCreatureSpawns_.size();
         processed++;
     }

From 884b72bc1c2cb8e05c7ccf71316f175e81ab3962 Mon Sep 17 00:00:00 2001
From: Kelsi <kelsihates2fa@gmail.com>
Date: Sat, 7 Mar 2026 11:59:19 -0800
Subject: [PATCH 2/9] Incremental terrain upload + M2 instance dedup hash for
 city stutter
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Terrain finalization was uploading all 256 chunks (GPU fence waits) in one
atomic advanceFinalization call that couldn't be interrupted by the 5ms time
budget. Now split into incremental batches of 16 chunks per call, allowing
the time budget to yield between batches.

M2 instance creation had O(N) dedup scans iterating ALL instances to check
for duplicates. In cities with 5000+ doodads, this caused O(N²) total work
during tile loading. Replaced with hash-based DedupKey map for O(1) lookups.

Changes:
- TerrainRenderer::loadTerrainIncremental: uploads N chunks per call
- FinalizingTile tracks terrainChunkNext for cross-frame progress
- TERRAIN phase yields after preload and after each chunk batch
- M2Renderer::DedupKey hash map replaces linear scan in createInstance
  and createInstanceWithMatrix
- Dedup map maintained through rebuildSpatialIndex and clear paths
---
 include/rendering/m2_renderer.hpp      | 19 ++++++
 include/rendering/terrain_manager.hpp  |  5 ++
 include/rendering/terrain_renderer.hpp |  7 +++
 src/rendering/m2_renderer.cpp          | 65 ++++++++++++++------
 src/rendering/terrain_manager.cpp      | 48 +++++++++------
 src/rendering/terrain_renderer.cpp     | 84 ++++++++++++++++++++++++++
 6 files changed, 193 insertions(+), 35 deletions(-)

diff --git a/include/rendering/m2_renderer.hpp b/include/rendering/m2_renderer.hpp
index f53fb4bf..91616a28 100644
--- a/include/rendering/m2_renderer.hpp
+++ b/include/rendering/m2_renderer.hpp
@@ -389,6 +389,25 @@ private:
     std::unordered_map<uint32_t, M2ModelGPU> models;
     std::vector<M2Instance> instances;
 
+    // O(1) dedup: key = (modelId, quantized x, quantized y, quantized z) → instanceId
+    struct DedupKey {
+        uint32_t modelId;
+        int32_t qx, qy, qz; // position quantized to 0.1 units
+        bool operator==(const DedupKey& o) const {
+            return modelId == o.modelId && qx == o.qx && qy == o.qy && qz == o.qz;
+        }
+    };
+    struct DedupHash {
+        size_t operator()(const DedupKey& k) const {
+            size_t h = std::hash<uint32_t>()(k.modelId);
+            h ^= std::hash<int32_t>()(k.qx) * 2654435761u;
+            h ^= std::hash<int32_t>()(k.qy) * 40503u;
+            h ^= std::hash<int32_t>()(k.qz) * 12289u;
+            return h;
+        }
+    };
+    std::unordered_map<DedupKey, uint32_t, DedupHash> instanceDedupMap_;
+
     uint32_t nextInstanceId = 1;
     uint32_t lastDrawCallCount = 0;
     size_t modelCacheLimit_ = 6000;
diff --git a/include/rendering/terrain_manager.hpp b/include/rendering/terrain_manager.hpp
index 0090edc4..efede0c9 100644
--- a/include/rendering/terrain_manager.hpp
+++ b/include/rendering/terrain_manager.hpp
@@ -150,6 +150,11 @@ struct FinalizingTile {
     size_t wmoModelIndex = 0;      // Next WMO model to upload
     size_t wmoDoodadIndex = 0;     // Next WMO doodad to upload
 
+    // Incremental terrain upload state (splits TERRAIN phase across frames)
+    bool terrainPreloaded = false;  // True after preloaded textures uploaded
+    int terrainChunkNext = 0;       // Next chunk index to upload (0-255, row-major)
+    bool terrainMeshDone = false;   // True when all chunks uploaded
+
     // Accumulated results (built up across phases)
     std::vector<uint32_t> m2InstanceIds;
     std::vector<uint32_t> wmoInstanceIds;
diff --git a/include/rendering/terrain_renderer.hpp b/include/rendering/terrain_renderer.hpp
index 91279e9c..a1d433d1 100644
--- a/include/rendering/terrain_renderer.hpp
+++ b/include/rendering/terrain_renderer.hpp
@@ -86,6 +86,13 @@ public:
                      const std::vector<std::string>& texturePaths,
                      int tileX = -1, int tileY = -1);
 
+    /// Upload a batch of terrain chunks incrementally. Returns true when all chunks done.
+    /// chunkIndex is updated to the next chunk to process (0-255 row-major).
+    bool loadTerrainIncremental(const pipeline::TerrainMesh& mesh,
+                                const std::vector<std::string>& texturePaths,
+                                int tileX, int tileY,
+                                int& chunkIndex, int maxChunksPerCall = 16);
+
     void removeTile(int tileX, int tileY);
 
     void uploadPreloadedTextures(const std::unordered_map<std::string, pipeline::BLPImage>& textures);
diff --git a/src/rendering/m2_renderer.cpp b/src/rendering/m2_renderer.cpp
index d76843a0..d6df9dfe 100644
--- a/src/rendering/m2_renderer.cpp
+++ b/src/rendering/m2_renderer.cpp
@@ -678,6 +678,7 @@ void M2Renderer::shutdown() {
     instances.clear();
     spatialGrid.clear();
     instanceIndexById.clear();
+    instanceDedupMap_.clear();
 
     // Delete cached textures
     textureCache.clear();
@@ -1613,17 +1614,16 @@ uint32_t M2Renderer::createInstance(uint32_t modelId, const glm::vec3& position,
     }
     const auto& mdlRef = modelIt->second;
 
-    // Ground clutter is procedurally scattered and high-count; avoid O(N) dedup
-    // scans that can hitch when new tiles stream in.
+    // Deduplicate: skip if same model already at nearly the same position.
+    // Uses hash map for O(1) lookup instead of O(N) scan.
     if (!mdlRef.isGroundDetail) {
-        // Deduplicate: skip if same model already at nearly the same position
-        for (const auto& existing : instances) {
-            if (existing.modelId == modelId) {
-                glm::vec3 d = existing.position - position;
-                if (glm::dot(d, d) < 0.01f) {
-                    return existing.id;
-                }
-            }
+        DedupKey dk{modelId,
+                    static_cast<int32_t>(std::round(position.x * 10.0f)),
+                    static_cast<int32_t>(std::round(position.y * 10.0f)),
+                    static_cast<int32_t>(std::round(position.z * 10.0f))};
+        auto dit = instanceDedupMap_.find(dk);
+        if (dit != instanceDedupMap_.end()) {
+            return dit->second;
         }
     }
 
@@ -1662,6 +1662,15 @@ uint32_t M2Renderer::createInstance(uint32_t modelId, const glm::vec3& position,
         instance.variationTimer = 3000.0f + static_cast<float>(rand() % 8000);
     }
 
+    // Register in dedup map before pushing (uses original position, not ground-adjusted)
+    if (!mdlRef.isGroundDetail) {
+        DedupKey dk{modelId,
+                    static_cast<int32_t>(std::round(position.x * 10.0f)),
+                    static_cast<int32_t>(std::round(position.y * 10.0f)),
+                    static_cast<int32_t>(std::round(position.z * 10.0f))};
+        instanceDedupMap_[dk] = instance.id;
+    }
+
     instances.push_back(instance);
     size_t idx = instances.size() - 1;
     // Track special instances for fast-path iteration
@@ -1700,13 +1709,15 @@ uint32_t M2Renderer::createInstanceWithMatrix(uint32_t modelId, const glm::mat4&
         return 0;
     }
 
-    // Deduplicate: skip if same model already at nearly the same position
-    for (const auto& existing : instances) {
-        if (existing.modelId == modelId) {
-            glm::vec3 d = existing.position - position;
-            if (glm::dot(d, d) < 0.01f) {
-                return existing.id;
-            }
+    // Deduplicate: O(1) hash lookup
+    {
+        DedupKey dk{modelId,
+                    static_cast<int32_t>(std::round(position.x * 10.0f)),
+                    static_cast<int32_t>(std::round(position.y * 10.0f)),
+                    static_cast<int32_t>(std::round(position.z * 10.0f))};
+        auto dit = instanceDedupMap_.find(dk);
+        if (dit != instanceDedupMap_.end()) {
+            return dit->second;
         }
     }
 
@@ -1743,6 +1754,15 @@ uint32_t M2Renderer::createInstanceWithMatrix(uint32_t modelId, const glm::mat4&
         instance.animTime = static_cast<float>(rand()) / RAND_MAX * 10000.0f;
     }
 
+    // Register in dedup map
+    {
+        DedupKey dk{modelId,
+                    static_cast<int32_t>(std::round(position.x * 10.0f)),
+                    static_cast<int32_t>(std::round(position.y * 10.0f)),
+                    static_cast<int32_t>(std::round(position.z * 10.0f))};
+        instanceDedupMap_[dk] = instance.id;
+    }
+
     instances.push_back(instance);
     size_t idx = instances.size() - 1;
     if (mdl2.isSmoke) {
@@ -3477,6 +3497,7 @@ void M2Renderer::clear() {
     instances.clear();
     spatialGrid.clear();
     instanceIndexById.clear();
+    instanceDedupMap_.clear();
     smokeParticles.clear();
     smokeInstanceIndices_.clear();
     portalInstanceIndices_.clear();
@@ -3513,6 +3534,7 @@ M2Renderer::GridCell M2Renderer::toCell(const glm::vec3& p) const {
 void M2Renderer::rebuildSpatialIndex() {
     spatialGrid.clear();
     instanceIndexById.clear();
+    instanceDedupMap_.clear();
     instanceIndexById.reserve(instances.size());
     smokeInstanceIndices_.clear();
     portalInstanceIndices_.clear();
@@ -3524,6 +3546,15 @@ void M2Renderer::rebuildSpatialIndex() {
         const auto& inst = instances[i];
         instanceIndexById[inst.id] = i;
 
+        // Rebuild dedup map (skip ground detail)
+        if (!inst.cachedIsGroundDetail) {
+            DedupKey dk{inst.modelId,
+                        static_cast<int32_t>(std::round(inst.position.x * 10.0f)),
+                        static_cast<int32_t>(std::round(inst.position.y * 10.0f)),
+                        static_cast<int32_t>(std::round(inst.position.z * 10.0f))};
+            instanceDedupMap_[dk] = inst.id;
+        }
+
         if (inst.cachedIsSmoke) {
             smokeInstanceIndices_.push_back(i);
         }
diff --git a/src/rendering/terrain_manager.cpp b/src/rendering/terrain_manager.cpp
index b164d969..11204ca2 100644
--- a/src/rendering/terrain_manager.cpp
+++ b/src/rendering/terrain_manager.cpp
@@ -695,27 +695,39 @@ bool TerrainManager::advanceFinalization(FinalizingTile& ft) {
             return true;
         }
 
-        LOG_DEBUG("Finalizing tile [", x, ",", y, "] (incremental)");
-
-        // Upload pre-loaded textures
-        if (!pending->preloadedTextures.empty()) {
-            terrainRenderer->uploadPreloadedTextures(pending->preloadedTextures);
-        }
-
-        // Upload terrain mesh to GPU
-        if (!terrainRenderer->loadTerrain(pending->mesh, pending->terrain.textures, x, y)) {
-            LOG_ERROR("Failed to upload terrain to GPU for tile [", x, ",", y, "]");
-            failedTiles[coord] = true;
-            {
-                std::lock_guard<std::mutex> lock(queueMutex);
-                pendingTiles.erase(coord);
+        // Upload pre-loaded textures (once)
+        if (!ft.terrainPreloaded) {
+            LOG_DEBUG("Finalizing tile [", x, ",", y, "] (incremental)");
+            if (!pending->preloadedTextures.empty()) {
+                terrainRenderer->uploadPreloadedTextures(pending->preloadedTextures);
             }
-            ft.phase = FinalizationPhase::DONE;
-            return true;
+            ft.terrainPreloaded = true;
+            // Yield after preload to give time budget a chance to interrupt
+            return false;
         }
 
-        // Load water immediately after terrain (same frame) — water is now
-        // deduplicated to ~1-2 merged surfaces per tile, so this is fast.
+        // Upload terrain chunks incrementally (16 per call to spread across frames)
+        if (!ft.terrainMeshDone) {
+            if (pending->mesh.validChunkCount == 0) {
+                LOG_ERROR("Failed to upload terrain to GPU for tile [", x, ",", y, "]");
+                failedTiles[coord] = true;
+                {
+                    std::lock_guard<std::mutex> lock(queueMutex);
+                    pendingTiles.erase(coord);
+                }
+                ft.phase = FinalizationPhase::DONE;
+                return true;
+            }
+            bool allDone = terrainRenderer->loadTerrainIncremental(
+                pending->mesh, pending->terrain.textures, x, y,
+                ft.terrainChunkNext, 16);
+            if (!allDone) {
+                return false; // More chunks remain — yield to time budget
+            }
+            ft.terrainMeshDone = true;
+        }
+
+        // Load water after all terrain chunks are uploaded
         if (waterRenderer) {
             size_t beforeSurfaces = waterRenderer->getSurfaceCount();
             waterRenderer->loadFromTerrain(pending->terrain, true, x, y);
diff --git a/src/rendering/terrain_renderer.cpp b/src/rendering/terrain_renderer.cpp
index 6e312233..227178d5 100644
--- a/src/rendering/terrain_renderer.cpp
+++ b/src/rendering/terrain_renderer.cpp
@@ -409,6 +409,90 @@ bool TerrainRenderer::loadTerrain(const pipeline::TerrainMesh& mesh,
     return !chunks.empty();
 }
 
+bool TerrainRenderer::loadTerrainIncremental(const pipeline::TerrainMesh& mesh,
+                                              const std::vector<std::string>& texturePaths,
+                                              int tileX, int tileY,
+                                              int& chunkIndex, int maxChunksPerCall) {
+    int uploaded = 0;
+    while (chunkIndex < 256 && uploaded < maxChunksPerCall) {
+        int cy = chunkIndex / 16;
+        int cx = chunkIndex % 16;
+        chunkIndex++;
+
+        const auto& chunk = mesh.getChunk(cx, cy);
+        if (!chunk.isValid()) continue;
+
+        TerrainChunkGPU gpuChunk = uploadChunk(chunk);
+        if (!gpuChunk.isValid()) continue;
+
+        calculateBoundingSphere(gpuChunk, chunk);
+
+        if (!chunk.layers.empty()) {
+            uint32_t baseTexId = chunk.layers[0].textureId;
+            if (baseTexId < texturePaths.size()) {
+                gpuChunk.baseTexture = loadTexture(texturePaths[baseTexId]);
+            } else {
+                gpuChunk.baseTexture = whiteTexture.get();
+            }
+
+            for (size_t i = 1; i < chunk.layers.size() && i < 4; i++) {
+                const auto& layer = chunk.layers[i];
+                int li = static_cast<int>(i) - 1;
+
+                VkTexture* layerTex = whiteTexture.get();
+                if (layer.textureId < texturePaths.size()) {
+                    layerTex = loadTexture(texturePaths[layer.textureId]);
+                }
+                gpuChunk.layerTextures[li] = layerTex;
+
+                VkTexture* alphaTex = opaqueAlphaTexture.get();
+                if (!layer.alphaData.empty()) {
+                    alphaTex = createAlphaTexture(layer.alphaData);
+                }
+                gpuChunk.alphaTextures[li] = alphaTex;
+                gpuChunk.layerCount = static_cast<int>(i);
+            }
+        } else {
+            gpuChunk.baseTexture = whiteTexture.get();
+        }
+
+        gpuChunk.tileX = tileX;
+        gpuChunk.tileY = tileY;
+
+        TerrainParamsUBO params{};
+        params.layerCount = gpuChunk.layerCount;
+        params.hasLayer1 = gpuChunk.layerCount >= 1 ? 1 : 0;
+        params.hasLayer2 = gpuChunk.layerCount >= 2 ? 1 : 0;
+        params.hasLayer3 = gpuChunk.layerCount >= 3 ? 1 : 0;
+
+        VkBufferCreateInfo bufCI{};
+        bufCI.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO;
+        bufCI.size = sizeof(TerrainParamsUBO);
+        bufCI.usage = VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT;
+
+        VmaAllocationCreateInfo allocCI{};
+        allocCI.usage = VMA_MEMORY_USAGE_CPU_TO_GPU;
+        allocCI.flags = VMA_ALLOCATION_CREATE_MAPPED_BIT;
+
+        VmaAllocationInfo mapInfo{};
+        vmaCreateBuffer(vkCtx->getAllocator(), &bufCI, &allocCI,
+                        &gpuChunk.paramsUBO, &gpuChunk.paramsAlloc, &mapInfo);
+        if (mapInfo.pMappedData) {
+            std::memcpy(mapInfo.pMappedData, &params, sizeof(params));
+        }
+
+        gpuChunk.materialSet = allocateMaterialSet();
+        if (gpuChunk.materialSet) {
+            writeMaterialDescriptors(gpuChunk.materialSet, gpuChunk);
+        }
+
+        chunks.push_back(std::move(gpuChunk));
+        uploaded++;
+    }
+
+    return chunkIndex >= 256;
+}
+
 TerrainChunkGPU TerrainRenderer::uploadChunk(const pipeline::ChunkMesh& chunk) {
     TerrainChunkGPU gpuChunk;
 

From 16b43367003334747a44f1089a619b1e1d1cb7a3 Mon Sep 17 00:00:00 2001
From: Kelsi <kelsihates2fa@gmail.com>
Date: Sat, 7 Mar 2026 12:19:59 -0800
Subject: [PATCH 3/9] Batch GPU uploads to eliminate per-upload fence waits
 (stutter fix)

Every uploadBuffer/VkTexture::upload called immediateSubmit which did a
separate vkQueueSubmit + vkWaitForFences. Loading a single creature model
with textures caused 4-8+ fence waits; terrain chunks caused 80+ per batch.

Added beginUploadBatch/endUploadBatch to VkContext: records all upload
commands into a single command buffer, submits once with one fence wait.
Staging buffers are deferred for cleanup after the batch completes.

Wrapped in batch mode:
- CharacterRenderer::loadModel (creature VB/IB + textures)
- M2Renderer::loadModel (doodad VB/IB + textures)
- TerrainRenderer::loadTerrain/loadTerrainIncremental (chunk geometry + textures)
- TerrainRenderer::uploadPreloadedTextures
- WMORenderer::loadModel (group geometry + textures)
---
 include/rendering/vk_context.hpp     | 14 ++++++++++++
 src/rendering/character_renderer.cpp |  6 +++++
 src/rendering/m2_renderer.cpp        |  6 +++++
 src/rendering/terrain_renderer.cpp   | 15 ++++++++++++
 src/rendering/vk_context.cpp         | 34 ++++++++++++++++++++++++++++
 src/rendering/vk_texture.cpp         | 12 ++++++++--
 src/rendering/vk_utils.cpp           |  8 +++++--
 src/rendering/wmo_renderer.cpp       |  6 +++++
 8 files changed, 97 insertions(+), 4 deletions(-)

diff --git a/include/rendering/vk_context.hpp b/include/rendering/vk_context.hpp
index 3a242940..dab96d2a 100644
--- a/include/rendering/vk_context.hpp
+++ b/include/rendering/vk_context.hpp
@@ -1,5 +1,6 @@
 #pragma once
 
+#include "rendering/vk_utils.hpp"
 #include <vulkan/vulkan.h>
 #include <vk_mem_alloc.h>
 #include <VkBootstrap.h>
@@ -46,6 +47,13 @@ public:
     // Immediate submit for one-off GPU work (descriptor pool creation, etc.)
     void immediateSubmit(std::function<void(VkCommandBuffer cmd)>&& function);
 
+    // Batch upload mode: records multiple upload commands into a single
+    // command buffer, then submits with ONE fence wait instead of one per upload.
+    void beginUploadBatch();
+    void endUploadBatch();
+    bool isInUploadBatch() const { return inUploadBatch_; }
+    void deferStagingCleanup(AllocatedBuffer staging);
+
     // Accessors
     VkInstance getInstance() const { return instance; }
     VkPhysicalDevice getPhysicalDevice() const { return physicalDevice; }
@@ -143,6 +151,12 @@ private:
     VkCommandPool immCommandPool = VK_NULL_HANDLE;
     VkFence immFence = VK_NULL_HANDLE;
 
+    // Batch upload state (nesting-safe via depth counter)
+    int uploadBatchDepth_ = 0;
+    bool inUploadBatch_ = false;
+    VkCommandBuffer batchCmd_ = VK_NULL_HANDLE;
+    std::vector<AllocatedBuffer> batchStagingBuffers_;
+
     // Depth buffer (shared across all framebuffers)
     VkImage depthImage = VK_NULL_HANDLE;
     VkImageView depthImageView = VK_NULL_HANDLE;
diff --git a/src/rendering/character_renderer.cpp b/src/rendering/character_renderer.cpp
index 2126e5e5..9aa99c72 100644
--- a/src/rendering/character_renderer.cpp
+++ b/src/rendering/character_renderer.cpp
@@ -1247,6 +1247,10 @@ bool CharacterRenderer::loadModel(const pipeline::M2Model& model, uint32_t id) {
     M2ModelGPU gpuModel;
     gpuModel.data = model;
 
+    // Batch all GPU uploads (VB, IB, textures) into a single command buffer
+    // submission with one fence wait, instead of one fence wait per upload.
+    vkCtx_->beginUploadBatch();
+
     // Setup GPU buffers
     setupModelBuffers(gpuModel);
 
@@ -1259,6 +1263,8 @@ bool CharacterRenderer::loadModel(const pipeline::M2Model& model, uint32_t id) {
         gpuModel.textureIds.push_back(texPtr);
     }
 
+    vkCtx_->endUploadBatch();
+
     models[id] = std::move(gpuModel);
 
     core::Logger::getInstance().debug("Loaded M2 model ", id, " (", model.vertices.size(),
diff --git a/src/rendering/m2_renderer.cpp b/src/rendering/m2_renderer.cpp
index d6df9dfe..c4e7a727 100644
--- a/src/rendering/m2_renderer.cpp
+++ b/src/rendering/m2_renderer.cpp
@@ -1185,6 +1185,10 @@ bool M2Renderer::loadModel(const pipeline::M2Model& model, uint32_t modelId) {
         }
     }
 
+    // Batch all GPU uploads (VB, IB, textures) into a single command buffer
+    // submission with one fence wait, instead of one fence wait per upload.
+    vkCtx_->beginUploadBatch();
+
     if (hasGeometry) {
         // Create VBO with interleaved vertex data
         // Format: position (3), normal (3), texcoord0 (2), texcoord1 (2), boneWeights (4), boneIndices (4 as float)
@@ -1536,6 +1540,8 @@ bool M2Renderer::loadModel(const pipeline::M2Model& model, uint32_t modelId) {
         }
     }
 
+    vkCtx_->endUploadBatch();
+
     // Allocate Vulkan descriptor sets and UBOs for each batch
     for (auto& bgpu : gpuModel.batches) {
         // Create combined UBO for M2Params (binding 1) + M2Material (binding 2)
diff --git a/src/rendering/terrain_renderer.cpp b/src/rendering/terrain_renderer.cpp
index 227178d5..fb20ce42 100644
--- a/src/rendering/terrain_renderer.cpp
+++ b/src/rendering/terrain_renderer.cpp
@@ -326,6 +326,8 @@ bool TerrainRenderer::loadTerrain(const pipeline::TerrainMesh& mesh,
     }
     LOG_DEBUG("Loading terrain mesh: ", mesh.validChunkCount, " chunks");
 
+    vkCtx->beginUploadBatch();
+
     for (int y = 0; y < 16; y++) {
         for (int x = 0; x < 16; x++) {
             const auto& chunk = mesh.getChunk(x, y);
@@ -405,6 +407,8 @@ bool TerrainRenderer::loadTerrain(const pipeline::TerrainMesh& mesh,
         }
     }
 
+    vkCtx->endUploadBatch();
+
     LOG_DEBUG("Loaded ", chunks.size(), " terrain chunks to GPU");
     return !chunks.empty();
 }
@@ -413,6 +417,10 @@ bool TerrainRenderer::loadTerrainIncremental(const pipeline::TerrainMesh& mesh,
                                               const std::vector<std::string>& texturePaths,
                                               int tileX, int tileY,
                                               int& chunkIndex, int maxChunksPerCall) {
+    // Batch all GPU uploads (VBs, IBs, textures) into a single command buffer
+    // submission with one fence wait, instead of one per buffer/texture.
+    vkCtx->beginUploadBatch();
+
     int uploaded = 0;
     while (chunkIndex < 256 && uploaded < maxChunksPerCall) {
         int cy = chunkIndex / 16;
@@ -490,6 +498,8 @@ bool TerrainRenderer::loadTerrainIncremental(const pipeline::TerrainMesh& mesh,
         uploaded++;
     }
 
+    vkCtx->endUploadBatch();
+
     return chunkIndex >= 256;
 }
 
@@ -580,6 +590,9 @@ void TerrainRenderer::uploadPreloadedTextures(
                        [](unsigned char c) { return static_cast<char>(std::tolower(c)); });
         return key;
     };
+    // Batch all texture uploads into a single command buffer submission
+    vkCtx->beginUploadBatch();
+
     for (const auto& [path, blp] : textures) {
         std::string key = normalizeKey(path);
         if (textureCache.find(key) != textureCache.end()) continue;
@@ -599,6 +612,8 @@ void TerrainRenderer::uploadPreloadedTextures(
         textureCacheBytes_ += e.approxBytes;
         textureCache[key] = std::move(e);
     }
+
+    vkCtx->endUploadBatch();
 }
 
 VkTexture* TerrainRenderer::createAlphaTexture(const std::vector<uint8_t>& alphaData) {
diff --git a/src/rendering/vk_context.cpp b/src/rendering/vk_context.cpp
index e1a76cee..dc73c685 100644
--- a/src/rendering/vk_context.cpp
+++ b/src/rendering/vk_context.cpp
@@ -1423,10 +1423,44 @@ void VkContext::endSingleTimeCommands(VkCommandBuffer cmd) {
 }
 
 void VkContext::immediateSubmit(std::function<void(VkCommandBuffer cmd)>&& function) {
+    if (inUploadBatch_) {
+        // Record into the batch command buffer — no submit, no fence wait
+        function(batchCmd_);
+        return;
+    }
     VkCommandBuffer cmd = beginSingleTimeCommands();
     function(cmd);
     endSingleTimeCommands(cmd);
 }
 
+void VkContext::beginUploadBatch() {
+    uploadBatchDepth_++;
+    if (inUploadBatch_) return; // already in a batch (nested call)
+    inUploadBatch_ = true;
+    batchCmd_ = beginSingleTimeCommands();
+}
+
+void VkContext::endUploadBatch() {
+    if (uploadBatchDepth_ <= 0) return;
+    uploadBatchDepth_--;
+    if (uploadBatchDepth_ > 0) return; // still inside an outer batch
+
+    inUploadBatch_ = false;
+
+    // Submit all recorded commands with a single fence wait
+    endSingleTimeCommands(batchCmd_);
+    batchCmd_ = VK_NULL_HANDLE;
+
+    // Destroy all deferred staging buffers
+    for (auto& staging : batchStagingBuffers_) {
+        destroyBuffer(allocator, staging);
+    }
+    batchStagingBuffers_.clear();
+}
+
+void VkContext::deferStagingCleanup(AllocatedBuffer staging) {
+    batchStagingBuffers_.push_back(staging);
+}
+
 } // namespace rendering
 } // namespace wowee
diff --git a/src/rendering/vk_texture.cpp b/src/rendering/vk_texture.cpp
index fba6d72b..415e3d56 100644
--- a/src/rendering/vk_texture.cpp
+++ b/src/rendering/vk_texture.cpp
@@ -96,7 +96,11 @@ bool VkTexture::upload(VkContext& ctx, const uint8_t* pixels, uint32_t width, ui
         generateMipmaps(ctx, format, width, height);
     }
 
-    destroyBuffer(ctx.getAllocator(), staging);
+    if (ctx.isInUploadBatch()) {
+        ctx.deferStagingCleanup(staging);
+    } else {
+        destroyBuffer(ctx.getAllocator(), staging);
+    }
     return true;
 }
 
@@ -162,7 +166,11 @@ bool VkTexture::uploadMips(VkContext& ctx, const uint8_t* const* mipData,
             VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT);
     });
 
-    destroyBuffer(ctx.getAllocator(), staging);
+    if (ctx.isInUploadBatch()) {
+        ctx.deferStagingCleanup(staging);
+    } else {
+        destroyBuffer(ctx.getAllocator(), staging);
+    }
     return true;
 }
 
diff --git a/src/rendering/vk_utils.cpp b/src/rendering/vk_utils.cpp
index d105c986..3a2f51d1 100644
--- a/src/rendering/vk_utils.cpp
+++ b/src/rendering/vk_utils.cpp
@@ -198,8 +198,12 @@ AllocatedBuffer uploadBuffer(VkContext& ctx, const void* data, VkDeviceSize size
         vkCmdCopyBuffer(cmd, staging.buffer, gpuBuffer.buffer, 1, &copyRegion);
     });
 
-    // Destroy staging buffer
-    destroyBuffer(ctx.getAllocator(), staging);
+    // Destroy staging buffer (deferred if in batch mode)
+    if (ctx.isInUploadBatch()) {
+        ctx.deferStagingCleanup(staging);
+    } else {
+        destroyBuffer(ctx.getAllocator(), staging);
+    }
 
     return gpuBuffer;
 }
diff --git a/src/rendering/wmo_renderer.cpp b/src/rendering/wmo_renderer.cpp
index ff6b0035..691abaa1 100644
--- a/src/rendering/wmo_renderer.cpp
+++ b/src/rendering/wmo_renderer.cpp
@@ -419,6 +419,10 @@ bool WMORenderer::loadModel(const pipeline::WMOModel& model, uint32_t id) {
     core::Logger::getInstance().debug("  WMO bounds: min=(", model.boundingBoxMin.x, ", ", model.boundingBoxMin.y, ", ", model.boundingBoxMin.z,
                                       ") max=(", model.boundingBoxMax.x, ", ", model.boundingBoxMax.y, ", ", model.boundingBoxMax.z, ")");
 
+    // Batch all GPU uploads (textures, VBs, IBs) into a single command buffer
+    // submission with one fence wait, instead of one per upload.
+    vkCtx_->beginUploadBatch();
+
     // Load textures for this model
     core::Logger::getInstance().debug("  WMO has ", model.textures.size(), " texture paths, ", model.materials.size(), " materials");
     if (assetManager && !model.textures.empty()) {
@@ -720,6 +724,8 @@ bool WMORenderer::loadModel(const pipeline::WMOModel& model, uint32_t id) {
         groupRes.allUntextured = !anyTextured && !groupRes.mergedBatches.empty();
     }
 
+    vkCtx_->endUploadBatch();
+
     // Copy portal data for visibility culling
     modelData.portalVertices = model.portalVertices;
     for (const auto& portal : model.portals) {

From 25bb63c50ad45697bac78ca98fbe443c0d9a95de Mon Sep 17 00:00:00 2001
From: Kelsi <kelsihates2fa@gmail.com>
Date: Sat, 7 Mar 2026 12:32:39 -0800
Subject: [PATCH 4/9] Faster terrain/model loading: more workers, batched
 finalization, skip redundant I/O

- Worker threads: use (cores - 1-2) instead of cores/2, minimum 4
- Outer upload batch in processReadyTiles: ALL model/texture uploads per
  frame share a single command buffer submission + fence wait
- Upload multiple models per finalization step: 8 M2s, 4 WMOs, 16 doodads
  per call instead of 1 each (all within same GPU batch)
- Terrain chunks: 64 per step instead of 16
- Skip redundant M2 file I/O: thread-safe uploadedM2Ids_ set lets
  background workers skip re-reading+parsing models already on GPU
- processAllReadyTiles (loading screen) and processOneReadyTile also
  wrapped in outer upload batches
---
 include/rendering/terrain_manager.hpp  |   5 +
 include/rendering/terrain_renderer.hpp |   1 +
 src/rendering/terrain_manager.cpp      | 133 ++++++++++++++++++-------
 3 files changed, 105 insertions(+), 34 deletions(-)

diff --git a/include/rendering/terrain_manager.hpp b/include/rendering/terrain_manager.hpp
index efede0c9..1b2af320 100644
--- a/include/rendering/terrain_manager.hpp
+++ b/include/rendering/terrain_manager.hpp
@@ -381,6 +381,11 @@ private:
     std::unordered_set<std::string> missingAdtWarnings_;
     std::mutex missingAdtWarningsMutex_;
 
+    // Thread-safe set of M2 model IDs already uploaded to GPU
+    // (checked by workers to skip redundant file I/O + parsing)
+    std::unordered_set<uint32_t> uploadedM2Ids_;
+    std::mutex uploadedM2IdsMutex_;
+
     // Dedup set for doodad placements across tile boundaries
     std::unordered_set<uint32_t> placedDoodadIds;
 
diff --git a/include/rendering/terrain_renderer.hpp b/include/rendering/terrain_renderer.hpp
index a1d433d1..77af9a64 100644
--- a/include/rendering/terrain_renderer.hpp
+++ b/include/rendering/terrain_renderer.hpp
@@ -127,6 +127,7 @@ public:
     int getRenderedChunkCount() const { return renderedChunks; }
     int getCulledChunkCount() const { return culledChunks; }
     int getTriangleCount() const;
+    VkContext* getVkContext() const { return vkCtx; }
 
 private:
     TerrainChunkGPU uploadChunk(const pipeline::ChunkMesh& chunk);
diff --git a/src/rendering/terrain_manager.cpp b/src/rendering/terrain_manager.cpp
index 11204ca2..3eb1ba1c 100644
--- a/src/rendering/terrain_manager.cpp
+++ b/src/rendering/terrain_manager.cpp
@@ -1,5 +1,6 @@
 #include "rendering/terrain_manager.hpp"
 #include "rendering/terrain_renderer.hpp"
+#include "rendering/vk_context.hpp"
 #include "rendering/water_renderer.hpp"
 #include "rendering/m2_renderer.hpp"
 #include "rendering/wmo_renderer.hpp"
@@ -53,12 +54,12 @@ int computeTerrainWorkerCount() {
 
     unsigned hc = std::thread::hardware_concurrency();
     if (hc > 0) {
-        // Terrain streaming should leave CPU room for render/update threads.
-        const unsigned availableCores = (hc > 1u) ? (hc - 1u) : 1u;
-        const unsigned targetWorkers = std::max(2u, availableCores / 2u);
+        // Use most cores for loading — leave 1-2 for render/update threads.
+        const unsigned reserved = (hc >= 8u) ? 2u : 1u;
+        const unsigned targetWorkers = std::max(4u, hc - reserved);
         return static_cast<int>(targetWorkers);
     }
-    return 2;  // Fallback
+    return 4;  // Fallback
 }
 
 bool decodeLayerAlpha(const pipeline::MapChunk& chunk, size_t layerIdx, std::vector<uint8_t>& outAlpha) {
@@ -372,6 +373,15 @@ std::shared_ptr<PendingTile> TerrainManager::prepareTile(int x, int y) {
                                    int& skippedSkinNotFound) -> bool {
         if (preparedModelIds.find(modelId) != preparedModelIds.end()) return true;
 
+        // Skip file I/O + parsing for models already uploaded to GPU from previous tiles
+        {
+            std::lock_guard<std::mutex> lock(uploadedM2IdsMutex_);
+            if (uploadedM2Ids_.count(modelId)) {
+                preparedModelIds.insert(modelId);
+                return true;
+            }
+        }
+
         std::vector<uint8_t> m2Data = assetManager->readFile(m2Path);
         if (m2Data.empty()) {
             skippedFileNotFound++;
@@ -551,19 +561,30 @@ std::shared_ptr<PendingTile> TerrainManager::prepareTile(int x, int y) {
                         }
 
                         uint32_t doodadModelId = static_cast<uint32_t>(std::hash<std::string>{}(m2Path));
-                        std::vector<uint8_t> m2Data = assetManager->readFile(m2Path);
-                        if (m2Data.empty()) continue;
 
-                        pipeline::M2Model m2Model = pipeline::M2Loader::load(m2Data);
-                        if (m2Model.name.empty()) {
-                            m2Model.name = m2Path;
+                        // Skip file I/O if model already uploaded from a previous tile
+                        bool modelAlreadyUploaded = false;
+                        {
+                            std::lock_guard<std::mutex> lock(uploadedM2IdsMutex_);
+                            modelAlreadyUploaded = uploadedM2Ids_.count(doodadModelId) > 0;
                         }
-                        std::string skinPath = m2Path.substr(0, m2Path.size() - 3) + "00.skin";
-                        std::vector<uint8_t> skinData = assetManager->readFile(skinPath);
-                        if (!skinData.empty() && m2Model.version >= 264) {
-                            pipeline::M2Loader::loadSkin(skinData, m2Model);
+
+                        pipeline::M2Model m2Model;
+                        if (!modelAlreadyUploaded) {
+                            std::vector<uint8_t> m2Data = assetManager->readFile(m2Path);
+                            if (m2Data.empty()) continue;
+
+                            m2Model = pipeline::M2Loader::load(m2Data);
+                            if (m2Model.name.empty()) {
+                                m2Model.name = m2Path;
+                            }
+                            std::string skinPath = m2Path.substr(0, m2Path.size() - 3) + "00.skin";
+                            std::vector<uint8_t> skinData = assetManager->readFile(skinPath);
+                            if (!skinData.empty() && m2Model.version >= 264) {
+                                pipeline::M2Loader::loadSkin(skinData, m2Model);
+                            }
+                            if (!m2Model.isValid()) continue;
                         }
-                        if (!m2Model.isValid()) continue;
 
                         // Build doodad's local transform (WoW coordinates)
                         // WMO doodads use quaternion rotation
@@ -720,7 +741,7 @@ bool TerrainManager::advanceFinalization(FinalizingTile& ft) {
             }
             bool allDone = terrainRenderer->loadTerrainIncremental(
                 pending->mesh, pending->terrain.textures, x, y,
-                ft.terrainChunkNext, 16);
+                ft.terrainChunkNext, 64);
             if (!allDone) {
                 return false; // More chunks remain — yield to time budget
             }
@@ -750,13 +771,21 @@ bool TerrainManager::advanceFinalization(FinalizingTile& ft) {
     }
 
     case FinalizationPhase::M2_MODELS: {
-        // Upload ONE M2 model per call
+        // Upload multiple M2 models per call (batched GPU uploads)
         if (m2Renderer && ft.m2ModelIndex < pending->m2Models.size()) {
-            auto& m2Ready = pending->m2Models[ft.m2ModelIndex];
-            if (m2Renderer->loadModel(m2Ready.model, m2Ready.modelId)) {
-                ft.uploadedM2ModelIds.insert(m2Ready.modelId);
+            constexpr size_t kModelsPerStep = 8;
+            size_t uploaded = 0;
+            while (ft.m2ModelIndex < pending->m2Models.size() && uploaded < kModelsPerStep) {
+                auto& m2Ready = pending->m2Models[ft.m2ModelIndex];
+                if (m2Renderer->loadModel(m2Ready.model, m2Ready.modelId)) {
+                    ft.uploadedM2ModelIds.insert(m2Ready.modelId);
+                    // Track uploaded model IDs so background threads can skip re-reading
+                    std::lock_guard<std::mutex> lock(uploadedM2IdsMutex_);
+                    uploadedM2Ids_.insert(m2Ready.modelId);
+                }
+                ft.m2ModelIndex++;
+                uploaded++;
             }
-            ft.m2ModelIndex++;
             // Stay in this phase until all models uploaded
             if (ft.m2ModelIndex < pending->m2Models.size()) {
                 return false;
@@ -798,22 +827,23 @@ bool TerrainManager::advanceFinalization(FinalizingTile& ft) {
     }
 
     case FinalizationPhase::WMO_MODELS: {
-        // Upload ONE WMO model per call
+        // Upload multiple WMO models per call (batched GPU uploads)
         if (wmoRenderer && assetManager) {
             wmoRenderer->initialize(nullptr, VK_NULL_HANDLE, assetManager);
 
-            if (ft.wmoModelIndex < pending->wmoModels.size()) {
+            constexpr size_t kWmosPerStep = 4;
+            size_t uploaded = 0;
+            while (ft.wmoModelIndex < pending->wmoModels.size() && uploaded < kWmosPerStep) {
                 auto& wmoReady = pending->wmoModels[ft.wmoModelIndex];
-                // Deduplicate
                 if (wmoReady.uniqueId != 0 && placedWmoIds.count(wmoReady.uniqueId)) {
                     ft.wmoModelIndex++;
-                    if (ft.wmoModelIndex < pending->wmoModels.size()) return false;
                 } else {
                     wmoRenderer->loadModel(wmoReady.model, wmoReady.modelId);
                     ft.wmoModelIndex++;
-                    if (ft.wmoModelIndex < pending->wmoModels.size()) return false;
+                    uploaded++;
                 }
             }
+            if (ft.wmoModelIndex < pending->wmoModels.size()) return false;
         }
         ft.phase = FinalizationPhase::WMO_INSTANCES;
         return false;
@@ -874,17 +904,25 @@ bool TerrainManager::advanceFinalization(FinalizingTile& ft) {
     }
 
     case FinalizationPhase::WMO_DOODADS: {
-        // Upload ONE WMO doodad M2 per call
+        // Upload multiple WMO doodad M2s per call (batched GPU uploads)
         if (m2Renderer && ft.wmoDoodadIndex < pending->wmoDoodads.size()) {
-            auto& doodad = pending->wmoDoodads[ft.wmoDoodadIndex];
-            m2Renderer->loadModel(doodad.model, doodad.modelId);
-            uint32_t wmoDoodadInstId = m2Renderer->createInstanceWithMatrix(
-                doodad.modelId, doodad.modelMatrix, doodad.worldPosition);
-            if (wmoDoodadInstId) {
-                m2Renderer->setSkipCollision(wmoDoodadInstId, true);
-                ft.m2InstanceIds.push_back(wmoDoodadInstId);
+            constexpr size_t kDoodadsPerStep = 16;
+            size_t uploaded = 0;
+            while (ft.wmoDoodadIndex < pending->wmoDoodads.size() && uploaded < kDoodadsPerStep) {
+                auto& doodad = pending->wmoDoodads[ft.wmoDoodadIndex];
+                if (m2Renderer->loadModel(doodad.model, doodad.modelId)) {
+                    std::lock_guard<std::mutex> lock(uploadedM2IdsMutex_);
+                    uploadedM2Ids_.insert(doodad.modelId);
+                }
+                uint32_t wmoDoodadInstId = m2Renderer->createInstanceWithMatrix(
+                    doodad.modelId, doodad.modelMatrix, doodad.worldPosition);
+                if (wmoDoodadInstId) {
+                    m2Renderer->setSkipCollision(wmoDoodadInstId, true);
+                    ft.m2InstanceIds.push_back(wmoDoodadInstId);
+                }
+                ft.wmoDoodadIndex++;
+                uploaded++;
             }
-            ft.wmoDoodadIndex++;
             if (ft.wmoDoodadIndex < pending->wmoDoodads.size()) return false;
         }
         ft.phase = FinalizationPhase::WATER;
@@ -1062,6 +1100,11 @@ void TerrainManager::processReadyTiles() {
         }
     }
 
+    // Outer upload batch: all GPU uploads across all advanceFinalization calls
+    // this frame share a single command buffer submission + fence wait.
+    VkContext* vkCtx = terrainRenderer ? terrainRenderer->getVkContext() : nullptr;
+    if (vkCtx) vkCtx->beginUploadBatch();
+
     // Drive incremental finalization within time budget
     while (!finalizingTiles_.empty()) {
         auto& ft = finalizingTiles_.front();
@@ -1077,6 +1120,8 @@ void TerrainManager::processReadyTiles() {
             break;
         }
     }
+
+    if (vkCtx) vkCtx->endUploadBatch();
 }
 
 void TerrainManager::processAllReadyTiles() {
@@ -1094,12 +1139,19 @@ void TerrainManager::processAllReadyTiles() {
             }
         }
     }
+
+    // Batch all GPU uploads across all tiles into a single submission
+    VkContext* vkCtx = terrainRenderer ? terrainRenderer->getVkContext() : nullptr;
+    if (vkCtx) vkCtx->beginUploadBatch();
+
     // Finalize all tiles completely (no time budget — used for loading screens)
     while (!finalizingTiles_.empty()) {
         auto& ft = finalizingTiles_.front();
         while (!advanceFinalization(ft)) {}
         finalizingTiles_.pop_front();
     }
+
+    if (vkCtx) vkCtx->endUploadBatch();
 }
 
 void TerrainManager::processOneReadyTile() {
@@ -1118,9 +1170,14 @@ void TerrainManager::processOneReadyTile() {
     }
     // Finalize ONE tile completely, then return so caller can update the screen
     if (!finalizingTiles_.empty()) {
+        VkContext* vkCtx = terrainRenderer ? terrainRenderer->getVkContext() : nullptr;
+        if (vkCtx) vkCtx->beginUploadBatch();
+
         auto& ft = finalizingTiles_.front();
         while (!advanceFinalization(ft)) {}
         finalizingTiles_.pop_front();
+
+        if (vkCtx) vkCtx->endUploadBatch();
     }
 }
 
@@ -1340,6 +1397,10 @@ void TerrainManager::unloadAll() {
     finalizingTiles_.clear();
     placedDoodadIds.clear();
     placedWmoIds.clear();
+    {
+        std::lock_guard<std::mutex> lock(uploadedM2IdsMutex_);
+        uploadedM2Ids_.clear();
+    }
 
     LOG_INFO("Unloading all terrain tiles");
     loadedTiles.clear();
@@ -1388,6 +1449,10 @@ void TerrainManager::softReset() {
     finalizingTiles_.clear();
     placedDoodadIds.clear();
     placedWmoIds.clear();
+    {
+        std::lock_guard<std::mutex> lock(uploadedM2IdsMutex_);
+        uploadedM2Ids_.clear();
+    }
 
     // Clear tile cache — keys are (x,y) without map name, so stale entries from
     // a different map with overlapping coordinates would produce wrong geometry.

From 71e8ed5b7d3003d1ac1a1eda9c56bb71a2f88b59 Mon Sep 17 00:00:00 2001
From: Kelsi <kelsihates2fa@gmail.com>
Date: Sat, 7 Mar 2026 12:39:38 -0800
Subject: [PATCH 5/9] Reduce initial load to radius 1 (~5 tiles) for fast game
 entry
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Was waiting for all ~50 tiles (radius 4) to fully prepare + finalize
before entering the game. Now loads only the immediate surrounding tiles
during the loading screen, then restores the full radius for in-game
streaming. setLoadRadius just sets an int — actual loading happens lazily
via background workers during the game loop.
---
 src/core/application.cpp | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/src/core/application.cpp b/src/core/application.cpp
index 2a06bd5c..cabcaa01 100644
--- a/src/core/application.cpp
+++ b/src/core/application.cpp
@@ -3925,6 +3925,13 @@ void Application::loadOnlineWorldTerrain(uint32_t mapId, float x, float y, float
             auto* terrainMgr = renderer->getTerrainManager();
             auto* camera = renderer->getCamera();
 
+            // Use a small radius for the initial load (just immediate tiles),
+            // then restore the full radius after entering the game.
+            // This matches WoW's behavior: load quickly, stream the rest in-game.
+            const int savedLoadRadius = 4;
+            terrainMgr->setLoadRadius(1);
+            terrainMgr->setUnloadRadius(7);
+
             // Trigger tile streaming for surrounding area
             terrainMgr->update(*camera, 1.0f);
 
@@ -4016,6 +4023,9 @@ void Application::loadOnlineWorldTerrain(uint32_t mapId, float x, float y, float
 
             LOG_INFO("Online terrain streaming complete: ", terrainMgr->getLoadedTileCount(), " tiles loaded");
 
+            // Restore full load radius — remaining tiles stream in-game
+            terrainMgr->setLoadRadius(savedLoadRadius);
+
             // Load/precompute collision cache
             if (renderer->getWMORenderer()) {
                 showProgress("Building collision cache...", 0.88f);

From 0313bd869285ff9e21e6bf24e3ec192189be4633 Mon Sep 17 00:00:00 2001
From: Kelsi <kelsihates2fa@gmail.com>
Date: Sat, 7 Mar 2026 13:44:09 -0800
Subject: [PATCH 6/9] Performance: ring buffer UBOs, batched load screen
 uploads, background world preloader

- Replace per-frame VMA alloc/free of material UBOs with a ring buffer in
  CharacterRenderer (~500 allocations/frame eliminated)
- Batch all ready terrain tiles into a single GPU upload during load screen
  (processAllReadyTiles instead of one-at-a-time with individual fence waits)
- Lift per-frame creature/GO spawn budgets during load screen warmup phase
- Add background world preloader: saves last world position to disk, pre-warms
  AssetManager file cache with ADT files starting at app init (login screen)
  so terrain workers get instant cache hits when Enter World is clicked
- Distance-filter expensive collision guard to 8-unit melee range
- Merge 3 CharacterRenderer update loops into single pass
- Time-budget instrumentation for slow update stages (>3ms threshold)
- Count-based async creature model upload budget (max 3/frame in-game)
- 1-per-frame game object spawn + per-doodad time budget for transport loading
- Use deque for creature spawn queue to avoid O(n) front-erase
---
 include/core/application.hpp             |  22 +-
 include/rendering/character_renderer.hpp |   9 +-
 src/core/application.cpp                 | 299 +++++++++++++++++++++--
 src/game/game_handler.cpp                |   6 +
 src/rendering/character_renderer.cpp     | 160 +++++-------
 src/rendering/renderer.cpp               |  13 +
 src/rendering/terrain_manager.cpp        |   2 +-
 7 files changed, 390 insertions(+), 121 deletions(-)

diff --git a/include/core/application.hpp b/include/core/application.hpp
index 7415da18..a23e6bd8 100644
--- a/include/core/application.hpp
+++ b/include/core/application.hpp
@@ -6,12 +6,15 @@
 #include <memory>
 #include <string>
 #include <vector>
+#include <deque>
 #include <unordered_map>
 #include <unordered_set>
 #include <array>
 #include <optional>
 #include <future>
 #include <mutex>
+#include <thread>
+#include <atomic>
 
 namespace wowee {
 
@@ -282,7 +285,7 @@ private:
         uint32_t displayId;
         float x, y, z, orientation;
     };
-    std::vector<PendingCreatureSpawn> pendingCreatureSpawns_;
+    std::deque<PendingCreatureSpawn> pendingCreatureSpawns_;
     static constexpr int MAX_SPAWNS_PER_FRAME = 3;
     static constexpr int MAX_NEW_CREATURE_MODELS_PER_FRAME = 1;
     static constexpr uint16_t MAX_CREATURE_SPAWN_RETRIES = 300;
@@ -353,6 +356,23 @@ private:
     // Quest marker billboard sprites (above NPCs)
     void loadQuestMarkerModels();  // Now loads BLP textures
     void updateQuestMarkers();     // Updates billboard positions
+
+    // Background world preloader — warms AssetManager file cache for the
+    // expected world before the user clicks Enter World.
+    struct WorldPreload {
+        uint32_t mapId = 0;
+        std::string mapName;
+        int centerTileX = 0;
+        int centerTileY = 0;
+        std::atomic<bool> cancel{false};
+        std::vector<std::thread> workers;
+    };
+    std::unique_ptr<WorldPreload> worldPreload_;
+    void startWorldPreload(uint32_t mapId, const std::string& mapName, float serverX, float serverY);
+    void cancelWorldPreload();
+    void saveLastWorldInfo(uint32_t mapId, const std::string& mapName, float serverX, float serverY);
+    struct LastWorldInfo { uint32_t mapId = 0; std::string mapName; float x = 0, y = 0; bool valid = false; };
+    LastWorldInfo loadLastWorldInfo() const;
 };
 
 } // namespace core
diff --git a/include/rendering/character_renderer.hpp b/include/rendering/character_renderer.hpp
index c6f63451..52813cf4 100644
--- a/include/rendering/character_renderer.hpp
+++ b/include/rendering/character_renderer.hpp
@@ -254,7 +254,14 @@ private:
     VkDescriptorPool materialDescPools_[2] = {VK_NULL_HANDLE, VK_NULL_HANDLE};
     VkDescriptorPool boneDescPool_ = VK_NULL_HANDLE;
     uint32_t lastMaterialPoolResetFrame_ = 0xFFFFFFFFu;
-    std::vector<std::pair<VkBuffer, VmaAllocation>> transientMaterialUbos_[2];
+
+    // Material UBO ring buffer — pre-allocated per frame slot, sub-allocated each draw
+    VkBuffer materialRingBuffer_[2] = {VK_NULL_HANDLE, VK_NULL_HANDLE};
+    VmaAllocation materialRingAlloc_[2] = {VK_NULL_HANDLE, VK_NULL_HANDLE};
+    void* materialRingMapped_[2] = {nullptr, nullptr};
+    uint32_t materialRingOffset_[2] = {0, 0};
+    uint32_t materialUboAlignment_ = 256;  // minUniformBufferOffsetAlignment
+    static constexpr uint32_t MATERIAL_RING_CAPACITY = 4096;
 
     // Texture cache
     struct TextureCacheEntry {
diff --git a/src/core/application.cpp b/src/core/application.cpp
index cabcaa01..f0c22a2c 100644
--- a/src/core/application.cpp
+++ b/src/core/application.cpp
@@ -56,6 +56,7 @@
 #include <sstream>
 #include <set>
 #include <filesystem>
+#include <fstream>
 
 #include <thread>
 #ifdef __linux__
@@ -314,6 +315,15 @@ bool Application::initialize() {
             gameHandler->getTransportManager()->loadTaxiPathNodeDBC(assetManager.get());
         }
 
+        // Start background preload for last-played character's world.
+        // Warms the file cache so terrain tile loading is faster at Enter World.
+        {
+            auto lastWorld = loadLastWorldInfo();
+            if (lastWorld.valid) {
+                startWorldPreload(lastWorld.mapId, lastWorld.mapName, lastWorld.x, lastWorld.y);
+            }
+        }
+
     } else {
         LOG_WARNING("Failed to initialize asset manager - asset loading will be unavailable");
         LOG_WARNING("Set WOW_DATA_PATH environment variable to your WoW Data directory");
@@ -521,6 +531,9 @@ void Application::run() {
 void Application::shutdown() {
     LOG_WARNING("Shutting down application...");
 
+    // Stop background world preloader before destroying AssetManager
+    cancelWorldPreload();
+
     // Save floor cache before renderer is destroyed
     if (renderer && renderer->getWMORenderer()) {
         size_t cacheSize = renderer->getWMORenderer()->getFloorCacheSize();
@@ -843,6 +856,7 @@ void Application::update(float deltaTime) {
             const char* inGameStep = "begin";
             try {
             auto runInGameStage = [&](const char* stageName, auto&& fn) {
+                auto stageStart = std::chrono::steady_clock::now();
                 try {
                     fn();
                 } catch (const std::bad_alloc& e) {
@@ -852,6 +866,11 @@ void Application::update(float deltaTime) {
                     LOG_ERROR("Exception during IN_GAME update stage '", stageName, "': ", e.what());
                     throw;
                 }
+                auto stageEnd = std::chrono::steady_clock::now();
+                float stageMs = std::chrono::duration<float, std::milli>(stageEnd - stageStart).count();
+                if (stageMs > 3.0f) {
+                    LOG_WARNING("SLOW update stage '", stageName, "': ", stageMs, "ms");
+                }
             };
             inGameStep = "gameHandler update";
             updateCheckpoint = "in_game: gameHandler update";
@@ -1289,6 +1308,7 @@ void Application::update(float deltaTime) {
             // creature models remain at stale spawn positions.
             inGameStep = "creature render sync";
             updateCheckpoint = "in_game: creature render sync";
+            auto creatureSyncStart = std::chrono::steady_clock::now();
             if (renderer && gameHandler && renderer->getCharacterRenderer()) {
                 auto* charRenderer = renderer->getCharacterRenderer();
                 static float npcWeaponRetryTimer = 0.0f;
@@ -1333,24 +1353,31 @@ void Application::update(float deltaTime) {
                     }
 
                     glm::vec3 canonical(entity->getX(), entity->getY(), entity->getZ());
+                    float canonDistSq = 0.0f;
                     if (havePlayerPos) {
                         glm::vec3 d = canonical - playerPos;
-                        if (glm::dot(d, d) > syncRadiusSq) continue;
+                        canonDistSq = glm::dot(d, d);
+                        if (canonDistSq > syncRadiusSq) continue;
                     }
 
                     glm::vec3 renderPos = core::coords::canonicalToRender(canonical);
 
                     // Visual collision guard: keep hostile melee units from rendering inside the
                     // player's model while attacking. This is client-side only (no server position change).
-                    auto unit = std::static_pointer_cast<game::Unit>(entity);
-                    const uint64_t currentTargetGuid = gameHandler->hasTarget() ? gameHandler->getTargetGuid() : 0;
-                    const uint64_t autoAttackGuid = gameHandler->getAutoAttackTargetGuid();
-                    const bool isCombatTarget = (guid == currentTargetGuid || guid == autoAttackGuid);
-                    bool clipGuardEligible = havePlayerPos &&
-                                             unit->getHealth() > 0 &&
-                                             (unit->isHostile() ||
-                                              gameHandler->isAggressiveTowardPlayer(guid) ||
-                                              isCombatTarget);
+                    // Only check for creatures within 8 units (melee range) — saves expensive
+                    // getRenderBoundsForGuid/getModelData calls for distant creatures.
+                    bool clipGuardEligible = false;
+                    bool isCombatTarget = false;
+                    if (havePlayerPos && canonDistSq < 64.0f) { // 8² = melee range
+                        auto unit = std::static_pointer_cast<game::Unit>(entity);
+                        const uint64_t currentTargetGuid = gameHandler->hasTarget() ? gameHandler->getTargetGuid() : 0;
+                        const uint64_t autoAttackGuid = gameHandler->getAutoAttackTargetGuid();
+                        isCombatTarget = (guid == currentTargetGuid || guid == autoAttackGuid);
+                        clipGuardEligible = unit->getHealth() > 0 &&
+                                            (unit->isHostile() ||
+                                             gameHandler->isAggressiveTowardPlayer(guid) ||
+                                             isCombatTarget);
+                    }
                     if (clipGuardEligible) {
                         float creatureCollisionRadius = 0.8f;
                         glm::vec3 cc;
@@ -1410,7 +1437,8 @@ void Application::update(float deltaTime) {
                         float planarDist = glm::length(delta2);
                         float dz = std::abs(renderPos.z - prevPos.z);
 
-                        const bool deadOrCorpse = unit->getHealth() == 0;
+                        auto unitPtr = std::static_pointer_cast<game::Unit>(entity);
+                        const bool deadOrCorpse = unitPtr->getHealth() == 0;
                         const bool largeCorrection = (planarDist > 6.0f) || (dz > 3.0f);
                         if (deadOrCorpse || largeCorrection) {
                             charRenderer->setInstancePosition(instanceId, renderPos);
@@ -1425,6 +1453,14 @@ void Application::update(float deltaTime) {
                     charRenderer->setInstanceRotation(instanceId, glm::vec3(0.0f, 0.0f, renderYaw));
                 }
             }
+            {
+                float csMs = std::chrono::duration<float, std::milli>(
+                    std::chrono::steady_clock::now() - creatureSyncStart).count();
+                if (csMs > 5.0f) {
+                    LOG_WARNING("SLOW update stage 'creature render sync': ", csMs, "ms (",
+                                creatureInstances_.size(), " creatures)");
+                }
+            }
 
             // Movement heartbeat is sent from GameHandler::update() to avoid
             // duplicate packets from multiple update loops.
@@ -1447,6 +1483,7 @@ void Application::update(float deltaTime) {
     // Update renderer (camera, etc.) only when in-game
     updateCheckpoint = "renderer update";
     if (renderer && state == AppState::IN_GAME) {
+        auto rendererUpdateStart = std::chrono::steady_clock::now();
         try {
             renderer->update(deltaTime);
         } catch (const std::bad_alloc& e) {
@@ -1456,6 +1493,11 @@ void Application::update(float deltaTime) {
             LOG_ERROR("Exception during Application::update stage 'renderer->update': ", e.what());
             throw;
         }
+        float ruMs = std::chrono::duration<float, std::milli>(
+            std::chrono::steady_clock::now() - rendererUpdateStart).count();
+        if (ruMs > 5.0f) {
+            LOG_WARNING("SLOW update stage 'renderer->update': ", ruMs, "ms");
+        }
     }
     // Update UI
     updateCheckpoint = "ui update";
@@ -3537,6 +3579,21 @@ void Application::loadOnlineWorldTerrain(uint32_t mapId, float x, float y, float
     }
     LOG_INFO("Loading online world terrain for map '", mapName, "' (ID ", mapId, ")");
 
+    // Cancel any stale preload (if it was for a different map, the file cache
+    // still retains whatever was loaded — it doesn't hurt).
+    if (worldPreload_) {
+        if (worldPreload_->mapId == mapId) {
+            LOG_INFO("World preload: cache-warm hit for map '", mapName, "'");
+        } else {
+            LOG_INFO("World preload: map mismatch (preloaded ", worldPreload_->mapName,
+                     ", entering ", mapName, ")");
+        }
+    }
+    cancelWorldPreload();
+
+    // Save this world info for next session's early preload
+    saveLastWorldInfo(mapId, mapName, x, y);
+
     // Convert server coordinates to canonical WoW coordinates
     // Server sends: X=West (canonical.Y), Y=North (canonical.X), Z=Up
     glm::vec3 spawnCanonical = core::coords::serverToCanonical(glm::vec3(x, y, z));
@@ -3967,8 +4024,11 @@ void Application::loadOnlineWorldTerrain(uint32_t mapId, float x, float y, float
                 // Trigger new streaming — enqueue tiles for background workers
                 terrainMgr->update(*camera, 0.016f);
 
-                // Process ONE tile per iteration so loading screen updates after each
-                terrainMgr->processOneReadyTile();
+                // Process ALL available ready tiles per iteration — batches GPU
+                // uploads into a single command buffer + fence wait instead of
+                // one fence per tile.  Loading screen still updates between
+                // iterations while workers parse more tiles.
+                terrainMgr->processAllReadyTiles();
 
                 int remaining = terrainMgr->getRemainingTileCount();
                 int loaded = terrainMgr->getLoadedTileCount();
@@ -4126,9 +4186,64 @@ void Application::loadOnlineWorldTerrain(uint32_t mapId, float x, float y, float
 
             if (world) world->update(1.0f / 60.0f);
             processPlayerSpawnQueue();
+
+            // During load screen warmup: lift per-frame budgets so GPU uploads
+            // happen in bulk while the loading screen is still visible.
+            // Process ALL async creature model uploads (no 3-per-frame cap).
+            {
+                for (auto it = asyncCreatureLoads_.begin(); it != asyncCreatureLoads_.end(); ) {
+                    if (!it->future.valid() ||
+                        it->future.wait_for(std::chrono::milliseconds(0)) != std::future_status::ready) {
+                        ++it;
+                        continue;
+                    }
+                    auto result = it->future.get();
+                    it = asyncCreatureLoads_.erase(it);
+                    if (result.permanent_failure) {
+                        nonRenderableCreatureDisplayIds_.insert(result.displayId);
+                        creaturePermanentFailureGuids_.insert(result.guid);
+                        pendingCreatureSpawnGuids_.erase(result.guid);
+                        creatureSpawnRetryCounts_.erase(result.guid);
+                        continue;
+                    }
+                    if (!result.valid || !result.model) {
+                        pendingCreatureSpawnGuids_.erase(result.guid);
+                        creatureSpawnRetryCounts_.erase(result.guid);
+                        continue;
+                    }
+                    auto* charRenderer = renderer ? renderer->getCharacterRenderer() : nullptr;
+                    if (!charRenderer) { pendingCreatureSpawnGuids_.erase(result.guid); continue; }
+                    if (!charRenderer->loadModel(*result.model, result.modelId)) {
+                        nonRenderableCreatureDisplayIds_.insert(result.displayId);
+                        creaturePermanentFailureGuids_.insert(result.guid);
+                        pendingCreatureSpawnGuids_.erase(result.guid);
+                        creatureSpawnRetryCounts_.erase(result.guid);
+                        continue;
+                    }
+                    displayIdModelCache_[result.displayId] = result.modelId;
+                    pendingCreatureSpawnGuids_.erase(result.guid);
+                    creatureSpawnRetryCounts_.erase(result.guid);
+                    if (!creatureInstances_.count(result.guid) &&
+                        !creaturePermanentFailureGuids_.count(result.guid)) {
+                        PendingCreatureSpawn s{};
+                        s.guid = result.guid; s.displayId = result.displayId;
+                        s.x = result.x; s.y = result.y; s.z = result.z;
+                        s.orientation = result.orientation;
+                        pendingCreatureSpawns_.push_back(s);
+                        pendingCreatureSpawnGuids_.insert(result.guid);
+                    }
+                }
+            }
             processCreatureSpawnQueue();
             processDeferredEquipmentQueue();
-            processGameObjectSpawnQueue();
+
+            // Process ALL pending game object spawns (no 1-per-frame cap during load screen).
+            while (!pendingGameObjectSpawns_.empty()) {
+                auto& s = pendingGameObjectSpawns_.front();
+                spawnOnlineGameObject(s.guid, s.entry, s.displayId, s.x, s.y, s.z, s.orientation);
+                pendingGameObjectSpawns_.erase(pendingGameObjectSpawns_.begin());
+            }
+
             processPendingTransportDoodads();
             processPendingMount();
             updateQuestMarkers();
@@ -6767,12 +6882,25 @@ void Application::spawnOnlineGameObject(uint64_t guid, uint32_t entry, uint32_t
 
 void Application::processAsyncCreatureResults() {
     // Check completed async model loads and finalize on main thread (GPU upload + instance creation).
+    // Limit GPU model uploads per frame to avoid spikes, but always drain cheap bookkeeping.
+    static constexpr int kMaxModelUploadsPerFrame = 3;
+    int modelUploads = 0;
+
     for (auto it = asyncCreatureLoads_.begin(); it != asyncCreatureLoads_.end(); ) {
         if (!it->future.valid() ||
             it->future.wait_for(std::chrono::milliseconds(0)) != std::future_status::ready) {
             ++it;
             continue;
         }
+
+        // Peek: if this result needs a NEW model upload (not cached) and we've hit
+        // the upload budget, defer to next frame without consuming the future.
+        if (modelUploads >= kMaxModelUploadsPerFrame) {
+            // Check if this displayId already has a cached model (cheap spawn, no GPU upload).
+            // We can't peek the displayId without getting the future, so just break.
+            break;
+        }
+
         auto result = it->future.get();
         it = asyncCreatureLoads_.erase(it);
 
@@ -6805,6 +6933,7 @@ void Application::processAsyncCreatureResults() {
             continue;
         }
         displayIdModelCache_[result.displayId] = result.modelId;
+        modelUploads++;
 
         pendingCreatureSpawnGuids_.erase(result.guid);
         creatureSpawnRetryCounts_.erase(result.guid);
@@ -6854,7 +6983,7 @@ void Application::processCreatureSpawnQueue() {
         }
 
         PendingCreatureSpawn s = pendingCreatureSpawns_.front();
-        pendingCreatureSpawns_.erase(pendingCreatureSpawns_.begin());
+        pendingCreatureSpawns_.pop_front();
 
         if (nonRenderableCreatureDisplayIds_.count(s.displayId)) {
             pendingCreatureSpawnGuids_.erase(s.guid);
@@ -7035,13 +7164,11 @@ void Application::processDeferredEquipmentQueue() {
 void Application::processGameObjectSpawnQueue() {
     if (pendingGameObjectSpawns_.empty()) return;
 
-    int spawned = 0;
-    while (!pendingGameObjectSpawns_.empty() && spawned < MAX_SPAWNS_PER_FRAME) {
-        auto& s = pendingGameObjectSpawns_.front();
-        spawnOnlineGameObject(s.guid, s.entry, s.displayId, s.x, s.y, s.z, s.orientation);
-        pendingGameObjectSpawns_.erase(pendingGameObjectSpawns_.begin());
-        spawned++;
-    }
+    // Only spawn 1 game object per frame — each can involve heavy synchronous
+    // WMO loading (root + groups from disk + GPU upload), easily 100ms+.
+    auto& s = pendingGameObjectSpawns_.front();
+    spawnOnlineGameObject(s.guid, s.entry, s.displayId, s.x, s.y, s.z, s.orientation);
+    pendingGameObjectSpawns_.erase(pendingGameObjectSpawns_.begin());
 }
 
 void Application::processPendingTransportDoodads() {
@@ -7052,9 +7179,16 @@ void Application::processPendingTransportDoodads() {
     auto* m2Renderer = renderer->getM2Renderer();
     if (!wmoRenderer || !m2Renderer) return;
 
+    auto startTime = std::chrono::steady_clock::now();
+    static constexpr float kDoodadBudgetMs = 4.0f;
+
     size_t budgetLeft = MAX_TRANSPORT_DOODADS_PER_FRAME;
     for (auto it = pendingTransportDoodadBatches_.begin();
          it != pendingTransportDoodadBatches_.end() && budgetLeft > 0;) {
+        // Time budget check
+        float elapsedMs = std::chrono::duration<float, std::milli>(
+            std::chrono::steady_clock::now() - startTime).count();
+        if (elapsedMs >= kDoodadBudgetMs) break;
         auto goIt = gameObjectInstances_.find(it->guid);
         if (goIt == gameObjectInstances_.end() || !goIt->second.isWmo ||
             goIt->second.instanceId != it->instanceId || goIt->second.modelId != it->modelId) {
@@ -7070,6 +7204,11 @@ void Application::processPendingTransportDoodads() {
 
         const size_t maxIndex = std::min(it->doodadBudget, doodadTemplates->size());
         while (it->nextIndex < maxIndex && budgetLeft > 0) {
+            // Per-doodad time budget (each does synchronous file I/O + parse + GPU upload)
+            float innerMs = std::chrono::duration<float, std::milli>(
+                std::chrono::steady_clock::now() - startTime).count();
+            if (innerMs >= kDoodadBudgetMs) { budgetLeft = 0; break; }
+
             const auto& doodadTemplate = (*doodadTemplates)[it->nextIndex];
             it->nextIndex++;
             budgetLeft--;
@@ -7729,5 +7868,121 @@ void Application::setupTestTransport() {
     LOG_INFO("========================================");
 }
 
+// ─── World Preloader ─────────────────────────────────────────────────────────
+// Pre-warms AssetManager file cache with ADT files (and their _obj0 variants)
+// for tiles around the expected spawn position.  Runs in background so that
+// when loadOnlineWorldTerrain eventually asks TerrainManager workers to parse
+// the same files, every readFile() is an instant cache hit instead of disk I/O.
+
+void Application::startWorldPreload(uint32_t mapId, const std::string& mapName,
+                                     float serverX, float serverY) {
+    cancelWorldPreload();
+    if (!assetManager || !assetManager->isInitialized() || mapName.empty()) return;
+
+    glm::vec3 canonical = core::coords::serverToCanonical(glm::vec3(serverX, serverY, 0.0f));
+    auto [tileX, tileY] = core::coords::canonicalToTile(canonical.x, canonical.y);
+
+    worldPreload_ = std::make_unique<WorldPreload>();
+    worldPreload_->mapId = mapId;
+    worldPreload_->mapName = mapName;
+    worldPreload_->centerTileX = tileX;
+    worldPreload_->centerTileY = tileY;
+
+    LOG_INFO("World preload: starting for map '", mapName, "' tile [", tileX, ",", tileY, "]");
+
+    // Build list of tiles to preload (radius 1 = 3x3 = 9 tiles, matching load screen)
+    struct TileJob { int x, y; };
+    auto jobs = std::make_shared<std::vector<TileJob>>();
+    // Center tile first (most important)
+    jobs->push_back({tileX, tileY});
+    for (int dx = -1; dx <= 1; dx++) {
+        for (int dy = -1; dy <= 1; dy++) {
+            if (dx == 0 && dy == 0) continue;
+            int tx = tileX + dx, ty = tileY + dy;
+            if (tx < 0 || tx > 63 || ty < 0 || ty > 63) continue;
+            jobs->push_back({tx, ty});
+        }
+    }
+
+    // Spawn worker threads (one per tile for maximum parallelism)
+    auto cancelFlag = &worldPreload_->cancel;
+    auto* am = assetManager.get();
+    std::string mn = mapName;
+
+    int numWorkers = std::min(static_cast<int>(jobs->size()), 4);
+    auto nextJob = std::make_shared<std::atomic<int>>(0);
+
+    for (int w = 0; w < numWorkers; w++) {
+        worldPreload_->workers.emplace_back([am, mn, jobs, nextJob, cancelFlag]() {
+            while (!cancelFlag->load(std::memory_order_relaxed)) {
+                int idx = nextJob->fetch_add(1, std::memory_order_relaxed);
+                if (idx >= static_cast<int>(jobs->size())) break;
+
+                int tx = (*jobs)[idx].x;
+                int ty = (*jobs)[idx].y;
+
+                // Read ADT file (warms file cache)
+                std::string adtPath = "World\\Maps\\" + mn + "\\" + mn + "_" +
+                                      std::to_string(tx) + "_" + std::to_string(ty) + ".adt";
+                am->readFile(adtPath);
+                if (cancelFlag->load(std::memory_order_relaxed)) break;
+
+                // Read obj0 variant
+                std::string objPath = "World\\Maps\\" + mn + "\\" + mn + "_" +
+                                      std::to_string(tx) + "_" + std::to_string(ty) + "_obj0.adt";
+                am->readFile(objPath);
+            }
+            LOG_DEBUG("World preload worker finished");
+        });
+    }
+}
+
+void Application::cancelWorldPreload() {
+    if (!worldPreload_) return;
+    worldPreload_->cancel.store(true, std::memory_order_relaxed);
+    for (auto& t : worldPreload_->workers) {
+        if (t.joinable()) t.join();
+    }
+    LOG_INFO("World preload: cancelled (map=", worldPreload_->mapName,
+             " tile=[", worldPreload_->centerTileX, ",", worldPreload_->centerTileY, "])");
+    worldPreload_.reset();
+}
+
+void Application::saveLastWorldInfo(uint32_t mapId, const std::string& mapName,
+                                     float serverX, float serverY) {
+#ifdef _WIN32
+    const char* base = std::getenv("APPDATA");
+    std::string dir = base ? std::string(base) + "\\wowee" : ".";
+#else
+    const char* home = std::getenv("HOME");
+    std::string dir = home ? std::string(home) + "/.wowee" : ".";
+#endif
+    std::filesystem::create_directories(dir);
+    std::ofstream f(dir + "/last_world.cfg");
+    if (f) {
+        f << mapId << "\n" << mapName << "\n" << serverX << "\n" << serverY << "\n";
+    }
+}
+
+Application::LastWorldInfo Application::loadLastWorldInfo() const {
+#ifdef _WIN32
+    const char* base = std::getenv("APPDATA");
+    std::string dir = base ? std::string(base) + "\\wowee" : ".";
+#else
+    const char* home = std::getenv("HOME");
+    std::string dir = home ? std::string(home) + "/.wowee" : ".";
+#endif
+    LastWorldInfo info;
+    std::ifstream f(dir + "/last_world.cfg");
+    if (!f) return info;
+    std::string line;
+    if (std::getline(f, line)) info.mapId = static_cast<uint32_t>(std::stoul(line));
+    if (std::getline(f, line)) info.mapName = line;
+    if (std::getline(f, line)) info.x = std::stof(line);
+    if (std::getline(f, line)) info.y = std::stof(line);
+    info.valid = !info.mapName.empty();
+    return info;
+}
+
 } // namespace core
 } // namespace wowee
diff --git a/src/game/game_handler.cpp b/src/game/game_handler.cpp
index e80e727f..9a7aed97 100644
--- a/src/game/game_handler.cpp
+++ b/src/game/game_handler.cpp
@@ -541,7 +541,13 @@ void GameHandler::update(float deltaTime) {
 
     // Update socket (processes incoming data and triggers callbacks)
     if (socket) {
+        auto socketStart = std::chrono::steady_clock::now();
         socket->update();
+        float socketMs = std::chrono::duration<float, std::milli>(
+            std::chrono::steady_clock::now() - socketStart).count();
+        if (socketMs > 3.0f) {
+            LOG_WARNING("SLOW socket->update: ", socketMs, "ms");
+        }
     }
 
     // Detect server-side disconnect (socket closed during update)
diff --git a/src/rendering/character_renderer.cpp b/src/rendering/character_renderer.cpp
index 9aa99c72..f735dd7d 100644
--- a/src/rendering/character_renderer.cpp
+++ b/src/rendering/character_renderer.cpp
@@ -197,6 +197,29 @@ bool CharacterRenderer::initialize(VkContext* ctx, VkDescriptorSetLayout perFram
         vkCreateDescriptorPool(device, &ci, nullptr, &boneDescPool_);
     }
 
+    // --- Material UBO ring buffers (one per frame slot) ---
+    {
+        VkPhysicalDeviceProperties props;
+        vkGetPhysicalDeviceProperties(ctx->getPhysicalDevice(), &props);
+        materialUboAlignment_ = static_cast<uint32_t>(props.limits.minUniformBufferOffsetAlignment);
+        if (materialUboAlignment_ < 1) materialUboAlignment_ = 1;
+        // Round up UBO size to alignment
+        uint32_t alignedUboSize = (sizeof(CharMaterialUBO) + materialUboAlignment_ - 1) & ~(materialUboAlignment_ - 1);
+        uint32_t ringSize = alignedUboSize * MATERIAL_RING_CAPACITY;
+        for (int i = 0; i < 2; i++) {
+            VkBufferCreateInfo bci{VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO};
+            bci.size = ringSize;
+            bci.usage = VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT;
+            VmaAllocationCreateInfo aci{};
+            aci.usage = VMA_MEMORY_USAGE_CPU_TO_GPU;
+            aci.flags = VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT | VMA_ALLOCATION_CREATE_MAPPED_BIT;
+            VmaAllocationInfo allocInfo{};
+            vmaCreateBuffer(ctx->getAllocator(), &bci, &aci,
+                            &materialRingBuffer_[i], &materialRingAlloc_[i], &allocInfo);
+            materialRingMapped_[i] = allocInfo.pMappedData;
+        }
+    }
+
     // --- Pipeline layout ---
     // set 0 = perFrame, set 1 = material, set 2 = bones
     // Push constant: mat4 model = 64 bytes
@@ -352,14 +375,15 @@ void CharacterRenderer::shutdown() {
 
     if (pipelineLayout_) { vkDestroyPipelineLayout(device, pipelineLayout_, nullptr); pipelineLayout_ = VK_NULL_HANDLE; }
 
-    // Release any deferred transient material UBOs.
+    // Destroy material ring buffers
     for (int i = 0; i < 2; i++) {
-        for (const auto& b : transientMaterialUbos_[i]) {
-            if (b.first) {
-                vmaDestroyBuffer(alloc, b.first, b.second);
-            }
+        if (materialRingBuffer_[i]) {
+            vmaDestroyBuffer(alloc, materialRingBuffer_[i], materialRingAlloc_[i]);
+            materialRingBuffer_[i] = VK_NULL_HANDLE;
+            materialRingAlloc_[i] = VK_NULL_HANDLE;
+            materialRingMapped_[i] = nullptr;
         }
-        transientMaterialUbos_[i].clear();
+        materialRingOffset_[i] = 0;
     }
 
     // Destroy descriptor pools and layouts
@@ -391,7 +415,6 @@ void CharacterRenderer::clear() {
 
     vkDeviceWaitIdle(vkCtx_->getDevice());
     VkDevice device = vkCtx_->getDevice();
-    VmaAllocator alloc = vkCtx_->getAllocator();
 
     // Destroy GPU resources for all models
     for (auto& pair : models) {
@@ -441,14 +464,9 @@ void CharacterRenderer::clear() {
     models.clear();
     instances.clear();
 
-    // Release deferred transient material UBOs
+    // Reset material ring buffer offsets (buffers persist, just reset write position)
     for (int i = 0; i < 2; i++) {
-        for (const auto& b : transientMaterialUbos_[i]) {
-            if (b.first) {
-                vmaDestroyBuffer(alloc, b.first, b.second);
-            }
-        }
-        transientMaterialUbos_[i].clear();
+        materialRingOffset_[i] = 0;
     }
 
     // Reset descriptor pools (don't destroy — reuse for new allocations)
@@ -1454,8 +1472,14 @@ void CharacterRenderer::update(float deltaTime, const glm::vec3& cameraPos) {
     const float animUpdateRadius = static_cast<float>(envSizeOrDefault("WOWEE_CHAR_ANIM_RADIUS", 120));
     const float animUpdateRadiusSq = animUpdateRadius * animUpdateRadius;
 
-    // Update fade-in opacity
-    for (auto& [id, inst] : instances) {
+    // Single pass: fade-in, movement, and animation bone collection
+    std::vector<std::reference_wrapper<CharacterInstance>> toUpdate;
+    toUpdate.reserve(instances.size());
+
+    for (auto& pair : instances) {
+        auto& inst = pair.second;
+
+        // Update fade-in opacity
         if (inst.fadeInDuration > 0.0f && inst.opacity < 1.0f) {
             inst.fadeInTime += deltaTime;
             inst.opacity = std::min(1.0f, inst.fadeInTime / inst.fadeInDuration);
@@ -1463,10 +1487,8 @@ void CharacterRenderer::update(float deltaTime, const glm::vec3& cameraPos) {
                 inst.fadeInDuration = 0.0f;
             }
         }
-    }
 
-    // Interpolate creature movement
-    for (auto& [id, inst] : instances) {
+        // Interpolate creature movement
         if (inst.isMoving) {
             inst.moveElapsed += deltaTime;
             float t = inst.moveElapsed / inst.moveDuration;
@@ -1475,23 +1497,14 @@ void CharacterRenderer::update(float deltaTime, const glm::vec3& cameraPos) {
                 inst.isMoving = false;
                 // Return to idle when movement completes
                 if (inst.currentAnimationId == 4 || inst.currentAnimationId == 5) {
-                    playAnimation(id, 0, true);
+                    playAnimation(pair.first, 0, true);
                 }
             } else {
                 inst.position = glm::mix(inst.moveStart, inst.moveEnd, t);
             }
         }
-    }
 
-    // Only update animations for nearby characters (performance optimization)
-    // Collect instances that need bone recomputation, with distance-based throttling
-    std::vector<std::reference_wrapper<CharacterInstance>> toUpdate;
-    toUpdate.reserve(instances.size());
-
-    for (auto& pair : instances) {
-        auto& inst = pair.second;
-
-        // Skip weapon instances — their transforms are set by parent bones
+        // Skip weapon instances for animation — their transforms are set by parent bones
         if (inst.hasOverrideModelMatrix) continue;
 
         float distSq = glm::distance2(inst.position, cameraPos);
@@ -1533,7 +1546,7 @@ void CharacterRenderer::update(float deltaTime, const glm::vec3& cameraPos) {
     // Thread bone matrix computation in chunks
     if (updatedCount >= 8 && numAnimThreads_ > 1) {
         static const size_t minAnimWorkPerThread = std::max<size_t>(
-            16, envSizeOrDefault("WOWEE_CHAR_ANIM_WORK_PER_THREAD", 64));
+            8, envSizeOrDefault("WOWEE_CHAR_ANIM_WORK_PER_THREAD", 16));
         const size_t maxUsefulThreads = std::max<size_t>(
             1, (updatedCount + minAnimWorkPerThread - 1) / minAnimWorkPerThread);
         const size_t numThreads = std::min(static_cast<size_t>(numAnimThreads_), maxUsefulThreads);
@@ -1728,8 +1741,6 @@ void CharacterRenderer::calculateBoneMatrices(CharacterInstance& instance) {
     size_t numBones = model.bones.size();
     instance.boneMatrices.resize(numBones);
 
-    static bool dumpedOnce = false;
-
     for (size_t i = 0; i < numBones; i++) {
         const auto& bone = model.bones[i];
 
@@ -1737,19 +1748,6 @@ void CharacterRenderer::calculateBoneMatrices(CharacterInstance& instance) {
         // At rest this is identity, so no separate bind pose is needed
         glm::mat4 localTransform = getBoneTransform(bone, instance.animationTime, instance.currentSequenceIndex);
 
-        // Debug: dump first frame bone data
-        if (!dumpedOnce && i < 5) {
-            glm::vec3 t = interpolateVec3(bone.translation, instance.currentSequenceIndex, instance.animationTime, glm::vec3(0.0f));
-            glm::quat r = interpolateQuat(bone.rotation, instance.currentSequenceIndex, instance.animationTime);
-            glm::vec3 s = interpolateVec3(bone.scale, instance.currentSequenceIndex, instance.animationTime, glm::vec3(1.0f));
-            core::Logger::getInstance().info("Bone ", i, " parent=", bone.parentBone,
-                " pivot=(", bone.pivot.x, ",", bone.pivot.y, ",", bone.pivot.z, ")",
-                " t=(", t.x, ",", t.y, ",", t.z, ")",
-                " r=(", r.w, ",", r.x, ",", r.y, ",", r.z, ")",
-                " s=(", s.x, ",", s.y, ",", s.z, ")",
-                " seqIdx=", instance.currentSequenceIndex);
-        }
-
         // Compose with parent
         if (bone.parentBone >= 0 && static_cast<size_t>(bone.parentBone) < numBones) {
             instance.boneMatrices[i] = instance.boneMatrices[bone.parentBone] * localTransform;
@@ -1757,12 +1755,6 @@ void CharacterRenderer::calculateBoneMatrices(CharacterInstance& instance) {
             instance.boneMatrices[i] = localTransform;
         }
     }
-    if (!dumpedOnce) {
-        dumpedOnce = true;
-        // Dump final matrix for bone 0
-        auto& m = instance.boneMatrices[0];
-        core::Logger::getInstance().info("Bone 0 final matrix row0=(", m[0][0], ",", m[1][0], ",", m[2][0], ",", m[3][0], ")");
-    }
 }
 
 glm::mat4 CharacterRenderer::getBoneTransform(const pipeline::M2Bone& bone, float time, int sequenceIndex) {
@@ -1797,22 +1789,19 @@ void CharacterRenderer::render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet,
     uint32_t frameIndex = vkCtx_->getCurrentFrame();
     uint32_t frameSlot = frameIndex % 2u;
 
-    // Reset transient material allocations once per frame slot.
-    // beginFrame() waits on this slot's fence before recording.
+    // Reset material ring buffer and descriptor pool once per frame slot.
     if (lastMaterialPoolResetFrame_ != frameIndex) {
-        VmaAllocator alloc = vkCtx_->getAllocator();
-        for (const auto& b : transientMaterialUbos_[frameSlot]) {
-            if (b.first) {
-                vmaDestroyBuffer(alloc, b.first, b.second);
-            }
-        }
-        transientMaterialUbos_[frameSlot].clear();
+        materialRingOffset_[frameSlot] = 0;
         if (materialDescPools_[frameSlot]) {
             vkResetDescriptorPool(vkCtx_->getDevice(), materialDescPools_[frameSlot], 0);
         }
         lastMaterialPoolResetFrame_ = frameIndex;
     }
 
+    // Pre-compute aligned UBO stride for ring buffer sub-allocation
+    const uint32_t uboStride = (sizeof(CharMaterialUBO) + materialUboAlignment_ - 1) & ~(materialUboAlignment_ - 1);
+    const uint32_t ringCapacityBytes = uboStride * MATERIAL_RING_CAPACITY;
+
     // Bind per-frame descriptor set (set 0) -- shared across all draws
     vkCmdBindDescriptorSets(cmd, VK_PIPELINE_BIND_POINT_GRAPHICS,
                             pipelineLayout_, 0, 1, &perFrameSet, 0, nullptr);
@@ -2182,27 +2171,18 @@ void CharacterRenderer::render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet,
                 matData.heightMapVariance = batchHeightVariance;
                 matData.normalMapStrength = normalMapStrength_;
 
-                // Create a small UBO for this batch's material
-                VkBufferCreateInfo bci{VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO};
-                bci.size = sizeof(CharMaterialUBO);
-                bci.usage = VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT;
-                VmaAllocationCreateInfo aci{};
-                aci.usage = VMA_MEMORY_USAGE_CPU_TO_GPU;
-                aci.flags = VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT | VMA_ALLOCATION_CREATE_MAPPED_BIT;
-                VmaAllocationInfo allocInfo{};
-                ::VkBuffer matUBO = VK_NULL_HANDLE;
-                VmaAllocation matUBOAlloc = VK_NULL_HANDLE;
-                vmaCreateBuffer(vkCtx_->getAllocator(), &bci, &aci, &matUBO, &matUBOAlloc, &allocInfo);
-                if (allocInfo.pMappedData) {
-                    memcpy(allocInfo.pMappedData, &matData, sizeof(CharMaterialUBO));
-                }
+                // Sub-allocate material UBO from ring buffer
+                uint32_t matOffset = materialRingOffset_[frameSlot];
+                if (matOffset + uboStride > ringCapacityBytes) continue; // ring exhausted
+                memcpy(static_cast<char*>(materialRingMapped_[frameSlot]) + matOffset, &matData, sizeof(CharMaterialUBO));
+                materialRingOffset_[frameSlot] = matOffset + uboStride;
 
                 // Write descriptor set: binding 0 = texture, binding 1 = material UBO, binding 2 = normal/height map
                 VkTexture* bindTex = (texPtr && texPtr->isValid()) ? texPtr : whiteTexture_.get();
                 VkDescriptorImageInfo imgInfo = bindTex->descriptorInfo();
                 VkDescriptorBufferInfo bufInfo{};
-                bufInfo.buffer = matUBO;
-                bufInfo.offset = 0;
+                bufInfo.buffer = materialRingBuffer_[frameSlot];
+                bufInfo.offset = matOffset;
                 bufInfo.range = sizeof(CharMaterialUBO);
                 VkDescriptorImageInfo nhImgInfo = normalMap->descriptorInfo();
 
@@ -2235,8 +2215,6 @@ void CharacterRenderer::render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet,
                                         pipelineLayout_, 1, 1, &materialSet, 0, nullptr);
 
                 vkCmdDrawIndexed(cmd, batch.indexCount, 1, batch.indexStart, 0, 0);
-
-                transientMaterialUbos_[frameSlot].emplace_back(matUBO, matUBOAlloc);
             }
         } else {
             // Draw entire model with first texture
@@ -2277,24 +2255,16 @@ void CharacterRenderer::render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet,
             matData.heightMapVariance = 0.0f;
             matData.normalMapStrength = normalMapStrength_;
 
-            VkBufferCreateInfo bci{VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO};
-            bci.size = sizeof(CharMaterialUBO);
-            bci.usage = VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT;
-            VmaAllocationCreateInfo aci{};
-            aci.usage = VMA_MEMORY_USAGE_CPU_TO_GPU;
-            aci.flags = VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT | VMA_ALLOCATION_CREATE_MAPPED_BIT;
-            VmaAllocationInfo allocInfo{};
-            ::VkBuffer matUBO = VK_NULL_HANDLE;
-            VmaAllocation matUBOAlloc = VK_NULL_HANDLE;
-            vmaCreateBuffer(vkCtx_->getAllocator(), &bci, &aci, &matUBO, &matUBOAlloc, &allocInfo);
-            if (allocInfo.pMappedData) {
-                memcpy(allocInfo.pMappedData, &matData, sizeof(CharMaterialUBO));
-            }
+            // Sub-allocate material UBO from ring buffer
+            uint32_t matOffset2 = materialRingOffset_[frameSlot];
+            if (matOffset2 + uboStride > ringCapacityBytes) continue; // ring exhausted
+            memcpy(static_cast<char*>(materialRingMapped_[frameSlot]) + matOffset2, &matData, sizeof(CharMaterialUBO));
+            materialRingOffset_[frameSlot] = matOffset2 + uboStride;
 
             VkDescriptorImageInfo imgInfo = texPtr->descriptorInfo();
             VkDescriptorBufferInfo bufInfo{};
-            bufInfo.buffer = matUBO;
-            bufInfo.offset = 0;
+            bufInfo.buffer = materialRingBuffer_[frameSlot];
+            bufInfo.offset = matOffset2;
             bufInfo.range = sizeof(CharMaterialUBO);
             VkDescriptorImageInfo nhImgInfo2 = flatNormalTexture_->descriptorInfo();
 
@@ -2326,8 +2296,6 @@ void CharacterRenderer::render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet,
                                     pipelineLayout_, 1, 1, &materialSet, 0, nullptr);
 
             vkCmdDrawIndexed(cmd, gpuModel.indexCount, 1, 0, 0, 0);
-
-            transientMaterialUbos_[frameSlot].emplace_back(matUBO, matUBOAlloc);
         }
     }
 }
diff --git a/src/rendering/renderer.cpp b/src/rendering/renderer.cpp
index 5f3e48ae..69bfecdb 100644
--- a/src/rendering/renderer.cpp
+++ b/src/rendering/renderer.cpp
@@ -2527,7 +2527,13 @@ void Renderer::update(float deltaTime) {
 
     // Update terrain streaming
     if (terrainManager && camera) {
+        auto terrStart = std::chrono::steady_clock::now();
         terrainManager->update(*camera, deltaTime);
+        float terrMs = std::chrono::duration<float, std::milli>(
+            std::chrono::steady_clock::now() - terrStart).count();
+        if (terrMs > 5.0f) {
+            LOG_WARNING("SLOW terrainManager->update: ", terrMs, "ms");
+        }
     }
 
     // Update sky system (skybox time, star twinkle, clouds, celestial moon phases)
@@ -2579,7 +2585,14 @@ void Renderer::update(float deltaTime) {
 
     // Update character animations
     if (characterRenderer && camera) {
+        auto charAnimStart = std::chrono::steady_clock::now();
         characterRenderer->update(deltaTime, camera->getPosition());
+        float charAnimMs = std::chrono::duration<float, std::milli>(
+            std::chrono::steady_clock::now() - charAnimStart).count();
+        if (charAnimMs > 5.0f) {
+            LOG_WARNING("SLOW characterRenderer->update: ", charAnimMs, "ms (",
+                        characterRenderer->getInstanceCount(), " instances)");
+        }
     }
 
     // Update AudioEngine (cleanup finished sounds, etc.)
diff --git a/src/rendering/terrain_manager.cpp b/src/rendering/terrain_manager.cpp
index 3eb1ba1c..20a2e9a1 100644
--- a/src/rendering/terrain_manager.cpp
+++ b/src/rendering/terrain_manager.cpp
@@ -1082,7 +1082,7 @@ void TerrainManager::workerLoop() {
 void TerrainManager::processReadyTiles() {
     // Process tiles with time budget to avoid frame spikes
     // Taxi mode gets a slightly larger budget to avoid visible late-pop terrain/models.
-    const float timeBudgetMs = taxiStreamingMode_ ? 8.0f : 5.0f;
+    const float timeBudgetMs = taxiStreamingMode_ ? 8.0f : 3.0f;
     auto startTime = std::chrono::high_resolution_clock::now();
 
     // Move newly ready tiles into the finalizing deque.

From 7ac990cff43028e26f6674b90950286f7817131b Mon Sep 17 00:00:00 2001
From: Kelsi <kelsihates2fa@gmail.com>
Date: Sat, 7 Mar 2026 15:46:56 -0800
Subject: [PATCH 7/9] Background BLP texture pre-decoding + deferred WMO normal
 maps (12x streaming perf)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Move CPU-heavy BLP texture decoding from main thread to background worker
threads for all hot paths: terrain M2 models, WMO doodad M2s, WMO textures,
creature models, and gameobject WMOs. Each renderer (M2, WMO, Character) now
accepts a pre-decoded BLP cache that loadTexture() checks before falling back
to synchronous decode.

Defer WMO normal/height map generation (3 per-pixel passes: luminance, box
blur, Sobel) during terrain streaming finalization — this was the dominant
remaining bottleneck after BLP pre-decoding.

Terrain streaming stalls: 1576ms → 124ms worst case.
---
 include/core/application.hpp             |  22 ++-
 include/rendering/character_renderer.hpp |   6 +
 include/rendering/m2_renderer.hpp        |   8 +
 include/rendering/terrain_manager.hpp    |   6 +
 include/rendering/vk_context.hpp         |  13 +-
 include/rendering/wmo_renderer.hpp       |   9 +
 src/core/application.cpp                 | 231 +++++++++++++++++++++--
 src/rendering/character_renderer.cpp     |  52 ++---
 src/rendering/m2_renderer.cpp            |  93 +++++----
 src/rendering/renderer.cpp               |  10 +
 src/rendering/terrain_manager.cpp        | 115 ++++++++---
 src/rendering/vk_context.cpp             |  89 ++++++++-
 src/rendering/wmo_renderer.cpp           |  28 ++-
 13 files changed, 573 insertions(+), 109 deletions(-)

diff --git a/include/core/application.hpp b/include/core/application.hpp
index a23e6bd8..c97bfaf6 100644
--- a/include/core/application.hpp
+++ b/include/core/application.hpp
@@ -3,6 +3,7 @@
 #include "core/window.hpp"
 #include "core/input.hpp"
 #include "game/character.hpp"
+#include "pipeline/blp_loader.hpp"
 #include <memory>
 #include <string>
 #include <vector>
@@ -23,7 +24,7 @@ namespace rendering { class Renderer; }
 namespace ui { class UIManager; }
 namespace auth { class AuthHandler; }
 namespace game { class GameHandler; class World; class ExpansionRegistry; }
-namespace pipeline { class AssetManager; class DBCLayout; struct M2Model; }
+namespace pipeline { class AssetManager; class DBCLayout; struct M2Model; struct WMOModel; }
 namespace audio { enum class VoiceType; }
 
 namespace core {
@@ -206,6 +207,7 @@ private:
         uint32_t modelId;
         float x, y, z, orientation;
         std::shared_ptr<pipeline::M2Model> model; // parsed on background thread
+        std::unordered_map<std::string, pipeline::BLPImage> predecodedTextures; // decoded on bg thread
         bool valid = false;
         bool permanent_failure = false;
     };
@@ -337,6 +339,24 @@ private:
     };
     std::vector<PendingGameObjectSpawn> pendingGameObjectSpawns_;
     void processGameObjectSpawnQueue();
+
+    // Async WMO loading for game objects (file I/O + parse on background thread)
+    struct PreparedGameObjectWMO {
+        uint64_t guid;
+        uint32_t entry;
+        uint32_t displayId;
+        float x, y, z, orientation;
+        std::shared_ptr<pipeline::WMOModel> wmoModel;
+        std::unordered_map<std::string, pipeline::BLPImage> predecodedTextures; // decoded on bg thread
+        bool valid = false;
+        bool isWmo = false;
+        std::string modelPath;
+    };
+    struct AsyncGameObjectLoad {
+        std::future<PreparedGameObjectWMO> future;
+    };
+    std::vector<AsyncGameObjectLoad> asyncGameObjectLoads_;
+    void processAsyncGameObjectResults();
     struct PendingTransportDoodadBatch {
         uint64_t guid = 0;
         uint32_t modelId = 0;
diff --git a/include/rendering/character_renderer.hpp b/include/rendering/character_renderer.hpp
index 52813cf4..c7cae0d7 100644
--- a/include/rendering/character_renderer.hpp
+++ b/include/rendering/character_renderer.hpp
@@ -1,6 +1,7 @@
 #pragma once
 
 #include "pipeline/m2_loader.hpp"
+#include "pipeline/blp_loader.hpp"
 #include <vulkan/vulkan.h>
 #include <vk_mem_alloc.h>
 #include <glm/glm.hpp>
@@ -114,7 +115,11 @@ public:
     void setShadowMap(VkTexture*, const glm::mat4&) {}
     void clearShadowMap() {}
 
+    // Pre-decoded BLP cache: set before calling loadModel() to skip main-thread BLP decode
+    void setPredecodedBLPCache(std::unordered_map<std::string, pipeline::BLPImage>* cache) { predecodedBLPCache_ = cache; }
+
 private:
+    std::unordered_map<std::string, pipeline::BLPImage>* predecodedBLPCache_ = nullptr;
     // GPU representation of M2 model
     struct M2ModelGPU {
         VkBuffer vertexBuffer = VK_NULL_HANDLE;
@@ -180,6 +185,7 @@ private:
 
         // Bone update throttling (skip frames for distant characters)
         uint32_t boneUpdateCounter = 0;
+        const M2ModelGPU* cachedModel = nullptr;  // Avoid per-frame hash lookups
 
         // Per-instance bone SSBO (double-buffered per frame)
         VkBuffer boneBuffer[2] = {};
diff --git a/include/rendering/m2_renderer.hpp b/include/rendering/m2_renderer.hpp
index 91616a28..1c35e34b 100644
--- a/include/rendering/m2_renderer.hpp
+++ b/include/rendering/m2_renderer.hpp
@@ -1,6 +1,7 @@
 #pragma once
 
 #include "pipeline/m2_loader.hpp"
+#include "pipeline/blp_loader.hpp"
 #include <vulkan/vulkan.h>
 #include <vk_mem_alloc.h>
 #include <glm/glm.hpp>
@@ -188,6 +189,7 @@ struct M2Instance {
     bool skipCollision = false;    // WMO interior doodads — skip player wall collision
     float cachedBoundRadius = 0.0f;
     float portalSpinAngle = 0.0f;  // Accumulated spin angle for portal rotation
+    const M2ModelGPU* cachedModel = nullptr;  // Avoid per-frame hash lookups
 
     // Frame-skip optimization (update distant animations less frequently)
     uint8_t frameSkipCounter = 0;
@@ -328,6 +330,10 @@ public:
 
     std::vector<glm::vec3> getWaterVegetationPositions(const glm::vec3& camPos, float maxDist) const;
 
+    // Pre-decoded BLP cache: set by terrain manager before calling loadModel()
+    // so loadTexture() can skip the expensive assetManager->loadTexture() call.
+    void setPredecodedBLPCache(std::unordered_map<std::string, pipeline::BLPImage>* cache) { predecodedBLPCache_ = cache; }
+
 private:
     bool initialized_ = false;
     bool insideInterior = false;
@@ -414,6 +420,8 @@ private:
     uint32_t modelLimitRejectWarnings_ = 0;
 
     VkTexture* loadTexture(const std::string& path, uint32_t texFlags = 0);
+    std::unordered_map<std::string, pipeline::BLPImage>* predecodedBLPCache_ = nullptr;
+
     struct TextureCacheEntry {
         std::unique_ptr<VkTexture> texture;
         size_t approxBytes = 0;
diff --git a/include/rendering/terrain_manager.hpp b/include/rendering/terrain_manager.hpp
index 1b2af320..6f732721 100644
--- a/include/rendering/terrain_manager.hpp
+++ b/include/rendering/terrain_manager.hpp
@@ -121,6 +121,12 @@ struct PendingTile {
     // Pre-loaded terrain texture BLP data (loaded on background thread to avoid
     // blocking file I/O on the main thread during finalizeTile)
     std::unordered_map<std::string, pipeline::BLPImage> preloadedTextures;
+
+    // Pre-decoded M2 model textures (decoded on background thread)
+    std::unordered_map<std::string, pipeline::BLPImage> preloadedM2Textures;
+
+    // Pre-decoded WMO textures (decoded on background thread)
+    std::unordered_map<std::string, pipeline::BLPImage> preloadedWMOTextures;
 };
 
 /**
diff --git a/include/rendering/vk_context.hpp b/include/rendering/vk_context.hpp
index dab96d2a..907e21bf 100644
--- a/include/rendering/vk_context.hpp
+++ b/include/rendering/vk_context.hpp
@@ -50,9 +50,12 @@ public:
     // Batch upload mode: records multiple upload commands into a single
     // command buffer, then submits with ONE fence wait instead of one per upload.
     void beginUploadBatch();
-    void endUploadBatch();
+    void endUploadBatch();       // Async: submits but does NOT wait for fence
+    void endUploadBatchSync();   // Sync: submits and waits (for load screens)
     bool isInUploadBatch() const { return inUploadBatch_; }
     void deferStagingCleanup(AllocatedBuffer staging);
+    void pollUploadBatches();    // Check completed async uploads, free staging buffers
+    void waitAllUploads();       // Block until all in-flight uploads complete
 
     // Accessors
     VkInstance getInstance() const { return instance; }
@@ -157,6 +160,14 @@ private:
     VkCommandBuffer batchCmd_ = VK_NULL_HANDLE;
     std::vector<AllocatedBuffer> batchStagingBuffers_;
 
+    // Async upload: in-flight batches awaiting GPU completion
+    struct InFlightBatch {
+        VkFence fence = VK_NULL_HANDLE;
+        VkCommandBuffer cmd = VK_NULL_HANDLE;
+        std::vector<AllocatedBuffer> stagingBuffers;
+    };
+    std::vector<InFlightBatch> inFlightBatches_;
+
     // Depth buffer (shared across all framebuffers)
     VkImage depthImage = VK_NULL_HANDLE;
     VkImageView depthImageView = VK_NULL_HANDLE;
diff --git a/include/rendering/wmo_renderer.hpp b/include/rendering/wmo_renderer.hpp
index 095a354d..f0d3b36f 100644
--- a/include/rendering/wmo_renderer.hpp
+++ b/include/rendering/wmo_renderer.hpp
@@ -1,5 +1,6 @@
 #pragma once
 
+#include "pipeline/blp_loader.hpp"
 #include <vulkan/vulkan.h>
 #include <vk_mem_alloc.h>
 #include <glm/glm.hpp>
@@ -325,6 +326,12 @@ public:
     // Pre-compute floor cache for all loaded WMO instances
     void precomputeFloorCache();
 
+    // Pre-decoded BLP cache: set before calling loadModel() to skip main-thread BLP decode
+    void setPredecodedBLPCache(std::unordered_map<std::string, pipeline::BLPImage>* cache) { predecodedBLPCache_ = cache; }
+
+    // Defer normal/height map generation during streaming to avoid CPU stalls
+    void setDeferNormalMaps(bool defer) { deferNormalMaps_ = defer; }
+
 private:
     // WMO material UBO — matches WMOMaterial in wmo.frag.glsl
     struct WMOMaterialUBO {
@@ -558,6 +565,7 @@ private:
      * Load a texture from path
      */
     VkTexture* loadTexture(const std::string& path);
+    std::unordered_map<std::string, pipeline::BLPImage>* predecodedBLPCache_ = nullptr;
 
     /**
      * Generate normal+height map from diffuse RGBA8 pixels
@@ -670,6 +678,7 @@ private:
 
     // Normal mapping / POM settings
     bool normalMappingEnabled_ = true;   // on by default
+    bool deferNormalMaps_ = false;       // skip normal map gen during streaming
     float normalMapStrength_ = 0.8f;     // 0.0 = flat, 1.0 = full, 2.0 = exaggerated
     bool pomEnabled_ = true;             // on by default
     int pomQuality_ = 1;                 // 0=Low(16), 1=Medium(32), 2=High(64)
diff --git a/src/core/application.cpp b/src/core/application.cpp
index f0c22a2c..f4712613 100644
--- a/src/core/application.cpp
+++ b/src/core/application.cpp
@@ -6883,7 +6883,7 @@ void Application::spawnOnlineGameObject(uint64_t guid, uint32_t entry, uint32_t
 void Application::processAsyncCreatureResults() {
     // Check completed async model loads and finalize on main thread (GPU upload + instance creation).
     // Limit GPU model uploads per frame to avoid spikes, but always drain cheap bookkeeping.
-    static constexpr int kMaxModelUploadsPerFrame = 3;
+    static constexpr int kMaxModelUploadsPerFrame = 1;
     int modelUploads = 0;
 
     for (auto it = asyncCreatureLoads_.begin(); it != asyncCreatureLoads_.end(); ) {
@@ -6925,13 +6925,17 @@ void Application::processAsyncCreatureResults() {
         }
 
         // Upload model to GPU (must happen on main thread)
+        // Use pre-decoded BLP cache to skip main-thread texture decode
+        charRenderer->setPredecodedBLPCache(&result.predecodedTextures);
         if (!charRenderer->loadModel(*result.model, result.modelId)) {
+            charRenderer->setPredecodedBLPCache(nullptr);
             nonRenderableCreatureDisplayIds_.insert(result.displayId);
             creaturePermanentFailureGuids_.insert(result.guid);
             pendingCreatureSpawnGuids_.erase(result.guid);
             creatureSpawnRetryCounts_.erase(result.guid);
             continue;
         }
+        charRenderer->setPredecodedBLPCache(nullptr);
         displayIdModelCache_[result.displayId] = result.modelId;
         modelUploads++;
 
@@ -6956,6 +6960,10 @@ void Application::processAsyncCreatureResults() {
 }
 
 void Application::processCreatureSpawnQueue() {
+    auto startTime = std::chrono::steady_clock::now();
+    // Budget: max 2ms per frame for creature spawning to prevent stutter.
+    static constexpr float kSpawnBudgetMs = 2.0f;
+
     // First, finalize any async model loads that completed on background threads.
     processAsyncCreatureResults();
 
@@ -6965,18 +6973,15 @@ void Application::processCreatureSpawnQueue() {
         if (!creatureLookupsBuilt_) return;
     }
 
-    auto startTime = std::chrono::steady_clock::now();
-    // Budget: max 4ms per frame for creature spawning to prevent stutter.
-    static constexpr float kSpawnBudgetMs = 4.0f;
-
     int processed = 0;
     int asyncLaunched = 0;
     size_t rotationsLeft = pendingCreatureSpawns_.size();
     while (!pendingCreatureSpawns_.empty() &&
            processed < MAX_SPAWNS_PER_FRAME &&
            rotationsLeft > 0) {
-        // Check time budget after each spawn (not for the first one, always process at least 1)
-        if (processed > 0) {
+        // Check time budget every iteration (including first — async results may
+        // have already consumed the budget via GPU model uploads).
+        {
             auto now = std::chrono::steady_clock::now();
             float elapsedMs = std::chrono::duration<float, std::milli>(now - startTime).count();
             if (elapsedMs >= kSpawnBudgetMs) break;
@@ -7081,6 +7086,20 @@ void Application::processCreatureSpawnQueue() {
                         }
                     }
 
+                    // Pre-decode model textures on background thread
+                    for (const auto& tex : model->textures) {
+                        if (tex.filename.empty()) continue;
+                        std::string texKey = tex.filename;
+                        std::replace(texKey.begin(), texKey.end(), '/', '\\');
+                        std::transform(texKey.begin(), texKey.end(), texKey.begin(),
+                                       [](unsigned char c) { return static_cast<char>(std::tolower(c)); });
+                        if (result.predecodedTextures.find(texKey) != result.predecodedTextures.end()) continue;
+                        auto blp = am->loadTexture(texKey);
+                        if (blp.isValid()) {
+                            result.predecodedTextures[texKey] = std::move(blp);
+                        }
+                    }
+
                     result.model = std::move(model);
                     result.valid = true;
                     return result;
@@ -7161,14 +7180,202 @@ void Application::processDeferredEquipmentQueue() {
     setOnlinePlayerEquipment(guid, equipData.first, equipData.second);
 }
 
+void Application::processAsyncGameObjectResults() {
+    for (auto it = asyncGameObjectLoads_.begin(); it != asyncGameObjectLoads_.end(); ) {
+        if (!it->future.valid() ||
+            it->future.wait_for(std::chrono::milliseconds(0)) != std::future_status::ready) {
+            ++it;
+            continue;
+        }
+
+        auto result = it->future.get();
+        it = asyncGameObjectLoads_.erase(it);
+
+        if (!result.valid || !result.isWmo || !result.wmoModel) {
+            // Fallback: spawn via sync path (likely an M2 or failed WMO)
+            spawnOnlineGameObject(result.guid, result.entry, result.displayId,
+                                 result.x, result.y, result.z, result.orientation);
+            continue;
+        }
+
+        // WMO parsed on background thread — do GPU upload + instance creation on main thread
+        auto* wmoRenderer = renderer ? renderer->getWMORenderer() : nullptr;
+        if (!wmoRenderer) continue;
+
+        uint32_t modelId = 0;
+        auto itCache = gameObjectDisplayIdWmoCache_.find(result.displayId);
+        if (itCache != gameObjectDisplayIdWmoCache_.end()) {
+            modelId = itCache->second;
+        } else {
+            modelId = nextGameObjectWmoModelId_++;
+            wmoRenderer->setPredecodedBLPCache(&result.predecodedTextures);
+            if (!wmoRenderer->loadModel(*result.wmoModel, modelId)) {
+                wmoRenderer->setPredecodedBLPCache(nullptr);
+                LOG_WARNING("Failed to load async gameobject WMO: ", result.modelPath);
+                continue;
+            }
+            wmoRenderer->setPredecodedBLPCache(nullptr);
+            gameObjectDisplayIdWmoCache_[result.displayId] = modelId;
+        }
+
+        glm::vec3 renderPos = core::coords::canonicalToRender(
+            glm::vec3(result.x, result.y, result.z));
+        uint32_t instanceId = wmoRenderer->createInstance(
+            modelId, renderPos, glm::vec3(0.0f, 0.0f, result.orientation), 1.0f);
+        if (instanceId == 0) continue;
+
+        gameObjectInstances_[result.guid] = {modelId, instanceId, true};
+
+        // Queue transport doodad loading if applicable
+        std::string lowerPath = result.modelPath;
+        std::transform(lowerPath.begin(), lowerPath.end(), lowerPath.begin(),
+                       [](unsigned char c) { return static_cast<char>(std::tolower(c)); });
+        if (lowerPath.find("transport") != std::string::npos) {
+            const auto* doodadTemplates = wmoRenderer->getDoodadTemplates(modelId);
+            if (doodadTemplates && !doodadTemplates->empty()) {
+                PendingTransportDoodadBatch batch;
+                batch.guid = result.guid;
+                batch.modelId = modelId;
+                batch.instanceId = instanceId;
+                batch.x = result.x;
+                batch.y = result.y;
+                batch.z = result.z;
+                batch.orientation = result.orientation;
+                batch.doodadBudget = doodadTemplates->size();
+                pendingTransportDoodadBatches_.push_back(batch);
+            }
+        }
+    }
+}
+
 void Application::processGameObjectSpawnQueue() {
+    // Finalize any completed async WMO loads first
+    processAsyncGameObjectResults();
+
     if (pendingGameObjectSpawns_.empty()) return;
 
-    // Only spawn 1 game object per frame — each can involve heavy synchronous
-    // WMO loading (root + groups from disk + GPU upload), easily 100ms+.
-    auto& s = pendingGameObjectSpawns_.front();
-    spawnOnlineGameObject(s.guid, s.entry, s.displayId, s.x, s.y, s.z, s.orientation);
-    pendingGameObjectSpawns_.erase(pendingGameObjectSpawns_.begin());
+    // Process spawns: cached WMOs and M2s go sync (cheap), uncached WMOs go async
+    auto startTime = std::chrono::steady_clock::now();
+    static constexpr float kBudgetMs = 2.0f;
+    static constexpr int kMaxAsyncLoads = 2;
+
+    while (!pendingGameObjectSpawns_.empty()) {
+        float elapsedMs = std::chrono::duration<float, std::milli>(
+            std::chrono::steady_clock::now() - startTime).count();
+        if (elapsedMs >= kBudgetMs) break;
+
+        auto& s = pendingGameObjectSpawns_.front();
+
+        // Check if this is an uncached WMO that needs async loading
+        std::string modelPath;
+        if (gameObjectLookupsBuilt_) {
+            // Check transport overrides first
+            bool isTransport = gameHandler && gameHandler->isTransportGuid(s.guid);
+            if (isTransport) {
+                if (s.entry == 20808 || s.entry == 176231 || s.entry == 176310)
+                    modelPath = "World\\wmo\\transports\\transport_ship\\transportship.wmo";
+                else if (s.displayId == 807 || s.displayId == 808 || s.displayId == 175080 || s.displayId == 176495 || s.displayId == 164871)
+                    modelPath = "World\\wmo\\transports\\transport_zeppelin\\transport_zeppelin.wmo";
+                else if (s.displayId == 1587)
+                    modelPath = "World\\wmo\\transports\\transport_horde_zeppelin\\Transport_Horde_Zeppelin.wmo";
+                else if (s.displayId == 2454 || s.displayId == 181688 || s.displayId == 190536)
+                    modelPath = "World\\wmo\\transports\\icebreaker\\Transport_Icebreaker_ship.wmo";
+            }
+            if (modelPath.empty())
+                modelPath = getGameObjectModelPathForDisplayId(s.displayId);
+        }
+
+        std::string lowerPath = modelPath;
+        std::transform(lowerPath.begin(), lowerPath.end(), lowerPath.begin(),
+                       [](unsigned char c) { return static_cast<char>(std::tolower(c)); });
+        bool isWmo = lowerPath.size() >= 4 && lowerPath.substr(lowerPath.size() - 4) == ".wmo";
+        bool isCached = isWmo && gameObjectDisplayIdWmoCache_.count(s.displayId);
+
+        if (isWmo && !isCached && !modelPath.empty() &&
+            static_cast<int>(asyncGameObjectLoads_.size()) < kMaxAsyncLoads) {
+            // Launch async WMO load — file I/O + parse on background thread
+            auto* am = assetManager.get();
+            PendingGameObjectSpawn capture = s;
+            std::string capturePath = modelPath;
+            AsyncGameObjectLoad load;
+            load.future = std::async(std::launch::async,
+                [am, capture, capturePath]() -> PreparedGameObjectWMO {
+                    PreparedGameObjectWMO result;
+                    result.guid = capture.guid;
+                    result.entry = capture.entry;
+                    result.displayId = capture.displayId;
+                    result.x = capture.x;
+                    result.y = capture.y;
+                    result.z = capture.z;
+                    result.orientation = capture.orientation;
+                    result.modelPath = capturePath;
+                    result.isWmo = true;
+
+                    auto wmoData = am->readFile(capturePath);
+                    if (wmoData.empty()) return result;
+
+                    auto wmo = std::make_shared<pipeline::WMOModel>(
+                        pipeline::WMOLoader::load(wmoData));
+
+                    // Load groups
+                    if (wmo->nGroups > 0) {
+                        std::string basePath = capturePath;
+                        std::string ext;
+                        if (basePath.size() > 4) {
+                            ext = basePath.substr(basePath.size() - 4);
+                            basePath = basePath.substr(0, basePath.size() - 4);
+                        }
+                        for (uint32_t gi = 0; gi < wmo->nGroups; gi++) {
+                            char suffix[16];
+                            snprintf(suffix, sizeof(suffix), "_%03u%s", gi, ext.c_str());
+                            auto groupData = am->readFile(basePath + suffix);
+                            if (groupData.empty()) {
+                                snprintf(suffix, sizeof(suffix), "_%03u.wmo", gi);
+                                groupData = am->readFile(basePath + suffix);
+                            }
+                            if (!groupData.empty()) {
+                                pipeline::WMOLoader::loadGroup(groupData, *wmo, gi);
+                            }
+                        }
+                    }
+
+                    // Pre-decode WMO textures on background thread
+                    for (const auto& texPath : wmo->textures) {
+                        if (texPath.empty()) continue;
+                        std::string texKey = texPath;
+                        size_t nul = texKey.find('\0');
+                        if (nul != std::string::npos) texKey.resize(nul);
+                        std::replace(texKey.begin(), texKey.end(), '/', '\\');
+                        std::transform(texKey.begin(), texKey.end(), texKey.begin(),
+                                       [](unsigned char c) { return static_cast<char>(std::tolower(c)); });
+                        if (texKey.empty()) continue;
+                        // Convert to .blp extension
+                        if (texKey.size() >= 4) {
+                            std::string ext = texKey.substr(texKey.size() - 4);
+                            if (ext == ".tga" || ext == ".dds") {
+                                texKey = texKey.substr(0, texKey.size() - 4) + ".blp";
+                            }
+                        }
+                        if (result.predecodedTextures.find(texKey) != result.predecodedTextures.end()) continue;
+                        auto blp = am->loadTexture(texKey);
+                        if (blp.isValid()) {
+                            result.predecodedTextures[texKey] = std::move(blp);
+                        }
+                    }
+
+                    result.wmoModel = wmo;
+                    result.valid = true;
+                    return result;
+                });
+            asyncGameObjectLoads_.push_back(std::move(load));
+            pendingGameObjectSpawns_.erase(pendingGameObjectSpawns_.begin());
+            continue;
+        }
+
+        // Cached WMO or M2 — spawn synchronously (cheap)
+        spawnOnlineGameObject(s.guid, s.entry, s.displayId, s.x, s.y, s.z, s.orientation);
+        pendingGameObjectSpawns_.erase(pendingGameObjectSpawns_.begin());
+    }
 }
 
 void Application::processPendingTransportDoodads() {
diff --git a/src/rendering/character_renderer.cpp b/src/rendering/character_renderer.cpp
index f735dd7d..040a301d 100644
--- a/src/rendering/character_renderer.cpp
+++ b/src/rendering/character_renderer.cpp
@@ -625,7 +625,18 @@ VkTexture* CharacterRenderer::loadTexture(const std::string& path) {
         return whiteTexture_.get();
     }
 
-    auto blpImage = assetManager->loadTexture(key);
+    // Check pre-decoded BLP cache first (populated by background threads)
+    pipeline::BLPImage blpImage;
+    if (predecodedBLPCache_) {
+        auto pit = predecodedBLPCache_->find(key);
+        if (pit != predecodedBLPCache_->end()) {
+            blpImage = std::move(pit->second);
+            predecodedBLPCache_->erase(pit);
+        }
+    }
+    if (!blpImage.isValid()) {
+        blpImage = assetManager->loadTexture(key);
+    }
     if (!blpImage.isValid()) {
         // Return white fallback but don't cache the failure — allow retry
         // on next character load in case the asset becomes available.
@@ -1412,8 +1423,9 @@ uint32_t CharacterRenderer::createInstance(uint32_t modelId, const glm::vec3& po
     instance.scale = scale;
 
     // Initialize bone matrices to identity
-    auto& model = models[modelId].data;
-    instance.boneMatrices.resize(std::max(static_cast<size_t>(1), model.bones.size()), glm::mat4(1.0f));
+    auto& gpuRef = models[modelId];
+    instance.boneMatrices.resize(std::max(static_cast<size_t>(1), gpuRef.data.bones.size()), glm::mat4(1.0f));
+    instance.cachedModel = &gpuRef;
 
     uint32_t id = instance.id;
     instances[id] = std::move(instance);
@@ -1511,13 +1523,12 @@ void CharacterRenderer::update(float deltaTime, const glm::vec3& cameraPos) {
         if (distSq >= animUpdateRadiusSq) continue;
 
         // Always advance animation time (cheap)
-        auto modelIt = models.find(inst.modelId);
-        if (modelIt != models.end() && !modelIt->second.data.sequences.empty()) {
+        if (inst.cachedModel && !inst.cachedModel->data.sequences.empty()) {
             if (inst.currentSequenceIndex < 0) {
                 inst.currentSequenceIndex = 0;
-                inst.currentAnimationId = modelIt->second.data.sequences[0].id;
+                inst.currentAnimationId = inst.cachedModel->data.sequences[0].id;
             }
-            const auto& seq = modelIt->second.data.sequences[inst.currentSequenceIndex];
+            const auto& seq = inst.cachedModel->data.sequences[inst.currentSequenceIndex];
             inst.animationTime += deltaTime * 1000.0f;
             if (seq.duration > 0 && inst.animationTime >= static_cast<float>(seq.duration)) {
                 if (inst.animationLoop) {
@@ -1528,10 +1539,11 @@ void CharacterRenderer::update(float deltaTime, const glm::vec3& cameraPos) {
             }
         }
 
-        // Distance-tiered bone throttling: near=every frame, mid=every 3rd, far=every 6th
+        // Distance-tiered bone throttling: near=every frame, mid=every 4th, far=every 8th
         uint32_t boneInterval = 1;
-        if (distSq > 60.0f * 60.0f) boneInterval = 6;
-        else if (distSq > 30.0f * 30.0f) boneInterval = 3;
+        if (distSq > 40.0f * 40.0f) boneInterval = 8;
+        else if (distSq > 20.0f * 20.0f) boneInterval = 4;
+        else if (distSq > 10.0f * 10.0f) boneInterval = 2;
 
         inst.boneUpdateCounter++;
         bool needsBones = (inst.boneUpdateCounter >= boneInterval) || inst.boneMatrices.empty();
@@ -1615,11 +1627,8 @@ void CharacterRenderer::update(float deltaTime, const glm::vec3& cameraPos) {
 }
 
 void CharacterRenderer::updateAnimation(CharacterInstance& instance, float deltaTime) {
-    auto modelIt = models.find(instance.modelId);
-    if (modelIt == models.end()) {
-        return;
-    }
-    const auto& model = modelIt->second.data;
+    if (!instance.cachedModel) return;
+    const auto& model = instance.cachedModel->data;
 
     if (model.sequences.empty()) {
         return;
@@ -1732,7 +1741,8 @@ glm::quat CharacterRenderer::interpolateQuat(const pipeline::M2AnimationTrack& t
 // --- Bone transform calculation ---
 
 void CharacterRenderer::calculateBoneMatrices(CharacterInstance& instance) {
-    auto& model = models[instance.modelId].data;
+    if (!instance.cachedModel) return;
+    auto& model = instance.cachedModel->data;
 
     if (model.bones.empty()) {
         return;
@@ -1833,9 +1843,8 @@ void CharacterRenderer::render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet,
             }
         }
 
-        auto modelIt = models.find(instance.modelId);
-        if (modelIt == models.end()) continue;
-        const auto& gpuModel = modelIt->second;
+        if (!instance.cachedModel) continue;
+        const auto& gpuModel = *instance.cachedModel;
 
         // Skip models without GPU buffers
         if (!gpuModel.vertexBuffer) continue;
@@ -2487,9 +2496,8 @@ void CharacterRenderer::renderShadow(VkCommandBuffer cmd, const glm::mat4& light
         glm::vec3 diff = inst.position - shadowCenter;
         if (glm::dot(diff, diff) > shadowRadiusSq) continue;
 
-        auto modelIt = models.find(inst.modelId);
-        if (modelIt == models.end()) continue;
-        const M2ModelGPU& gpuModel = modelIt->second;
+        if (!inst.cachedModel) continue;
+        const M2ModelGPU& gpuModel = *inst.cachedModel;
         if (!gpuModel.vertexBuffer) continue;
 
         glm::mat4 modelMat = inst.hasOverrideModelMatrix
diff --git a/src/rendering/m2_renderer.cpp b/src/rendering/m2_renderer.cpp
index c4e7a727..d455e494 100644
--- a/src/rendering/m2_renderer.cpp
+++ b/src/rendering/m2_renderer.cpp
@@ -1657,6 +1657,7 @@ uint32_t M2Renderer::createInstance(uint32_t modelId, const glm::vec3& position,
     instance.cachedIsInvisibleTrap = mdlRef.isInvisibleTrap;
     instance.cachedIsInstancePortal = mdlRef.isInstancePortal;
     instance.cachedIsValid = mdlRef.isValid();
+    instance.cachedModel = &mdlRef;
 
     // Initialize animation: play first sequence (usually Stand/Idle)
     const auto& mdl = mdlRef;
@@ -1748,6 +1749,7 @@ uint32_t M2Renderer::createInstanceWithMatrix(uint32_t modelId, const glm::mat4&
     instance.cachedIsGroundDetail = mdl2.isGroundDetail;
     instance.cachedIsInvisibleTrap = mdl2.isInvisibleTrap;
     instance.cachedIsValid = mdl2.isValid();
+    instance.cachedModel = &mdl2;
 
     // Initialize animation
     if (mdl2.hasAnimation && !mdl2.disableAnimation && !mdl2.sequences.empty()) {
@@ -2026,9 +2028,8 @@ void M2Renderer::update(float deltaTime, const glm::vec3& cameraPos, const glm::
         instance.animTime += dtMs * (instance.animSpeed - 1.0f);
 
         // For animation looping/variation, we need the actual model data.
-        auto it = models.find(instance.modelId);
-        if (it == models.end()) continue;
-        const M2ModelGPU& model = it->second;
+        if (!instance.cachedModel) continue;
+        const M2ModelGPU& model = *instance.cachedModel;
 
         // Validate sequence index
         if (instance.currentSequenceIndex < 0 ||
@@ -2084,6 +2085,14 @@ void M2Renderer::update(float deltaTime, const glm::vec3& cameraPos, const glm::
         float paddedRadius = std::max(cullRadius * 1.5f, cullRadius + 3.0f);
         if (cullRadius > 0.0f && !updateFrustum.intersectsSphere(instance.position, paddedRadius)) continue;
 
+        // Distance-based frame skipping: update distant bones less frequently
+        uint32_t boneInterval = 1;
+        if (distSq > 200.0f * 200.0f) boneInterval = 8;
+        else if (distSq > 100.0f * 100.0f) boneInterval = 4;
+        else if (distSq > 50.0f * 50.0f) boneInterval = 2;
+        instance.frameSkipCounter++;
+        if ((instance.frameSkipCounter % boneInterval) != 0) continue;
+
         boneWorkIndices_.push_back(idx);
     }
 
@@ -2097,9 +2106,8 @@ void M2Renderer::update(float deltaTime, const glm::vec3& cameraPos, const glm::
             for (size_t i : boneWorkIndices_) {
                 if (i >= instances.size()) continue;
                 auto& inst = instances[i];
-                auto mdlIt = models.find(inst.modelId);
-                if (mdlIt == models.end()) continue;
-                computeBoneMatrices(mdlIt->second, inst);
+                if (!inst.cachedModel) continue;
+                computeBoneMatrices(*inst.cachedModel, inst);
             }
         } else {
             // Parallel — dispatch across worker threads
@@ -2112,9 +2120,8 @@ void M2Renderer::update(float deltaTime, const glm::vec3& cameraPos, const glm::
                 for (size_t i : boneWorkIndices_) {
                     if (i >= instances.size()) continue;
                     auto& inst = instances[i];
-                    auto mdlIt = models.find(inst.modelId);
-                    if (mdlIt == models.end()) continue;
-                    computeBoneMatrices(mdlIt->second, inst);
+                    if (!inst.cachedModel) continue;
+                    computeBoneMatrices(*inst.cachedModel, inst);
                 }
             } else {
                 const size_t chunkSize = animCount / numThreads;
@@ -2135,9 +2142,8 @@ void M2Renderer::update(float deltaTime, const glm::vec3& cameraPos, const glm::
                                 size_t idx = boneWorkIndices_[j];
                                 if (idx >= instances.size()) continue;
                                 auto& inst = instances[idx];
-                                auto mdlIt = models.find(inst.modelId);
-                                if (mdlIt == models.end()) continue;
-                                computeBoneMatrices(mdlIt->second, inst);
+                                if (!inst.cachedModel) continue;
+                                computeBoneMatrices(*inst.cachedModel, inst);
                             }
                         }));
                     start = end;
@@ -2159,9 +2165,8 @@ void M2Renderer::update(float deltaTime, const glm::vec3& cameraPos, const glm::
         glm::vec3 toCam = instance.position - cachedCamPos_;
         float distSq = glm::dot(toCam, toCam);
         if (distSq > cachedMaxRenderDistSq_) continue;
-        auto mdlIt = models.find(instance.modelId);
-        if (mdlIt == models.end()) continue;
-        emitParticles(instance, mdlIt->second, deltaTime);
+        if (!instance.cachedModel) continue;
+        emitParticles(instance, *instance.cachedModel, deltaTime);
         updateParticles(instance, deltaTime);
     }
 
@@ -2865,9 +2870,8 @@ void M2Renderer::renderShadow(VkCommandBuffer cmd, const glm::mat4& lightSpaceMa
             glm::vec3 diff = instance.position - shadowCenter;
             if (glm::dot(diff, diff) > shadowRadiusSq) continue;
 
-            auto modelIt = models.find(instance.modelId);
-            if (modelIt == models.end()) continue;
-            const M2ModelGPU& model = modelIt->second;
+            if (!instance.cachedModel) continue;
+            const M2ModelGPU& model = *instance.cachedModel;
 
             // Filter: only draw foliage models in foliage pass, non-foliage in non-foliage pass
             if (model.shadowWindFoliage != foliagePass) continue;
@@ -2973,8 +2977,7 @@ std::vector<glm::vec3> M2Renderer::getWaterVegetationPositions(const glm::vec3&
     std::vector<glm::vec3> result;
     float maxDistSq = maxDist * maxDist;
     for (const auto& inst : instances) {
-        auto it = models.find(inst.modelId);
-        if (it == models.end() || !it->second.isWaterVegetation) continue;
+        if (!inst.cachedModel || !inst.cachedModel->isWaterVegetation) continue;
         glm::vec3 diff = inst.position - camPos;
         if (glm::dot(diff, diff) <= maxDistSq) {
             result.push_back(inst.position);
@@ -3085,9 +3088,8 @@ void M2Renderer::emitParticles(M2Instance& inst, const M2ModelGPU& gpu, float dt
 }
 
 void M2Renderer::updateParticles(M2Instance& inst, float dt) {
-    auto it = models.find(inst.modelId);
-    if (it == models.end()) return;
-    const auto& gpu = it->second;
+    if (!inst.cachedModel) return;
+    const auto& gpu = *inst.cachedModel;
 
     for (size_t i = 0; i < inst.particles.size(); ) {
         auto& p = inst.particles[i];
@@ -3162,9 +3164,8 @@ void M2Renderer::renderM2Particles(VkCommandBuffer cmd, VkDescriptorSet perFrame
 
     for (auto& inst : instances) {
         if (inst.particles.empty()) continue;
-        auto it = models.find(inst.modelId);
-        if (it == models.end()) continue;
-        const auto& gpu = it->second;
+        if (!inst.cachedModel) continue;
+        const auto& gpu = *inst.cachedModel;
 
         for (const auto& p : inst.particles) {
             if (p.emitterIndex < 0 || p.emitterIndex >= static_cast<int>(gpu.particleEmitters.size())) continue;
@@ -3549,9 +3550,13 @@ void M2Renderer::rebuildSpatialIndex() {
     particleInstanceIndices_.clear();
 
     for (size_t i = 0; i < instances.size(); i++) {
-        const auto& inst = instances[i];
+        auto& inst = instances[i];
         instanceIndexById[inst.id] = i;
 
+        // Re-cache model pointer (may have changed after model map modifications)
+        auto mdlIt = models.find(inst.modelId);
+        inst.cachedModel = (mdlIt != models.end()) ? &mdlIt->second : nullptr;
+
         // Rebuild dedup map (skip ground detail)
         if (!inst.cachedIsGroundDetail) {
             DedupKey dk{inst.modelId,
@@ -3684,8 +3689,18 @@ VkTexture* M2Renderer::loadTexture(const std::string& path, uint32_t texFlags) {
         containsToken(key, "campfire") ||
         containsToken(key, "bonfire");
 
-    // Load BLP texture
-    pipeline::BLPImage blp = assetManager->loadTexture(key);
+    // Check pre-decoded BLP cache first (populated by background worker threads)
+    pipeline::BLPImage blp;
+    if (predecodedBLPCache_) {
+        auto pit = predecodedBLPCache_->find(key);
+        if (pit != predecodedBLPCache_->end()) {
+            blp = std::move(pit->second);
+            predecodedBLPCache_->erase(pit);
+        }
+    }
+    if (!blp.isValid()) {
+        blp = assetManager->loadTexture(key);
+    }
     if (!blp.isValid()) {
         // Return white fallback but don't cache the failure — MPQ reads can
         // fail transiently during streaming; allow retry on next model load.
@@ -3751,9 +3766,8 @@ VkTexture* M2Renderer::loadTexture(const std::string& path, uint32_t texFlags) {
 uint32_t M2Renderer::getTotalTriangleCount() const {
     uint32_t total = 0;
     for (const auto& instance : instances) {
-        auto it = models.find(instance.modelId);
-        if (it != models.end()) {
-            total += it->second.indexCount / 3;
+        if (instance.cachedModel) {
+            total += instance.cachedModel->indexCount / 3;
         }
     }
     return total;
@@ -3775,11 +3789,10 @@ std::optional<float> M2Renderer::getFloorHeight(float glX, float glY, float glZ,
             continue;
         }
 
-        auto it = models.find(instance.modelId);
-        if (it == models.end()) continue;
+        if (!instance.cachedModel) continue;
         if (instance.scale <= 0.001f) continue;
 
-        const M2ModelGPU& model = it->second;
+        const M2ModelGPU& model = *instance.cachedModel;
         if (model.collisionNoBlock || model.isInvisibleTrap || model.isSpellEffect) continue;
         if (instance.skipCollision) continue;
 
@@ -3931,10 +3944,9 @@ bool M2Renderer::checkCollision(const glm::vec3& from, const glm::vec3& to,
         if (from.z > instance.worldBoundsMax.z + 2.5f && adjustedPos.z > instance.worldBoundsMax.z + 2.5f) continue;
         if (from.z + 2.5f < instance.worldBoundsMin.z && adjustedPos.z + 2.5f < instance.worldBoundsMin.z) continue;
 
-        auto it = models.find(instance.modelId);
-        if (it == models.end()) continue;
+        if (!instance.cachedModel) continue;
 
-        const M2ModelGPU& model = it->second;
+        const M2ModelGPU& model = *instance.cachedModel;
         if (model.collisionNoBlock || model.isInvisibleTrap || model.isSpellEffect) continue;
         if (instance.skipCollision) continue;
         if (instance.scale <= 0.001f) continue;
@@ -4172,10 +4184,9 @@ float M2Renderer::raycastBoundingBoxes(const glm::vec3& origin, const glm::vec3&
             continue;
         }
 
-        auto it = models.find(instance.modelId);
-        if (it == models.end()) continue;
+        if (!instance.cachedModel) continue;
 
-        const M2ModelGPU& model = it->second;
+        const M2ModelGPU& model = *instance.cachedModel;
         if (model.collisionNoBlock || model.isInvisibleTrap || model.isSpellEffect) continue;
         glm::vec3 localMin, localMax;
         getTightCollisionBounds(model, localMin, localMax);
diff --git a/src/rendering/renderer.cpp b/src/rendering/renderer.cpp
index 69bfecdb..55ba1370 100644
--- a/src/rendering/renderer.cpp
+++ b/src/rendering/renderer.cpp
@@ -2434,6 +2434,9 @@ void Renderer::update(float deltaTime) {
         cameraController->update(deltaTime);
         auto cameraEnd = std::chrono::steady_clock::now();
         lastCameraUpdateMs = std::chrono::duration<double, std::milli>(cameraEnd - cameraStart).count();
+        if (lastCameraUpdateMs > 3.0) {
+            LOG_WARNING("SLOW cameraController->update: ", lastCameraUpdateMs, "ms");
+        }
 
         // Update 3D audio listener position/orientation to match camera
         if (camera) {
@@ -2779,8 +2782,15 @@ void Renderer::update(float deltaTime) {
 
     // Update M2 doodad animations (pass camera for frustum-culling bone computation)
     if (m2Renderer && camera) {
+        auto m2Start = std::chrono::steady_clock::now();
         m2Renderer->update(deltaTime, camera->getPosition(),
                            camera->getProjectionMatrix() * camera->getViewMatrix());
+        float m2Ms = std::chrono::duration<float, std::milli>(
+            std::chrono::steady_clock::now() - m2Start).count();
+        if (m2Ms > 3.0f) {
+            LOG_WARNING("SLOW m2Renderer->update: ", m2Ms, "ms (",
+                        m2Renderer->getInstanceCount(), " instances)");
+        }
     }
 
     // Helper: play zone music, dispatching local files (file: prefix) vs MPQ paths
diff --git a/src/rendering/terrain_manager.cpp b/src/rendering/terrain_manager.cpp
index 20a2e9a1..97527c8c 100644
--- a/src/rendering/terrain_manager.cpp
+++ b/src/rendering/terrain_manager.cpp
@@ -231,9 +231,14 @@ bool TerrainManager::loadTile(int x, int y) {
         return false;
     }
 
+    VkContext* vkCtx = terrainRenderer ? terrainRenderer->getVkContext() : nullptr;
+    if (vkCtx) vkCtx->beginUploadBatch();
+
     FinalizingTile ft;
     ft.pending = std::move(pending);
     while (!advanceFinalization(ft)) {}
+
+    if (vkCtx) vkCtx->endUploadBatchSync();  // Sync — caller expects tile ready
     return true;
 }
 
@@ -407,6 +412,20 @@ std::shared_ptr<PendingTile> TerrainManager::prepareTile(int x, int y) {
             return false;
         }
 
+        // Pre-decode M2 model textures on background thread
+        for (const auto& tex : m2Model.textures) {
+            if (tex.filename.empty()) continue;
+            std::string texKey = tex.filename;
+            std::replace(texKey.begin(), texKey.end(), '/', '\\');
+            std::transform(texKey.begin(), texKey.end(), texKey.begin(),
+                           [](unsigned char c) { return static_cast<char>(std::tolower(c)); });
+            if (pending->preloadedM2Textures.find(texKey) != pending->preloadedM2Textures.end()) continue;
+            auto blp = assetManager->loadTexture(texKey);
+            if (blp.isValid()) {
+                pending->preloadedM2Textures[texKey] = std::move(blp);
+            }
+        }
+
         PendingTile::M2Ready ready;
         ready.modelId = modelId;
         ready.model = std::move(m2Model);
@@ -584,6 +603,20 @@ std::shared_ptr<PendingTile> TerrainManager::prepareTile(int x, int y) {
                                 pipeline::M2Loader::loadSkin(skinData, m2Model);
                             }
                             if (!m2Model.isValid()) continue;
+
+                            // Pre-decode doodad M2 textures on background thread
+                            for (const auto& tex : m2Model.textures) {
+                                if (tex.filename.empty()) continue;
+                                std::string texKey = tex.filename;
+                                std::replace(texKey.begin(), texKey.end(), '/', '\\');
+                                std::transform(texKey.begin(), texKey.end(), texKey.begin(),
+                                               [](unsigned char c) { return static_cast<char>(std::tolower(c)); });
+                                if (pending->preloadedM2Textures.find(texKey) != pending->preloadedM2Textures.end()) continue;
+                                auto blp = assetManager->loadTexture(texKey);
+                                if (blp.isValid()) {
+                                    pending->preloadedM2Textures[texKey] = std::move(blp);
+                                }
+                            }
                         }
 
                         // Build doodad's local transform (WoW coordinates)
@@ -654,6 +687,32 @@ std::shared_ptr<PendingTile> TerrainManager::prepareTile(int x, int y) {
                     }
                 }
 
+                // Pre-decode WMO textures on background thread
+                for (const auto& texPath : wmoModel.textures) {
+                    if (texPath.empty()) continue;
+                    std::string texKey = texPath;
+                    // Truncate at NUL (WMO paths can have stray bytes)
+                    size_t nul = texKey.find('\0');
+                    if (nul != std::string::npos) texKey.resize(nul);
+                    std::replace(texKey.begin(), texKey.end(), '/', '\\');
+                    std::transform(texKey.begin(), texKey.end(), texKey.begin(),
+                                   [](unsigned char c) { return static_cast<char>(std::tolower(c)); });
+                    if (texKey.empty()) continue;
+                    if (pending->preloadedWMOTextures.find(texKey) != pending->preloadedWMOTextures.end()) continue;
+                    // Try .blp variant
+                    std::string blpKey = texKey;
+                    if (blpKey.size() >= 4) {
+                        std::string ext = blpKey.substr(blpKey.size() - 4);
+                        if (ext == ".tga" || ext == ".dds") {
+                            blpKey = blpKey.substr(0, blpKey.size() - 4) + ".blp";
+                        }
+                    }
+                    auto blp = assetManager->loadTexture(blpKey);
+                    if (blp.isValid()) {
+                        pending->preloadedWMOTextures[blpKey] = std::move(blp);
+                    }
+                }
+
                 PendingTile::WMOReady ready;
                 // Cache WMO model uploads by path; placement dedup uses uniqueId separately.
                 ready.modelId = static_cast<uint32_t>(std::hash<std::string>{}(wmoPath));
@@ -741,7 +800,7 @@ bool TerrainManager::advanceFinalization(FinalizingTile& ft) {
             }
             bool allDone = terrainRenderer->loadTerrainIncremental(
                 pending->mesh, pending->terrain.textures, x, y,
-                ft.terrainChunkNext, 64);
+                ft.terrainChunkNext, 32);
             if (!allDone) {
                 return false; // More chunks remain — yield to time budget
             }
@@ -773,7 +832,9 @@ bool TerrainManager::advanceFinalization(FinalizingTile& ft) {
     case FinalizationPhase::M2_MODELS: {
         // Upload multiple M2 models per call (batched GPU uploads)
         if (m2Renderer && ft.m2ModelIndex < pending->m2Models.size()) {
-            constexpr size_t kModelsPerStep = 8;
+            // Set pre-decoded BLP cache so loadTexture() skips main-thread BLP decode
+            m2Renderer->setPredecodedBLPCache(&pending->preloadedM2Textures);
+            constexpr size_t kModelsPerStep = 4;
             size_t uploaded = 0;
             while (ft.m2ModelIndex < pending->m2Models.size() && uploaded < kModelsPerStep) {
                 auto& m2Ready = pending->m2Models[ft.m2ModelIndex];
@@ -786,6 +847,7 @@ bool TerrainManager::advanceFinalization(FinalizingTile& ft) {
                 ft.m2ModelIndex++;
                 uploaded++;
             }
+            m2Renderer->setPredecodedBLPCache(nullptr);
             // Stay in this phase until all models uploaded
             if (ft.m2ModelIndex < pending->m2Models.size()) {
                 return false;
@@ -830,8 +892,11 @@ bool TerrainManager::advanceFinalization(FinalizingTile& ft) {
         // Upload multiple WMO models per call (batched GPU uploads)
         if (wmoRenderer && assetManager) {
             wmoRenderer->initialize(nullptr, VK_NULL_HANDLE, assetManager);
+            // Set pre-decoded BLP cache and defer normal maps during streaming
+            wmoRenderer->setPredecodedBLPCache(&pending->preloadedWMOTextures);
+            wmoRenderer->setDeferNormalMaps(true);
 
-            constexpr size_t kWmosPerStep = 4;
+            constexpr size_t kWmosPerStep = 1;
             size_t uploaded = 0;
             while (ft.wmoModelIndex < pending->wmoModels.size() && uploaded < kWmosPerStep) {
                 auto& wmoReady = pending->wmoModels[ft.wmoModelIndex];
@@ -843,6 +908,8 @@ bool TerrainManager::advanceFinalization(FinalizingTile& ft) {
                     uploaded++;
                 }
             }
+            wmoRenderer->setDeferNormalMaps(false);
+            wmoRenderer->setPredecodedBLPCache(nullptr);
             if (ft.wmoModelIndex < pending->wmoModels.size()) return false;
         }
         ft.phase = FinalizationPhase::WMO_INSTANCES;
@@ -906,7 +973,9 @@ bool TerrainManager::advanceFinalization(FinalizingTile& ft) {
     case FinalizationPhase::WMO_DOODADS: {
         // Upload multiple WMO doodad M2s per call (batched GPU uploads)
         if (m2Renderer && ft.wmoDoodadIndex < pending->wmoDoodads.size()) {
-            constexpr size_t kDoodadsPerStep = 16;
+            // Set pre-decoded BLP cache for doodad M2 textures
+            m2Renderer->setPredecodedBLPCache(&pending->preloadedM2Textures);
+            constexpr size_t kDoodadsPerStep = 4;
             size_t uploaded = 0;
             while (ft.wmoDoodadIndex < pending->wmoDoodads.size() && uploaded < kDoodadsPerStep) {
                 auto& doodad = pending->wmoDoodads[ft.wmoDoodadIndex];
@@ -923,6 +992,7 @@ bool TerrainManager::advanceFinalization(FinalizingTile& ft) {
                 ft.wmoDoodadIndex++;
                 uploaded++;
             }
+            m2Renderer->setPredecodedBLPCache(nullptr);
             if (ft.wmoDoodadIndex < pending->wmoDoodads.size()) return false;
         }
         ft.phase = FinalizationPhase::WATER;
@@ -1080,11 +1150,6 @@ void TerrainManager::workerLoop() {
 }
 
 void TerrainManager::processReadyTiles() {
-    // Process tiles with time budget to avoid frame spikes
-    // Taxi mode gets a slightly larger budget to avoid visible late-pop terrain/models.
-    const float timeBudgetMs = taxiStreamingMode_ ? 8.0f : 3.0f;
-    auto startTime = std::chrono::high_resolution_clock::now();
-
     // Move newly ready tiles into the finalizing deque.
     // Keep them in pendingTiles so streamTiles() won't re-enqueue them.
     {
@@ -1100,28 +1165,32 @@ void TerrainManager::processReadyTiles() {
         }
     }
 
-    // Outer upload batch: all GPU uploads across all advanceFinalization calls
-    // this frame share a single command buffer submission + fence wait.
     VkContext* vkCtx = terrainRenderer ? terrainRenderer->getVkContext() : nullptr;
+
+    // Reclaim completed async uploads from previous frames (non-blocking)
+    if (vkCtx) vkCtx->pollUploadBatches();
+
+    // Nothing to finalize — done.
+    if (finalizingTiles_.empty()) return;
+
+    // Async upload batch: record GPU copies into a command buffer, submit with
+    // a fence, but DON'T wait.  The fence is polled on subsequent frames.
+    // This eliminates the main-thread stall from vkWaitForFences entirely.
+    const int maxSteps = taxiStreamingMode_ ? 8 : 2;
+    int steps = 0;
+
     if (vkCtx) vkCtx->beginUploadBatch();
 
-    // Drive incremental finalization within time budget
-    while (!finalizingTiles_.empty()) {
+    while (!finalizingTiles_.empty() && steps < maxSteps) {
         auto& ft = finalizingTiles_.front();
         bool done = advanceFinalization(ft);
-
         if (done) {
             finalizingTiles_.pop_front();
         }
-
-        auto now = std::chrono::high_resolution_clock::now();
-        float elapsedMs = std::chrono::duration<float, std::milli>(now - startTime).count();
-        if (elapsedMs >= timeBudgetMs) {
-            break;
-        }
+        steps++;
     }
 
-    if (vkCtx) vkCtx->endUploadBatch();
+    if (vkCtx) vkCtx->endUploadBatch();  // Async — submits but doesn't wait
 }
 
 void TerrainManager::processAllReadyTiles() {
@@ -1151,7 +1220,7 @@ void TerrainManager::processAllReadyTiles() {
         finalizingTiles_.pop_front();
     }
 
-    if (vkCtx) vkCtx->endUploadBatch();
+    if (vkCtx) vkCtx->endUploadBatchSync();  // Sync — load screen needs data ready
 }
 
 void TerrainManager::processOneReadyTile() {
@@ -1177,7 +1246,7 @@ void TerrainManager::processOneReadyTile() {
         while (!advanceFinalization(ft)) {}
         finalizingTiles_.pop_front();
 
-        if (vkCtx) vkCtx->endUploadBatch();
+        if (vkCtx) vkCtx->endUploadBatchSync();  // Sync — load screen needs data ready
     }
 }
 
diff --git a/src/rendering/vk_context.cpp b/src/rendering/vk_context.cpp
index dc73c685..79e7eac3 100644
--- a/src/rendering/vk_context.cpp
+++ b/src/rendering/vk_context.cpp
@@ -67,6 +67,14 @@ void VkContext::shutdown() {
         frame = {};
     }
 
+    // Clean up any in-flight async upload batches (device already idle)
+    for (auto& batch : inFlightBatches_) {
+        // Staging buffers: skip destroy — allocator is about to be torn down
+        vkDestroyFence(device, batch.fence, nullptr);
+        // Command buffer freed when pool is destroyed below
+    }
+    inFlightBatches_.clear();
+
     if (immFence) { vkDestroyFence(device, immFence, nullptr); immFence = VK_NULL_HANDLE; }
     if (immCommandPool) { vkDestroyCommandPool(device, immCommandPool, nullptr); immCommandPool = VK_NULL_HANDLE; }
 
@@ -1447,17 +1455,94 @@ void VkContext::endUploadBatch() {
 
     inUploadBatch_ = false;
 
-    // Submit all recorded commands with a single fence wait
+    if (batchStagingBuffers_.empty()) {
+        // No GPU copies were recorded — skip the submit entirely.
+        vkEndCommandBuffer(batchCmd_);
+        vkFreeCommandBuffers(device, immCommandPool, 1, &batchCmd_);
+        batchCmd_ = VK_NULL_HANDLE;
+        return;
+    }
+
+    // Submit commands with a NEW fence — don't wait, let GPU work in parallel.
+    vkEndCommandBuffer(batchCmd_);
+
+    VkFenceCreateInfo fenceInfo{};
+    fenceInfo.sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO;
+    VkFence fence = VK_NULL_HANDLE;
+    vkCreateFence(device, &fenceInfo, nullptr, &fence);
+
+    VkSubmitInfo submitInfo{};
+    submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
+    submitInfo.commandBufferCount = 1;
+    submitInfo.pCommandBuffers = &batchCmd_;
+    vkQueueSubmit(graphicsQueue, 1, &submitInfo, fence);
+
+    // Stash everything for later cleanup when fence signals
+    InFlightBatch batch;
+    batch.fence = fence;
+    batch.cmd = batchCmd_;
+    batch.stagingBuffers = std::move(batchStagingBuffers_);
+    inFlightBatches_.push_back(std::move(batch));
+
+    batchCmd_ = VK_NULL_HANDLE;
+    batchStagingBuffers_.clear();
+}
+
+void VkContext::endUploadBatchSync() {
+    if (uploadBatchDepth_ <= 0) return;
+    uploadBatchDepth_--;
+    if (uploadBatchDepth_ > 0) return;
+
+    inUploadBatch_ = false;
+
+    if (batchStagingBuffers_.empty()) {
+        vkEndCommandBuffer(batchCmd_);
+        vkFreeCommandBuffers(device, immCommandPool, 1, &batchCmd_);
+        batchCmd_ = VK_NULL_HANDLE;
+        return;
+    }
+
+    // Synchronous path for load screens — submit and wait
     endSingleTimeCommands(batchCmd_);
     batchCmd_ = VK_NULL_HANDLE;
 
-    // Destroy all deferred staging buffers
     for (auto& staging : batchStagingBuffers_) {
         destroyBuffer(allocator, staging);
     }
     batchStagingBuffers_.clear();
 }
 
+void VkContext::pollUploadBatches() {
+    if (inFlightBatches_.empty()) return;
+
+    for (auto it = inFlightBatches_.begin(); it != inFlightBatches_.end(); ) {
+        VkResult result = vkGetFenceStatus(device, it->fence);
+        if (result == VK_SUCCESS) {
+            // GPU finished — free resources
+            for (auto& staging : it->stagingBuffers) {
+                destroyBuffer(allocator, staging);
+            }
+            vkFreeCommandBuffers(device, immCommandPool, 1, &it->cmd);
+            vkDestroyFence(device, it->fence, nullptr);
+            it = inFlightBatches_.erase(it);
+        } else {
+            ++it;
+        }
+    }
+}
+
+void VkContext::waitAllUploads() {
+    for (auto& batch : inFlightBatches_) {
+        vkWaitForFences(device, 1, &batch.fence, VK_TRUE, UINT64_MAX);
+        for (auto& staging : batch.stagingBuffers) {
+            destroyBuffer(allocator, staging);
+        }
+        vkFreeCommandBuffers(device, immCommandPool, 1, &batch.cmd);
+        vkDestroyFence(device, batch.fence, nullptr);
+    }
+    inFlightBatches_.clear();
+}
+
 void VkContext::deferStagingCleanup(AllocatedBuffer staging) {
     batchStagingBuffers_.push_back(staging);
 }
diff --git a/src/rendering/wmo_renderer.cpp b/src/rendering/wmo_renderer.cpp
index 691abaa1..5dec0e3e 100644
--- a/src/rendering/wmo_renderer.cpp
+++ b/src/rendering/wmo_renderer.cpp
@@ -2325,13 +2325,27 @@ VkTexture* WMORenderer::loadTexture(const std::string& path) {
     const auto& attemptedCandidates = uniqueCandidates;
 
     // Try loading all candidates until one succeeds
+    // Check pre-decoded BLP cache first (populated by background worker threads)
     pipeline::BLPImage blp;
     std::string resolvedKey;
-    for (const auto& c : attemptedCandidates) {
-        blp = assetManager->loadTexture(c);
-        if (blp.isValid()) {
-            resolvedKey = c;
-            break;
+    if (predecodedBLPCache_) {
+        for (const auto& c : uniqueCandidates) {
+            auto pit = predecodedBLPCache_->find(c);
+            if (pit != predecodedBLPCache_->end()) {
+                blp = std::move(pit->second);
+                predecodedBLPCache_->erase(pit);
+                resolvedKey = c;
+                break;
+            }
+        }
+    }
+    if (!blp.isValid()) {
+        for (const auto& c : attemptedCandidates) {
+            blp = assetManager->loadTexture(c);
+            if (blp.isValid()) {
+                resolvedKey = c;
+                break;
+            }
         }
     }
     if (!blp.isValid()) {
@@ -2369,10 +2383,10 @@ VkTexture* WMORenderer::loadTexture(const std::string& path) {
     texture->createSampler(vkCtx_->getDevice(), VK_FILTER_LINEAR, VK_FILTER_LINEAR,
                             VK_SAMPLER_ADDRESS_MODE_REPEAT);
 
-    // Generate normal+height map from diffuse pixels
+    // Generate normal+height map from diffuse pixels (skip during streaming to avoid CPU stalls)
     float nhVariance = 0.0f;
     std::unique_ptr<VkTexture> nhMap;
-    if (normalMappingEnabled_ || pomEnabled_) {
+    if ((normalMappingEnabled_ || pomEnabled_) && !deferNormalMaps_) {
         nhMap = generateNormalHeightMap(blp.data.data(), blp.width, blp.height, nhVariance);
         if (nhMap) {
             approxBytes *= 2;  // account for normal map in budget

From faca22ac5fe1bd8bac93a08d7bb74ee6eef6879d Mon Sep 17 00:00:00 2001
From: Kelsi <kelsihates2fa@gmail.com>
Date: Sat, 7 Mar 2026 16:54:58 -0800
Subject: [PATCH 8/9] Async humanoid NPC texture pipeline to eliminate 30-150ms
 main-thread stalls

Move all DBC lookups (CharSections, ItemDisplayInfo), texture path resolution,
and BLP decoding for humanoid NPCs to background threads. Only GPU texture
uploads remain on the main thread via pre-decoded BLP cache.
---
 include/core/application.hpp         |  44 ++
 src/core/application.cpp             | 916 +++++++++++++++++----------
 src/rendering/character_renderer.cpp |  70 +-
 3 files changed, 703 insertions(+), 327 deletions(-)

diff --git a/include/core/application.hpp b/include/core/application.hpp
index c97bfaf6..84b89f32 100644
--- a/include/core/application.hpp
+++ b/include/core/application.hpp
@@ -220,6 +220,7 @@ private:
     std::unordered_set<uint64_t> deadCreatureGuids_;            // GUIDs that should spawn in corpse/death pose
     std::unordered_map<uint32_t, uint32_t> displayIdModelCache_; // displayId → modelId (model caching)
     std::unordered_set<uint32_t> displayIdTexturesApplied_;    // displayIds with per-model textures applied
+    std::unordered_map<uint32_t, std::unordered_map<std::string, pipeline::BLPImage>> displayIdPredecodedTextures_; // displayId → pre-decoded skin textures
     mutable std::unordered_set<uint32_t> warnedMissingDisplayDataIds_; // displayIds already warned
     mutable std::unordered_set<uint32_t> warnedMissingModelPathIds_;   // modelIds/displayIds already warned
     uint32_t nextCreatureModelId_ = 5000;  // Model IDs for online creatures
@@ -312,6 +313,49 @@ private:
     // Deferred equipment compositing queue — processes max 1 per frame to avoid stutter
     std::vector<std::pair<uint64_t, std::pair<std::array<uint32_t, 19>, std::array<uint8_t, 19>>>> deferredEquipmentQueue_;
     void processDeferredEquipmentQueue();
+    // Async equipment texture pre-decode: BLP decode on background thread, composite on main thread
+    struct PreparedEquipmentUpdate {
+        uint64_t guid;
+        std::array<uint32_t, 19> displayInfoIds;
+        std::array<uint8_t, 19> inventoryTypes;
+        std::unordered_map<std::string, pipeline::BLPImage> predecodedTextures;
+    };
+    struct AsyncEquipmentLoad {
+        std::future<PreparedEquipmentUpdate> future;
+    };
+    std::vector<AsyncEquipmentLoad> asyncEquipmentLoads_;
+    void processAsyncEquipmentResults();
+    std::vector<std::string> resolveEquipmentTexturePaths(uint64_t guid,
+        const std::array<uint32_t, 19>& displayInfoIds,
+        const std::array<uint8_t, 19>& inventoryTypes) const;
+    // Deferred NPC texture setup — async DBC lookups + BLP pre-decode to avoid main-thread stalls
+    struct DeferredNpcComposite {
+        uint32_t modelId;
+        uint32_t displayId;
+        // Skin compositing (type-1 slots)
+        std::string basePath;                     // CharSections skin base texture
+        std::vector<std::string> overlayPaths;    // face + underwear overlays
+        std::vector<std::pair<int, std::string>> regionLayers;  // equipment region overlays
+        std::vector<uint32_t> skinTextureSlots;   // model texture slots needing skin composite
+        bool hasComposite = false;                // needs compositing (overlays or equipment regions)
+        bool hasSimpleSkin = false;               // just base skin, no compositing needed
+        // Baked skin (type-1 slots)
+        std::string bakedSkinPath;                // baked texture path (if available)
+        bool hasBakedSkin = false;                // baked skin resolved successfully
+        // Hair (type-6 slots)
+        std::vector<uint32_t> hairTextureSlots;   // model texture slots needing hair texture
+        std::string hairTexturePath;              // resolved hair texture path
+        bool useBakedForHair = false;             // bald NPC: use baked skin for type-6
+    };
+    struct PreparedNpcComposite {
+        DeferredNpcComposite info;
+        std::unordered_map<std::string, pipeline::BLPImage> predecodedTextures;
+    };
+    struct AsyncNpcCompositeLoad {
+        std::future<PreparedNpcComposite> future;
+    };
+    std::vector<AsyncNpcCompositeLoad> asyncNpcCompositeLoads_;
+    void processAsyncNpcCompositeResults();
     // Cache base player model geometry by (raceId, genderId)
     std::unordered_map<uint32_t, uint32_t> playerModelCache_; // key=(race<<8)|gender → modelId
     struct PlayerTextureSlots { int skin = -1; int hair = -1; int underwear = -1; };
diff --git a/src/core/application.cpp b/src/core/application.cpp
index f4712613..b003af53 100644
--- a/src/core/application.cpp
+++ b/src/core/application.cpp
@@ -913,11 +913,24 @@ void Application::update(float deltaTime) {
             inGameStep = "spawn/equipment queues";
             updateCheckpoint = "in_game: spawn/equipment queues";
             runInGameStage("spawn/equipment queues", [&] {
+                auto t0 = std::chrono::steady_clock::now();
                 processPlayerSpawnQueue();
-                // Process deferred online creature spawns (throttled)
+                auto t1 = std::chrono::steady_clock::now();
                 processCreatureSpawnQueue();
-                // Process deferred equipment compositing (max 1 per frame to avoid stutter)
+                auto t2 = std::chrono::steady_clock::now();
+                processAsyncNpcCompositeResults();
+                auto t3 = std::chrono::steady_clock::now();
                 processDeferredEquipmentQueue();
+                auto t4 = std::chrono::steady_clock::now();
+                float pMs = std::chrono::duration<float, std::milli>(t1 - t0).count();
+                float cMs = std::chrono::duration<float, std::milli>(t2 - t1).count();
+                float nMs = std::chrono::duration<float, std::milli>(t3 - t2).count();
+                float eMs = std::chrono::duration<float, std::milli>(t4 - t3).count();
+                float total = pMs + cMs + nMs + eMs;
+                if (total > 4.0f) {
+                    LOG_WARNING("spawn/equip breakdown: player=", pMs, "ms creature=", cMs,
+                                "ms npcComposite=", nMs, "ms equip=", eMs, "ms");
+                }
             });
             // Self-heal missing creature visuals: if a nearby UNIT exists in
             // entity state but has no render instance, queue a spawn retry.
@@ -4235,6 +4248,7 @@ void Application::loadOnlineWorldTerrain(uint32_t mapId, float x, float y, float
                 }
             }
             processCreatureSpawnQueue();
+            processAsyncNpcCompositeResults();
             processDeferredEquipmentQueue();
 
             // Process ALL pending game object spawns (no 1-per-frame cap during load screen).
@@ -4792,9 +4806,17 @@ void Application::spawnOnlineCreature(uint64_t guid, uint32_t displayId, float x
     auto itDisplayData = displayDataMap_.find(displayId);
     bool needsTextures = (displayIdTexturesApplied_.find(displayId) == displayIdTexturesApplied_.end());
     if (needsTextures && itDisplayData != displayDataMap_.end()) {
+        auto texStart = std::chrono::steady_clock::now();
         displayIdTexturesApplied_.insert(displayId);
         const auto& dispData = itDisplayData->second;
 
+        // Use pre-decoded textures from async creature load (if available)
+        auto itPreDec = displayIdPredecodedTextures_.find(displayId);
+        bool hasPreDec = (itPreDec != displayIdPredecodedTextures_.end());
+        if (hasPreDec) {
+            charRenderer->setPredecodedBLPCache(&itPreDec->second);
+        }
+
         // Get model directory for texture path construction
         std::string modelDir;
         size_t lastSlash = m2Path.find_last_of("\\/");
@@ -4827,336 +4849,217 @@ void Application::spawnOnlineCreature(uint64_t guid, uint32_t displayId, float x
                 LOG_DEBUG("  Found humanoid extra: raceId=", (int)extra.raceId, " sexId=", (int)extra.sexId,
                           " hairStyle=", (int)extra.hairStyleId, " hairColor=", (int)extra.hairColorId,
                           " bakeName='", extra.bakeName, "'");
-                LOG_DEBUG("NPC equip: chest=", extra.equipDisplayId[3],
-                          " legs=", extra.equipDisplayId[5],
-                          " feet=", extra.equipDisplayId[6],
-                          " hands=", extra.equipDisplayId[8],
-                          " bake='", extra.bakeName, "'");
 
-                // Build equipment texture region layers from NPC equipment display IDs
-                // (texture-only compositing — no geoset changes to avoid invisibility bugs)
-                std::vector<std::pair<int, std::string>> npcRegionLayers;
-                std::string npcCapeTexturePath;
-                auto npcItemDisplayDbc = assetManager->loadDBC("ItemDisplayInfo.dbc");
-                    if (npcItemDisplayDbc) {
-                        static const char* npcComponentDirs[] = {
-                            "ArmUpperTexture", "ArmLowerTexture", "HandTexture",
-                            "TorsoUpperTexture", "TorsoLowerTexture",
-                            "LegUpperTexture", "LegLowerTexture", "FootTexture",
-                        };
-                        const auto* idiL = pipeline::getActiveDBCLayout()
-                            ? pipeline::getActiveDBCLayout()->getLayout("ItemDisplayInfo") : nullptr;
-                        // Texture component region fields (8 regions: ArmUpper..Foot)
-                        // Binary DBC (23 fields) has textures at 14+
-                        const uint32_t texRegionFields[8] = {
-                            idiL ? (*idiL)["TextureArmUpper"]  : 14u,
-                            idiL ? (*idiL)["TextureArmLower"]  : 15u,
-                            idiL ? (*idiL)["TextureHand"]      : 16u,
-                            idiL ? (*idiL)["TextureTorsoUpper"]: 17u,
-                            idiL ? (*idiL)["TextureTorsoLower"]: 18u,
-                            idiL ? (*idiL)["TextureLegUpper"]  : 19u,
-                            idiL ? (*idiL)["TextureLegLower"]  : 20u,
-                            idiL ? (*idiL)["TextureFoot"]      : 21u,
-                        };
-                        const bool npcIsFemale = (extra.sexId == 1);
-                        const bool npcHasArmArmor = (extra.equipDisplayId[7] != 0 || extra.equipDisplayId[8] != 0);
-
-                        auto regionAllowedForNpcSlot = [](int eqSlot, int region) -> bool {
-                            // Regions: 0 ArmUpper, 1 ArmLower, 2 Hand, 3 TorsoUpper, 4 TorsoLower,
-                            //          5 LegUpper, 6 LegLower, 7 Foot
-                            switch (eqSlot) {
-                                case 2: // shirt
-                                case 3: // chest
-                                    return region <= 4;
-                                case 4: // belt
-                                    // TODO(#npc-belt-region): belt torso-lower overlay can
-                                    // cut out male abdomen on some humanoid NPCs.
-                                    // Keep disabled until region compositing is fixed.
-                                    return false;
-                                case 5: // legs
-                                    return region == 5 || region == 6;
-                                case 6: // feet
-                                    return region == 7;
-                                case 7: // wrist
-                                    // Bracer overlays on NPCs often produce bad arm artifacts.
-                                    // Keep disabled until slot-accurate arm compositing is implemented.
-                                    return false;
-                                case 8: // hands
-                                    // Keep glove textures to hand region only; arm regions from glove
-                                    // items can produce furry/looping forearm artifacts on some NPCs.
-                                    return region == 2;
-                                case 9: // tabard
-                                    return region == 3 || region == 4;
-                                default:
-                                    return false;
-                            }
-                        };
-                        auto regionAllowedForNpcSlotCtx = [&](int eqSlot, int region) -> bool {
-                            // Shirt (slot 2) without arm armor: restrict to torso only
-                            // to avoid bare-skin shirt textures bleeding onto arms.
-                            // Chest (slot 3) always paints arms — plate/mail chest armor
-                            // must cover the full upper body even without separate gloves.
-                            if (eqSlot == 2 && !npcHasArmArmor) {
-                                return (region == 3 || region == 4);
-                            }
-                            return regionAllowedForNpcSlot(eqSlot, region);
-                        };
-
-                        // Iterate all 11 NPC equipment slots; use slot-aware region filtering
-                        for (int eqSlot = 0; eqSlot < 11; eqSlot++) {
-                            uint32_t did = extra.equipDisplayId[eqSlot];
-                            if (did == 0) continue;
-                            int32_t recIdx = npcItemDisplayDbc->findRecordById(did);
-                            if (recIdx < 0) continue;
-
-                            for (int region = 0; region < 8; region++) {
-                                if (!regionAllowedForNpcSlotCtx(eqSlot, region)) continue;
-                                std::string texName = npcItemDisplayDbc->getString(
-                                    static_cast<uint32_t>(recIdx), texRegionFields[region]);
-                                if (texName.empty()) continue;
-
-                                std::string base = "Item\\TextureComponents\\" +
-                                    std::string(npcComponentDirs[region]) + "\\" + texName;
-                                std::string genderPath = base + (npcIsFemale ? "_F.blp" : "_M.blp");
-                                std::string unisexPath = base + "_U.blp";
-                                std::string basePath = base + ".blp";
-                                std::string fullPath;
-                                if (assetManager->fileExists(genderPath)) fullPath = genderPath;
-                                else if (assetManager->fileExists(unisexPath)) fullPath = unisexPath;
-                                else if (assetManager->fileExists(basePath)) fullPath = basePath;
-                                else continue;
-
-                                npcRegionLayers.emplace_back(region, fullPath);
-                            }
-                        }
-
-                        // Cloak/cape texture is separate from the body atlas.
-                        // Read equipped cape displayId (slot 10) and resolve the best cape texture path.
-                        uint32_t capeDisplayId = extra.equipDisplayId[10];
-                        if (capeDisplayId != 0) {
-                            int32_t capeRecIdx = npcItemDisplayDbc->findRecordById(capeDisplayId);
-                            if (capeRecIdx >= 0) {
-                                const uint32_t leftTexField = idiL ? (*idiL)["LeftModelTexture"] : 3u;
-                                const uint32_t rightTexField = leftTexField + 1u; // modelTexture_2 in 3.3.5a
-
-                                std::vector<std::string> capeNames;
-                                auto addName = [&](const std::string& n) {
-                                    if (!n.empty() && std::find(capeNames.begin(), capeNames.end(), n) == capeNames.end()) {
-                                        capeNames.push_back(n);
-                                    }
-                                };
-                                std::string leftName = npcItemDisplayDbc->getString(
-                                    static_cast<uint32_t>(capeRecIdx), leftTexField);
-                                std::string rightName = npcItemDisplayDbc->getString(
-                                    static_cast<uint32_t>(capeRecIdx), rightTexField);
-                                // Female models often prefer modelTexture_2.
-                                if (npcIsFemale) {
-                                    addName(rightName);
-                                    addName(leftName);
-                                } else {
-                                    addName(leftName);
-                                    addName(rightName);
-                                }
-
-                                auto hasBlpExt = [](const std::string& p) {
-                                    if (p.size() < 4) return false;
-                                    std::string ext = p.substr(p.size() - 4);
-                                    std::transform(ext.begin(), ext.end(), ext.begin(),
-                                                   [](unsigned char c) { return static_cast<char>(std::tolower(c)); });
-                                    return ext == ".blp";
-                                };
-
-                                std::vector<std::string> capeCandidates;
-                                auto addCapeCandidate = [&](const std::string& p) {
-                                    if (p.empty()) return;
-                                    if (std::find(capeCandidates.begin(), capeCandidates.end(), p) == capeCandidates.end()) {
-                                        capeCandidates.push_back(p);
-                                    }
-                                };
-
-                                for (const auto& nameRaw : capeNames) {
-                                    std::string name = nameRaw;
-                                    std::replace(name.begin(), name.end(), '/', '\\');
-                                    bool hasDir = (name.find('\\') != std::string::npos);
-                                    bool hasExt = hasBlpExt(name);
-                                    if (hasDir) {
-                                        addCapeCandidate(name);
-                                        if (!hasExt) addCapeCandidate(name + ".blp");
-                                    } else {
-                                        std::string base = "Item\\ObjectComponents\\Cape\\" + name;
-                                        addCapeCandidate(base);
-                                        if (!hasExt) addCapeCandidate(base + ".blp");
-                                        // Some data sets use gender/unisex suffix variants.
-                                        addCapeCandidate(base + (npcIsFemale ? "_F.blp" : "_M.blp"));
-                                        addCapeCandidate(base + "_U.blp");
-                                    }
-                                }
-
-                                for (const auto& candidate : capeCandidates) {
-                                    if (assetManager->fileExists(candidate)) {
-                                        npcCapeTexturePath = candidate;
-                                        break;
-                                    }
-                                }
-                            }
-                        }
-                    }
-
-                // Use baked texture for body skin (types 1, 2)
-                // Type 6 (hair) needs its own texture from CharSections.dbc
-                const bool allowNpcRegionComposite = true;
-                rendering::VkTexture* bakedSkinTex = nullptr;
-                if (!extra.bakeName.empty()) {
-                    std::string bakePath = "Textures\\BakedNpcTextures\\" + extra.bakeName;
-                    rendering::VkTexture* finalTex = charRenderer->loadTexture(bakePath);
-                    bakedSkinTex = finalTex;
-                    if (finalTex && modelData) {
-                        for (size_t ti = 0; ti < modelData->textures.size(); ti++) {
-                            uint32_t texType = modelData->textures[ti].type;
-                            if (texType == 1) {
-                                charRenderer->setModelTexture(modelId, static_cast<uint32_t>(ti), finalTex);
-                                hasHumanoidTexture = true;
-                                LOG_DEBUG("NPC baked type1 slot=", ti, " modelId=", modelId,
-                                            " tex=", bakePath);
-                            }
-                        }
-                    }
-                }
-                // Fallback: if baked texture failed or bakeName was empty, build from CharSections
-                if (!hasHumanoidTexture) {
-                    LOG_DEBUG("  Trying CharSections fallback for NPC skin");
-
-                    // Build skin texture from CharSections.dbc (same as player character)
-                    auto csFallbackDbc = assetManager->loadDBC("CharSections.dbc");
-                    if (csFallbackDbc) {
-                        const auto* csFL = pipeline::getActiveDBCLayout()
-                            ? pipeline::getActiveDBCLayout()->getLayout("CharSections") : nullptr;
-                        uint32_t npcRace = static_cast<uint32_t>(extra.raceId);
-                        uint32_t npcSex = static_cast<uint32_t>(extra.sexId);
-                        uint32_t npcSkin = static_cast<uint32_t>(extra.skinId);
-                        uint32_t npcFace = static_cast<uint32_t>(extra.faceId);
-                        std::string npcSkinPath, npcFaceLower, npcFaceUpper;
-                        std::vector<std::string> npcUnderwear;
-
-                        for (uint32_t r = 0; r < csFallbackDbc->getRecordCount(); r++) {
-                            uint32_t rId = csFallbackDbc->getUInt32(r, csFL ? (*csFL)["RaceID"] : 1);
-                            uint32_t sId = csFallbackDbc->getUInt32(r, csFL ? (*csFL)["SexID"] : 2);
-                            if (rId != npcRace || sId != npcSex) continue;
-
-                            uint32_t section = csFallbackDbc->getUInt32(r, csFL ? (*csFL)["BaseSection"] : 3);
-                            uint32_t variation = csFallbackDbc->getUInt32(r, csFL ? (*csFL)["VariationIndex"] : 8);
-                            uint32_t color = csFallbackDbc->getUInt32(r, csFL ? (*csFL)["ColorIndex"] : 9);
-                            uint32_t tex1F = csFL ? (*csFL)["Texture1"] : 4;
-
-                            // Section 0 = skin: match colorIndex = skinId
-                            if (section == 0 && npcSkinPath.empty() && color == npcSkin) {
-                                npcSkinPath = csFallbackDbc->getString(r, tex1F);
-                            }
-                            // Section 1 = face: match variation=faceId, color=skinId
-                            else if (section == 1 && npcFaceLower.empty() &&
-                                     variation == npcFace && color == npcSkin) {
-                                npcFaceLower = csFallbackDbc->getString(r, tex1F);
-                                npcFaceUpper = csFallbackDbc->getString(r, tex1F + 1);
-                            }
-                            // Section 4 = underwear: match color=skinId
-                            else if (section == 4 && npcUnderwear.empty() && color == npcSkin) {
-                                for (uint32_t f = tex1F; f <= tex1F + 2; f++) {
-                                    std::string tex = csFallbackDbc->getString(r, f);
-                                    if (!tex.empty()) npcUnderwear.push_back(tex);
-                                }
-                            }
-                        }
-
-                        LOG_DEBUG("NPC CharSections lookup: race=", npcRace, " sex=", npcSex,
-                                    " skin=", npcSkin, " face=", npcFace,
-                                    " skinPath='", npcSkinPath, "' faceLower='", npcFaceLower, "'");
-                        if (!npcSkinPath.empty()) {
-                            // Composite skin + face + underwear
-                            std::vector<std::string> skinLayers;
-                            skinLayers.push_back(npcSkinPath);
-                            if (!npcFaceLower.empty()) skinLayers.push_back(npcFaceLower);
-                            if (!npcFaceUpper.empty()) skinLayers.push_back(npcFaceUpper);
-                            for (const auto& uw : npcUnderwear) skinLayers.push_back(uw);
-
-                            rendering::VkTexture* npcSkinTex = nullptr;
-                            if (allowNpcRegionComposite && !npcRegionLayers.empty()) {
-                                npcSkinTex = charRenderer->compositeWithRegions(npcSkinPath,
-                                    std::vector<std::string>(skinLayers.begin() + 1, skinLayers.end()),
-                                    npcRegionLayers);
-                            } else if (skinLayers.size() > 1) {
-                                npcSkinTex = charRenderer->compositeTextures(skinLayers);
-                            } else {
-                                npcSkinTex = charRenderer->loadTexture(npcSkinPath);
-                            }
-
-                            if (npcSkinTex && modelData) {
-                                int slotsSet = 0;
-                                for (size_t ti = 0; ti < modelData->textures.size(); ti++) {
-                                    uint32_t texType = modelData->textures[ti].type;
-                                    if (texType == 1 || texType == 11 || texType == 12 || texType == 13) {
-                                        charRenderer->setModelTexture(modelId, static_cast<uint32_t>(ti), npcSkinTex);
-                                        hasHumanoidTexture = true;
-                                        slotsSet++;
-                                    }
-                                }
-                                LOG_DEBUG("NPC CharSections: skin='", npcSkinPath, "' regions=",
-                                            npcRegionLayers.size(), " applied=", hasHumanoidTexture,
-                                            " slots=", slotsSet,
-                                            " modelId=", modelId, " texCount=", modelData->textures.size());
-                            }
-                        }
+                // Collect model texture slot info (type 1 = skin, type 6 = hair)
+                std::vector<uint32_t> skinSlots, hairSlots;
+                if (modelData) {
+                    for (size_t ti = 0; ti < modelData->textures.size(); ti++) {
+                        uint32_t texType = modelData->textures[ti].type;
+                        if (texType == 1 || texType == 11 || texType == 12 || texType == 13)
+                            skinSlots.push_back(static_cast<uint32_t>(ti));
+                        if (texType == 6)
+                            hairSlots.push_back(static_cast<uint32_t>(ti));
                     }
                 }
 
-                // Load hair texture from CharSections.dbc (section 3)
-                auto charSectionsDbc = assetManager->loadDBC("CharSections.dbc");
-                if (charSectionsDbc) {
-                    const auto* csL2 = pipeline::getActiveDBCLayout() ? pipeline::getActiveDBCLayout()->getLayout("CharSections") : nullptr;
-                    uint32_t targetRace = static_cast<uint32_t>(extra.raceId);
-                    uint32_t targetSex = static_cast<uint32_t>(extra.sexId);
-                    std::string hairTexPath;
+                // Copy extra data for the async task (avoid dangling reference)
+                HumanoidDisplayExtra extraCopy = extra;
 
-                    for (uint32_t r = 0; r < charSectionsDbc->getRecordCount(); r++) {
-                        uint32_t raceId = charSectionsDbc->getUInt32(r, csL2 ? (*csL2)["RaceID"] : 1);
-                        uint32_t sexId = charSectionsDbc->getUInt32(r, csL2 ? (*csL2)["SexID"] : 2);
-                        uint32_t section = charSectionsDbc->getUInt32(r, csL2 ? (*csL2)["BaseSection"] : 3);
-                        uint32_t variation = charSectionsDbc->getUInt32(r, csL2 ? (*csL2)["VariationIndex"] : 4);
-                        uint32_t colorIdx = charSectionsDbc->getUInt32(r, csL2 ? (*csL2)["ColorIndex"] : 5);
+                // Launch async task: ALL DBC lookups, path resolution, and BLP pre-decode
+                // happen on a background thread. Only GPU texture upload runs on main thread
+                // (in processAsyncNpcCompositeResults).
+                auto* am = assetManager.get();
+                AsyncNpcCompositeLoad load;
+                load.future = std::async(std::launch::async,
+                    [am, extraCopy, skinSlots = std::move(skinSlots),
+                     hairSlots = std::move(hairSlots), modelId, displayId]() mutable -> PreparedNpcComposite {
+                        PreparedNpcComposite result;
+                        DeferredNpcComposite& def = result.info;
+                        def.modelId = modelId;
+                        def.displayId = displayId;
+                        def.skinTextureSlots = std::move(skinSlots);
+                        def.hairTextureSlots = std::move(hairSlots);
 
-                        if (raceId != targetRace || sexId != targetSex) continue;
-                        if (section != 3) continue;  // Section 3 = hair
-                        if (variation != static_cast<uint32_t>(extra.hairStyleId)) continue;
-                        if (colorIdx != static_cast<uint32_t>(extra.hairColorId)) continue;
+                        std::vector<std::string> allPaths;  // paths to pre-decode
 
-                        hairTexPath = charSectionsDbc->getString(r, csL2 ? (*csL2)["Texture1"] : 6);
-                        break;
-                    }
+                        // --- Baked skin texture ---
+                        if (!extraCopy.bakeName.empty()) {
+                            def.bakedSkinPath = "Textures\\BakedNpcTextures\\" + extraCopy.bakeName;
+                            def.hasBakedSkin = true;
+                            allPaths.push_back(def.bakedSkinPath);
+                        }
 
-                    if (!hairTexPath.empty()) {
-                        rendering::VkTexture* hairTex = charRenderer->loadTexture(hairTexPath);
-                        rendering::VkTexture* whTex = charRenderer->loadTexture("");
-                        if (hairTex && hairTex != whTex && modelData) {
-                            for (size_t ti = 0; ti < modelData->textures.size(); ti++) {
-                                if (modelData->textures[ti].type == 6) {
-                                    charRenderer->setModelTexture(modelId, static_cast<uint32_t>(ti), hairTex);
+                        // --- CharSections fallback (skin/face/underwear) ---
+                        if (!def.hasBakedSkin) {
+                            auto csDbc = am->loadDBC("CharSections.dbc");
+                            if (csDbc) {
+                                const auto* csL = pipeline::getActiveDBCLayout()
+                                    ? pipeline::getActiveDBCLayout()->getLayout("CharSections") : nullptr;
+                                uint32_t npcRace = static_cast<uint32_t>(extraCopy.raceId);
+                                uint32_t npcSex = static_cast<uint32_t>(extraCopy.sexId);
+                                uint32_t npcSkin = static_cast<uint32_t>(extraCopy.skinId);
+                                uint32_t npcFace = static_cast<uint32_t>(extraCopy.faceId);
+                                std::string npcFaceLower, npcFaceUpper;
+                                std::vector<std::string> npcUnderwear;
+
+                                for (uint32_t r = 0; r < csDbc->getRecordCount(); r++) {
+                                    uint32_t rId = csDbc->getUInt32(r, csL ? (*csL)["RaceID"] : 1);
+                                    uint32_t sId = csDbc->getUInt32(r, csL ? (*csL)["SexID"] : 2);
+                                    if (rId != npcRace || sId != npcSex) continue;
+
+                                    uint32_t section = csDbc->getUInt32(r, csL ? (*csL)["BaseSection"] : 3);
+                                    uint32_t variation = csDbc->getUInt32(r, csL ? (*csL)["VariationIndex"] : 4);
+                                    uint32_t color = csDbc->getUInt32(r, csL ? (*csL)["ColorIndex"] : 5);
+                                    uint32_t tex1F = csL ? (*csL)["Texture1"] : 6;
+
+                                    if (section == 0 && def.basePath.empty() && color == npcSkin) {
+                                        def.basePath = csDbc->getString(r, tex1F);
+                                    } else if (section == 1 && npcFaceLower.empty() &&
+                                               variation == npcFace && color == npcSkin) {
+                                        npcFaceLower = csDbc->getString(r, tex1F);
+                                        npcFaceUpper = csDbc->getString(r, tex1F + 1);
+                                    } else if (section == 4 && npcUnderwear.empty() && color == npcSkin) {
+                                        for (uint32_t f = tex1F; f <= tex1F + 2; f++) {
+                                            std::string tex = csDbc->getString(r, f);
+                                            if (!tex.empty()) npcUnderwear.push_back(tex);
+                                        }
+                                    }
+                                }
+
+                                if (!def.basePath.empty()) {
+                                    allPaths.push_back(def.basePath);
+                                    if (!npcFaceLower.empty()) { def.overlayPaths.push_back(npcFaceLower); allPaths.push_back(npcFaceLower); }
+                                    if (!npcFaceUpper.empty()) { def.overlayPaths.push_back(npcFaceUpper); allPaths.push_back(npcFaceUpper); }
+                                    for (const auto& uw : npcUnderwear) { def.overlayPaths.push_back(uw); allPaths.push_back(uw); }
                                 }
                             }
                         }
-                    }
-                    // Bald NPCs (hairStyle=0 or no CharSections match): set type-6 to
-                    // the skin/baked texture so the scalp cap renders with skin color.
-                    if (hairTexPath.empty() && bakedSkinTex && modelData) {
-                        for (size_t ti = 0; ti < modelData->textures.size(); ti++) {
-                            if (modelData->textures[ti].type == 6) {
-                                charRenderer->setModelTexture(modelId, static_cast<uint32_t>(ti), bakedSkinTex);
+
+                        // --- Equipment region layers (ItemDisplayInfo DBC) ---
+                        auto idiDbc = am->loadDBC("ItemDisplayInfo.dbc");
+                        if (idiDbc) {
+                            static const char* componentDirs[] = {
+                                "ArmUpperTexture", "ArmLowerTexture", "HandTexture",
+                                "TorsoUpperTexture", "TorsoLowerTexture",
+                                "LegUpperTexture", "LegLowerTexture", "FootTexture",
+                            };
+                            const auto* idiL = pipeline::getActiveDBCLayout()
+                                ? pipeline::getActiveDBCLayout()->getLayout("ItemDisplayInfo") : nullptr;
+                            const uint32_t texRegionFields[8] = {
+                                idiL ? (*idiL)["TextureArmUpper"]  : 14u,
+                                idiL ? (*idiL)["TextureArmLower"]  : 15u,
+                                idiL ? (*idiL)["TextureHand"]      : 16u,
+                                idiL ? (*idiL)["TextureTorsoUpper"]: 17u,
+                                idiL ? (*idiL)["TextureTorsoLower"]: 18u,
+                                idiL ? (*idiL)["TextureLegUpper"]  : 19u,
+                                idiL ? (*idiL)["TextureLegLower"]  : 20u,
+                                idiL ? (*idiL)["TextureFoot"]      : 21u,
+                            };
+                            const bool npcIsFemale = (extraCopy.sexId == 1);
+                            const bool npcHasArmArmor = (extraCopy.equipDisplayId[7] != 0 || extraCopy.equipDisplayId[8] != 0);
+
+                            auto regionAllowedForNpcSlot = [](int eqSlot, int region) -> bool {
+                                switch (eqSlot) {
+                                    case 2: case 3: return region <= 4;
+                                    case 4: return false;
+                                    case 5: return region == 5 || region == 6;
+                                    case 6: return region == 7;
+                                    case 7: return false;
+                                    case 8: return region == 2;
+                                    case 9: return region == 3 || region == 4;
+                                    default: return false;
+                                }
+                            };
+
+                            for (int eqSlot = 0; eqSlot < 11; eqSlot++) {
+                                uint32_t did = extraCopy.equipDisplayId[eqSlot];
+                                if (did == 0) continue;
+                                int32_t recIdx = idiDbc->findRecordById(did);
+                                if (recIdx < 0) continue;
+
+                                for (int region = 0; region < 8; region++) {
+                                    if (!regionAllowedForNpcSlot(eqSlot, region)) continue;
+                                    if (eqSlot == 2 && !npcHasArmArmor && !(region == 3 || region == 4)) continue;
+                                    std::string texName = idiDbc->getString(
+                                        static_cast<uint32_t>(recIdx), texRegionFields[region]);
+                                    if (texName.empty()) continue;
+
+                                    std::string base = "Item\\TextureComponents\\" +
+                                        std::string(componentDirs[region]) + "\\" + texName;
+                                    std::string genderPath = base + (npcIsFemale ? "_F.blp" : "_M.blp");
+                                    std::string unisexPath = base + "_U.blp";
+                                    std::string basePath = base + ".blp";
+                                    std::string fullPath;
+                                    if (am->fileExists(genderPath)) fullPath = genderPath;
+                                    else if (am->fileExists(unisexPath)) fullPath = unisexPath;
+                                    else if (am->fileExists(basePath)) fullPath = basePath;
+                                    else continue;
+
+                                    def.regionLayers.emplace_back(region, fullPath);
+                                    allPaths.push_back(fullPath);
+                                }
                             }
                         }
-                    }
-                }
 
-                // Do not apply cape textures at model scope here. Type-2 texture slots are
-                // shared per model and this can leak cape textures/white fallbacks onto
-                // unrelated humanoid NPCs that use the same modelId.
+                        // Determine compositing mode
+                        if (!def.basePath.empty()) {
+                            bool needsComposite = !def.overlayPaths.empty() || !def.regionLayers.empty();
+                            if (needsComposite && !def.skinTextureSlots.empty()) {
+                                def.hasComposite = true;
+                            } else if (!def.skinTextureSlots.empty()) {
+                                def.hasSimpleSkin = true;
+                            }
+                        }
+
+                        // --- Hair texture from CharSections (section 3) ---
+                        {
+                            auto csDbc = am->loadDBC("CharSections.dbc");
+                            if (csDbc) {
+                                const auto* csL = pipeline::getActiveDBCLayout()
+                                    ? pipeline::getActiveDBCLayout()->getLayout("CharSections") : nullptr;
+                                uint32_t targetRace = static_cast<uint32_t>(extraCopy.raceId);
+                                uint32_t targetSex = static_cast<uint32_t>(extraCopy.sexId);
+
+                                for (uint32_t r = 0; r < csDbc->getRecordCount(); r++) {
+                                    uint32_t raceId = csDbc->getUInt32(r, csL ? (*csL)["RaceID"] : 1);
+                                    uint32_t sexId = csDbc->getUInt32(r, csL ? (*csL)["SexID"] : 2);
+                                    if (raceId != targetRace || sexId != targetSex) continue;
+                                    uint32_t section = csDbc->getUInt32(r, csL ? (*csL)["BaseSection"] : 3);
+                                    if (section != 3) continue;
+                                    uint32_t variation = csDbc->getUInt32(r, csL ? (*csL)["VariationIndex"] : 4);
+                                    uint32_t colorIdx = csDbc->getUInt32(r, csL ? (*csL)["ColorIndex"] : 5);
+                                    if (variation != static_cast<uint32_t>(extraCopy.hairStyleId)) continue;
+                                    if (colorIdx != static_cast<uint32_t>(extraCopy.hairColorId)) continue;
+                                    def.hairTexturePath = csDbc->getString(r, csL ? (*csL)["Texture1"] : 6);
+                                    break;
+                                }
+
+                                if (!def.hairTexturePath.empty()) {
+                                    allPaths.push_back(def.hairTexturePath);
+                                } else if (def.hasBakedSkin && !def.hairTextureSlots.empty()) {
+                                    def.useBakedForHair = true;
+                                    // bakedSkinPath already in allPaths
+                                }
+                            }
+                        }
+
+                        // --- Pre-decode all BLP textures on this background thread ---
+                        for (const auto& path : allPaths) {
+                            std::string key = path;
+                            std::replace(key.begin(), key.end(), '/', '\\');
+                            std::transform(key.begin(), key.end(), key.begin(),
+                                           [](unsigned char c) { return static_cast<char>(std::tolower(c)); });
+                            if (result.predecodedTextures.count(key)) continue;
+                            auto blp = am->loadTexture(key);
+                            if (blp.isValid()) {
+                                result.predecodedTextures[key] = std::move(blp);
+                            }
+                        }
+
+                        return result;
+                    });
+                asyncNpcCompositeLoads_.push_back(std::move(load));
+                hasHumanoidTexture = true;  // skip non-humanoid skin block
             } else {
                 LOG_WARNING("  extraDisplayId ", dispData.extraDisplayId, " not found in humanoidExtraMap");
             }
@@ -5235,6 +5138,18 @@ void Application::spawnOnlineCreature(uint64_t guid, uint32_t displayId, float x
                 }
             }
         }
+
+        // Clear pre-decoded cache after applying all display textures
+        charRenderer->setPredecodedBLPCache(nullptr);
+        displayIdPredecodedTextures_.erase(displayId);
+        {
+            auto texEnd = std::chrono::steady_clock::now();
+            float texMs = std::chrono::duration<float, std::milli>(texEnd - texStart).count();
+            if (texMs > 3.0f) {
+                LOG_WARNING("spawnCreature texture setup took ", texMs, "ms displayId=", displayId,
+                            " hasPreDec=", hasPreDec, " extra=", dispData.extraDisplayId);
+            }
+        }
     }
 
     // Use the entity's latest server-authoritative position rather than the stale spawn
@@ -6926,6 +6841,7 @@ void Application::processAsyncCreatureResults() {
 
         // Upload model to GPU (must happen on main thread)
         // Use pre-decoded BLP cache to skip main-thread texture decode
+        auto uploadStart = std::chrono::steady_clock::now();
         charRenderer->setPredecodedBLPCache(&result.predecodedTextures);
         if (!charRenderer->loadModel(*result.model, result.modelId)) {
             charRenderer->setPredecodedBLPCache(nullptr);
@@ -6936,6 +6852,18 @@ void Application::processAsyncCreatureResults() {
             continue;
         }
         charRenderer->setPredecodedBLPCache(nullptr);
+        {
+            auto uploadEnd = std::chrono::steady_clock::now();
+            float uploadMs = std::chrono::duration<float, std::milli>(uploadEnd - uploadStart).count();
+            if (uploadMs > 3.0f) {
+                LOG_WARNING("charRenderer->loadModel took ", uploadMs, "ms displayId=", result.displayId,
+                            " preDecoded=", result.predecodedTextures.size());
+            }
+        }
+        // Save remaining pre-decoded textures (display skins) for spawnOnlineCreature
+        if (!result.predecodedTextures.empty()) {
+            displayIdPredecodedTextures_[result.displayId] = std::move(result.predecodedTextures);
+        }
         displayIdModelCache_[result.displayId] = result.modelId;
         modelUploads++;
 
@@ -6959,6 +6887,77 @@ void Application::processAsyncCreatureResults() {
     }
 }
 
+void Application::processAsyncNpcCompositeResults() {
+    auto* charRenderer = renderer ? renderer->getCharacterRenderer() : nullptr;
+    if (!charRenderer) return;
+
+    for (auto it = asyncNpcCompositeLoads_.begin(); it != asyncNpcCompositeLoads_.end(); ) {
+        if (!it->future.valid() ||
+            it->future.wait_for(std::chrono::milliseconds(0)) != std::future_status::ready) {
+            ++it;
+            continue;
+        }
+        auto result = it->future.get();
+        it = asyncNpcCompositeLoads_.erase(it);
+
+        const auto& info = result.info;
+
+        // Set pre-decoded cache so texture loads skip synchronous BLP decode
+        charRenderer->setPredecodedBLPCache(&result.predecodedTextures);
+
+        // --- Apply skin to type-1 slots ---
+        rendering::VkTexture* skinTex = nullptr;
+
+        if (info.hasBakedSkin) {
+            // Baked skin: load from pre-decoded cache
+            skinTex = charRenderer->loadTexture(info.bakedSkinPath);
+        }
+
+        if (info.hasComposite) {
+            // Composite with face/underwear/equipment regions on top of base skin
+            rendering::VkTexture* compositeTex = nullptr;
+            if (!info.regionLayers.empty()) {
+                compositeTex = charRenderer->compositeWithRegions(info.basePath,
+                    info.overlayPaths, info.regionLayers);
+            } else if (!info.overlayPaths.empty()) {
+                std::vector<std::string> skinLayers;
+                skinLayers.push_back(info.basePath);
+                for (const auto& op : info.overlayPaths) skinLayers.push_back(op);
+                compositeTex = charRenderer->compositeTextures(skinLayers);
+            }
+            if (compositeTex) skinTex = compositeTex;
+        } else if (info.hasSimpleSkin) {
+            // Simple skin: just base texture, no compositing
+            auto* baseTex = charRenderer->loadTexture(info.basePath);
+            if (baseTex) skinTex = baseTex;
+        }
+
+        if (skinTex) {
+            for (uint32_t slot : info.skinTextureSlots) {
+                charRenderer->setModelTexture(info.modelId, slot, skinTex);
+            }
+        }
+
+        // --- Apply hair texture to type-6 slots ---
+        if (!info.hairTexturePath.empty()) {
+            rendering::VkTexture* hairTex = charRenderer->loadTexture(info.hairTexturePath);
+            rendering::VkTexture* whTex = charRenderer->loadTexture("");
+            if (hairTex && hairTex != whTex) {
+                for (uint32_t slot : info.hairTextureSlots) {
+                    charRenderer->setModelTexture(info.modelId, slot, hairTex);
+                }
+            }
+        } else if (info.useBakedForHair && skinTex) {
+            // Bald NPC: use skin/baked texture for scalp cap
+            for (uint32_t slot : info.hairTextureSlots) {
+                charRenderer->setModelTexture(info.modelId, slot, skinTex);
+            }
+        }
+
+        charRenderer->setPredecodedBLPCache(nullptr);
+    }
+}
+
 void Application::processCreatureSpawnQueue() {
     auto startTime = std::chrono::steady_clock::now();
     // Budget: max 2ms per frame for creature spawning to prevent stutter.
@@ -6966,6 +6965,13 @@ void Application::processCreatureSpawnQueue() {
 
     // First, finalize any async model loads that completed on background threads.
     processAsyncCreatureResults();
+    {
+        auto now = std::chrono::steady_clock::now();
+        float asyncMs = std::chrono::duration<float, std::milli>(now - startTime).count();
+        if (asyncMs > 3.0f) {
+            LOG_WARNING("processAsyncCreatureResults took ", asyncMs, "ms");
+        }
+    }
 
     if (pendingCreatureSpawns_.empty()) return;
     if (!creatureLookupsBuilt_) {
@@ -7039,9 +7045,136 @@ void Application::processCreatureSpawnQueue() {
             // Launch async M2 load — file I/O and parsing happen off the main thread.
             uint32_t modelId = nextCreatureModelId_++;
             auto* am = assetManager.get();
+
+            // Collect display skin texture paths for background pre-decode
+            std::vector<std::string> displaySkinPaths;
+            {
+                auto itDD = displayDataMap_.find(s.displayId);
+                if (itDD != displayDataMap_.end()) {
+                    std::string modelDir;
+                    size_t lastSlash = m2Path.find_last_of("\\/");
+                    if (lastSlash != std::string::npos) modelDir = m2Path.substr(0, lastSlash + 1);
+
+                    auto resolveForAsync = [&](const std::string& skinField) {
+                        if (skinField.empty()) return;
+                        std::string raw = skinField;
+                        std::replace(raw.begin(), raw.end(), '/', '\\');
+                        while (!raw.empty() && std::isspace(static_cast<unsigned char>(raw.front()))) raw.erase(raw.begin());
+                        while (!raw.empty() && std::isspace(static_cast<unsigned char>(raw.back()))) raw.pop_back();
+                        if (raw.empty()) return;
+                        bool hasExt = raw.size() >= 4 && raw.substr(raw.size()-4) == ".blp";
+                        bool hasDir = raw.find('\\') != std::string::npos;
+                        std::vector<std::string> candidates;
+                        if (hasDir) {
+                            candidates.push_back(raw);
+                            if (!hasExt) candidates.push_back(raw + ".blp");
+                        } else {
+                            candidates.push_back(modelDir + raw);
+                            if (!hasExt) candidates.push_back(modelDir + raw + ".blp");
+                            candidates.push_back(raw);
+                            if (!hasExt) candidates.push_back(raw + ".blp");
+                        }
+                        for (const auto& c : candidates) {
+                            if (am->fileExists(c)) { displaySkinPaths.push_back(c); return; }
+                        }
+                    };
+                    resolveForAsync(itDD->second.skin1);
+                    resolveForAsync(itDD->second.skin2);
+                    resolveForAsync(itDD->second.skin3);
+
+                    // Pre-decode humanoid NPC textures (bake, skin, face, underwear, hair, equipment)
+                    if (itDD->second.extraDisplayId != 0) {
+                        auto itHE = humanoidExtraMap_.find(itDD->second.extraDisplayId);
+                        if (itHE != humanoidExtraMap_.end()) {
+                            const auto& he = itHE->second;
+                            // Baked texture
+                            if (!he.bakeName.empty()) {
+                                displaySkinPaths.push_back("Textures\\BakedNpcTextures\\" + he.bakeName);
+                            }
+                            // CharSections: skin, face, underwear
+                            auto csDbc = am->loadDBC("CharSections.dbc");
+                            if (csDbc) {
+                                const auto* csL = pipeline::getActiveDBCLayout()
+                                    ? pipeline::getActiveDBCLayout()->getLayout("CharSections") : nullptr;
+                                uint32_t nRace = static_cast<uint32_t>(he.raceId);
+                                uint32_t nSex = static_cast<uint32_t>(he.sexId);
+                                uint32_t nSkin = static_cast<uint32_t>(he.skinId);
+                                uint32_t nFace = static_cast<uint32_t>(he.faceId);
+                                for (uint32_t r = 0; r < csDbc->getRecordCount(); r++) {
+                                    uint32_t rId = csDbc->getUInt32(r, csL ? (*csL)["RaceID"] : 1);
+                                    uint32_t sId = csDbc->getUInt32(r, csL ? (*csL)["SexID"] : 2);
+                                    if (rId != nRace || sId != nSex) continue;
+                                    uint32_t section = csDbc->getUInt32(r, csL ? (*csL)["BaseSection"] : 3);
+                                    uint32_t variation = csDbc->getUInt32(r, csL ? (*csL)["VariationIndex"] : 4);
+                                    uint32_t color = csDbc->getUInt32(r, csL ? (*csL)["ColorIndex"] : 5);
+                                    uint32_t tex1F = csL ? (*csL)["Texture1"] : 6;
+                                    if (section == 0 && color == nSkin) {
+                                        std::string t = csDbc->getString(r, tex1F);
+                                        if (!t.empty()) displaySkinPaths.push_back(t);
+                                    } else if (section == 1 && variation == nFace && color == nSkin) {
+                                        std::string t1 = csDbc->getString(r, tex1F);
+                                        std::string t2 = csDbc->getString(r, tex1F + 1);
+                                        if (!t1.empty()) displaySkinPaths.push_back(t1);
+                                        if (!t2.empty()) displaySkinPaths.push_back(t2);
+                                    } else if (section == 3 && variation == static_cast<uint32_t>(he.hairStyleId)
+                                               && color == static_cast<uint32_t>(he.hairColorId)) {
+                                        std::string t = csDbc->getString(r, tex1F);
+                                        if (!t.empty()) displaySkinPaths.push_back(t);
+                                    } else if (section == 4 && color == nSkin) {
+                                        for (uint32_t f = tex1F; f <= tex1F + 2; f++) {
+                                            std::string t = csDbc->getString(r, f);
+                                            if (!t.empty()) displaySkinPaths.push_back(t);
+                                        }
+                                    }
+                                }
+                            }
+                            // Equipment region textures
+                            auto idiDbc = am->loadDBC("ItemDisplayInfo.dbc");
+                            if (idiDbc) {
+                                static const char* compDirs[] = {
+                                    "ArmUpperTexture", "ArmLowerTexture", "HandTexture",
+                                    "TorsoUpperTexture", "TorsoLowerTexture",
+                                    "LegUpperTexture", "LegLowerTexture", "FootTexture",
+                                };
+                                const auto* idiL = pipeline::getActiveDBCLayout()
+                                    ? pipeline::getActiveDBCLayout()->getLayout("ItemDisplayInfo") : nullptr;
+                                const uint32_t trf[8] = {
+                                    idiL ? (*idiL)["TextureArmUpper"]  : 14u,
+                                    idiL ? (*idiL)["TextureArmLower"]  : 15u,
+                                    idiL ? (*idiL)["TextureHand"]      : 16u,
+                                    idiL ? (*idiL)["TextureTorsoUpper"]: 17u,
+                                    idiL ? (*idiL)["TextureTorsoLower"]: 18u,
+                                    idiL ? (*idiL)["TextureLegUpper"]  : 19u,
+                                    idiL ? (*idiL)["TextureLegLower"]  : 20u,
+                                    idiL ? (*idiL)["TextureFoot"]      : 21u,
+                                };
+                                const bool isFem = (he.sexId == 1);
+                                for (int eq = 0; eq < 11; eq++) {
+                                    uint32_t did = he.equipDisplayId[eq];
+                                    if (did == 0) continue;
+                                    int32_t recIdx = idiDbc->findRecordById(did);
+                                    if (recIdx < 0) continue;
+                                    for (int region = 0; region < 8; region++) {
+                                        std::string texName = idiDbc->getString(static_cast<uint32_t>(recIdx), trf[region]);
+                                        if (texName.empty()) continue;
+                                        std::string base = "Item\\TextureComponents\\" +
+                                            std::string(compDirs[region]) + "\\" + texName;
+                                        std::string gp = base + (isFem ? "_F.blp" : "_M.blp");
+                                        std::string up = base + "_U.blp";
+                                        if (am->fileExists(gp)) displaySkinPaths.push_back(gp);
+                                        else if (am->fileExists(up)) displaySkinPaths.push_back(up);
+                                        else displaySkinPaths.push_back(base + ".blp");
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+
             AsyncCreatureLoad load;
             load.future = std::async(std::launch::async,
-                [am, m2Path, modelId, s]() -> PreparedCreatureModel {
+                [am, m2Path, modelId, s, skinPaths = std::move(displaySkinPaths)]() -> PreparedCreatureModel {
                     PreparedCreatureModel result;
                     result.guid = s.guid;
                     result.displayId = s.displayId;
@@ -7100,6 +7233,19 @@ void Application::processCreatureSpawnQueue() {
                         }
                     }
 
+                    // Pre-decode display skin textures (skin1/skin2/skin3 from CreatureDisplayInfo)
+                    for (const auto& sp : skinPaths) {
+                        std::string key = sp;
+                        std::replace(key.begin(), key.end(), '/', '\\');
+                        std::transform(key.begin(), key.end(), key.begin(),
+                                       [](unsigned char c) { return static_cast<char>(std::tolower(c)); });
+                        if (result.predecodedTextures.count(key)) continue;
+                        auto blp = am->loadTexture(key);
+                        if (blp.isValid()) {
+                            result.predecodedTextures[key] = std::move(blp);
+                        }
+                    }
+
                     result.model = std::move(model);
                     result.valid = true;
                     return result;
@@ -7113,7 +7259,15 @@ void Application::processCreatureSpawnQueue() {
         }
 
         // Cached model — spawn is fast (no file I/O, just instance creation + texture setup)
-        spawnOnlineCreature(s.guid, s.displayId, s.x, s.y, s.z, s.orientation);
+        {
+            auto spawnStart = std::chrono::steady_clock::now();
+            spawnOnlineCreature(s.guid, s.displayId, s.x, s.y, s.z, s.orientation);
+            auto spawnEnd = std::chrono::steady_clock::now();
+            float spawnMs = std::chrono::duration<float, std::milli>(spawnEnd - spawnStart).count();
+            if (spawnMs > 3.0f) {
+                LOG_WARNING("spawnOnlineCreature took ", spawnMs, "ms displayId=", s.displayId);
+            }
+        }
         pendingCreatureSpawnGuids_.erase(s.guid);
 
         // If spawn still failed, retry for a limited number of frames.
@@ -7172,12 +7326,130 @@ void Application::processPlayerSpawnQueue() {
     }
 }
 
+std::vector<std::string> Application::resolveEquipmentTexturePaths(uint64_t guid,
+    const std::array<uint32_t, 19>& displayInfoIds,
+    const std::array<uint8_t, 19>& /*inventoryTypes*/) const {
+    std::vector<std::string> paths;
+
+    auto it = onlinePlayerAppearance_.find(guid);
+    if (it == onlinePlayerAppearance_.end()) return paths;
+    const OnlinePlayerAppearanceState& st = it->second;
+
+    // Add base skin + underwear paths
+    if (!st.bodySkinPath.empty()) paths.push_back(st.bodySkinPath);
+    for (const auto& up : st.underwearPaths) {
+        if (!up.empty()) paths.push_back(up);
+    }
+
+    // Resolve equipment region texture paths (same logic as setOnlinePlayerEquipment)
+    auto displayInfoDbc = assetManager->loadDBC("ItemDisplayInfo.dbc");
+    if (!displayInfoDbc) return paths;
+    const auto* idiL = pipeline::getActiveDBCLayout()
+        ? pipeline::getActiveDBCLayout()->getLayout("ItemDisplayInfo") : nullptr;
+
+    static const char* componentDirs[] = {
+        "ArmUpperTexture", "ArmLowerTexture", "HandTexture",
+        "TorsoUpperTexture", "TorsoLowerTexture",
+        "LegUpperTexture", "LegLowerTexture", "FootTexture",
+    };
+    const uint32_t texRegionFields[8] = {
+        idiL ? (*idiL)["TextureArmUpper"]  : 14u,
+        idiL ? (*idiL)["TextureArmLower"]  : 15u,
+        idiL ? (*idiL)["TextureHand"]      : 16u,
+        idiL ? (*idiL)["TextureTorsoUpper"]: 17u,
+        idiL ? (*idiL)["TextureTorsoLower"]: 18u,
+        idiL ? (*idiL)["TextureLegUpper"]  : 19u,
+        idiL ? (*idiL)["TextureLegLower"]  : 20u,
+        idiL ? (*idiL)["TextureFoot"]      : 21u,
+    };
+    const bool isFemale = (st.genderId == 1);
+
+    for (int s = 0; s < 19; s++) {
+        uint32_t did = displayInfoIds[s];
+        if (did == 0) continue;
+        int32_t recIdx = displayInfoDbc->findRecordById(did);
+        if (recIdx < 0) continue;
+        for (int region = 0; region < 8; region++) {
+            std::string texName = displayInfoDbc->getString(
+                static_cast<uint32_t>(recIdx), texRegionFields[region]);
+            if (texName.empty()) continue;
+            std::string base = "Item\\TextureComponents\\" +
+                std::string(componentDirs[region]) + "\\" + texName;
+            std::string genderPath = base + (isFemale ? "_F.blp" : "_M.blp");
+            std::string unisexPath = base + "_U.blp";
+            if (assetManager->fileExists(genderPath)) paths.push_back(genderPath);
+            else if (assetManager->fileExists(unisexPath)) paths.push_back(unisexPath);
+            else paths.push_back(base + ".blp");
+        }
+    }
+    return paths;
+}
+
+void Application::processAsyncEquipmentResults() {
+    for (auto it = asyncEquipmentLoads_.begin(); it != asyncEquipmentLoads_.end(); ) {
+        if (!it->future.valid() ||
+            it->future.wait_for(std::chrono::milliseconds(0)) != std::future_status::ready) {
+            ++it;
+            continue;
+        }
+        auto result = it->future.get();
+        it = asyncEquipmentLoads_.erase(it);
+
+        auto* charRenderer = renderer ? renderer->getCharacterRenderer() : nullptr;
+        if (!charRenderer) continue;
+
+        // Set pre-decoded cache so compositeWithRegions skips synchronous BLP decode
+        charRenderer->setPredecodedBLPCache(&result.predecodedTextures);
+        setOnlinePlayerEquipment(result.guid, result.displayInfoIds, result.inventoryTypes);
+        charRenderer->setPredecodedBLPCache(nullptr);
+    }
+}
+
 void Application::processDeferredEquipmentQueue() {
+    // First, finalize any completed async pre-decodes
+    processAsyncEquipmentResults();
+
     if (deferredEquipmentQueue_.empty()) return;
-    // Process at most 1 per frame — compositeWithRegions is expensive
+    // Limit in-flight async equipment loads
+    if (asyncEquipmentLoads_.size() >= 2) return;
+
     auto [guid, equipData] = deferredEquipmentQueue_.front();
     deferredEquipmentQueue_.erase(deferredEquipmentQueue_.begin());
-    setOnlinePlayerEquipment(guid, equipData.first, equipData.second);
+
+    // Resolve all texture paths that compositeWithRegions will need
+    auto texturePaths = resolveEquipmentTexturePaths(guid, equipData.first, equipData.second);
+
+    if (texturePaths.empty()) {
+        // No textures to pre-decode — just apply directly (fast path)
+        setOnlinePlayerEquipment(guid, equipData.first, equipData.second);
+        return;
+    }
+
+    // Launch background BLP pre-decode
+    auto* am = assetManager.get();
+    auto displayInfoIds = equipData.first;
+    auto inventoryTypes = equipData.second;
+    AsyncEquipmentLoad load;
+    load.future = std::async(std::launch::async,
+        [am, guid, displayInfoIds, inventoryTypes, paths = std::move(texturePaths)]() -> PreparedEquipmentUpdate {
+            PreparedEquipmentUpdate result;
+            result.guid = guid;
+            result.displayInfoIds = displayInfoIds;
+            result.inventoryTypes = inventoryTypes;
+            for (const auto& path : paths) {
+                std::string key = path;
+                std::replace(key.begin(), key.end(), '/', '\\');
+                std::transform(key.begin(), key.end(), key.begin(),
+                               [](unsigned char c) { return static_cast<char>(std::tolower(c)); });
+                if (result.predecodedTextures.count(key)) continue;
+                auto blp = am->loadTexture(key);
+                if (blp.isValid()) {
+                    result.predecodedTextures[key] = std::move(blp);
+                }
+            }
+            return result;
+        });
+    asyncEquipmentLoads_.push_back(std::move(load));
 }
 
 void Application::processAsyncGameObjectResults() {
diff --git a/src/rendering/character_renderer.cpp b/src/rendering/character_renderer.cpp
index 040a301d..2031a7b4 100644
--- a/src/rendering/character_renderer.cpp
+++ b/src/rendering/character_renderer.cpp
@@ -836,7 +836,19 @@ VkTexture* CharacterRenderer::compositeTextures(const std::vector<std::string>&
     }
 
     // Load base layer
-    auto base = assetManager->loadTexture(layerPaths[0]);
+    pipeline::BLPImage base;
+    if (predecodedBLPCache_) {
+        std::string key = layerPaths[0];
+        std::replace(key.begin(), key.end(), '/', '\\');
+        std::transform(key.begin(), key.end(), key.begin(),
+                       [](unsigned char c) { return static_cast<char>(std::tolower(c)); });
+        auto pit = predecodedBLPCache_->find(key);
+        if (pit != predecodedBLPCache_->end()) {
+            base = std::move(pit->second);
+            predecodedBLPCache_->erase(pit);
+        }
+    }
+    if (!base.isValid()) base = assetManager->loadTexture(layerPaths[0]);
     if (!base.isValid()) {
         core::Logger::getInstance().warning("Composite: failed to load base layer: ", layerPaths[0]);
         return whiteTexture_.get();
@@ -877,7 +889,19 @@ VkTexture* CharacterRenderer::compositeTextures(const std::vector<std::string>&
     for (size_t layer = 1; layer < layerPaths.size(); layer++) {
         if (layerPaths[layer].empty()) continue;
 
-        auto overlay = assetManager->loadTexture(layerPaths[layer]);
+        pipeline::BLPImage overlay;
+        if (predecodedBLPCache_) {
+            std::string key = layerPaths[layer];
+            std::replace(key.begin(), key.end(), '/', '\\');
+            std::transform(key.begin(), key.end(), key.begin(),
+                           [](unsigned char c) { return static_cast<char>(std::tolower(c)); });
+            auto pit = predecodedBLPCache_->find(key);
+            if (pit != predecodedBLPCache_->end()) {
+                overlay = std::move(pit->second);
+                predecodedBLPCache_->erase(pit);
+            }
+        }
+        if (!overlay.isValid()) overlay = assetManager->loadTexture(layerPaths[layer]);
         if (!overlay.isValid()) {
             core::Logger::getInstance().warning("Composite: FAILED to load overlay: ", layerPaths[layer]);
             continue;
@@ -1054,7 +1078,19 @@ VkTexture* CharacterRenderer::compositeWithRegions(const std::string& basePath,
         return whiteTexture_.get();
     }
 
-    auto base = assetManager->loadTexture(basePath);
+    pipeline::BLPImage base;
+    if (predecodedBLPCache_) {
+        std::string key = basePath;
+        std::replace(key.begin(), key.end(), '/', '\\');
+        std::transform(key.begin(), key.end(), key.begin(),
+                       [](unsigned char c) { return static_cast<char>(std::tolower(c)); });
+        auto pit = predecodedBLPCache_->find(key);
+        if (pit != predecodedBLPCache_->end()) {
+            base = std::move(pit->second);
+            predecodedBLPCache_->erase(pit);
+        }
+    }
+    if (!base.isValid()) base = assetManager->loadTexture(basePath);
     if (!base.isValid()) {
         return whiteTexture_.get();
     }
@@ -1093,7 +1129,19 @@ VkTexture* CharacterRenderer::compositeWithRegions(const std::string& basePath,
     bool upscaled = (base.width == 256 && base.height == 256 && width == 512);
     for (const auto& ul : baseLayers) {
         if (ul.empty()) continue;
-        auto overlay = assetManager->loadTexture(ul);
+        pipeline::BLPImage overlay;
+        if (predecodedBLPCache_) {
+            std::string key = ul;
+            std::replace(key.begin(), key.end(), '/', '\\');
+            std::transform(key.begin(), key.end(), key.begin(),
+                           [](unsigned char c) { return static_cast<char>(std::tolower(c)); });
+            auto pit = predecodedBLPCache_->find(key);
+            if (pit != predecodedBLPCache_->end()) {
+                overlay = std::move(pit->second);
+                predecodedBLPCache_->erase(pit);
+            }
+        }
+        if (!overlay.isValid()) overlay = assetManager->loadTexture(ul);
         if (!overlay.isValid()) continue;
 
         if (overlay.width == width && overlay.height == height) {
@@ -1171,7 +1219,19 @@ VkTexture* CharacterRenderer::compositeWithRegions(const std::string& basePath,
         int regionIdx = rl.first;
         if (regionIdx < 0 || regionIdx >= 8) continue;
 
-        auto overlay = assetManager->loadTexture(rl.second);
+        pipeline::BLPImage overlay;
+        if (predecodedBLPCache_) {
+            std::string key = rl.second;
+            std::replace(key.begin(), key.end(), '/', '\\');
+            std::transform(key.begin(), key.end(), key.begin(),
+                           [](unsigned char c) { return static_cast<char>(std::tolower(c)); });
+            auto pit = predecodedBLPCache_->find(key);
+            if (pit != predecodedBLPCache_->end()) {
+                overlay = std::move(pit->second);
+                predecodedBLPCache_->erase(pit);
+            }
+        }
+        if (!overlay.isValid()) overlay = assetManager->loadTexture(rl.second);
         if (!overlay.isValid()) {
             core::Logger::getInstance().warning("compositeWithRegions: failed to load ", rl.second);
             continue;

From 24f2ec75ec126c5598a545046f6a711ec68e765c Mon Sep 17 00:00:00 2001
From: Kelsi <kelsihates2fa@gmail.com>
Date: Sat, 7 Mar 2026 17:16:38 -0800
Subject: [PATCH 9/9] Defer normal map generation to reduce GPU model upload
 stalls by ~50%

Each loadTexture call was generating a normal/height map inline (3 full-image
passes: luminance + blur + Sobel). For models with 15-20 textures this added
30-40ms to the 70ms model upload. Now deferred to a per-frame budget (2/frame
in-game, 10/frame during load screen). Models render without POM until their
normal maps are ready.
---
 include/rendering/character_renderer.hpp | 13 +++++++
 src/core/application.cpp                 | 13 +++++--
 src/rendering/character_renderer.cpp     | 45 ++++++++++++++++++++----
 3 files changed, 62 insertions(+), 9 deletions(-)

diff --git a/include/rendering/character_renderer.hpp b/include/rendering/character_renderer.hpp
index c7cae0d7..83cb3e7f 100644
--- a/include/rendering/character_renderer.hpp
+++ b/include/rendering/character_renderer.hpp
@@ -12,6 +12,7 @@
 #include <string>
 #include <utility>
 #include <future>
+#include <deque>
 
 namespace wowee {
 namespace pipeline { class AssetManager; }
@@ -278,6 +279,7 @@ private:
         uint64_t lastUse = 0;
         bool hasAlpha = false;
         bool colorKeyBlack = false;
+        bool normalMapPending = false;  // deferred normal map generation
     };
     std::unordered_map<std::string, TextureCacheEntry> textureCache;
     std::unordered_map<VkTexture*, bool> textureHasAlphaByPtr_;
@@ -302,6 +304,17 @@ private:
     std::unique_ptr<VkTexture> generateNormalHeightMap(
         const uint8_t* pixels, uint32_t width, uint32_t height, float& outVariance);
 
+    // Deferred normal map generation — avoids stalling loadModel
+    struct PendingNormalMap {
+        std::string cacheKey;
+        std::vector<uint8_t> pixels;  // RGBA pixel data
+        uint32_t width, height;
+    };
+    std::deque<PendingNormalMap> pendingNormalMaps_;
+public:
+    void processPendingNormalMaps(int budget = 2);
+private:
+
     // Normal mapping / POM settings
     bool normalMappingEnabled_ = true;
     float normalMapStrength_ = 0.8f;
diff --git a/src/core/application.cpp b/src/core/application.cpp
index b003af53..1a239d8a 100644
--- a/src/core/application.cpp
+++ b/src/core/application.cpp
@@ -922,14 +922,20 @@ void Application::update(float deltaTime) {
                 auto t3 = std::chrono::steady_clock::now();
                 processDeferredEquipmentQueue();
                 auto t4 = std::chrono::steady_clock::now();
+                // Process deferred normal maps (2 per frame to spread CPU cost)
+                if (auto* cr = renderer ? renderer->getCharacterRenderer() : nullptr) {
+                    cr->processPendingNormalMaps(2);
+                }
+                auto t5 = std::chrono::steady_clock::now();
                 float pMs = std::chrono::duration<float, std::milli>(t1 - t0).count();
                 float cMs = std::chrono::duration<float, std::milli>(t2 - t1).count();
                 float nMs = std::chrono::duration<float, std::milli>(t3 - t2).count();
                 float eMs = std::chrono::duration<float, std::milli>(t4 - t3).count();
-                float total = pMs + cMs + nMs + eMs;
+                float nmMs = std::chrono::duration<float, std::milli>(t5 - t4).count();
+                float total = pMs + cMs + nMs + eMs + nmMs;
                 if (total > 4.0f) {
                     LOG_WARNING("spawn/equip breakdown: player=", pMs, "ms creature=", cMs,
-                                "ms npcComposite=", nMs, "ms equip=", eMs, "ms");
+                                "ms npcComposite=", nMs, "ms equip=", eMs, "ms normalMaps=", nmMs, "ms");
                 }
             });
             // Self-heal missing creature visuals: if a nearby UNIT exists in
@@ -4250,6 +4256,9 @@ void Application::loadOnlineWorldTerrain(uint32_t mapId, float x, float y, float
             processCreatureSpawnQueue();
             processAsyncNpcCompositeResults();
             processDeferredEquipmentQueue();
+            if (auto* cr = renderer ? renderer->getCharacterRenderer() : nullptr) {
+                cr->processPendingNormalMaps(10);  // higher budget during load screen
+            }
 
             // Process ALL pending game object spawns (no 1-per-frame cap during load screen).
             while (!pendingGameObjectSpawns_.empty()) {
diff --git a/src/rendering/character_renderer.cpp b/src/rendering/character_renderer.cpp
index 2031a7b4..baaaf3e6 100644
--- a/src/rendering/character_renderer.cpp
+++ b/src/rendering/character_renderer.cpp
@@ -687,13 +687,16 @@ VkTexture* CharacterRenderer::loadTexture(const std::string& path) {
     e.hasAlpha = hasAlpha;
     e.colorKeyBlack = colorKeyBlackHint;
 
-    // Generate normal/height map from diffuse texture
-    float nhVariance = 0.0f;
-    auto nhMap = generateNormalHeightMap(blpImage.data.data(), blpImage.width, blpImage.height, nhVariance);
-    if (nhMap) {
-        e.heightMapVariance = nhVariance;
-        e.approxBytes += approxTextureBytesWithMips(blpImage.width, blpImage.height);
-        e.normalHeightMap = std::move(nhMap);
+    // Defer normal/height map generation to avoid stalling loadModel.
+    // Normal maps are generated in processPendingNormalMaps() at a per-frame budget.
+    if (blpImage.width >= 32 && blpImage.height >= 32) {
+        PendingNormalMap pending;
+        pending.cacheKey = key;
+        pending.pixels.assign(blpImage.data.begin(), blpImage.data.end());
+        pending.width = blpImage.width;
+        pending.height = blpImage.height;
+        pendingNormalMaps_.push_back(std::move(pending));
+        e.normalMapPending = true;
     }
 
     textureCacheBytes_ += e.approxBytes;
@@ -705,6 +708,34 @@ VkTexture* CharacterRenderer::loadTexture(const std::string& path) {
     return texPtr;
 }
 
+void CharacterRenderer::processPendingNormalMaps(int budget) {
+    if (pendingNormalMaps_.empty() || !vkCtx_) return;
+
+    int processed = 0;
+    while (!pendingNormalMaps_.empty() && processed < budget) {
+        auto pending = std::move(pendingNormalMaps_.front());
+        pendingNormalMaps_.pop_front();
+
+        auto it = textureCache.find(pending.cacheKey);
+        if (it == textureCache.end()) continue;  // texture was evicted
+
+        float nhVariance = 0.0f;
+        vkCtx_->beginUploadBatch();
+        auto nhMap = generateNormalHeightMap(pending.pixels.data(),
+            pending.width, pending.height, nhVariance);
+        vkCtx_->endUploadBatch();
+
+        if (nhMap) {
+            it->second.heightMapVariance = nhVariance;
+            it->second.approxBytes += approxTextureBytesWithMips(pending.width, pending.height);
+            textureCacheBytes_ += approxTextureBytesWithMips(pending.width, pending.height);
+            it->second.normalHeightMap = std::move(nhMap);
+        }
+        it->second.normalMapPending = false;
+        processed++;
+    }
+}
+
 // Alpha-blend overlay onto composite at (dstX, dstY)
 static void blitOverlay(std::vector<uint8_t>& composite, int compW, int compH,
                          const pipeline::BLPImage& overlay, int dstX, int dstY) {