diff --git a/include/core/application.hpp b/include/core/application.hpp
index 92e96e8e..84b89f32 100644
--- a/include/core/application.hpp
+++ b/include/core/application.hpp
@@ -3,13 +3,19 @@
 #include "core/window.hpp"
 #include "core/input.hpp"
 #include "game/character.hpp"
+#include "pipeline/blp_loader.hpp"
 #include <memory>
 #include <string>
 #include <vector>
+#include <deque>
 #include <unordered_map>
 #include <unordered_set>
 #include <array>
 #include <optional>
+#include <future>
+#include <mutex>
+#include <thread>
+#include <atomic>
 
 namespace wowee {
 
@@ -18,7 +24,7 @@ namespace rendering { class Renderer; }
 namespace ui { class UIManager; }
 namespace auth { class AuthHandler; }
 namespace game { class GameHandler; class World; class ExpansionRegistry; }
-namespace pipeline { class AssetManager; class DBCLayout; }
+namespace pipeline { class AssetManager; class DBCLayout; struct M2Model; struct WMOModel; }
 namespace audio { enum class VoiceType; }
 
 namespace core {
@@ -90,6 +96,7 @@ private:
     static const char* mapIdToName(uint32_t mapId);
     void loadOnlineWorldTerrain(uint32_t mapId, float x, float y, float z);
     void buildFactionHostilityMap(uint8_t playerRace);
+    pipeline::M2Model loadCreatureM2Sync(const std::string& m2Path);
     void spawnOnlineCreature(uint64_t guid, uint32_t displayId, float x, float y, float z, float orientation);
     void despawnOnlineCreature(uint64_t guid);
     bool tryAttachCreatureVirtualWeapons(uint64_t guid, uint32_t instanceId);
@@ -181,8 +188,39 @@ private:
     std::unordered_map<uint64_t, glm::vec3> creatureRenderPosCache_; // guid -> last synced render position
     std::unordered_set<uint64_t> creatureWeaponsAttached_;       // guid set when NPC virtual weapons attached
     std::unordered_map<uint64_t, uint8_t> creatureWeaponAttachAttempts_; // guid -> attach attempts
+    std::unordered_map<uint32_t, bool> modelIdIsWolfLike_;     // modelId → cached wolf/worg check
+    static constexpr int MAX_WEAPON_ATTACHES_PER_TICK = 2;     // limit weapon attach work per 1s tick
+
+    // CharSections.dbc lookup cache to avoid O(N) DBC scan per NPC spawn.
+    // Key: (race<<24)|(sex<<16)|(section<<12)|(variation<<8)|color → texture path
+    std::unordered_map<uint64_t, std::string> charSectionsCache_;
+    bool charSectionsCacheBuilt_ = false;
+    void buildCharSectionsCache();
+    std::string lookupCharSection(uint8_t race, uint8_t sex, uint8_t section,
+                                  uint8_t variation, uint8_t color, int texIndex = 0) const;
+
+    // Async creature model loading: file I/O + M2 parsing on background thread,
+    // GPU upload + instance creation on main thread.
+    struct PreparedCreatureModel {
+        uint64_t guid;
+        uint32_t displayId;
+        uint32_t modelId;
+        float x, y, z, orientation;
+        std::shared_ptr<pipeline::M2Model> model; // parsed on background thread
+        std::unordered_map<std::string, pipeline::BLPImage> predecodedTextures; // decoded on bg thread
+        bool valid = false;
+        bool permanent_failure = false;
+    };
+    struct AsyncCreatureLoad {
+        std::future<PreparedCreatureModel> future;
+    };
+    std::vector<AsyncCreatureLoad> asyncCreatureLoads_;
+    void processAsyncCreatureResults();
+    static constexpr int MAX_ASYNC_CREATURE_LOADS = 4; // concurrent background loads
     std::unordered_set<uint64_t> deadCreatureGuids_;            // GUIDs that should spawn in corpse/death pose
     std::unordered_map<uint32_t, uint32_t> displayIdModelCache_; // displayId → modelId (model caching)
+    std::unordered_set<uint32_t> displayIdTexturesApplied_;    // displayIds with per-model textures applied
+    std::unordered_map<uint32_t, std::unordered_map<std::string, pipeline::BLPImage>> displayIdPredecodedTextures_; // displayId → pre-decoded skin textures
     mutable std::unordered_set<uint32_t> warnedMissingDisplayDataIds_; // displayIds already warned
     mutable std::unordered_set<uint32_t> warnedMissingModelPathIds_;   // modelIds/displayIds already warned
     uint32_t nextCreatureModelId_ = 5000;  // Model IDs for online creatures
@@ -250,7 +288,7 @@ private:
         uint32_t displayId;
         float x, y, z, orientation;
     };
-    std::vector<PendingCreatureSpawn> pendingCreatureSpawns_;
+    std::deque<PendingCreatureSpawn> pendingCreatureSpawns_;
     static constexpr int MAX_SPAWNS_PER_FRAME = 3;
     static constexpr int MAX_NEW_CREATURE_MODELS_PER_FRAME = 1;
     static constexpr uint16_t MAX_CREATURE_SPAWN_RETRIES = 300;
@@ -275,6 +313,49 @@ private:
     // Deferred equipment compositing queue — processes max 1 per frame to avoid stutter
     std::vector<std::pair<uint64_t, std::pair<std::array<uint32_t, 19>, std::array<uint8_t, 19>>>> deferredEquipmentQueue_;
     void processDeferredEquipmentQueue();
+    // Async equipment texture pre-decode: BLP decode on background thread, composite on main thread
+    struct PreparedEquipmentUpdate {
+        uint64_t guid;
+        std::array<uint32_t, 19> displayInfoIds;
+        std::array<uint8_t, 19> inventoryTypes;
+        std::unordered_map<std::string, pipeline::BLPImage> predecodedTextures;
+    };
+    struct AsyncEquipmentLoad {
+        std::future<PreparedEquipmentUpdate> future;
+    };
+    std::vector<AsyncEquipmentLoad> asyncEquipmentLoads_;
+    void processAsyncEquipmentResults();
+    std::vector<std::string> resolveEquipmentTexturePaths(uint64_t guid,
+        const std::array<uint32_t, 19>& displayInfoIds,
+        const std::array<uint8_t, 19>& inventoryTypes) const;
+    // Deferred NPC texture setup — async DBC lookups + BLP pre-decode to avoid main-thread stalls
+    struct DeferredNpcComposite {
+        uint32_t modelId;
+        uint32_t displayId;
+        // Skin compositing (type-1 slots)
+        std::string basePath;                     // CharSections skin base texture
+        std::vector<std::string> overlayPaths;    // face + underwear overlays
+        std::vector<std::pair<int, std::string>> regionLayers;  // equipment region overlays
+        std::vector<uint32_t> skinTextureSlots;   // model texture slots needing skin composite
+        bool hasComposite = false;                // needs compositing (overlays or equipment regions)
+        bool hasSimpleSkin = false;               // just base skin, no compositing needed
+        // Baked skin (type-1 slots)
+        std::string bakedSkinPath;                // baked texture path (if available)
+        bool hasBakedSkin = false;                // baked skin resolved successfully
+        // Hair (type-6 slots)
+        std::vector<uint32_t> hairTextureSlots;   // model texture slots needing hair texture
+        std::string hairTexturePath;              // resolved hair texture path
+        bool useBakedForHair = false;             // bald NPC: use baked skin for type-6
+    };
+    struct PreparedNpcComposite {
+        DeferredNpcComposite info;
+        std::unordered_map<std::string, pipeline::BLPImage> predecodedTextures;
+    };
+    struct AsyncNpcCompositeLoad {
+        std::future<PreparedNpcComposite> future;
+    };
+    std::vector<AsyncNpcCompositeLoad> asyncNpcCompositeLoads_;
+    void processAsyncNpcCompositeResults();
     // Cache base player model geometry by (raceId, genderId)
     std::unordered_map<uint32_t, uint32_t> playerModelCache_; // key=(race<<8)|gender → modelId
     struct PlayerTextureSlots { int skin = -1; int hair = -1; int underwear = -1; };
@@ -302,6 +383,24 @@ private:
     };
     std::vector<PendingGameObjectSpawn> pendingGameObjectSpawns_;
     void processGameObjectSpawnQueue();
+
+    // Async WMO loading for game objects (file I/O + parse on background thread)
+    struct PreparedGameObjectWMO {
+        uint64_t guid;
+        uint32_t entry;
+        uint32_t displayId;
+        float x, y, z, orientation;
+        std::shared_ptr<pipeline::WMOModel> wmoModel;
+        std::unordered_map<std::string, pipeline::BLPImage> predecodedTextures; // decoded on bg thread
+        bool valid = false;
+        bool isWmo = false;
+        std::string modelPath;
+    };
+    struct AsyncGameObjectLoad {
+        std::future<PreparedGameObjectWMO> future;
+    };
+    std::vector<AsyncGameObjectLoad> asyncGameObjectLoads_;
+    void processAsyncGameObjectResults();
     struct PendingTransportDoodadBatch {
         uint64_t guid = 0;
         uint32_t modelId = 0;
@@ -321,6 +420,23 @@ private:
     // Quest marker billboard sprites (above NPCs)
     void loadQuestMarkerModels();  // Now loads BLP textures
     void updateQuestMarkers();     // Updates billboard positions
+
+    // Background world preloader — warms AssetManager file cache for the
+    // expected world before the user clicks Enter World.
+    struct WorldPreload {
+        uint32_t mapId = 0;
+        std::string mapName;
+        int centerTileX = 0;
+        int centerTileY = 0;
+        std::atomic<bool> cancel{false};
+        std::vector<std::thread> workers;
+    };
+    std::unique_ptr<WorldPreload> worldPreload_;
+    void startWorldPreload(uint32_t mapId, const std::string& mapName, float serverX, float serverY);
+    void cancelWorldPreload();
+    void saveLastWorldInfo(uint32_t mapId, const std::string& mapName, float serverX, float serverY);
+    struct LastWorldInfo { uint32_t mapId = 0; std::string mapName; float x = 0, y = 0; bool valid = false; };
+    LastWorldInfo loadLastWorldInfo() const;
 };
 
 } // namespace core
diff --git a/include/rendering/character_renderer.hpp b/include/rendering/character_renderer.hpp
index c6f63451..83cb3e7f 100644
--- a/include/rendering/character_renderer.hpp
+++ b/include/rendering/character_renderer.hpp
@@ -1,6 +1,7 @@
 #pragma once
 
 #include "pipeline/m2_loader.hpp"
+#include "pipeline/blp_loader.hpp"
 #include <vulkan/vulkan.h>
 #include <vk_mem_alloc.h>
 #include <glm/glm.hpp>
@@ -11,6 +12,7 @@
 #include <string>
 #include <utility>
 #include <future>
+#include <deque>
 
 namespace wowee {
 namespace pipeline { class AssetManager; }
@@ -114,7 +116,11 @@ public:
     void setShadowMap(VkTexture*, const glm::mat4&) {}
     void clearShadowMap() {}
 
+    // Pre-decoded BLP cache: set before calling loadModel() to skip main-thread BLP decode
+    void setPredecodedBLPCache(std::unordered_map<std::string, pipeline::BLPImage>* cache) { predecodedBLPCache_ = cache; }
+
 private:
+    std::unordered_map<std::string, pipeline::BLPImage>* predecodedBLPCache_ = nullptr;
     // GPU representation of M2 model
     struct M2ModelGPU {
         VkBuffer vertexBuffer = VK_NULL_HANDLE;
@@ -180,6 +186,7 @@ private:
 
         // Bone update throttling (skip frames for distant characters)
         uint32_t boneUpdateCounter = 0;
+        const M2ModelGPU* cachedModel = nullptr;  // Avoid per-frame hash lookups
 
         // Per-instance bone SSBO (double-buffered per frame)
         VkBuffer boneBuffer[2] = {};
@@ -254,7 +261,14 @@ private:
     VkDescriptorPool materialDescPools_[2] = {VK_NULL_HANDLE, VK_NULL_HANDLE};
     VkDescriptorPool boneDescPool_ = VK_NULL_HANDLE;
     uint32_t lastMaterialPoolResetFrame_ = 0xFFFFFFFFu;
-    std::vector<std::pair<VkBuffer, VmaAllocation>> transientMaterialUbos_[2];
+
+    // Material UBO ring buffer — pre-allocated per frame slot, sub-allocated each draw
+    VkBuffer materialRingBuffer_[2] = {VK_NULL_HANDLE, VK_NULL_HANDLE};
+    VmaAllocation materialRingAlloc_[2] = {VK_NULL_HANDLE, VK_NULL_HANDLE};
+    void* materialRingMapped_[2] = {nullptr, nullptr};
+    uint32_t materialRingOffset_[2] = {0, 0};
+    uint32_t materialUboAlignment_ = 256;  // minUniformBufferOffsetAlignment
+    static constexpr uint32_t MATERIAL_RING_CAPACITY = 4096;
 
     // Texture cache
     struct TextureCacheEntry {
@@ -265,6 +279,7 @@ private:
         uint64_t lastUse = 0;
         bool hasAlpha = false;
         bool colorKeyBlack = false;
+        bool normalMapPending = false;  // deferred normal map generation
     };
     std::unordered_map<std::string, TextureCacheEntry> textureCache;
     std::unordered_map<VkTexture*, bool> textureHasAlphaByPtr_;
@@ -289,6 +304,17 @@ private:
     std::unique_ptr<VkTexture> generateNormalHeightMap(
         const uint8_t* pixels, uint32_t width, uint32_t height, float& outVariance);
 
+    // Deferred normal map generation — avoids stalling loadModel
+    struct PendingNormalMap {
+        std::string cacheKey;
+        std::vector<uint8_t> pixels;  // RGBA pixel data
+        uint32_t width, height;
+    };
+    std::deque<PendingNormalMap> pendingNormalMaps_;
+public:
+    void processPendingNormalMaps(int budget = 2);
+private:
+
     // Normal mapping / POM settings
     bool normalMappingEnabled_ = true;
     float normalMapStrength_ = 0.8f;
diff --git a/include/rendering/m2_renderer.hpp b/include/rendering/m2_renderer.hpp
index f53fb4bf..1c35e34b 100644
--- a/include/rendering/m2_renderer.hpp
+++ b/include/rendering/m2_renderer.hpp
@@ -1,6 +1,7 @@
 #pragma once
 
 #include "pipeline/m2_loader.hpp"
+#include "pipeline/blp_loader.hpp"
 #include <vulkan/vulkan.h>
 #include <vk_mem_alloc.h>
 #include <glm/glm.hpp>
@@ -188,6 +189,7 @@ struct M2Instance {
     bool skipCollision = false;    // WMO interior doodads — skip player wall collision
     float cachedBoundRadius = 0.0f;
     float portalSpinAngle = 0.0f;  // Accumulated spin angle for portal rotation
+    const M2ModelGPU* cachedModel = nullptr;  // Avoid per-frame hash lookups
 
     // Frame-skip optimization (update distant animations less frequently)
     uint8_t frameSkipCounter = 0;
@@ -328,6 +330,10 @@ public:
 
     std::vector<glm::vec3> getWaterVegetationPositions(const glm::vec3& camPos, float maxDist) const;
 
+    // Pre-decoded BLP cache: set by terrain manager before calling loadModel()
+    // so loadTexture() can skip the expensive assetManager->loadTexture() call.
+    void setPredecodedBLPCache(std::unordered_map<std::string, pipeline::BLPImage>* cache) { predecodedBLPCache_ = cache; }
+
 private:
     bool initialized_ = false;
     bool insideInterior = false;
@@ -389,12 +395,33 @@ private:
     std::unordered_map<uint32_t, M2ModelGPU> models;
     std::vector<M2Instance> instances;
 
+    // O(1) dedup: key = (modelId, quantized x, quantized y, quantized z) → instanceId
+    struct DedupKey {
+        uint32_t modelId;
+        int32_t qx, qy, qz; // position quantized to 0.1 units
+        bool operator==(const DedupKey& o) const {
+            return modelId == o.modelId && qx == o.qx && qy == o.qy && qz == o.qz;
+        }
+    };
+    struct DedupHash {
+        size_t operator()(const DedupKey& k) const {
+            size_t h = std::hash<uint32_t>()(k.modelId);
+            h ^= std::hash<int32_t>()(k.qx) * 2654435761u;
+            h ^= std::hash<int32_t>()(k.qy) * 40503u;
+            h ^= std::hash<int32_t>()(k.qz) * 12289u;
+            return h;
+        }
+    };
+    std::unordered_map<DedupKey, uint32_t, DedupHash> instanceDedupMap_;
+
     uint32_t nextInstanceId = 1;
     uint32_t lastDrawCallCount = 0;
     size_t modelCacheLimit_ = 6000;
     uint32_t modelLimitRejectWarnings_ = 0;
 
     VkTexture* loadTexture(const std::string& path, uint32_t texFlags = 0);
+    std::unordered_map<std::string, pipeline::BLPImage>* predecodedBLPCache_ = nullptr;
+
     struct TextureCacheEntry {
         std::unique_ptr<VkTexture> texture;
         size_t approxBytes = 0;
diff --git a/include/rendering/terrain_manager.hpp b/include/rendering/terrain_manager.hpp
index 0090edc4..6f732721 100644
--- a/include/rendering/terrain_manager.hpp
+++ b/include/rendering/terrain_manager.hpp
@@ -121,6 +121,12 @@ struct PendingTile {
     // Pre-loaded terrain texture BLP data (loaded on background thread to avoid
     // blocking file I/O on the main thread during finalizeTile)
     std::unordered_map<std::string, pipeline::BLPImage> preloadedTextures;
+
+    // Pre-decoded M2 model textures (decoded on background thread)
+    std::unordered_map<std::string, pipeline::BLPImage> preloadedM2Textures;
+
+    // Pre-decoded WMO textures (decoded on background thread)
+    std::unordered_map<std::string, pipeline::BLPImage> preloadedWMOTextures;
 };
 
 /**
@@ -150,6 +156,11 @@ struct FinalizingTile {
     size_t wmoModelIndex = 0;      // Next WMO model to upload
     size_t wmoDoodadIndex = 0;     // Next WMO doodad to upload
 
+    // Incremental terrain upload state (splits TERRAIN phase across frames)
+    bool terrainPreloaded = false;  // True after preloaded textures uploaded
+    int terrainChunkNext = 0;       // Next chunk index to upload (0-255, row-major)
+    bool terrainMeshDone = false;   // True when all chunks uploaded
+
     // Accumulated results (built up across phases)
     std::vector<uint32_t> m2InstanceIds;
     std::vector<uint32_t> wmoInstanceIds;
@@ -376,6 +387,11 @@ private:
     std::unordered_set<std::string> missingAdtWarnings_;
     std::mutex missingAdtWarningsMutex_;
 
+    // Thread-safe set of M2 model IDs already uploaded to GPU
+    // (checked by workers to skip redundant file I/O + parsing)
+    std::unordered_set<uint32_t> uploadedM2Ids_;
+    std::mutex uploadedM2IdsMutex_;
+
     // Dedup set for doodad placements across tile boundaries
     std::unordered_set<uint32_t> placedDoodadIds;
 
diff --git a/include/rendering/terrain_renderer.hpp b/include/rendering/terrain_renderer.hpp
index 91279e9c..77af9a64 100644
--- a/include/rendering/terrain_renderer.hpp
+++ b/include/rendering/terrain_renderer.hpp
@@ -86,6 +86,13 @@ public:
                      const std::vector<std::string>& texturePaths,
                      int tileX = -1, int tileY = -1);
 
+    /// Upload a batch of terrain chunks incrementally. Returns true when all chunks done.
+    /// chunkIndex is updated to the next chunk to process (0-255 row-major).
+    bool loadTerrainIncremental(const pipeline::TerrainMesh& mesh,
+                                const std::vector<std::string>& texturePaths,
+                                int tileX, int tileY,
+                                int& chunkIndex, int maxChunksPerCall = 16);
+
     void removeTile(int tileX, int tileY);
 
     void uploadPreloadedTextures(const std::unordered_map<std::string, pipeline::BLPImage>& textures);
@@ -120,6 +127,7 @@ public:
     int getRenderedChunkCount() const { return renderedChunks; }
     int getCulledChunkCount() const { return culledChunks; }
     int getTriangleCount() const;
+    VkContext* getVkContext() const { return vkCtx; }
 
 private:
     TerrainChunkGPU uploadChunk(const pipeline::ChunkMesh& chunk);
diff --git a/include/rendering/vk_context.hpp b/include/rendering/vk_context.hpp
index 3a242940..907e21bf 100644
--- a/include/rendering/vk_context.hpp
+++ b/include/rendering/vk_context.hpp
@@ -1,5 +1,6 @@
 #pragma once
 
+#include "rendering/vk_utils.hpp"
 #include <vulkan/vulkan.h>
 #include <vk_mem_alloc.h>
 #include <VkBootstrap.h>
@@ -46,6 +47,16 @@ public:
     // Immediate submit for one-off GPU work (descriptor pool creation, etc.)
     void immediateSubmit(std::function<void(VkCommandBuffer cmd)>&& function);
 
+    // Batch upload mode: records multiple upload commands into a single
+    // command buffer, then submits with ONE fence wait instead of one per upload.
+    void beginUploadBatch();
+    void endUploadBatch();       // Async: submits but does NOT wait for fence
+    void endUploadBatchSync();   // Sync: submits and waits (for load screens)
+    bool isInUploadBatch() const { return inUploadBatch_; }
+    void deferStagingCleanup(AllocatedBuffer staging);
+    void pollUploadBatches();    // Check completed async uploads, free staging buffers
+    void waitAllUploads();       // Block until all in-flight uploads complete
+
     // Accessors
     VkInstance getInstance() const { return instance; }
     VkPhysicalDevice getPhysicalDevice() const { return physicalDevice; }
@@ -143,6 +154,20 @@ private:
     VkCommandPool immCommandPool = VK_NULL_HANDLE;
     VkFence immFence = VK_NULL_HANDLE;
 
+    // Batch upload state (nesting-safe via depth counter)
+    int uploadBatchDepth_ = 0;
+    bool inUploadBatch_ = false;
+    VkCommandBuffer batchCmd_ = VK_NULL_HANDLE;
+    std::vector<AllocatedBuffer> batchStagingBuffers_;
+
+    // Async upload: in-flight batches awaiting GPU completion
+    struct InFlightBatch {
+        VkFence fence = VK_NULL_HANDLE;
+        VkCommandBuffer cmd = VK_NULL_HANDLE;
+        std::vector<AllocatedBuffer> stagingBuffers;
+    };
+    std::vector<InFlightBatch> inFlightBatches_;
+
     // Depth buffer (shared across all framebuffers)
     VkImage depthImage = VK_NULL_HANDLE;
     VkImageView depthImageView = VK_NULL_HANDLE;
diff --git a/include/rendering/wmo_renderer.hpp b/include/rendering/wmo_renderer.hpp
index 095a354d..f0d3b36f 100644
--- a/include/rendering/wmo_renderer.hpp
+++ b/include/rendering/wmo_renderer.hpp
@@ -1,5 +1,6 @@
 #pragma once
 
+#include "pipeline/blp_loader.hpp"
 #include <vulkan/vulkan.h>
 #include <vk_mem_alloc.h>
 #include <glm/glm.hpp>
@@ -325,6 +326,12 @@ public:
     // Pre-compute floor cache for all loaded WMO instances
     void precomputeFloorCache();
 
+    // Pre-decoded BLP cache: set before calling loadModel() to skip main-thread BLP decode
+    void setPredecodedBLPCache(std::unordered_map<std::string, pipeline::BLPImage>* cache) { predecodedBLPCache_ = cache; }
+
+    // Defer normal/height map generation during streaming to avoid CPU stalls
+    void setDeferNormalMaps(bool defer) { deferNormalMaps_ = defer; }
+
 private:
     // WMO material UBO — matches WMOMaterial in wmo.frag.glsl
     struct WMOMaterialUBO {
@@ -558,6 +565,7 @@ private:
      * Load a texture from path
      */
     VkTexture* loadTexture(const std::string& path);
+    std::unordered_map<std::string, pipeline::BLPImage>* predecodedBLPCache_ = nullptr;
 
     /**
      * Generate normal+height map from diffuse RGBA8 pixels
@@ -670,6 +678,7 @@ private:
 
     // Normal mapping / POM settings
     bool normalMappingEnabled_ = true;   // on by default
+    bool deferNormalMaps_ = false;       // skip normal map gen during streaming
     float normalMapStrength_ = 0.8f;     // 0.0 = flat, 1.0 = full, 2.0 = exaggerated
     bool pomEnabled_ = true;             // on by default
     int pomQuality_ = 1;                 // 0=Low(16), 1=Medium(32), 2=High(64)
diff --git a/src/core/application.cpp b/src/core/application.cpp
index 2a8ef041..1a239d8a 100644
--- a/src/core/application.cpp
+++ b/src/core/application.cpp
@@ -56,6 +56,7 @@
 #include <sstream>
 #include <set>
 #include <filesystem>
+#include <fstream>
 
 #include <thread>
 #ifdef __linux__
@@ -314,6 +315,15 @@ bool Application::initialize() {
             gameHandler->getTransportManager()->loadTaxiPathNodeDBC(assetManager.get());
         }
 
+        // Start background preload for last-played character's world.
+        // Warms the file cache so terrain tile loading is faster at Enter World.
+        {
+            auto lastWorld = loadLastWorldInfo();
+            if (lastWorld.valid) {
+                startWorldPreload(lastWorld.mapId, lastWorld.mapName, lastWorld.x, lastWorld.y);
+            }
+        }
+
     } else {
         LOG_WARNING("Failed to initialize asset manager - asset loading will be unavailable");
         LOG_WARNING("Set WOW_DATA_PATH environment variable to your WoW Data directory");
@@ -521,6 +531,9 @@ void Application::run() {
 void Application::shutdown() {
     LOG_WARNING("Shutting down application...");
 
+    // Stop background world preloader before destroying AssetManager
+    cancelWorldPreload();
+
     // Save floor cache before renderer is destroyed
     if (renderer && renderer->getWMORenderer()) {
         size_t cacheSize = renderer->getWMORenderer()->getFloorCacheSize();
@@ -734,6 +747,16 @@ void Application::logoutToLogin() {
     deadCreatureGuids_.clear();
     nonRenderableCreatureDisplayIds_.clear();
     creaturePermanentFailureGuids_.clear();
+    modelIdIsWolfLike_.clear();
+    displayIdTexturesApplied_.clear();
+    charSectionsCache_.clear();
+    charSectionsCacheBuilt_ = false;
+
+    // Wait for any in-flight async creature loads before clearing state
+    for (auto& load : asyncCreatureLoads_) {
+        if (load.future.valid()) load.future.wait();
+    }
+    asyncCreatureLoads_.clear();
 
     // --- Creature spawn queues ---
     pendingCreatureSpawns_.clear();
@@ -833,6 +856,7 @@ void Application::update(float deltaTime) {
             const char* inGameStep = "begin";
             try {
             auto runInGameStage = [&](const char* stageName, auto&& fn) {
+                auto stageStart = std::chrono::steady_clock::now();
                 try {
                     fn();
                 } catch (const std::bad_alloc& e) {
@@ -842,6 +866,11 @@ void Application::update(float deltaTime) {
                     LOG_ERROR("Exception during IN_GAME update stage '", stageName, "': ", e.what());
                     throw;
                 }
+                auto stageEnd = std::chrono::steady_clock::now();
+                float stageMs = std::chrono::duration<float, std::milli>(stageEnd - stageStart).count();
+                if (stageMs > 3.0f) {
+                    LOG_WARNING("SLOW update stage '", stageName, "': ", stageMs, "ms");
+                }
             };
             inGameStep = "gameHandler update";
             updateCheckpoint = "in_game: gameHandler update";
@@ -884,11 +913,30 @@ void Application::update(float deltaTime) {
             inGameStep = "spawn/equipment queues";
             updateCheckpoint = "in_game: spawn/equipment queues";
             runInGameStage("spawn/equipment queues", [&] {
+                auto t0 = std::chrono::steady_clock::now();
                 processPlayerSpawnQueue();
-                // Process deferred online creature spawns (throttled)
+                auto t1 = std::chrono::steady_clock::now();
                 processCreatureSpawnQueue();
-                // Process deferred equipment compositing (max 1 per frame to avoid stutter)
+                auto t2 = std::chrono::steady_clock::now();
+                processAsyncNpcCompositeResults();
+                auto t3 = std::chrono::steady_clock::now();
                 processDeferredEquipmentQueue();
+                auto t4 = std::chrono::steady_clock::now();
+                // Process deferred normal maps (2 per frame to spread CPU cost)
+                if (auto* cr = renderer ? renderer->getCharacterRenderer() : nullptr) {
+                    cr->processPendingNormalMaps(2);
+                }
+                auto t5 = std::chrono::steady_clock::now();
+                float pMs = std::chrono::duration<float, std::milli>(t1 - t0).count();
+                float cMs = std::chrono::duration<float, std::milli>(t2 - t1).count();
+                float nMs = std::chrono::duration<float, std::milli>(t3 - t2).count();
+                float eMs = std::chrono::duration<float, std::milli>(t4 - t3).count();
+                float nmMs = std::chrono::duration<float, std::milli>(t5 - t4).count();
+                float total = pMs + cMs + nMs + eMs + nmMs;
+                if (total > 4.0f) {
+                    LOG_WARNING("spawn/equip breakdown: player=", pMs, "ms creature=", cMs,
+                                "ms npcComposite=", nMs, "ms equip=", eMs, "ms normalMaps=", nmMs, "ms");
+                }
             });
             // Self-heal missing creature visuals: if a nearby UNIT exists in
             // entity state but has no render instance, queue a spawn retry.
@@ -1279,12 +1327,14 @@ void Application::update(float deltaTime) {
             // creature models remain at stale spawn positions.
             inGameStep = "creature render sync";
             updateCheckpoint = "in_game: creature render sync";
+            auto creatureSyncStart = std::chrono::steady_clock::now();
             if (renderer && gameHandler && renderer->getCharacterRenderer()) {
                 auto* charRenderer = renderer->getCharacterRenderer();
                 static float npcWeaponRetryTimer = 0.0f;
                 npcWeaponRetryTimer += deltaTime;
                 const bool npcWeaponRetryTick = (npcWeaponRetryTimer >= 1.0f);
                 if (npcWeaponRetryTick) npcWeaponRetryTimer = 0.0f;
+                int weaponAttachesThisTick = 0;
                 glm::vec3 playerPos(0.0f);
                 glm::vec3 playerRenderPos(0.0f);
                 bool havePlayerPos = false;
@@ -1304,11 +1354,14 @@ void Application::update(float deltaTime) {
                     auto entity = gameHandler->getEntityManager().getEntity(guid);
                     if (!entity || entity->getType() != game::ObjectType::UNIT) continue;
 
-                    if (npcWeaponRetryTick && !creatureWeaponsAttached_.count(guid)) {
+                    if (npcWeaponRetryTick &&
+                        weaponAttachesThisTick < MAX_WEAPON_ATTACHES_PER_TICK &&
+                        !creatureWeaponsAttached_.count(guid)) {
                         uint8_t attempts = 0;
                         auto itAttempts = creatureWeaponAttachAttempts_.find(guid);
                         if (itAttempts != creatureWeaponAttachAttempts_.end()) attempts = itAttempts->second;
                         if (attempts < 30) {
+                            weaponAttachesThisTick++;
                             if (tryAttachCreatureVirtualWeapons(guid, instanceId)) {
                                 creatureWeaponsAttached_.insert(guid);
                                 creatureWeaponAttachAttempts_.erase(guid);
@@ -1319,24 +1372,31 @@ void Application::update(float deltaTime) {
                     }
 
                     glm::vec3 canonical(entity->getX(), entity->getY(), entity->getZ());
+                    float canonDistSq = 0.0f;
                     if (havePlayerPos) {
                         glm::vec3 d = canonical - playerPos;
-                        if (glm::dot(d, d) > syncRadiusSq) continue;
+                        canonDistSq = glm::dot(d, d);
+                        if (canonDistSq > syncRadiusSq) continue;
                     }
 
                     glm::vec3 renderPos = core::coords::canonicalToRender(canonical);
 
                     // Visual collision guard: keep hostile melee units from rendering inside the
                     // player's model while attacking. This is client-side only (no server position change).
-                    auto unit = std::static_pointer_cast<game::Unit>(entity);
-                    const uint64_t currentTargetGuid = gameHandler->hasTarget() ? gameHandler->getTargetGuid() : 0;
-                    const uint64_t autoAttackGuid = gameHandler->getAutoAttackTargetGuid();
-                    const bool isCombatTarget = (guid == currentTargetGuid || guid == autoAttackGuid);
-                    bool clipGuardEligible = havePlayerPos &&
-                                             unit->getHealth() > 0 &&
-                                             (unit->isHostile() ||
-                                              gameHandler->isAggressiveTowardPlayer(guid) ||
-                                              isCombatTarget);
+                    // Only check for creatures within 8 units (melee range) — saves expensive
+                    // getRenderBoundsForGuid/getModelData calls for distant creatures.
+                    bool clipGuardEligible = false;
+                    bool isCombatTarget = false;
+                    if (havePlayerPos && canonDistSq < 64.0f) { // 8² = melee range
+                        auto unit = std::static_pointer_cast<game::Unit>(entity);
+                        const uint64_t currentTargetGuid = gameHandler->hasTarget() ? gameHandler->getTargetGuid() : 0;
+                        const uint64_t autoAttackGuid = gameHandler->getAutoAttackTargetGuid();
+                        isCombatTarget = (guid == currentTargetGuid || guid == autoAttackGuid);
+                        clipGuardEligible = unit->getHealth() > 0 &&
+                                            (unit->isHostile() ||
+                                             gameHandler->isAggressiveTowardPlayer(guid) ||
+                                             isCombatTarget);
+                    }
                     if (clipGuardEligible) {
                         float creatureCollisionRadius = 0.8f;
                         glm::vec3 cc;
@@ -1355,14 +1415,21 @@ void Application::update(float deltaTime) {
                         // often put head/torso inside the player capsule).
                         auto mit = creatureModelIds_.find(guid);
                         if (mit != creatureModelIds_.end()) {
-                            if (const auto* md = charRenderer->getModelData(mit->second)) {
-                                std::string modelName = md->name;
-                                std::transform(modelName.begin(), modelName.end(), modelName.begin(),
-                                               [](unsigned char c) { return static_cast<char>(std::tolower(c)); });
-                                if (modelName.find("wolf") != std::string::npos ||
-                                    modelName.find("worg") != std::string::npos) {
-                                    minSep = std::max(minSep, 2.45f);
+                            uint32_t mid = mit->second;
+                            auto wolfIt = modelIdIsWolfLike_.find(mid);
+                            if (wolfIt == modelIdIsWolfLike_.end()) {
+                                bool isWolf = false;
+                                if (const auto* md = charRenderer->getModelData(mid)) {
+                                    std::string modelName = md->name;
+                                    std::transform(modelName.begin(), modelName.end(), modelName.begin(),
+                                                   [](unsigned char c) { return static_cast<char>(std::tolower(c)); });
+                                    isWolf = (modelName.find("wolf") != std::string::npos ||
+                                              modelName.find("worg") != std::string::npos);
                                 }
+                                wolfIt = modelIdIsWolfLike_.emplace(mid, isWolf).first;
+                            }
+                            if (wolfIt->second) {
+                                minSep = std::max(minSep, 2.45f);
                             }
                         }
 
@@ -1389,7 +1456,8 @@ void Application::update(float deltaTime) {
                         float planarDist = glm::length(delta2);
                         float dz = std::abs(renderPos.z - prevPos.z);
 
-                        const bool deadOrCorpse = unit->getHealth() == 0;
+                        auto unitPtr = std::static_pointer_cast<game::Unit>(entity);
+                        const bool deadOrCorpse = unitPtr->getHealth() == 0;
                         const bool largeCorrection = (planarDist > 6.0f) || (dz > 3.0f);
                         if (deadOrCorpse || largeCorrection) {
                             charRenderer->setInstancePosition(instanceId, renderPos);
@@ -1404,6 +1472,14 @@ void Application::update(float deltaTime) {
                     charRenderer->setInstanceRotation(instanceId, glm::vec3(0.0f, 0.0f, renderYaw));
                 }
             }
+            {
+                float csMs = std::chrono::duration<float, std::milli>(
+                    std::chrono::steady_clock::now() - creatureSyncStart).count();
+                if (csMs > 5.0f) {
+                    LOG_WARNING("SLOW update stage 'creature render sync': ", csMs, "ms (",
+                                creatureInstances_.size(), " creatures)");
+                }
+            }
 
             // Movement heartbeat is sent from GameHandler::update() to avoid
             // duplicate packets from multiple update loops.
@@ -1426,6 +1502,7 @@ void Application::update(float deltaTime) {
     // Update renderer (camera, etc.) only when in-game
     updateCheckpoint = "renderer update";
     if (renderer && state == AppState::IN_GAME) {
+        auto rendererUpdateStart = std::chrono::steady_clock::now();
         try {
             renderer->update(deltaTime);
         } catch (const std::bad_alloc& e) {
@@ -1435,6 +1512,11 @@ void Application::update(float deltaTime) {
             LOG_ERROR("Exception during Application::update stage 'renderer->update': ", e.what());
             throw;
         }
+        float ruMs = std::chrono::duration<float, std::milli>(
+            std::chrono::steady_clock::now() - rendererUpdateStart).count();
+        if (ruMs > 5.0f) {
+            LOG_WARNING("SLOW update stage 'renderer->update': ", ruMs, "ms");
+        }
     }
     // Update UI
     updateCheckpoint = "ui update";
@@ -3465,6 +3547,14 @@ void Application::loadOnlineWorldTerrain(uint32_t mapId, float x, float y, float
         deadCreatureGuids_.clear();
         nonRenderableCreatureDisplayIds_.clear();
         creaturePermanentFailureGuids_.clear();
+        modelIdIsWolfLike_.clear();
+        displayIdTexturesApplied_.clear();
+        charSectionsCache_.clear();
+        charSectionsCacheBuilt_ = false;
+        for (auto& load : asyncCreatureLoads_) {
+            if (load.future.valid()) load.future.wait();
+        }
+        asyncCreatureLoads_.clear();
 
         playerInstances_.clear();
         onlinePlayerAppearance_.clear();
@@ -3508,6 +3598,21 @@ void Application::loadOnlineWorldTerrain(uint32_t mapId, float x, float y, float
     }
     LOG_INFO("Loading online world terrain for map '", mapName, "' (ID ", mapId, ")");
 
+    // Cancel any stale preload (if it was for a different map, the file cache
+    // still retains whatever was loaded — it doesn't hurt).
+    if (worldPreload_) {
+        if (worldPreload_->mapId == mapId) {
+            LOG_INFO("World preload: cache-warm hit for map '", mapName, "'");
+        } else {
+            LOG_INFO("World preload: map mismatch (preloaded ", worldPreload_->mapName,
+                     ", entering ", mapName, ")");
+        }
+    }
+    cancelWorldPreload();
+
+    // Save this world info for next session's early preload
+    saveLastWorldInfo(mapId, mapName, x, y);
+
     // Convert server coordinates to canonical WoW coordinates
     // Server sends: X=West (canonical.Y), Y=North (canonical.X), Z=Up
     glm::vec3 spawnCanonical = core::coords::serverToCanonical(glm::vec3(x, y, z));
@@ -3896,6 +4001,13 @@ void Application::loadOnlineWorldTerrain(uint32_t mapId, float x, float y, float
             auto* terrainMgr = renderer->getTerrainManager();
             auto* camera = renderer->getCamera();
 
+            // Use a small radius for the initial load (just immediate tiles),
+            // then restore the full radius after entering the game.
+            // This matches WoW's behavior: load quickly, stream the rest in-game.
+            const int savedLoadRadius = 4;
+            terrainMgr->setLoadRadius(1);
+            terrainMgr->setUnloadRadius(7);
+
             // Trigger tile streaming for surrounding area
             terrainMgr->update(*camera, 1.0f);
 
@@ -3931,8 +4043,11 @@ void Application::loadOnlineWorldTerrain(uint32_t mapId, float x, float y, float
                 // Trigger new streaming — enqueue tiles for background workers
                 terrainMgr->update(*camera, 0.016f);
 
-                // Process ONE tile per iteration so loading screen updates after each
-                terrainMgr->processOneReadyTile();
+                // Process ALL available ready tiles per iteration — batches GPU
+                // uploads into a single command buffer + fence wait instead of
+                // one fence per tile.  Loading screen still updates between
+                // iterations while workers parse more tiles.
+                terrainMgr->processAllReadyTiles();
 
                 int remaining = terrainMgr->getRemainingTileCount();
                 int loaded = terrainMgr->getLoadedTileCount();
@@ -3987,6 +4102,9 @@ void Application::loadOnlineWorldTerrain(uint32_t mapId, float x, float y, float
 
             LOG_INFO("Online terrain streaming complete: ", terrainMgr->getLoadedTileCount(), " tiles loaded");
 
+            // Restore full load radius — remaining tiles stream in-game
+            terrainMgr->setLoadRadius(savedLoadRadius);
+
             // Load/precompute collision cache
             if (renderer->getWMORenderer()) {
                 showProgress("Building collision cache...", 0.88f);
@@ -4087,9 +4205,68 @@ void Application::loadOnlineWorldTerrain(uint32_t mapId, float x, float y, float
 
             if (world) world->update(1.0f / 60.0f);
             processPlayerSpawnQueue();
+
+            // During load screen warmup: lift per-frame budgets so GPU uploads
+            // happen in bulk while the loading screen is still visible.
+            // Process ALL async creature model uploads (no 3-per-frame cap).
+            {
+                for (auto it = asyncCreatureLoads_.begin(); it != asyncCreatureLoads_.end(); ) {
+                    if (!it->future.valid() ||
+                        it->future.wait_for(std::chrono::milliseconds(0)) != std::future_status::ready) {
+                        ++it;
+                        continue;
+                    }
+                    auto result = it->future.get();
+                    it = asyncCreatureLoads_.erase(it);
+                    if (result.permanent_failure) {
+                        nonRenderableCreatureDisplayIds_.insert(result.displayId);
+                        creaturePermanentFailureGuids_.insert(result.guid);
+                        pendingCreatureSpawnGuids_.erase(result.guid);
+                        creatureSpawnRetryCounts_.erase(result.guid);
+                        continue;
+                    }
+                    if (!result.valid || !result.model) {
+                        pendingCreatureSpawnGuids_.erase(result.guid);
+                        creatureSpawnRetryCounts_.erase(result.guid);
+                        continue;
+                    }
+                    auto* charRenderer = renderer ? renderer->getCharacterRenderer() : nullptr;
+                    if (!charRenderer) { pendingCreatureSpawnGuids_.erase(result.guid); continue; }
+                    if (!charRenderer->loadModel(*result.model, result.modelId)) {
+                        nonRenderableCreatureDisplayIds_.insert(result.displayId);
+                        creaturePermanentFailureGuids_.insert(result.guid);
+                        pendingCreatureSpawnGuids_.erase(result.guid);
+                        creatureSpawnRetryCounts_.erase(result.guid);
+                        continue;
+                    }
+                    displayIdModelCache_[result.displayId] = result.modelId;
+                    pendingCreatureSpawnGuids_.erase(result.guid);
+                    creatureSpawnRetryCounts_.erase(result.guid);
+                    if (!creatureInstances_.count(result.guid) &&
+                        !creaturePermanentFailureGuids_.count(result.guid)) {
+                        PendingCreatureSpawn s{};
+                        s.guid = result.guid; s.displayId = result.displayId;
+                        s.x = result.x; s.y = result.y; s.z = result.z;
+                        s.orientation = result.orientation;
+                        pendingCreatureSpawns_.push_back(s);
+                        pendingCreatureSpawnGuids_.insert(result.guid);
+                    }
+                }
+            }
             processCreatureSpawnQueue();
+            processAsyncNpcCompositeResults();
             processDeferredEquipmentQueue();
-            processGameObjectSpawnQueue();
+            if (auto* cr = renderer ? renderer->getCharacterRenderer() : nullptr) {
+                cr->processPendingNormalMaps(10);  // higher budget during load screen
+            }
+
+            // Process ALL pending game object spawns (no 1-per-frame cap during load screen).
+            while (!pendingGameObjectSpawns_.empty()) {
+                auto& s = pendingGameObjectSpawns_.front();
+                spawnOnlineGameObject(s.guid, s.entry, s.displayId, s.x, s.y, s.z, s.orientation);
+                pendingGameObjectSpawns_.erase(pendingGameObjectSpawns_.begin());
+            }
+
             processPendingTransportDoodads();
             processPendingMount();
             updateQuestMarkers();
@@ -4140,6 +4317,55 @@ void Application::loadOnlineWorldTerrain(uint32_t mapId, float x, float y, float
     setState(AppState::IN_GAME);
 }
 
+void Application::buildCharSectionsCache() {
+    if (charSectionsCacheBuilt_ || !assetManager || !assetManager->isInitialized()) return;
+    auto dbc = assetManager->loadDBC("CharSections.dbc");
+    if (!dbc) return;
+    const auto* csL = pipeline::getActiveDBCLayout()
+        ? pipeline::getActiveDBCLayout()->getLayout("CharSections") : nullptr;
+    uint32_t raceF = csL ? (*csL)["RaceID"] : 1;
+    uint32_t sexF = csL ? (*csL)["SexID"] : 2;
+    uint32_t secF = csL ? (*csL)["BaseSection"] : 3;
+    uint32_t varF = csL ? (*csL)["VariationIndex"] : 4;
+    uint32_t colF = csL ? (*csL)["ColorIndex"] : 5;
+    uint32_t tex1F = csL ? (*csL)["Texture1"] : 6;
+    for (uint32_t r = 0; r < dbc->getRecordCount(); r++) {
+        uint32_t race = dbc->getUInt32(r, raceF);
+        uint32_t sex = dbc->getUInt32(r, sexF);
+        uint32_t section = dbc->getUInt32(r, secF);
+        uint32_t variation = dbc->getUInt32(r, varF);
+        uint32_t color = dbc->getUInt32(r, colF);
+        // We only cache sections 0 (skin), 1 (face), 3 (hair), 4 (underwear)
+        if (section != 0 && section != 1 && section != 3 && section != 4) continue;
+        for (int ti = 0; ti < 3; ti++) {
+            std::string tex = dbc->getString(r, tex1F + ti);
+            if (tex.empty()) continue;
+            // Key: race(8)|sex(4)|section(4)|variation(8)|color(8)|texIndex(2) packed into 64 bits
+            uint64_t key = (static_cast<uint64_t>(race) << 26) |
+                           (static_cast<uint64_t>(sex & 0xF) << 22) |
+                           (static_cast<uint64_t>(section & 0xF) << 18) |
+                           (static_cast<uint64_t>(variation & 0xFF) << 10) |
+                           (static_cast<uint64_t>(color & 0xFF) << 2) |
+                           static_cast<uint64_t>(ti);
+            charSectionsCache_.emplace(key, tex);
+        }
+    }
+    charSectionsCacheBuilt_ = true;
+    LOG_INFO("CharSections cache built: ", charSectionsCache_.size(), " entries");
+}
+
+std::string Application::lookupCharSection(uint8_t race, uint8_t sex, uint8_t section,
+                                           uint8_t variation, uint8_t color, int texIndex) const {
+    uint64_t key = (static_cast<uint64_t>(race) << 26) |
+                   (static_cast<uint64_t>(sex & 0xF) << 22) |
+                   (static_cast<uint64_t>(section & 0xF) << 18) |
+                   (static_cast<uint64_t>(variation & 0xFF) << 10) |
+                   (static_cast<uint64_t>(color & 0xFF) << 2) |
+                   static_cast<uint64_t>(texIndex);
+    auto it = charSectionsCache_.find(key);
+    return (it != charSectionsCache_.end()) ? it->second : std::string();
+}
+
 void Application::buildCreatureDisplayLookups() {
     if (creatureLookupsBuilt_ || !assetManager || !assetManager->isInitialized()) return;
 
@@ -4479,6 +4705,47 @@ bool Application::getRenderFootZForGuid(uint64_t guid, float& outFootZ) const {
     return renderer->getCharacterRenderer()->getInstanceFootZ(instanceId, outFootZ);
 }
 
+pipeline::M2Model Application::loadCreatureM2Sync(const std::string& m2Path) {
+    auto m2Data = assetManager->readFile(m2Path);
+    if (m2Data.empty()) {
+        LOG_WARNING("Failed to read creature M2: ", m2Path);
+        return {};
+    }
+
+    pipeline::M2Model model = pipeline::M2Loader::load(m2Data);
+    if (model.vertices.empty()) {
+        LOG_WARNING("Failed to parse creature M2: ", m2Path);
+        return {};
+    }
+
+    // Load skin file (only for WotLK M2s - vanilla has embedded skin)
+    if (model.version >= 264) {
+        std::string skinPath = m2Path.substr(0, m2Path.size() - 3) + "00.skin";
+        auto skinData = assetManager->readFile(skinPath);
+        if (!skinData.empty()) {
+            pipeline::M2Loader::loadSkin(skinData, model);
+        } else {
+            LOG_WARNING("Missing skin file for WotLK creature M2: ", skinPath);
+        }
+    }
+
+    // Load external .anim files for sequences without flag 0x20
+    std::string basePath = m2Path.substr(0, m2Path.size() - 3);
+    for (uint32_t si = 0; si < model.sequences.size(); si++) {
+        if (!(model.sequences[si].flags & 0x20)) {
+            char animFileName[256];
+            snprintf(animFileName, sizeof(animFileName), "%s%04u-%02u.anim",
+                basePath.c_str(), model.sequences[si].id, model.sequences[si].variationIndex);
+            auto animData = assetManager->readFileOptional(animFileName);
+            if (!animData.empty()) {
+                pipeline::M2Loader::loadAnimFile(m2Data, animData, si, model);
+            }
+        }
+    }
+
+    return model;
+}
+
 void Application::spawnOnlineCreature(uint64_t guid, uint32_t displayId, float x, float y, float z, float orientation) {
     if (!renderer || !renderer->getCharacterRenderer() || !assetManager) return;
 
@@ -4525,47 +4792,13 @@ void Application::spawnOnlineCreature(uint64_t guid, uint32_t displayId, float x
         // Load model from disk (only once per displayId)
         modelId = nextCreatureModelId_++;
 
-        auto m2Data = assetManager->readFile(m2Path);
-        if (m2Data.empty()) {
-            LOG_WARNING("Failed to read creature M2: ", m2Path);
+        pipeline::M2Model model = loadCreatureM2Sync(m2Path);
+        if (!model.isValid()) {
             nonRenderableCreatureDisplayIds_.insert(displayId);
             creaturePermanentFailureGuids_.insert(guid);
             return;
         }
 
-        pipeline::M2Model model = pipeline::M2Loader::load(m2Data);
-        if (model.vertices.empty()) {
-            LOG_WARNING("Failed to parse creature M2: ", m2Path);
-            nonRenderableCreatureDisplayIds_.insert(displayId);
-            creaturePermanentFailureGuids_.insert(guid);
-            return;
-        }
-
-        // Load skin file (only for WotLK M2s - vanilla has embedded skin)
-        if (model.version >= 264) {
-            std::string skinPath = m2Path.substr(0, m2Path.size() - 3) + "00.skin";
-            auto skinData = assetManager->readFile(skinPath);
-            if (!skinData.empty()) {
-                pipeline::M2Loader::loadSkin(skinData, model);
-            } else {
-                LOG_WARNING("Missing skin file for WotLK creature M2: ", skinPath);
-            }
-        }
-
-        // Load external .anim files for sequences without flag 0x20
-        std::string basePath = m2Path.substr(0, m2Path.size() - 3);
-        for (uint32_t si = 0; si < model.sequences.size(); si++) {
-            if (!(model.sequences[si].flags & 0x20)) {
-                char animFileName[256];
-                snprintf(animFileName, sizeof(animFileName), "%s%04u-%02u.anim",
-                    basePath.c_str(), model.sequences[si].id, model.sequences[si].variationIndex);
-                auto animData = assetManager->readFileOptional(animFileName);
-                if (!animData.empty()) {
-                    pipeline::M2Loader::loadAnimFile(m2Data, animData, si, model);
-                }
-            }
-        }
-
         if (!charRenderer->loadModel(model, modelId)) {
             LOG_WARNING("Failed to load creature model: ", m2Path);
             nonRenderableCreatureDisplayIds_.insert(displayId);
@@ -4576,11 +4809,23 @@ void Application::spawnOnlineCreature(uint64_t guid, uint32_t displayId, float x
         displayIdModelCache_[displayId] = modelId;
     }
 
-    // Apply skin textures from CreatureDisplayInfo.dbc (only for newly loaded models)
+    // Apply skin textures from CreatureDisplayInfo.dbc (only once per displayId model).
+    // Track separately from model cache because async loading may upload the model
+    // before textures are applied.
     auto itDisplayData = displayDataMap_.find(displayId);
-    if (!modelCached && itDisplayData != displayDataMap_.end()) {
+    bool needsTextures = (displayIdTexturesApplied_.find(displayId) == displayIdTexturesApplied_.end());
+    if (needsTextures && itDisplayData != displayDataMap_.end()) {
+        auto texStart = std::chrono::steady_clock::now();
+        displayIdTexturesApplied_.insert(displayId);
         const auto& dispData = itDisplayData->second;
 
+        // Use pre-decoded textures from async creature load (if available)
+        auto itPreDec = displayIdPredecodedTextures_.find(displayId);
+        bool hasPreDec = (itPreDec != displayIdPredecodedTextures_.end());
+        if (hasPreDec) {
+            charRenderer->setPredecodedBLPCache(&itPreDec->second);
+        }
+
         // Get model directory for texture path construction
         std::string modelDir;
         size_t lastSlash = m2Path.find_last_of("\\/");
@@ -4613,336 +4858,217 @@ void Application::spawnOnlineCreature(uint64_t guid, uint32_t displayId, float x
                 LOG_DEBUG("  Found humanoid extra: raceId=", (int)extra.raceId, " sexId=", (int)extra.sexId,
                           " hairStyle=", (int)extra.hairStyleId, " hairColor=", (int)extra.hairColorId,
                           " bakeName='", extra.bakeName, "'");
-                LOG_DEBUG("NPC equip: chest=", extra.equipDisplayId[3],
-                          " legs=", extra.equipDisplayId[5],
-                          " feet=", extra.equipDisplayId[6],
-                          " hands=", extra.equipDisplayId[8],
-                          " bake='", extra.bakeName, "'");
 
-                // Build equipment texture region layers from NPC equipment display IDs
-                // (texture-only compositing — no geoset changes to avoid invisibility bugs)
-                std::vector<std::pair<int, std::string>> npcRegionLayers;
-                std::string npcCapeTexturePath;
-                auto npcItemDisplayDbc = assetManager->loadDBC("ItemDisplayInfo.dbc");
-                    if (npcItemDisplayDbc) {
-                        static const char* npcComponentDirs[] = {
-                            "ArmUpperTexture", "ArmLowerTexture", "HandTexture",
-                            "TorsoUpperTexture", "TorsoLowerTexture",
-                            "LegUpperTexture", "LegLowerTexture", "FootTexture",
-                        };
-                        const auto* idiL = pipeline::getActiveDBCLayout()
-                            ? pipeline::getActiveDBCLayout()->getLayout("ItemDisplayInfo") : nullptr;
-                        // Texture component region fields (8 regions: ArmUpper..Foot)
-                        // Binary DBC (23 fields) has textures at 14+
-                        const uint32_t texRegionFields[8] = {
-                            idiL ? (*idiL)["TextureArmUpper"]  : 14u,
-                            idiL ? (*idiL)["TextureArmLower"]  : 15u,
-                            idiL ? (*idiL)["TextureHand"]      : 16u,
-                            idiL ? (*idiL)["TextureTorsoUpper"]: 17u,
-                            idiL ? (*idiL)["TextureTorsoLower"]: 18u,
-                            idiL ? (*idiL)["TextureLegUpper"]  : 19u,
-                            idiL ? (*idiL)["TextureLegLower"]  : 20u,
-                            idiL ? (*idiL)["TextureFoot"]      : 21u,
-                        };
-                        const bool npcIsFemale = (extra.sexId == 1);
-                        const bool npcHasArmArmor = (extra.equipDisplayId[7] != 0 || extra.equipDisplayId[8] != 0);
-
-                        auto regionAllowedForNpcSlot = [](int eqSlot, int region) -> bool {
-                            // Regions: 0 ArmUpper, 1 ArmLower, 2 Hand, 3 TorsoUpper, 4 TorsoLower,
-                            //          5 LegUpper, 6 LegLower, 7 Foot
-                            switch (eqSlot) {
-                                case 2: // shirt
-                                case 3: // chest
-                                    return region <= 4;
-                                case 4: // belt
-                                    // TODO(#npc-belt-region): belt torso-lower overlay can
-                                    // cut out male abdomen on some humanoid NPCs.
-                                    // Keep disabled until region compositing is fixed.
-                                    return false;
-                                case 5: // legs
-                                    return region == 5 || region == 6;
-                                case 6: // feet
-                                    return region == 7;
-                                case 7: // wrist
-                                    // Bracer overlays on NPCs often produce bad arm artifacts.
-                                    // Keep disabled until slot-accurate arm compositing is implemented.
-                                    return false;
-                                case 8: // hands
-                                    // Keep glove textures to hand region only; arm regions from glove
-                                    // items can produce furry/looping forearm artifacts on some NPCs.
-                                    return region == 2;
-                                case 9: // tabard
-                                    return region == 3 || region == 4;
-                                default:
-                                    return false;
-                            }
-                        };
-                        auto regionAllowedForNpcSlotCtx = [&](int eqSlot, int region) -> bool {
-                            // Shirt (slot 2) without arm armor: restrict to torso only
-                            // to avoid bare-skin shirt textures bleeding onto arms.
-                            // Chest (slot 3) always paints arms — plate/mail chest armor
-                            // must cover the full upper body even without separate gloves.
-                            if (eqSlot == 2 && !npcHasArmArmor) {
-                                return (region == 3 || region == 4);
-                            }
-                            return regionAllowedForNpcSlot(eqSlot, region);
-                        };
-
-                        // Iterate all 11 NPC equipment slots; use slot-aware region filtering
-                        for (int eqSlot = 0; eqSlot < 11; eqSlot++) {
-                            uint32_t did = extra.equipDisplayId[eqSlot];
-                            if (did == 0) continue;
-                            int32_t recIdx = npcItemDisplayDbc->findRecordById(did);
-                            if (recIdx < 0) continue;
-
-                            for (int region = 0; region < 8; region++) {
-                                if (!regionAllowedForNpcSlotCtx(eqSlot, region)) continue;
-                                std::string texName = npcItemDisplayDbc->getString(
-                                    static_cast<uint32_t>(recIdx), texRegionFields[region]);
-                                if (texName.empty()) continue;
-
-                                std::string base = "Item\\TextureComponents\\" +
-                                    std::string(npcComponentDirs[region]) + "\\" + texName;
-                                std::string genderPath = base + (npcIsFemale ? "_F.blp" : "_M.blp");
-                                std::string unisexPath = base + "_U.blp";
-                                std::string basePath = base + ".blp";
-                                std::string fullPath;
-                                if (assetManager->fileExists(genderPath)) fullPath = genderPath;
-                                else if (assetManager->fileExists(unisexPath)) fullPath = unisexPath;
-                                else if (assetManager->fileExists(basePath)) fullPath = basePath;
-                                else continue;
-
-                                npcRegionLayers.emplace_back(region, fullPath);
-                            }
-                        }
-
-                        // Cloak/cape texture is separate from the body atlas.
-                        // Read equipped cape displayId (slot 10) and resolve the best cape texture path.
-                        uint32_t capeDisplayId = extra.equipDisplayId[10];
-                        if (capeDisplayId != 0) {
-                            int32_t capeRecIdx = npcItemDisplayDbc->findRecordById(capeDisplayId);
-                            if (capeRecIdx >= 0) {
-                                const uint32_t leftTexField = idiL ? (*idiL)["LeftModelTexture"] : 3u;
-                                const uint32_t rightTexField = leftTexField + 1u; // modelTexture_2 in 3.3.5a
-
-                                std::vector<std::string> capeNames;
-                                auto addName = [&](const std::string& n) {
-                                    if (!n.empty() && std::find(capeNames.begin(), capeNames.end(), n) == capeNames.end()) {
-                                        capeNames.push_back(n);
-                                    }
-                                };
-                                std::string leftName = npcItemDisplayDbc->getString(
-                                    static_cast<uint32_t>(capeRecIdx), leftTexField);
-                                std::string rightName = npcItemDisplayDbc->getString(
-                                    static_cast<uint32_t>(capeRecIdx), rightTexField);
-                                // Female models often prefer modelTexture_2.
-                                if (npcIsFemale) {
-                                    addName(rightName);
-                                    addName(leftName);
-                                } else {
-                                    addName(leftName);
-                                    addName(rightName);
-                                }
-
-                                auto hasBlpExt = [](const std::string& p) {
-                                    if (p.size() < 4) return false;
-                                    std::string ext = p.substr(p.size() - 4);
-                                    std::transform(ext.begin(), ext.end(), ext.begin(),
-                                                   [](unsigned char c) { return static_cast<char>(std::tolower(c)); });
-                                    return ext == ".blp";
-                                };
-
-                                std::vector<std::string> capeCandidates;
-                                auto addCapeCandidate = [&](const std::string& p) {
-                                    if (p.empty()) return;
-                                    if (std::find(capeCandidates.begin(), capeCandidates.end(), p) == capeCandidates.end()) {
-                                        capeCandidates.push_back(p);
-                                    }
-                                };
-
-                                for (const auto& nameRaw : capeNames) {
-                                    std::string name = nameRaw;
-                                    std::replace(name.begin(), name.end(), '/', '\\');
-                                    bool hasDir = (name.find('\\') != std::string::npos);
-                                    bool hasExt = hasBlpExt(name);
-                                    if (hasDir) {
-                                        addCapeCandidate(name);
-                                        if (!hasExt) addCapeCandidate(name + ".blp");
-                                    } else {
-                                        std::string base = "Item\\ObjectComponents\\Cape\\" + name;
-                                        addCapeCandidate(base);
-                                        if (!hasExt) addCapeCandidate(base + ".blp");
-                                        // Some data sets use gender/unisex suffix variants.
-                                        addCapeCandidate(base + (npcIsFemale ? "_F.blp" : "_M.blp"));
-                                        addCapeCandidate(base + "_U.blp");
-                                    }
-                                }
-
-                                for (const auto& candidate : capeCandidates) {
-                                    if (assetManager->fileExists(candidate)) {
-                                        npcCapeTexturePath = candidate;
-                                        break;
-                                    }
-                                }
-                            }
-                        }
-                    }
-
-                // Use baked texture for body skin (types 1, 2)
-                // Type 6 (hair) needs its own texture from CharSections.dbc
-                const bool allowNpcRegionComposite = true;
-                rendering::VkTexture* bakedSkinTex = nullptr;
-                if (!extra.bakeName.empty()) {
-                    std::string bakePath = "Textures\\BakedNpcTextures\\" + extra.bakeName;
-                    rendering::VkTexture* finalTex = charRenderer->loadTexture(bakePath);
-                    bakedSkinTex = finalTex;
-                    if (finalTex && modelData) {
-                        for (size_t ti = 0; ti < modelData->textures.size(); ti++) {
-                            uint32_t texType = modelData->textures[ti].type;
-                            if (texType == 1) {
-                                charRenderer->setModelTexture(modelId, static_cast<uint32_t>(ti), finalTex);
-                                hasHumanoidTexture = true;
-                                LOG_DEBUG("NPC baked type1 slot=", ti, " modelId=", modelId,
-                                            " tex=", bakePath);
-                            }
-                        }
-                    }
-                }
-                // Fallback: if baked texture failed or bakeName was empty, build from CharSections
-                if (!hasHumanoidTexture) {
-                    LOG_DEBUG("  Trying CharSections fallback for NPC skin");
-
-                    // Build skin texture from CharSections.dbc (same as player character)
-                    auto csFallbackDbc = assetManager->loadDBC("CharSections.dbc");
-                    if (csFallbackDbc) {
-                        const auto* csFL = pipeline::getActiveDBCLayout()
-                            ? pipeline::getActiveDBCLayout()->getLayout("CharSections") : nullptr;
-                        uint32_t npcRace = static_cast<uint32_t>(extra.raceId);
-                        uint32_t npcSex = static_cast<uint32_t>(extra.sexId);
-                        uint32_t npcSkin = static_cast<uint32_t>(extra.skinId);
-                        uint32_t npcFace = static_cast<uint32_t>(extra.faceId);
-                        std::string npcSkinPath, npcFaceLower, npcFaceUpper;
-                        std::vector<std::string> npcUnderwear;
-
-                        for (uint32_t r = 0; r < csFallbackDbc->getRecordCount(); r++) {
-                            uint32_t rId = csFallbackDbc->getUInt32(r, csFL ? (*csFL)["RaceID"] : 1);
-                            uint32_t sId = csFallbackDbc->getUInt32(r, csFL ? (*csFL)["SexID"] : 2);
-                            if (rId != npcRace || sId != npcSex) continue;
-
-                            uint32_t section = csFallbackDbc->getUInt32(r, csFL ? (*csFL)["BaseSection"] : 3);
-                            uint32_t variation = csFallbackDbc->getUInt32(r, csFL ? (*csFL)["VariationIndex"] : 8);
-                            uint32_t color = csFallbackDbc->getUInt32(r, csFL ? (*csFL)["ColorIndex"] : 9);
-                            uint32_t tex1F = csFL ? (*csFL)["Texture1"] : 4;
-
-                            // Section 0 = skin: match colorIndex = skinId
-                            if (section == 0 && npcSkinPath.empty() && color == npcSkin) {
-                                npcSkinPath = csFallbackDbc->getString(r, tex1F);
-                            }
-                            // Section 1 = face: match variation=faceId, color=skinId
-                            else if (section == 1 && npcFaceLower.empty() &&
-                                     variation == npcFace && color == npcSkin) {
-                                npcFaceLower = csFallbackDbc->getString(r, tex1F);
-                                npcFaceUpper = csFallbackDbc->getString(r, tex1F + 1);
-                            }
-                            // Section 4 = underwear: match color=skinId
-                            else if (section == 4 && npcUnderwear.empty() && color == npcSkin) {
-                                for (uint32_t f = tex1F; f <= tex1F + 2; f++) {
-                                    std::string tex = csFallbackDbc->getString(r, f);
-                                    if (!tex.empty()) npcUnderwear.push_back(tex);
-                                }
-                            }
-                        }
-
-                        LOG_DEBUG("NPC CharSections lookup: race=", npcRace, " sex=", npcSex,
-                                    " skin=", npcSkin, " face=", npcFace,
-                                    " skinPath='", npcSkinPath, "' faceLower='", npcFaceLower, "'");
-                        if (!npcSkinPath.empty()) {
-                            // Composite skin + face + underwear
-                            std::vector<std::string> skinLayers;
-                            skinLayers.push_back(npcSkinPath);
-                            if (!npcFaceLower.empty()) skinLayers.push_back(npcFaceLower);
-                            if (!npcFaceUpper.empty()) skinLayers.push_back(npcFaceUpper);
-                            for (const auto& uw : npcUnderwear) skinLayers.push_back(uw);
-
-                            rendering::VkTexture* npcSkinTex = nullptr;
-                            if (allowNpcRegionComposite && !npcRegionLayers.empty()) {
-                                npcSkinTex = charRenderer->compositeWithRegions(npcSkinPath,
-                                    std::vector<std::string>(skinLayers.begin() + 1, skinLayers.end()),
-                                    npcRegionLayers);
-                            } else if (skinLayers.size() > 1) {
-                                npcSkinTex = charRenderer->compositeTextures(skinLayers);
-                            } else {
-                                npcSkinTex = charRenderer->loadTexture(npcSkinPath);
-                            }
-
-                            if (npcSkinTex && modelData) {
-                                int slotsSet = 0;
-                                for (size_t ti = 0; ti < modelData->textures.size(); ti++) {
-                                    uint32_t texType = modelData->textures[ti].type;
-                                    if (texType == 1 || texType == 11 || texType == 12 || texType == 13) {
-                                        charRenderer->setModelTexture(modelId, static_cast<uint32_t>(ti), npcSkinTex);
-                                        hasHumanoidTexture = true;
-                                        slotsSet++;
-                                    }
-                                }
-                                LOG_DEBUG("NPC CharSections: skin='", npcSkinPath, "' regions=",
-                                            npcRegionLayers.size(), " applied=", hasHumanoidTexture,
-                                            " slots=", slotsSet,
-                                            " modelId=", modelId, " texCount=", modelData->textures.size());
-                            }
-                        }
+                // Collect model texture slot info (type 1 = skin, type 6 = hair)
+                std::vector<uint32_t> skinSlots, hairSlots;
+                if (modelData) {
+                    for (size_t ti = 0; ti < modelData->textures.size(); ti++) {
+                        uint32_t texType = modelData->textures[ti].type;
+                        if (texType == 1 || texType == 11 || texType == 12 || texType == 13)
+                            skinSlots.push_back(static_cast<uint32_t>(ti));
+                        if (texType == 6)
+                            hairSlots.push_back(static_cast<uint32_t>(ti));
                     }
                 }
 
-                // Load hair texture from CharSections.dbc (section 3)
-                auto charSectionsDbc = assetManager->loadDBC("CharSections.dbc");
-                if (charSectionsDbc) {
-                    const auto* csL2 = pipeline::getActiveDBCLayout() ? pipeline::getActiveDBCLayout()->getLayout("CharSections") : nullptr;
-                    uint32_t targetRace = static_cast<uint32_t>(extra.raceId);
-                    uint32_t targetSex = static_cast<uint32_t>(extra.sexId);
-                    std::string hairTexPath;
+                // Copy extra data for the async task (avoid dangling reference)
+                HumanoidDisplayExtra extraCopy = extra;
 
-                    for (uint32_t r = 0; r < charSectionsDbc->getRecordCount(); r++) {
-                        uint32_t raceId = charSectionsDbc->getUInt32(r, csL2 ? (*csL2)["RaceID"] : 1);
-                        uint32_t sexId = charSectionsDbc->getUInt32(r, csL2 ? (*csL2)["SexID"] : 2);
-                        uint32_t section = charSectionsDbc->getUInt32(r, csL2 ? (*csL2)["BaseSection"] : 3);
-                        uint32_t variation = charSectionsDbc->getUInt32(r, csL2 ? (*csL2)["VariationIndex"] : 4);
-                        uint32_t colorIdx = charSectionsDbc->getUInt32(r, csL2 ? (*csL2)["ColorIndex"] : 5);
+                // Launch async task: ALL DBC lookups, path resolution, and BLP pre-decode
+                // happen on a background thread. Only GPU texture upload runs on main thread
+                // (in processAsyncNpcCompositeResults).
+                auto* am = assetManager.get();
+                AsyncNpcCompositeLoad load;
+                load.future = std::async(std::launch::async,
+                    [am, extraCopy, skinSlots = std::move(skinSlots),
+                     hairSlots = std::move(hairSlots), modelId, displayId]() mutable -> PreparedNpcComposite {
+                        PreparedNpcComposite result;
+                        DeferredNpcComposite& def = result.info;
+                        def.modelId = modelId;
+                        def.displayId = displayId;
+                        def.skinTextureSlots = std::move(skinSlots);
+                        def.hairTextureSlots = std::move(hairSlots);
 
-                        if (raceId != targetRace || sexId != targetSex) continue;
-                        if (section != 3) continue;  // Section 3 = hair
-                        if (variation != static_cast<uint32_t>(extra.hairStyleId)) continue;
-                        if (colorIdx != static_cast<uint32_t>(extra.hairColorId)) continue;
+                        std::vector<std::string> allPaths;  // paths to pre-decode
 
-                        hairTexPath = charSectionsDbc->getString(r, csL2 ? (*csL2)["Texture1"] : 6);
-                        break;
-                    }
+                        // --- Baked skin texture ---
+                        if (!extraCopy.bakeName.empty()) {
+                            def.bakedSkinPath = "Textures\\BakedNpcTextures\\" + extraCopy.bakeName;
+                            def.hasBakedSkin = true;
+                            allPaths.push_back(def.bakedSkinPath);
+                        }
 
-                    if (!hairTexPath.empty()) {
-                        rendering::VkTexture* hairTex = charRenderer->loadTexture(hairTexPath);
-                        rendering::VkTexture* whTex = charRenderer->loadTexture("");
-                        if (hairTex && hairTex != whTex && modelData) {
-                            for (size_t ti = 0; ti < modelData->textures.size(); ti++) {
-                                if (modelData->textures[ti].type == 6) {
-                                    charRenderer->setModelTexture(modelId, static_cast<uint32_t>(ti), hairTex);
+                        // --- CharSections fallback (skin/face/underwear) ---
+                        if (!def.hasBakedSkin) {
+                            auto csDbc = am->loadDBC("CharSections.dbc");
+                            if (csDbc) {
+                                const auto* csL = pipeline::getActiveDBCLayout()
+                                    ? pipeline::getActiveDBCLayout()->getLayout("CharSections") : nullptr;
+                                uint32_t npcRace = static_cast<uint32_t>(extraCopy.raceId);
+                                uint32_t npcSex = static_cast<uint32_t>(extraCopy.sexId);
+                                uint32_t npcSkin = static_cast<uint32_t>(extraCopy.skinId);
+                                uint32_t npcFace = static_cast<uint32_t>(extraCopy.faceId);
+                                std::string npcFaceLower, npcFaceUpper;
+                                std::vector<std::string> npcUnderwear;
+
+                                for (uint32_t r = 0; r < csDbc->getRecordCount(); r++) {
+                                    uint32_t rId = csDbc->getUInt32(r, csL ? (*csL)["RaceID"] : 1);
+                                    uint32_t sId = csDbc->getUInt32(r, csL ? (*csL)["SexID"] : 2);
+                                    if (rId != npcRace || sId != npcSex) continue;
+
+                                    uint32_t section = csDbc->getUInt32(r, csL ? (*csL)["BaseSection"] : 3);
+                                    uint32_t variation = csDbc->getUInt32(r, csL ? (*csL)["VariationIndex"] : 4);
+                                    uint32_t color = csDbc->getUInt32(r, csL ? (*csL)["ColorIndex"] : 5);
+                                    uint32_t tex1F = csL ? (*csL)["Texture1"] : 6;
+
+                                    if (section == 0 && def.basePath.empty() && color == npcSkin) {
+                                        def.basePath = csDbc->getString(r, tex1F);
+                                    } else if (section == 1 && npcFaceLower.empty() &&
+                                               variation == npcFace && color == npcSkin) {
+                                        npcFaceLower = csDbc->getString(r, tex1F);
+                                        npcFaceUpper = csDbc->getString(r, tex1F + 1);
+                                    } else if (section == 4 && npcUnderwear.empty() && color == npcSkin) {
+                                        for (uint32_t f = tex1F; f <= tex1F + 2; f++) {
+                                            std::string tex = csDbc->getString(r, f);
+                                            if (!tex.empty()) npcUnderwear.push_back(tex);
+                                        }
+                                    }
+                                }
+
+                                if (!def.basePath.empty()) {
+                                    allPaths.push_back(def.basePath);
+                                    if (!npcFaceLower.empty()) { def.overlayPaths.push_back(npcFaceLower); allPaths.push_back(npcFaceLower); }
+                                    if (!npcFaceUpper.empty()) { def.overlayPaths.push_back(npcFaceUpper); allPaths.push_back(npcFaceUpper); }
+                                    for (const auto& uw : npcUnderwear) { def.overlayPaths.push_back(uw); allPaths.push_back(uw); }
                                 }
                             }
                         }
-                    }
-                    // Bald NPCs (hairStyle=0 or no CharSections match): set type-6 to
-                    // the skin/baked texture so the scalp cap renders with skin color.
-                    if (hairTexPath.empty() && bakedSkinTex && modelData) {
-                        for (size_t ti = 0; ti < modelData->textures.size(); ti++) {
-                            if (modelData->textures[ti].type == 6) {
-                                charRenderer->setModelTexture(modelId, static_cast<uint32_t>(ti), bakedSkinTex);
+
+                        // --- Equipment region layers (ItemDisplayInfo DBC) ---
+                        auto idiDbc = am->loadDBC("ItemDisplayInfo.dbc");
+                        if (idiDbc) {
+                            static const char* componentDirs[] = {
+                                "ArmUpperTexture", "ArmLowerTexture", "HandTexture",
+                                "TorsoUpperTexture", "TorsoLowerTexture",
+                                "LegUpperTexture", "LegLowerTexture", "FootTexture",
+                            };
+                            const auto* idiL = pipeline::getActiveDBCLayout()
+                                ? pipeline::getActiveDBCLayout()->getLayout("ItemDisplayInfo") : nullptr;
+                            const uint32_t texRegionFields[8] = {
+                                idiL ? (*idiL)["TextureArmUpper"]  : 14u,
+                                idiL ? (*idiL)["TextureArmLower"]  : 15u,
+                                idiL ? (*idiL)["TextureHand"]      : 16u,
+                                idiL ? (*idiL)["TextureTorsoUpper"]: 17u,
+                                idiL ? (*idiL)["TextureTorsoLower"]: 18u,
+                                idiL ? (*idiL)["TextureLegUpper"]  : 19u,
+                                idiL ? (*idiL)["TextureLegLower"]  : 20u,
+                                idiL ? (*idiL)["TextureFoot"]      : 21u,
+                            };
+                            const bool npcIsFemale = (extraCopy.sexId == 1);
+                            const bool npcHasArmArmor = (extraCopy.equipDisplayId[7] != 0 || extraCopy.equipDisplayId[8] != 0);
+
+                            auto regionAllowedForNpcSlot = [](int eqSlot, int region) -> bool {
+                                switch (eqSlot) {
+                                    case 2: case 3: return region <= 4;
+                                    case 4: return false;
+                                    case 5: return region == 5 || region == 6;
+                                    case 6: return region == 7;
+                                    case 7: return false;
+                                    case 8: return region == 2;
+                                    case 9: return region == 3 || region == 4;
+                                    default: return false;
+                                }
+                            };
+
+                            for (int eqSlot = 0; eqSlot < 11; eqSlot++) {
+                                uint32_t did = extraCopy.equipDisplayId[eqSlot];
+                                if (did == 0) continue;
+                                int32_t recIdx = idiDbc->findRecordById(did);
+                                if (recIdx < 0) continue;
+
+                                for (int region = 0; region < 8; region++) {
+                                    if (!regionAllowedForNpcSlot(eqSlot, region)) continue;
+                                    if (eqSlot == 2 && !npcHasArmArmor && !(region == 3 || region == 4)) continue;
+                                    std::string texName = idiDbc->getString(
+                                        static_cast<uint32_t>(recIdx), texRegionFields[region]);
+                                    if (texName.empty()) continue;
+
+                                    std::string base = "Item\\TextureComponents\\" +
+                                        std::string(componentDirs[region]) + "\\" + texName;
+                                    std::string genderPath = base + (npcIsFemale ? "_F.blp" : "_M.blp");
+                                    std::string unisexPath = base + "_U.blp";
+                                    std::string basePath = base + ".blp";
+                                    std::string fullPath;
+                                    if (am->fileExists(genderPath)) fullPath = genderPath;
+                                    else if (am->fileExists(unisexPath)) fullPath = unisexPath;
+                                    else if (am->fileExists(basePath)) fullPath = basePath;
+                                    else continue;
+
+                                    def.regionLayers.emplace_back(region, fullPath);
+                                    allPaths.push_back(fullPath);
+                                }
                             }
                         }
-                    }
-                }
 
-                // Do not apply cape textures at model scope here. Type-2 texture slots are
-                // shared per model and this can leak cape textures/white fallbacks onto
-                // unrelated humanoid NPCs that use the same modelId.
+                        // Determine compositing mode
+                        if (!def.basePath.empty()) {
+                            bool needsComposite = !def.overlayPaths.empty() || !def.regionLayers.empty();
+                            if (needsComposite && !def.skinTextureSlots.empty()) {
+                                def.hasComposite = true;
+                            } else if (!def.skinTextureSlots.empty()) {
+                                def.hasSimpleSkin = true;
+                            }
+                        }
+
+                        // --- Hair texture from CharSections (section 3) ---
+                        {
+                            auto csDbc = am->loadDBC("CharSections.dbc");
+                            if (csDbc) {
+                                const auto* csL = pipeline::getActiveDBCLayout()
+                                    ? pipeline::getActiveDBCLayout()->getLayout("CharSections") : nullptr;
+                                uint32_t targetRace = static_cast<uint32_t>(extraCopy.raceId);
+                                uint32_t targetSex = static_cast<uint32_t>(extraCopy.sexId);
+
+                                for (uint32_t r = 0; r < csDbc->getRecordCount(); r++) {
+                                    uint32_t raceId = csDbc->getUInt32(r, csL ? (*csL)["RaceID"] : 1);
+                                    uint32_t sexId = csDbc->getUInt32(r, csL ? (*csL)["SexID"] : 2);
+                                    if (raceId != targetRace || sexId != targetSex) continue;
+                                    uint32_t section = csDbc->getUInt32(r, csL ? (*csL)["BaseSection"] : 3);
+                                    if (section != 3) continue;
+                                    uint32_t variation = csDbc->getUInt32(r, csL ? (*csL)["VariationIndex"] : 4);
+                                    uint32_t colorIdx = csDbc->getUInt32(r, csL ? (*csL)["ColorIndex"] : 5);
+                                    if (variation != static_cast<uint32_t>(extraCopy.hairStyleId)) continue;
+                                    if (colorIdx != static_cast<uint32_t>(extraCopy.hairColorId)) continue;
+                                    def.hairTexturePath = csDbc->getString(r, csL ? (*csL)["Texture1"] : 6);
+                                    break;
+                                }
+
+                                if (!def.hairTexturePath.empty()) {
+                                    allPaths.push_back(def.hairTexturePath);
+                                } else if (def.hasBakedSkin && !def.hairTextureSlots.empty()) {
+                                    def.useBakedForHair = true;
+                                    // bakedSkinPath already in allPaths
+                                }
+                            }
+                        }
+
+                        // --- Pre-decode all BLP textures on this background thread ---
+                        for (const auto& path : allPaths) {
+                            std::string key = path;
+                            std::replace(key.begin(), key.end(), '/', '\\');
+                            std::transform(key.begin(), key.end(), key.begin(),
+                                           [](unsigned char c) { return static_cast<char>(std::tolower(c)); });
+                            if (result.predecodedTextures.count(key)) continue;
+                            auto blp = am->loadTexture(key);
+                            if (blp.isValid()) {
+                                result.predecodedTextures[key] = std::move(blp);
+                            }
+                        }
+
+                        return result;
+                    });
+                asyncNpcCompositeLoads_.push_back(std::move(load));
+                hasHumanoidTexture = true;  // skip non-humanoid skin block
             } else {
                 LOG_WARNING("  extraDisplayId ", dispData.extraDisplayId, " not found in humanoidExtraMap");
             }
@@ -5021,6 +5147,18 @@ void Application::spawnOnlineCreature(uint64_t guid, uint32_t displayId, float x
                 }
             }
         }
+
+        // Clear pre-decoded cache after applying all display textures
+        charRenderer->setPredecodedBLPCache(nullptr);
+        displayIdPredecodedTextures_.erase(displayId);
+        {
+            auto texEnd = std::chrono::steady_clock::now();
+            float texMs = std::chrono::duration<float, std::milli>(texEnd - texStart).count();
+            if (texMs > 3.0f) {
+                LOG_WARNING("spawnCreature texture setup took ", texMs, "ms displayId=", displayId,
+                            " hasPreDec=", hasPreDec, " extra=", dispData.extraDisplayId);
+            }
+        }
     }
 
     // Use the entity's latest server-authoritative position rather than the stale spawn
@@ -5058,7 +5196,9 @@ void Application::spawnOnlineCreature(uint64_t guid, uint32_t displayId, float x
 
     // Per-instance hair/skin texture overrides — runs for ALL NPCs (including cached models)
     // so that each NPC gets its own hair/skin color regardless of model sharing.
+    // Uses pre-built CharSections cache (O(1) lookup instead of O(N) DBC scan).
     {
+        if (!charSectionsCacheBuilt_) buildCharSectionsCache();
         auto itDD = displayDataMap_.find(displayId);
         if (itDD != displayDataMap_.end() && itDD->second.extraDisplayId != 0) {
             auto itExtra2 = humanoidExtraMap_.find(itDD->second.extraDisplayId);
@@ -5066,37 +5206,19 @@ void Application::spawnOnlineCreature(uint64_t guid, uint32_t displayId, float x
                 const auto& extra = itExtra2->second;
                 const auto* md = charRenderer->getModelData(modelId);
                 if (md) {
-                    auto charSectionsDbc2 = assetManager->loadDBC("CharSections.dbc");
-                    if (charSectionsDbc2) {
-                        const auto* csL = pipeline::getActiveDBCLayout()
-                            ? pipeline::getActiveDBCLayout()->getLayout("CharSections") : nullptr;
-                        uint32_t tgtRace = static_cast<uint32_t>(extra.raceId);
-                        uint32_t tgtSex = static_cast<uint32_t>(extra.sexId);
-
-                        // Look up hair texture (section 3)
+                        // Look up hair texture (section 3) via cache
                         rendering::VkTexture* whiteTex = charRenderer->loadTexture("");
-                        for (uint32_t r = 0; r < charSectionsDbc2->getRecordCount(); r++) {
-                            uint32_t rId = charSectionsDbc2->getUInt32(r, csL ? (*csL)["RaceID"] : 1);
-                            uint32_t sId = charSectionsDbc2->getUInt32(r, csL ? (*csL)["SexID"] : 2);
-                            if (rId != tgtRace || sId != tgtSex) continue;
-                            uint32_t sec = charSectionsDbc2->getUInt32(r, csL ? (*csL)["BaseSection"] : 3);
-                            if (sec != 3) continue;
-                            uint32_t var = charSectionsDbc2->getUInt32(r, csL ? (*csL)["VariationIndex"] : 4);
-                            uint32_t col = charSectionsDbc2->getUInt32(r, csL ? (*csL)["ColorIndex"] : 5);
-                            if (var != static_cast<uint32_t>(extra.hairStyleId)) continue;
-                            if (col != static_cast<uint32_t>(extra.hairColorId)) continue;
-                            std::string hairPath = charSectionsDbc2->getString(r, csL ? (*csL)["Texture1"] : 6);
-                            if (!hairPath.empty()) {
-                                rendering::VkTexture* hairTex = charRenderer->loadTexture(hairPath);
-                                if (hairTex && hairTex != whiteTex) {
-                                    for (size_t ti = 0; ti < md->textures.size(); ti++) {
-                                        if (md->textures[ti].type == 6) {
-                                            charRenderer->setTextureSlotOverride(instanceId, static_cast<uint16_t>(ti), hairTex);
-                                        }
+                        std::string hairPath = lookupCharSection(
+                            extra.raceId, extra.sexId, 3, extra.hairStyleId, extra.hairColorId, 0);
+                        if (!hairPath.empty()) {
+                            rendering::VkTexture* hairTex = charRenderer->loadTexture(hairPath);
+                            if (hairTex && hairTex != whiteTex) {
+                                for (size_t ti = 0; ti < md->textures.size(); ti++) {
+                                    if (md->textures[ti].type == 6) {
+                                        charRenderer->setTextureSlotOverride(instanceId, static_cast<uint16_t>(ti), hairTex);
                                     }
                                 }
                             }
-                            break;
                         }
 
                         // Look up skin texture (section 0) for per-instance skin color.
@@ -5108,30 +5230,20 @@ void Application::spawnOnlineCreature(uint64_t guid, uint32_t displayId, float x
                                 if (extra.equipDisplayId[s] != 0) hasEquipOrBake = true;
                         }
                         if (!hasEquipOrBake) {
-                            for (uint32_t r = 0; r < charSectionsDbc2->getRecordCount(); r++) {
-                                uint32_t rId = charSectionsDbc2->getUInt32(r, csL ? (*csL)["RaceID"] : 1);
-                                uint32_t sId = charSectionsDbc2->getUInt32(r, csL ? (*csL)["SexID"] : 2);
-                                if (rId != tgtRace || sId != tgtSex) continue;
-                                uint32_t sec = charSectionsDbc2->getUInt32(r, csL ? (*csL)["BaseSection"] : 3);
-                                if (sec != 0) continue;
-                                uint32_t col = charSectionsDbc2->getUInt32(r, csL ? (*csL)["ColorIndex"] : 5);
-                                if (col != static_cast<uint32_t>(extra.skinId)) continue;
-                                std::string skinPath = charSectionsDbc2->getString(r, csL ? (*csL)["Texture1"] : 6);
-                                if (!skinPath.empty()) {
-                                    rendering::VkTexture* skinTex = charRenderer->loadTexture(skinPath);
-                                    if (skinTex) {
-                                        for (size_t ti = 0; ti < md->textures.size(); ti++) {
-                                            uint32_t tt = md->textures[ti].type;
-                                            if (tt == 1 || tt == 11) {
-                                                charRenderer->setTextureSlotOverride(instanceId, static_cast<uint16_t>(ti), skinTex);
-                                            }
+                            std::string skinPath = lookupCharSection(
+                                extra.raceId, extra.sexId, 0, 0, extra.skinId, 0);
+                            if (!skinPath.empty()) {
+                                rendering::VkTexture* skinTex = charRenderer->loadTexture(skinPath);
+                                if (skinTex) {
+                                    for (size_t ti = 0; ti < md->textures.size(); ti++) {
+                                        uint32_t tt = md->textures[ti].type;
+                                        if (tt == 1 || tt == 11) {
+                                            charRenderer->setTextureSlotOverride(instanceId, static_cast<uint16_t>(ti), skinTex);
                                         }
                                     }
                                 }
-                                break;
                             }
                         }
-                    }
                 }
             }
         }
@@ -6692,7 +6804,184 @@ void Application::spawnOnlineGameObject(uint64_t guid, uint32_t entry, uint32_t
              " displayId=", displayId, " at (", x, ", ", y, ", ", z, ")");
 }
 
+void Application::processAsyncCreatureResults() {
+    // Check completed async model loads and finalize on main thread (GPU upload + instance creation).
+    // Limit GPU model uploads per frame to avoid spikes, but always drain cheap bookkeeping.
+    static constexpr int kMaxModelUploadsPerFrame = 1;
+    int modelUploads = 0;
+
+    for (auto it = asyncCreatureLoads_.begin(); it != asyncCreatureLoads_.end(); ) {
+        if (!it->future.valid() ||
+            it->future.wait_for(std::chrono::milliseconds(0)) != std::future_status::ready) {
+            ++it;
+            continue;
+        }
+
+        // Peek: if this result needs a NEW model upload (not cached) and we've hit
+        // the upload budget, defer to next frame without consuming the future.
+        if (modelUploads >= kMaxModelUploadsPerFrame) {
+            // Check if this displayId already has a cached model (cheap spawn, no GPU upload).
+            // We can't peek the displayId without getting the future, so just break.
+            break;
+        }
+
+        auto result = it->future.get();
+        it = asyncCreatureLoads_.erase(it);
+
+        if (result.permanent_failure) {
+            nonRenderableCreatureDisplayIds_.insert(result.displayId);
+            creaturePermanentFailureGuids_.insert(result.guid);
+            pendingCreatureSpawnGuids_.erase(result.guid);
+            creatureSpawnRetryCounts_.erase(result.guid);
+            continue;
+        }
+        if (!result.valid || !result.model) {
+            pendingCreatureSpawnGuids_.erase(result.guid);
+            creatureSpawnRetryCounts_.erase(result.guid);
+            continue;
+        }
+
+        // Model parsed on background thread — upload to GPU on main thread.
+        auto* charRenderer = renderer ? renderer->getCharacterRenderer() : nullptr;
+        if (!charRenderer) {
+            pendingCreatureSpawnGuids_.erase(result.guid);
+            continue;
+        }
+
+        // Upload model to GPU (must happen on main thread)
+        // Use pre-decoded BLP cache to skip main-thread texture decode
+        auto uploadStart = std::chrono::steady_clock::now();
+        charRenderer->setPredecodedBLPCache(&result.predecodedTextures);
+        if (!charRenderer->loadModel(*result.model, result.modelId)) {
+            charRenderer->setPredecodedBLPCache(nullptr);
+            nonRenderableCreatureDisplayIds_.insert(result.displayId);
+            creaturePermanentFailureGuids_.insert(result.guid);
+            pendingCreatureSpawnGuids_.erase(result.guid);
+            creatureSpawnRetryCounts_.erase(result.guid);
+            continue;
+        }
+        charRenderer->setPredecodedBLPCache(nullptr);
+        {
+            auto uploadEnd = std::chrono::steady_clock::now();
+            float uploadMs = std::chrono::duration<float, std::milli>(uploadEnd - uploadStart).count();
+            if (uploadMs > 3.0f) {
+                LOG_WARNING("charRenderer->loadModel took ", uploadMs, "ms displayId=", result.displayId,
+                            " preDecoded=", result.predecodedTextures.size());
+            }
+        }
+        // Save remaining pre-decoded textures (display skins) for spawnOnlineCreature
+        if (!result.predecodedTextures.empty()) {
+            displayIdPredecodedTextures_[result.displayId] = std::move(result.predecodedTextures);
+        }
+        displayIdModelCache_[result.displayId] = result.modelId;
+        modelUploads++;
+
+        pendingCreatureSpawnGuids_.erase(result.guid);
+        creatureSpawnRetryCounts_.erase(result.guid);
+
+        // Re-queue as a normal pending spawn — model is now cached, so sync spawn is fast
+        // (only creates instance + applies textures, no file I/O).
+        if (!creatureInstances_.count(result.guid) &&
+            !creaturePermanentFailureGuids_.count(result.guid)) {
+            PendingCreatureSpawn s{};
+            s.guid = result.guid;
+            s.displayId = result.displayId;
+            s.x = result.x;
+            s.y = result.y;
+            s.z = result.z;
+            s.orientation = result.orientation;
+            pendingCreatureSpawns_.push_back(s);
+            pendingCreatureSpawnGuids_.insert(result.guid);
+        }
+    }
+}
+
+void Application::processAsyncNpcCompositeResults() {
+    auto* charRenderer = renderer ? renderer->getCharacterRenderer() : nullptr;
+    if (!charRenderer) return;
+
+    for (auto it = asyncNpcCompositeLoads_.begin(); it != asyncNpcCompositeLoads_.end(); ) {
+        if (!it->future.valid() ||
+            it->future.wait_for(std::chrono::milliseconds(0)) != std::future_status::ready) {
+            ++it;
+            continue;
+        }
+        auto result = it->future.get();
+        it = asyncNpcCompositeLoads_.erase(it);
+
+        const auto& info = result.info;
+
+        // Set pre-decoded cache so texture loads skip synchronous BLP decode
+        charRenderer->setPredecodedBLPCache(&result.predecodedTextures);
+
+        // --- Apply skin to type-1 slots ---
+        rendering::VkTexture* skinTex = nullptr;
+
+        if (info.hasBakedSkin) {
+            // Baked skin: load from pre-decoded cache
+            skinTex = charRenderer->loadTexture(info.bakedSkinPath);
+        }
+
+        if (info.hasComposite) {
+            // Composite with face/underwear/equipment regions on top of base skin
+            rendering::VkTexture* compositeTex = nullptr;
+            if (!info.regionLayers.empty()) {
+                compositeTex = charRenderer->compositeWithRegions(info.basePath,
+                    info.overlayPaths, info.regionLayers);
+            } else if (!info.overlayPaths.empty()) {
+                std::vector<std::string> skinLayers;
+                skinLayers.push_back(info.basePath);
+                for (const auto& op : info.overlayPaths) skinLayers.push_back(op);
+                compositeTex = charRenderer->compositeTextures(skinLayers);
+            }
+            if (compositeTex) skinTex = compositeTex;
+        } else if (info.hasSimpleSkin) {
+            // Simple skin: just base texture, no compositing
+            auto* baseTex = charRenderer->loadTexture(info.basePath);
+            if (baseTex) skinTex = baseTex;
+        }
+
+        if (skinTex) {
+            for (uint32_t slot : info.skinTextureSlots) {
+                charRenderer->setModelTexture(info.modelId, slot, skinTex);
+            }
+        }
+
+        // --- Apply hair texture to type-6 slots ---
+        if (!info.hairTexturePath.empty()) {
+            rendering::VkTexture* hairTex = charRenderer->loadTexture(info.hairTexturePath);
+            rendering::VkTexture* whTex = charRenderer->loadTexture("");
+            if (hairTex && hairTex != whTex) {
+                for (uint32_t slot : info.hairTextureSlots) {
+                    charRenderer->setModelTexture(info.modelId, slot, hairTex);
+                }
+            }
+        } else if (info.useBakedForHair && skinTex) {
+            // Bald NPC: use skin/baked texture for scalp cap
+            for (uint32_t slot : info.hairTextureSlots) {
+                charRenderer->setModelTexture(info.modelId, slot, skinTex);
+            }
+        }
+
+        charRenderer->setPredecodedBLPCache(nullptr);
+    }
+}
+
 void Application::processCreatureSpawnQueue() {
+    auto startTime = std::chrono::steady_clock::now();
+    // Budget: max 2ms per frame for creature spawning to prevent stutter.
+    static constexpr float kSpawnBudgetMs = 2.0f;
+
+    // First, finalize any async model loads that completed on background threads.
+    processAsyncCreatureResults();
+    {
+        auto now = std::chrono::steady_clock::now();
+        float asyncMs = std::chrono::duration<float, std::milli>(now - startTime).count();
+        if (asyncMs > 3.0f) {
+            LOG_WARNING("processAsyncCreatureResults took ", asyncMs, "ms");
+        }
+    }
+
     if (pendingCreatureSpawns_.empty()) return;
     if (!creatureLookupsBuilt_) {
         buildCreatureDisplayLookups();
@@ -6700,13 +6989,21 @@ void Application::processCreatureSpawnQueue() {
     }
 
     int processed = 0;
-    int newModelLoads = 0;
+    int asyncLaunched = 0;
     size_t rotationsLeft = pendingCreatureSpawns_.size();
     while (!pendingCreatureSpawns_.empty() &&
            processed < MAX_SPAWNS_PER_FRAME &&
            rotationsLeft > 0) {
+        // Check time budget every iteration (including first — async results may
+        // have already consumed the budget via GPU model uploads).
+        {
+            auto now = std::chrono::steady_clock::now();
+            float elapsedMs = std::chrono::duration<float, std::milli>(now - startTime).count();
+            if (elapsedMs >= kSpawnBudgetMs) break;
+        }
+
         PendingCreatureSpawn s = pendingCreatureSpawns_.front();
-        pendingCreatureSpawns_.erase(pendingCreatureSpawns_.begin());
+        pendingCreatureSpawns_.pop_front();
 
         if (nonRenderableCreatureDisplayIds_.count(s.displayId)) {
             pendingCreatureSpawnGuids_.erase(s.guid);
@@ -6717,15 +7014,269 @@ void Application::processCreatureSpawnQueue() {
         }
 
         const bool needsNewModel = (displayIdModelCache_.find(s.displayId) == displayIdModelCache_.end());
-        if (needsNewModel && newModelLoads >= MAX_NEW_CREATURE_MODELS_PER_FRAME) {
-            // Defer additional first-time model/texture loads to later frames so
-            // movement stays responsive in dense areas.
-            pendingCreatureSpawns_.push_back(s);
-            rotationsLeft--;
+
+        // For new models: launch async load on background thread instead of blocking.
+        if (needsNewModel) {
+            if (static_cast<int>(asyncCreatureLoads_.size()) + asyncLaunched >= MAX_ASYNC_CREATURE_LOADS) {
+                // Too many in-flight — defer to next frame
+                pendingCreatureSpawns_.push_back(s);
+                rotationsLeft--;
+                continue;
+            }
+
+            std::string m2Path = getModelPathForDisplayId(s.displayId);
+            if (m2Path.empty()) {
+                nonRenderableCreatureDisplayIds_.insert(s.displayId);
+                creaturePermanentFailureGuids_.insert(s.guid);
+                pendingCreatureSpawnGuids_.erase(s.guid);
+                creatureSpawnRetryCounts_.erase(s.guid);
+                processed++;
+                rotationsLeft = pendingCreatureSpawns_.size();
+                continue;
+            }
+
+            // Check for invisible stalkers
+            {
+                std::string lowerPath = m2Path;
+                std::transform(lowerPath.begin(), lowerPath.end(), lowerPath.begin(),
+                               [](unsigned char c) { return static_cast<char>(std::tolower(c)); });
+                if (lowerPath.find("invisiblestalker") != std::string::npos ||
+                    lowerPath.find("invisible_stalker") != std::string::npos) {
+                    nonRenderableCreatureDisplayIds_.insert(s.displayId);
+                    creaturePermanentFailureGuids_.insert(s.guid);
+                    pendingCreatureSpawnGuids_.erase(s.guid);
+                    processed++;
+                    rotationsLeft = pendingCreatureSpawns_.size();
+                    continue;
+                }
+            }
+
+            // Launch async M2 load — file I/O and parsing happen off the main thread.
+            uint32_t modelId = nextCreatureModelId_++;
+            auto* am = assetManager.get();
+
+            // Collect display skin texture paths for background pre-decode
+            std::vector<std::string> displaySkinPaths;
+            {
+                auto itDD = displayDataMap_.find(s.displayId);
+                if (itDD != displayDataMap_.end()) {
+                    std::string modelDir;
+                    size_t lastSlash = m2Path.find_last_of("\\/");
+                    if (lastSlash != std::string::npos) modelDir = m2Path.substr(0, lastSlash + 1);
+
+                    auto resolveForAsync = [&](const std::string& skinField) {
+                        if (skinField.empty()) return;
+                        std::string raw = skinField;
+                        std::replace(raw.begin(), raw.end(), '/', '\\');
+                        while (!raw.empty() && std::isspace(static_cast<unsigned char>(raw.front()))) raw.erase(raw.begin());
+                        while (!raw.empty() && std::isspace(static_cast<unsigned char>(raw.back()))) raw.pop_back();
+                        if (raw.empty()) return;
+                        bool hasExt = raw.size() >= 4 && raw.substr(raw.size()-4) == ".blp";
+                        bool hasDir = raw.find('\\') != std::string::npos;
+                        std::vector<std::string> candidates;
+                        if (hasDir) {
+                            candidates.push_back(raw);
+                            if (!hasExt) candidates.push_back(raw + ".blp");
+                        } else {
+                            candidates.push_back(modelDir + raw);
+                            if (!hasExt) candidates.push_back(modelDir + raw + ".blp");
+                            candidates.push_back(raw);
+                            if (!hasExt) candidates.push_back(raw + ".blp");
+                        }
+                        for (const auto& c : candidates) {
+                            if (am->fileExists(c)) { displaySkinPaths.push_back(c); return; }
+                        }
+                    };
+                    resolveForAsync(itDD->second.skin1);
+                    resolveForAsync(itDD->second.skin2);
+                    resolveForAsync(itDD->second.skin3);
+
+                    // Pre-decode humanoid NPC textures (bake, skin, face, underwear, hair, equipment)
+                    if (itDD->second.extraDisplayId != 0) {
+                        auto itHE = humanoidExtraMap_.find(itDD->second.extraDisplayId);
+                        if (itHE != humanoidExtraMap_.end()) {
+                            const auto& he = itHE->second;
+                            // Baked texture
+                            if (!he.bakeName.empty()) {
+                                displaySkinPaths.push_back("Textures\\BakedNpcTextures\\" + he.bakeName);
+                            }
+                            // CharSections: skin, face, underwear
+                            auto csDbc = am->loadDBC("CharSections.dbc");
+                            if (csDbc) {
+                                const auto* csL = pipeline::getActiveDBCLayout()
+                                    ? pipeline::getActiveDBCLayout()->getLayout("CharSections") : nullptr;
+                                uint32_t nRace = static_cast<uint32_t>(he.raceId);
+                                uint32_t nSex = static_cast<uint32_t>(he.sexId);
+                                uint32_t nSkin = static_cast<uint32_t>(he.skinId);
+                                uint32_t nFace = static_cast<uint32_t>(he.faceId);
+                                for (uint32_t r = 0; r < csDbc->getRecordCount(); r++) {
+                                    uint32_t rId = csDbc->getUInt32(r, csL ? (*csL)["RaceID"] : 1);
+                                    uint32_t sId = csDbc->getUInt32(r, csL ? (*csL)["SexID"] : 2);
+                                    if (rId != nRace || sId != nSex) continue;
+                                    uint32_t section = csDbc->getUInt32(r, csL ? (*csL)["BaseSection"] : 3);
+                                    uint32_t variation = csDbc->getUInt32(r, csL ? (*csL)["VariationIndex"] : 4);
+                                    uint32_t color = csDbc->getUInt32(r, csL ? (*csL)["ColorIndex"] : 5);
+                                    uint32_t tex1F = csL ? (*csL)["Texture1"] : 6;
+                                    if (section == 0 && color == nSkin) {
+                                        std::string t = csDbc->getString(r, tex1F);
+                                        if (!t.empty()) displaySkinPaths.push_back(t);
+                                    } else if (section == 1 && variation == nFace && color == nSkin) {
+                                        std::string t1 = csDbc->getString(r, tex1F);
+                                        std::string t2 = csDbc->getString(r, tex1F + 1);
+                                        if (!t1.empty()) displaySkinPaths.push_back(t1);
+                                        if (!t2.empty()) displaySkinPaths.push_back(t2);
+                                    } else if (section == 3 && variation == static_cast<uint32_t>(he.hairStyleId)
+                                               && color == static_cast<uint32_t>(he.hairColorId)) {
+                                        std::string t = csDbc->getString(r, tex1F);
+                                        if (!t.empty()) displaySkinPaths.push_back(t);
+                                    } else if (section == 4 && color == nSkin) {
+                                        for (uint32_t f = tex1F; f <= tex1F + 2; f++) {
+                                            std::string t = csDbc->getString(r, f);
+                                            if (!t.empty()) displaySkinPaths.push_back(t);
+                                        }
+                                    }
+                                }
+                            }
+                            // Equipment region textures
+                            auto idiDbc = am->loadDBC("ItemDisplayInfo.dbc");
+                            if (idiDbc) {
+                                static const char* compDirs[] = {
+                                    "ArmUpperTexture", "ArmLowerTexture", "HandTexture",
+                                    "TorsoUpperTexture", "TorsoLowerTexture",
+                                    "LegUpperTexture", "LegLowerTexture", "FootTexture",
+                                };
+                                const auto* idiL = pipeline::getActiveDBCLayout()
+                                    ? pipeline::getActiveDBCLayout()->getLayout("ItemDisplayInfo") : nullptr;
+                                const uint32_t trf[8] = {
+                                    idiL ? (*idiL)["TextureArmUpper"]  : 14u,
+                                    idiL ? (*idiL)["TextureArmLower"]  : 15u,
+                                    idiL ? (*idiL)["TextureHand"]      : 16u,
+                                    idiL ? (*idiL)["TextureTorsoUpper"]: 17u,
+                                    idiL ? (*idiL)["TextureTorsoLower"]: 18u,
+                                    idiL ? (*idiL)["TextureLegUpper"]  : 19u,
+                                    idiL ? (*idiL)["TextureLegLower"]  : 20u,
+                                    idiL ? (*idiL)["TextureFoot"]      : 21u,
+                                };
+                                const bool isFem = (he.sexId == 1);
+                                for (int eq = 0; eq < 11; eq++) {
+                                    uint32_t did = he.equipDisplayId[eq];
+                                    if (did == 0) continue;
+                                    int32_t recIdx = idiDbc->findRecordById(did);
+                                    if (recIdx < 0) continue;
+                                    for (int region = 0; region < 8; region++) {
+                                        std::string texName = idiDbc->getString(static_cast<uint32_t>(recIdx), trf[region]);
+                                        if (texName.empty()) continue;
+                                        std::string base = "Item\\TextureComponents\\" +
+                                            std::string(compDirs[region]) + "\\" + texName;
+                                        std::string gp = base + (isFem ? "_F.blp" : "_M.blp");
+                                        std::string up = base + "_U.blp";
+                                        if (am->fileExists(gp)) displaySkinPaths.push_back(gp);
+                                        else if (am->fileExists(up)) displaySkinPaths.push_back(up);
+                                        else displaySkinPaths.push_back(base + ".blp");
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+
+            AsyncCreatureLoad load;
+            load.future = std::async(std::launch::async,
+                [am, m2Path, modelId, s, skinPaths = std::move(displaySkinPaths)]() -> PreparedCreatureModel {
+                    PreparedCreatureModel result;
+                    result.guid = s.guid;
+                    result.displayId = s.displayId;
+                    result.modelId = modelId;
+                    result.x = s.x;
+                    result.y = s.y;
+                    result.z = s.z;
+                    result.orientation = s.orientation;
+
+                    auto m2Data = am->readFile(m2Path);
+                    if (m2Data.empty()) {
+                        result.permanent_failure = true;
+                        return result;
+                    }
+
+                    auto model = std::make_shared<pipeline::M2Model>(pipeline::M2Loader::load(m2Data));
+                    if (model->vertices.empty()) {
+                        result.permanent_failure = true;
+                        return result;
+                    }
+
+                    // Load skin file
+                    if (model->version >= 264) {
+                        std::string skinPath = m2Path.substr(0, m2Path.size() - 3) + "00.skin";
+                        auto skinData = am->readFile(skinPath);
+                        if (!skinData.empty()) {
+                            pipeline::M2Loader::loadSkin(skinData, *model);
+                        }
+                    }
+
+                    // Load external .anim files
+                    std::string basePath = m2Path.substr(0, m2Path.size() - 3);
+                    for (uint32_t si = 0; si < model->sequences.size(); si++) {
+                        if (!(model->sequences[si].flags & 0x20)) {
+                            char animFileName[256];
+                            snprintf(animFileName, sizeof(animFileName), "%s%04u-%02u.anim",
+                                basePath.c_str(), model->sequences[si].id, model->sequences[si].variationIndex);
+                            auto animData = am->readFileOptional(animFileName);
+                            if (!animData.empty()) {
+                                pipeline::M2Loader::loadAnimFile(m2Data, animData, si, *model);
+                            }
+                        }
+                    }
+
+                    // Pre-decode model textures on background thread
+                    for (const auto& tex : model->textures) {
+                        if (tex.filename.empty()) continue;
+                        std::string texKey = tex.filename;
+                        std::replace(texKey.begin(), texKey.end(), '/', '\\');
+                        std::transform(texKey.begin(), texKey.end(), texKey.begin(),
+                                       [](unsigned char c) { return static_cast<char>(std::tolower(c)); });
+                        if (result.predecodedTextures.find(texKey) != result.predecodedTextures.end()) continue;
+                        auto blp = am->loadTexture(texKey);
+                        if (blp.isValid()) {
+                            result.predecodedTextures[texKey] = std::move(blp);
+                        }
+                    }
+
+                    // Pre-decode display skin textures (skin1/skin2/skin3 from CreatureDisplayInfo)
+                    for (const auto& sp : skinPaths) {
+                        std::string key = sp;
+                        std::replace(key.begin(), key.end(), '/', '\\');
+                        std::transform(key.begin(), key.end(), key.begin(),
+                                       [](unsigned char c) { return static_cast<char>(std::tolower(c)); });
+                        if (result.predecodedTextures.count(key)) continue;
+                        auto blp = am->loadTexture(key);
+                        if (blp.isValid()) {
+                            result.predecodedTextures[key] = std::move(blp);
+                        }
+                    }
+
+                    result.model = std::move(model);
+                    result.valid = true;
+                    return result;
+                });
+            asyncCreatureLoads_.push_back(std::move(load));
+            asyncLaunched++;
+            // Don't erase from pendingCreatureSpawnGuids_ — the async result handler will do it
+            rotationsLeft = pendingCreatureSpawns_.size();
+            processed++;
             continue;
         }
 
-        spawnOnlineCreature(s.guid, s.displayId, s.x, s.y, s.z, s.orientation);
+        // Cached model — spawn is fast (no file I/O, just instance creation + texture setup)
+        {
+            auto spawnStart = std::chrono::steady_clock::now();
+            spawnOnlineCreature(s.guid, s.displayId, s.x, s.y, s.z, s.orientation);
+            auto spawnEnd = std::chrono::steady_clock::now();
+            float spawnMs = std::chrono::duration<float, std::milli>(spawnEnd - spawnStart).count();
+            if (spawnMs > 3.0f) {
+                LOG_WARNING("spawnOnlineCreature took ", spawnMs, "ms displayId=", s.displayId);
+            }
+        }
         pendingCreatureSpawnGuids_.erase(s.guid);
 
         // If spawn still failed, retry for a limited number of frames.
@@ -6752,9 +7303,6 @@ void Application::processCreatureSpawnQueue() {
         } else {
             creatureSpawnRetryCounts_.erase(s.guid);
         }
-        if (needsNewModel) {
-            newModelLoads++;
-        }
         rotationsLeft = pendingCreatureSpawns_.size();
         processed++;
     }
@@ -6787,23 +7335,327 @@ void Application::processPlayerSpawnQueue() {
     }
 }
 
+std::vector<std::string> Application::resolveEquipmentTexturePaths(uint64_t guid,
+    const std::array<uint32_t, 19>& displayInfoIds,
+    const std::array<uint8_t, 19>& /*inventoryTypes*/) const {
+    std::vector<std::string> paths;
+
+    auto it = onlinePlayerAppearance_.find(guid);
+    if (it == onlinePlayerAppearance_.end()) return paths;
+    const OnlinePlayerAppearanceState& st = it->second;
+
+    // Add base skin + underwear paths
+    if (!st.bodySkinPath.empty()) paths.push_back(st.bodySkinPath);
+    for (const auto& up : st.underwearPaths) {
+        if (!up.empty()) paths.push_back(up);
+    }
+
+    // Resolve equipment region texture paths (same logic as setOnlinePlayerEquipment)
+    auto displayInfoDbc = assetManager->loadDBC("ItemDisplayInfo.dbc");
+    if (!displayInfoDbc) return paths;
+    const auto* idiL = pipeline::getActiveDBCLayout()
+        ? pipeline::getActiveDBCLayout()->getLayout("ItemDisplayInfo") : nullptr;
+
+    static const char* componentDirs[] = {
+        "ArmUpperTexture", "ArmLowerTexture", "HandTexture",
+        "TorsoUpperTexture", "TorsoLowerTexture",
+        "LegUpperTexture", "LegLowerTexture", "FootTexture",
+    };
+    const uint32_t texRegionFields[8] = {
+        idiL ? (*idiL)["TextureArmUpper"]  : 14u,
+        idiL ? (*idiL)["TextureArmLower"]  : 15u,
+        idiL ? (*idiL)["TextureHand"]      : 16u,
+        idiL ? (*idiL)["TextureTorsoUpper"]: 17u,
+        idiL ? (*idiL)["TextureTorsoLower"]: 18u,
+        idiL ? (*idiL)["TextureLegUpper"]  : 19u,
+        idiL ? (*idiL)["TextureLegLower"]  : 20u,
+        idiL ? (*idiL)["TextureFoot"]      : 21u,
+    };
+    const bool isFemale = (st.genderId == 1);
+
+    for (int s = 0; s < 19; s++) {
+        uint32_t did = displayInfoIds[s];
+        if (did == 0) continue;
+        int32_t recIdx = displayInfoDbc->findRecordById(did);
+        if (recIdx < 0) continue;
+        for (int region = 0; region < 8; region++) {
+            std::string texName = displayInfoDbc->getString(
+                static_cast<uint32_t>(recIdx), texRegionFields[region]);
+            if (texName.empty()) continue;
+            std::string base = "Item\\TextureComponents\\" +
+                std::string(componentDirs[region]) + "\\" + texName;
+            std::string genderPath = base + (isFemale ? "_F.blp" : "_M.blp");
+            std::string unisexPath = base + "_U.blp";
+            if (assetManager->fileExists(genderPath)) paths.push_back(genderPath);
+            else if (assetManager->fileExists(unisexPath)) paths.push_back(unisexPath);
+            else paths.push_back(base + ".blp");
+        }
+    }
+    return paths;
+}
+
+void Application::processAsyncEquipmentResults() {
+    for (auto it = asyncEquipmentLoads_.begin(); it != asyncEquipmentLoads_.end(); ) {
+        if (!it->future.valid() ||
+            it->future.wait_for(std::chrono::milliseconds(0)) != std::future_status::ready) {
+            ++it;
+            continue;
+        }
+        auto result = it->future.get();
+        it = asyncEquipmentLoads_.erase(it);
+
+        auto* charRenderer = renderer ? renderer->getCharacterRenderer() : nullptr;
+        if (!charRenderer) continue;
+
+        // Set pre-decoded cache so compositeWithRegions skips synchronous BLP decode
+        charRenderer->setPredecodedBLPCache(&result.predecodedTextures);
+        setOnlinePlayerEquipment(result.guid, result.displayInfoIds, result.inventoryTypes);
+        charRenderer->setPredecodedBLPCache(nullptr);
+    }
+}
+
 void Application::processDeferredEquipmentQueue() {
+    // First, finalize any completed async pre-decodes
+    processAsyncEquipmentResults();
+
     if (deferredEquipmentQueue_.empty()) return;
-    // Process at most 1 per frame — compositeWithRegions is expensive
+    // Limit in-flight async equipment loads
+    if (asyncEquipmentLoads_.size() >= 2) return;
+
     auto [guid, equipData] = deferredEquipmentQueue_.front();
     deferredEquipmentQueue_.erase(deferredEquipmentQueue_.begin());
-    setOnlinePlayerEquipment(guid, equipData.first, equipData.second);
+
+    // Resolve all texture paths that compositeWithRegions will need
+    auto texturePaths = resolveEquipmentTexturePaths(guid, equipData.first, equipData.second);
+
+    if (texturePaths.empty()) {
+        // No textures to pre-decode — just apply directly (fast path)
+        setOnlinePlayerEquipment(guid, equipData.first, equipData.second);
+        return;
+    }
+
+    // Launch background BLP pre-decode
+    auto* am = assetManager.get();
+    auto displayInfoIds = equipData.first;
+    auto inventoryTypes = equipData.second;
+    AsyncEquipmentLoad load;
+    load.future = std::async(std::launch::async,
+        [am, guid, displayInfoIds, inventoryTypes, paths = std::move(texturePaths)]() -> PreparedEquipmentUpdate {
+            PreparedEquipmentUpdate result;
+            result.guid = guid;
+            result.displayInfoIds = displayInfoIds;
+            result.inventoryTypes = inventoryTypes;
+            for (const auto& path : paths) {
+                std::string key = path;
+                std::replace(key.begin(), key.end(), '/', '\\');
+                std::transform(key.begin(), key.end(), key.begin(),
+                               [](unsigned char c) { return static_cast<char>(std::tolower(c)); });
+                if (result.predecodedTextures.count(key)) continue;
+                auto blp = am->loadTexture(key);
+                if (blp.isValid()) {
+                    result.predecodedTextures[key] = std::move(blp);
+                }
+            }
+            return result;
+        });
+    asyncEquipmentLoads_.push_back(std::move(load));
+}
+
+void Application::processAsyncGameObjectResults() {
+    for (auto it = asyncGameObjectLoads_.begin(); it != asyncGameObjectLoads_.end(); ) {
+        if (!it->future.valid() ||
+            it->future.wait_for(std::chrono::milliseconds(0)) != std::future_status::ready) {
+            ++it;
+            continue;
+        }
+
+        auto result = it->future.get();
+        it = asyncGameObjectLoads_.erase(it);
+
+        if (!result.valid || !result.isWmo || !result.wmoModel) {
+            // Fallback: spawn via sync path (likely an M2 or failed WMO)
+            spawnOnlineGameObject(result.guid, result.entry, result.displayId,
+                                 result.x, result.y, result.z, result.orientation);
+            continue;
+        }
+
+        // WMO parsed on background thread — do GPU upload + instance creation on main thread
+        auto* wmoRenderer = renderer ? renderer->getWMORenderer() : nullptr;
+        if (!wmoRenderer) continue;
+
+        uint32_t modelId = 0;
+        auto itCache = gameObjectDisplayIdWmoCache_.find(result.displayId);
+        if (itCache != gameObjectDisplayIdWmoCache_.end()) {
+            modelId = itCache->second;
+        } else {
+            modelId = nextGameObjectWmoModelId_++;
+            wmoRenderer->setPredecodedBLPCache(&result.predecodedTextures);
+            if (!wmoRenderer->loadModel(*result.wmoModel, modelId)) {
+                wmoRenderer->setPredecodedBLPCache(nullptr);
+                LOG_WARNING("Failed to load async gameobject WMO: ", result.modelPath);
+                continue;
+            }
+            wmoRenderer->setPredecodedBLPCache(nullptr);
+            gameObjectDisplayIdWmoCache_[result.displayId] = modelId;
+        }
+
+        glm::vec3 renderPos = core::coords::canonicalToRender(
+            glm::vec3(result.x, result.y, result.z));
+        uint32_t instanceId = wmoRenderer->createInstance(
+            modelId, renderPos, glm::vec3(0.0f, 0.0f, result.orientation), 1.0f);
+        if (instanceId == 0) continue;
+
+        gameObjectInstances_[result.guid] = {modelId, instanceId, true};
+
+        // Queue transport doodad loading if applicable
+        std::string lowerPath = result.modelPath;
+        std::transform(lowerPath.begin(), lowerPath.end(), lowerPath.begin(),
+                       [](unsigned char c) { return static_cast<char>(std::tolower(c)); });
+        if (lowerPath.find("transport") != std::string::npos) {
+            const auto* doodadTemplates = wmoRenderer->getDoodadTemplates(modelId);
+            if (doodadTemplates && !doodadTemplates->empty()) {
+                PendingTransportDoodadBatch batch;
+                batch.guid = result.guid;
+                batch.modelId = modelId;
+                batch.instanceId = instanceId;
+                batch.x = result.x;
+                batch.y = result.y;
+                batch.z = result.z;
+                batch.orientation = result.orientation;
+                batch.doodadBudget = doodadTemplates->size();
+                pendingTransportDoodadBatches_.push_back(batch);
+            }
+        }
+    }
 }
 
 void Application::processGameObjectSpawnQueue() {
+    // Finalize any completed async WMO loads first
+    processAsyncGameObjectResults();
+
     if (pendingGameObjectSpawns_.empty()) return;
 
-    int spawned = 0;
-    while (!pendingGameObjectSpawns_.empty() && spawned < MAX_SPAWNS_PER_FRAME) {
+    // Process spawns: cached WMOs and M2s go sync (cheap), uncached WMOs go async
+    auto startTime = std::chrono::steady_clock::now();
+    static constexpr float kBudgetMs = 2.0f;
+    static constexpr int kMaxAsyncLoads = 2;
+
+    while (!pendingGameObjectSpawns_.empty()) {
+        float elapsedMs = std::chrono::duration<float, std::milli>(
+            std::chrono::steady_clock::now() - startTime).count();
+        if (elapsedMs >= kBudgetMs) break;
+
         auto& s = pendingGameObjectSpawns_.front();
+
+        // Check if this is an uncached WMO that needs async loading
+        std::string modelPath;
+        if (gameObjectLookupsBuilt_) {
+            // Check transport overrides first
+            bool isTransport = gameHandler && gameHandler->isTransportGuid(s.guid);
+            if (isTransport) {
+                if (s.entry == 20808 || s.entry == 176231 || s.entry == 176310)
+                    modelPath = "World\\wmo\\transports\\transport_ship\\transportship.wmo";
+                else if (s.displayId == 807 || s.displayId == 808 || s.displayId == 175080 || s.displayId == 176495 || s.displayId == 164871)
+                    modelPath = "World\\wmo\\transports\\transport_zeppelin\\transport_zeppelin.wmo";
+                else if (s.displayId == 1587)
+                    modelPath = "World\\wmo\\transports\\transport_horde_zeppelin\\Transport_Horde_Zeppelin.wmo";
+                else if (s.displayId == 2454 || s.displayId == 181688 || s.displayId == 190536)
+                    modelPath = "World\\wmo\\transports\\icebreaker\\Transport_Icebreaker_ship.wmo";
+            }
+            if (modelPath.empty())
+                modelPath = getGameObjectModelPathForDisplayId(s.displayId);
+        }
+
+        std::string lowerPath = modelPath;
+        std::transform(lowerPath.begin(), lowerPath.end(), lowerPath.begin(),
+                       [](unsigned char c) { return static_cast<char>(std::tolower(c)); });
+        bool isWmo = lowerPath.size() >= 4 && lowerPath.substr(lowerPath.size() - 4) == ".wmo";
+        bool isCached = isWmo && gameObjectDisplayIdWmoCache_.count(s.displayId);
+
+        if (isWmo && !isCached && !modelPath.empty() &&
+            static_cast<int>(asyncGameObjectLoads_.size()) < kMaxAsyncLoads) {
+            // Launch async WMO load — file I/O + parse on background thread
+            auto* am = assetManager.get();
+            PendingGameObjectSpawn capture = s;
+            std::string capturePath = modelPath;
+            AsyncGameObjectLoad load;
+            load.future = std::async(std::launch::async,
+                [am, capture, capturePath]() -> PreparedGameObjectWMO {
+                    PreparedGameObjectWMO result;
+                    result.guid = capture.guid;
+                    result.entry = capture.entry;
+                    result.displayId = capture.displayId;
+                    result.x = capture.x;
+                    result.y = capture.y;
+                    result.z = capture.z;
+                    result.orientation = capture.orientation;
+                    result.modelPath = capturePath;
+                    result.isWmo = true;
+
+                    auto wmoData = am->readFile(capturePath);
+                    if (wmoData.empty()) return result;
+
+                    auto wmo = std::make_shared<pipeline::WMOModel>(
+                        pipeline::WMOLoader::load(wmoData));
+
+                    // Load groups
+                    if (wmo->nGroups > 0) {
+                        std::string basePath = capturePath;
+                        std::string ext;
+                        if (basePath.size() > 4) {
+                            ext = basePath.substr(basePath.size() - 4);
+                            basePath = basePath.substr(0, basePath.size() - 4);
+                        }
+                        for (uint32_t gi = 0; gi < wmo->nGroups; gi++) {
+                            char suffix[16];
+                            snprintf(suffix, sizeof(suffix), "_%03u%s", gi, ext.c_str());
+                            auto groupData = am->readFile(basePath + suffix);
+                            if (groupData.empty()) {
+                                snprintf(suffix, sizeof(suffix), "_%03u.wmo", gi);
+                                groupData = am->readFile(basePath + suffix);
+                            }
+                            if (!groupData.empty()) {
+                                pipeline::WMOLoader::loadGroup(groupData, *wmo, gi);
+                            }
+                        }
+                    }
+
+                    // Pre-decode WMO textures on background thread
+                    for (const auto& texPath : wmo->textures) {
+                        if (texPath.empty()) continue;
+                        std::string texKey = texPath;
+                        size_t nul = texKey.find('\0');
+                        if (nul != std::string::npos) texKey.resize(nul);
+                        std::replace(texKey.begin(), texKey.end(), '/', '\\');
+                        std::transform(texKey.begin(), texKey.end(), texKey.begin(),
+                                       [](unsigned char c) { return static_cast<char>(std::tolower(c)); });
+                        if (texKey.empty()) continue;
+                        // Convert to .blp extension
+                        if (texKey.size() >= 4) {
+                            std::string ext = texKey.substr(texKey.size() - 4);
+                            if (ext == ".tga" || ext == ".dds") {
+                                texKey = texKey.substr(0, texKey.size() - 4) + ".blp";
+                            }
+                        }
+                        if (result.predecodedTextures.find(texKey) != result.predecodedTextures.end()) continue;
+                        auto blp = am->loadTexture(texKey);
+                        if (blp.isValid()) {
+                            result.predecodedTextures[texKey] = std::move(blp);
+                        }
+                    }
+
+                    result.wmoModel = wmo;
+                    result.valid = true;
+                    return result;
+                });
+            asyncGameObjectLoads_.push_back(std::move(load));
+            pendingGameObjectSpawns_.erase(pendingGameObjectSpawns_.begin());
+            continue;
+        }
+
+        // Cached WMO or M2 — spawn synchronously (cheap)
         spawnOnlineGameObject(s.guid, s.entry, s.displayId, s.x, s.y, s.z, s.orientation);
         pendingGameObjectSpawns_.erase(pendingGameObjectSpawns_.begin());
-        spawned++;
     }
 }
 
@@ -6815,9 +7667,16 @@ void Application::processPendingTransportDoodads() {
     auto* m2Renderer = renderer->getM2Renderer();
     if (!wmoRenderer || !m2Renderer) return;
 
+    auto startTime = std::chrono::steady_clock::now();
+    static constexpr float kDoodadBudgetMs = 4.0f;
+
     size_t budgetLeft = MAX_TRANSPORT_DOODADS_PER_FRAME;
     for (auto it = pendingTransportDoodadBatches_.begin();
          it != pendingTransportDoodadBatches_.end() && budgetLeft > 0;) {
+        // Time budget check
+        float elapsedMs = std::chrono::duration<float, std::milli>(
+            std::chrono::steady_clock::now() - startTime).count();
+        if (elapsedMs >= kDoodadBudgetMs) break;
         auto goIt = gameObjectInstances_.find(it->guid);
         if (goIt == gameObjectInstances_.end() || !goIt->second.isWmo ||
             goIt->second.instanceId != it->instanceId || goIt->second.modelId != it->modelId) {
@@ -6833,6 +7692,11 @@ void Application::processPendingTransportDoodads() {
 
         const size_t maxIndex = std::min(it->doodadBudget, doodadTemplates->size());
         while (it->nextIndex < maxIndex && budgetLeft > 0) {
+            // Per-doodad time budget (each does synchronous file I/O + parse + GPU upload)
+            float innerMs = std::chrono::duration<float, std::milli>(
+                std::chrono::steady_clock::now() - startTime).count();
+            if (innerMs >= kDoodadBudgetMs) { budgetLeft = 0; break; }
+
             const auto& doodadTemplate = (*doodadTemplates)[it->nextIndex];
             it->nextIndex++;
             budgetLeft--;
@@ -7492,5 +8356,121 @@ void Application::setupTestTransport() {
     LOG_INFO("========================================");
 }
 
+// ─── World Preloader ─────────────────────────────────────────────────────────
+// Pre-warms AssetManager file cache with ADT files (and their _obj0 variants)
+// for tiles around the expected spawn position.  Runs in background so that
+// when loadOnlineWorldTerrain eventually asks TerrainManager workers to parse
+// the same files, every readFile() is an instant cache hit instead of disk I/O.
+
+void Application::startWorldPreload(uint32_t mapId, const std::string& mapName,
+                                     float serverX, float serverY) {
+    cancelWorldPreload();
+    if (!assetManager || !assetManager->isInitialized() || mapName.empty()) return;
+
+    glm::vec3 canonical = core::coords::serverToCanonical(glm::vec3(serverX, serverY, 0.0f));
+    auto [tileX, tileY] = core::coords::canonicalToTile(canonical.x, canonical.y);
+
+    worldPreload_ = std::make_unique<WorldPreload>();
+    worldPreload_->mapId = mapId;
+    worldPreload_->mapName = mapName;
+    worldPreload_->centerTileX = tileX;
+    worldPreload_->centerTileY = tileY;
+
+    LOG_INFO("World preload: starting for map '", mapName, "' tile [", tileX, ",", tileY, "]");
+
+    // Build list of tiles to preload (radius 1 = 3x3 = 9 tiles, matching load screen)
+    struct TileJob { int x, y; };
+    auto jobs = std::make_shared<std::vector<TileJob>>();
+    // Center tile first (most important)
+    jobs->push_back({tileX, tileY});
+    for (int dx = -1; dx <= 1; dx++) {
+        for (int dy = -1; dy <= 1; dy++) {
+            if (dx == 0 && dy == 0) continue;
+            int tx = tileX + dx, ty = tileY + dy;
+            if (tx < 0 || tx > 63 || ty < 0 || ty > 63) continue;
+            jobs->push_back({tx, ty});
+        }
+    }
+
+    // Spawn worker threads (one per tile for maximum parallelism)
+    auto cancelFlag = &worldPreload_->cancel;
+    auto* am = assetManager.get();
+    std::string mn = mapName;
+
+    int numWorkers = std::min(static_cast<int>(jobs->size()), 4);
+    auto nextJob = std::make_shared<std::atomic<int>>(0);
+
+    for (int w = 0; w < numWorkers; w++) {
+        worldPreload_->workers.emplace_back([am, mn, jobs, nextJob, cancelFlag]() {
+            while (!cancelFlag->load(std::memory_order_relaxed)) {
+                int idx = nextJob->fetch_add(1, std::memory_order_relaxed);
+                if (idx >= static_cast<int>(jobs->size())) break;
+
+                int tx = (*jobs)[idx].x;
+                int ty = (*jobs)[idx].y;
+
+                // Read ADT file (warms file cache)
+                std::string adtPath = "World\\Maps\\" + mn + "\\" + mn + "_" +
+                                      std::to_string(tx) + "_" + std::to_string(ty) + ".adt";
+                am->readFile(adtPath);
+                if (cancelFlag->load(std::memory_order_relaxed)) break;
+
+                // Read obj0 variant
+                std::string objPath = "World\\Maps\\" + mn + "\\" + mn + "_" +
+                                      std::to_string(tx) + "_" + std::to_string(ty) + "_obj0.adt";
+                am->readFile(objPath);
+            }
+            LOG_DEBUG("World preload worker finished");
+        });
+    }
+}
+
+void Application::cancelWorldPreload() {
+    if (!worldPreload_) return;
+    worldPreload_->cancel.store(true, std::memory_order_relaxed);
+    for (auto& t : worldPreload_->workers) {
+        if (t.joinable()) t.join();
+    }
+    LOG_INFO("World preload: cancelled (map=", worldPreload_->mapName,
+             " tile=[", worldPreload_->centerTileX, ",", worldPreload_->centerTileY, "])");
+    worldPreload_.reset();
+}
+
+void Application::saveLastWorldInfo(uint32_t mapId, const std::string& mapName,
+                                     float serverX, float serverY) {
+#ifdef _WIN32
+    const char* base = std::getenv("APPDATA");
+    std::string dir = base ? std::string(base) + "\\wowee" : ".";
+#else
+    const char* home = std::getenv("HOME");
+    std::string dir = home ? std::string(home) + "/.wowee" : ".";
+#endif
+    std::filesystem::create_directories(dir);
+    std::ofstream f(dir + "/last_world.cfg");
+    if (f) {
+        f << mapId << "\n" << mapName << "\n" << serverX << "\n" << serverY << "\n";
+    }
+}
+
+Application::LastWorldInfo Application::loadLastWorldInfo() const {
+#ifdef _WIN32
+    const char* base = std::getenv("APPDATA");
+    std::string dir = base ? std::string(base) + "\\wowee" : ".";
+#else
+    const char* home = std::getenv("HOME");
+    std::string dir = home ? std::string(home) + "/.wowee" : ".";
+#endif
+    LastWorldInfo info;
+    std::ifstream f(dir + "/last_world.cfg");
+    if (!f) return info;
+    std::string line;
+    if (std::getline(f, line)) info.mapId = static_cast<uint32_t>(std::stoul(line));
+    if (std::getline(f, line)) info.mapName = line;
+    if (std::getline(f, line)) info.x = std::stof(line);
+    if (std::getline(f, line)) info.y = std::stof(line);
+    info.valid = !info.mapName.empty();
+    return info;
+}
+
 } // namespace core
 } // namespace wowee
diff --git a/src/game/game_handler.cpp b/src/game/game_handler.cpp
index e80e727f..9a7aed97 100644
--- a/src/game/game_handler.cpp
+++ b/src/game/game_handler.cpp
@@ -541,7 +541,13 @@ void GameHandler::update(float deltaTime) {
 
     // Update socket (processes incoming data and triggers callbacks)
     if (socket) {
+        auto socketStart = std::chrono::steady_clock::now();
         socket->update();
+        float socketMs = std::chrono::duration<float, std::milli>(
+            std::chrono::steady_clock::now() - socketStart).count();
+        if (socketMs > 3.0f) {
+            LOG_WARNING("SLOW socket->update: ", socketMs, "ms");
+        }
     }
 
     // Detect server-side disconnect (socket closed during update)
diff --git a/src/rendering/character_renderer.cpp b/src/rendering/character_renderer.cpp
index 2126e5e5..baaaf3e6 100644
--- a/src/rendering/character_renderer.cpp
+++ b/src/rendering/character_renderer.cpp
@@ -197,6 +197,29 @@ bool CharacterRenderer::initialize(VkContext* ctx, VkDescriptorSetLayout perFram
         vkCreateDescriptorPool(device, &ci, nullptr, &boneDescPool_);
     }
 
+    // --- Material UBO ring buffers (one per frame slot) ---
+    {
+        VkPhysicalDeviceProperties props;
+        vkGetPhysicalDeviceProperties(ctx->getPhysicalDevice(), &props);
+        materialUboAlignment_ = static_cast<uint32_t>(props.limits.minUniformBufferOffsetAlignment);
+        if (materialUboAlignment_ < 1) materialUboAlignment_ = 1;
+        // Round up UBO size to alignment
+        uint32_t alignedUboSize = (sizeof(CharMaterialUBO) + materialUboAlignment_ - 1) & ~(materialUboAlignment_ - 1);
+        uint32_t ringSize = alignedUboSize * MATERIAL_RING_CAPACITY;
+        for (int i = 0; i < 2; i++) {
+            VkBufferCreateInfo bci{VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO};
+            bci.size = ringSize;
+            bci.usage = VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT;
+            VmaAllocationCreateInfo aci{};
+            aci.usage = VMA_MEMORY_USAGE_CPU_TO_GPU;
+            aci.flags = VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT | VMA_ALLOCATION_CREATE_MAPPED_BIT;
+            VmaAllocationInfo allocInfo{};
+            vmaCreateBuffer(ctx->getAllocator(), &bci, &aci,
+                            &materialRingBuffer_[i], &materialRingAlloc_[i], &allocInfo);
+            materialRingMapped_[i] = allocInfo.pMappedData;
+        }
+    }
+
     // --- Pipeline layout ---
     // set 0 = perFrame, set 1 = material, set 2 = bones
     // Push constant: mat4 model = 64 bytes
@@ -352,14 +375,15 @@ void CharacterRenderer::shutdown() {
 
     if (pipelineLayout_) { vkDestroyPipelineLayout(device, pipelineLayout_, nullptr); pipelineLayout_ = VK_NULL_HANDLE; }
 
-    // Release any deferred transient material UBOs.
+    // Destroy material ring buffers
     for (int i = 0; i < 2; i++) {
-        for (const auto& b : transientMaterialUbos_[i]) {
-            if (b.first) {
-                vmaDestroyBuffer(alloc, b.first, b.second);
-            }
+        if (materialRingBuffer_[i]) {
+            vmaDestroyBuffer(alloc, materialRingBuffer_[i], materialRingAlloc_[i]);
+            materialRingBuffer_[i] = VK_NULL_HANDLE;
+            materialRingAlloc_[i] = VK_NULL_HANDLE;
+            materialRingMapped_[i] = nullptr;
         }
-        transientMaterialUbos_[i].clear();
+        materialRingOffset_[i] = 0;
     }
 
     // Destroy descriptor pools and layouts
@@ -391,7 +415,6 @@ void CharacterRenderer::clear() {
 
     vkDeviceWaitIdle(vkCtx_->getDevice());
     VkDevice device = vkCtx_->getDevice();
-    VmaAllocator alloc = vkCtx_->getAllocator();
 
     // Destroy GPU resources for all models
     for (auto& pair : models) {
@@ -441,14 +464,9 @@ void CharacterRenderer::clear() {
     models.clear();
     instances.clear();
 
-    // Release deferred transient material UBOs
+    // Reset material ring buffer offsets (buffers persist, just reset write position)
     for (int i = 0; i < 2; i++) {
-        for (const auto& b : transientMaterialUbos_[i]) {
-            if (b.first) {
-                vmaDestroyBuffer(alloc, b.first, b.second);
-            }
-        }
-        transientMaterialUbos_[i].clear();
+        materialRingOffset_[i] = 0;
     }
 
     // Reset descriptor pools (don't destroy — reuse for new allocations)
@@ -607,7 +625,18 @@ VkTexture* CharacterRenderer::loadTexture(const std::string& path) {
         return whiteTexture_.get();
     }
 
-    auto blpImage = assetManager->loadTexture(key);
+    // Check pre-decoded BLP cache first (populated by background threads)
+    pipeline::BLPImage blpImage;
+    if (predecodedBLPCache_) {
+        auto pit = predecodedBLPCache_->find(key);
+        if (pit != predecodedBLPCache_->end()) {
+            blpImage = std::move(pit->second);
+            predecodedBLPCache_->erase(pit);
+        }
+    }
+    if (!blpImage.isValid()) {
+        blpImage = assetManager->loadTexture(key);
+    }
     if (!blpImage.isValid()) {
         // Return white fallback but don't cache the failure — allow retry
         // on next character load in case the asset becomes available.
@@ -658,13 +687,16 @@ VkTexture* CharacterRenderer::loadTexture(const std::string& path) {
     e.hasAlpha = hasAlpha;
     e.colorKeyBlack = colorKeyBlackHint;
 
-    // Generate normal/height map from diffuse texture
-    float nhVariance = 0.0f;
-    auto nhMap = generateNormalHeightMap(blpImage.data.data(), blpImage.width, blpImage.height, nhVariance);
-    if (nhMap) {
-        e.heightMapVariance = nhVariance;
-        e.approxBytes += approxTextureBytesWithMips(blpImage.width, blpImage.height);
-        e.normalHeightMap = std::move(nhMap);
+    // Defer normal/height map generation to avoid stalling loadModel.
+    // Normal maps are generated in processPendingNormalMaps() at a per-frame budget.
+    if (blpImage.width >= 32 && blpImage.height >= 32) {
+        PendingNormalMap pending;
+        pending.cacheKey = key;
+        pending.pixels.assign(blpImage.data.begin(), blpImage.data.end());
+        pending.width = blpImage.width;
+        pending.height = blpImage.height;
+        pendingNormalMaps_.push_back(std::move(pending));
+        e.normalMapPending = true;
     }
 
     textureCacheBytes_ += e.approxBytes;
@@ -676,6 +708,34 @@ VkTexture* CharacterRenderer::loadTexture(const std::string& path) {
     return texPtr;
 }
 
+void CharacterRenderer::processPendingNormalMaps(int budget) {
+    if (pendingNormalMaps_.empty() || !vkCtx_) return;
+
+    int processed = 0;
+    while (!pendingNormalMaps_.empty() && processed < budget) {
+        auto pending = std::move(pendingNormalMaps_.front());
+        pendingNormalMaps_.pop_front();
+
+        auto it = textureCache.find(pending.cacheKey);
+        if (it == textureCache.end()) continue;  // texture was evicted
+
+        float nhVariance = 0.0f;
+        vkCtx_->beginUploadBatch();
+        auto nhMap = generateNormalHeightMap(pending.pixels.data(),
+            pending.width, pending.height, nhVariance);
+        vkCtx_->endUploadBatch();
+
+        if (nhMap) {
+            it->second.heightMapVariance = nhVariance;
+            it->second.approxBytes += approxTextureBytesWithMips(pending.width, pending.height);
+            textureCacheBytes_ += approxTextureBytesWithMips(pending.width, pending.height);
+            it->second.normalHeightMap = std::move(nhMap);
+        }
+        it->second.normalMapPending = false;
+        processed++;
+    }
+}
+
 // Alpha-blend overlay onto composite at (dstX, dstY)
 static void blitOverlay(std::vector<uint8_t>& composite, int compW, int compH,
                          const pipeline::BLPImage& overlay, int dstX, int dstY) {
@@ -807,7 +867,19 @@ VkTexture* CharacterRenderer::compositeTextures(const std::vector<std::string>&
     }
 
     // Load base layer
-    auto base = assetManager->loadTexture(layerPaths[0]);
+    pipeline::BLPImage base;
+    if (predecodedBLPCache_) {
+        std::string key = layerPaths[0];
+        std::replace(key.begin(), key.end(), '/', '\\');
+        std::transform(key.begin(), key.end(), key.begin(),
+                       [](unsigned char c) { return static_cast<char>(std::tolower(c)); });
+        auto pit = predecodedBLPCache_->find(key);
+        if (pit != predecodedBLPCache_->end()) {
+            base = std::move(pit->second);
+            predecodedBLPCache_->erase(pit);
+        }
+    }
+    if (!base.isValid()) base = assetManager->loadTexture(layerPaths[0]);
     if (!base.isValid()) {
         core::Logger::getInstance().warning("Composite: failed to load base layer: ", layerPaths[0]);
         return whiteTexture_.get();
@@ -848,7 +920,19 @@ VkTexture* CharacterRenderer::compositeTextures(const std::vector<std::string>&
     for (size_t layer = 1; layer < layerPaths.size(); layer++) {
         if (layerPaths[layer].empty()) continue;
 
-        auto overlay = assetManager->loadTexture(layerPaths[layer]);
+        pipeline::BLPImage overlay;
+        if (predecodedBLPCache_) {
+            std::string key = layerPaths[layer];
+            std::replace(key.begin(), key.end(), '/', '\\');
+            std::transform(key.begin(), key.end(), key.begin(),
+                           [](unsigned char c) { return static_cast<char>(std::tolower(c)); });
+            auto pit = predecodedBLPCache_->find(key);
+            if (pit != predecodedBLPCache_->end()) {
+                overlay = std::move(pit->second);
+                predecodedBLPCache_->erase(pit);
+            }
+        }
+        if (!overlay.isValid()) overlay = assetManager->loadTexture(layerPaths[layer]);
         if (!overlay.isValid()) {
             core::Logger::getInstance().warning("Composite: FAILED to load overlay: ", layerPaths[layer]);
             continue;
@@ -1025,7 +1109,19 @@ VkTexture* CharacterRenderer::compositeWithRegions(const std::string& basePath,
         return whiteTexture_.get();
     }
 
-    auto base = assetManager->loadTexture(basePath);
+    pipeline::BLPImage base;
+    if (predecodedBLPCache_) {
+        std::string key = basePath;
+        std::replace(key.begin(), key.end(), '/', '\\');
+        std::transform(key.begin(), key.end(), key.begin(),
+                       [](unsigned char c) { return static_cast<char>(std::tolower(c)); });
+        auto pit = predecodedBLPCache_->find(key);
+        if (pit != predecodedBLPCache_->end()) {
+            base = std::move(pit->second);
+            predecodedBLPCache_->erase(pit);
+        }
+    }
+    if (!base.isValid()) base = assetManager->loadTexture(basePath);
     if (!base.isValid()) {
         return whiteTexture_.get();
     }
@@ -1064,7 +1160,19 @@ VkTexture* CharacterRenderer::compositeWithRegions(const std::string& basePath,
     bool upscaled = (base.width == 256 && base.height == 256 && width == 512);
     for (const auto& ul : baseLayers) {
         if (ul.empty()) continue;
-        auto overlay = assetManager->loadTexture(ul);
+        pipeline::BLPImage overlay;
+        if (predecodedBLPCache_) {
+            std::string key = ul;
+            std::replace(key.begin(), key.end(), '/', '\\');
+            std::transform(key.begin(), key.end(), key.begin(),
+                           [](unsigned char c) { return static_cast<char>(std::tolower(c)); });
+            auto pit = predecodedBLPCache_->find(key);
+            if (pit != predecodedBLPCache_->end()) {
+                overlay = std::move(pit->second);
+                predecodedBLPCache_->erase(pit);
+            }
+        }
+        if (!overlay.isValid()) overlay = assetManager->loadTexture(ul);
         if (!overlay.isValid()) continue;
 
         if (overlay.width == width && overlay.height == height) {
@@ -1142,7 +1250,19 @@ VkTexture* CharacterRenderer::compositeWithRegions(const std::string& basePath,
         int regionIdx = rl.first;
         if (regionIdx < 0 || regionIdx >= 8) continue;
 
-        auto overlay = assetManager->loadTexture(rl.second);
+        pipeline::BLPImage overlay;
+        if (predecodedBLPCache_) {
+            std::string key = rl.second;
+            std::replace(key.begin(), key.end(), '/', '\\');
+            std::transform(key.begin(), key.end(), key.begin(),
+                           [](unsigned char c) { return static_cast<char>(std::tolower(c)); });
+            auto pit = predecodedBLPCache_->find(key);
+            if (pit != predecodedBLPCache_->end()) {
+                overlay = std::move(pit->second);
+                predecodedBLPCache_->erase(pit);
+            }
+        }
+        if (!overlay.isValid()) overlay = assetManager->loadTexture(rl.second);
         if (!overlay.isValid()) {
             core::Logger::getInstance().warning("compositeWithRegions: failed to load ", rl.second);
             continue;
@@ -1247,6 +1367,10 @@ bool CharacterRenderer::loadModel(const pipeline::M2Model& model, uint32_t id) {
     M2ModelGPU gpuModel;
     gpuModel.data = model;
 
+    // Batch all GPU uploads (VB, IB, textures) into a single command buffer
+    // submission with one fence wait, instead of one fence wait per upload.
+    vkCtx_->beginUploadBatch();
+
     // Setup GPU buffers
     setupModelBuffers(gpuModel);
 
@@ -1259,6 +1383,8 @@ bool CharacterRenderer::loadModel(const pipeline::M2Model& model, uint32_t id) {
         gpuModel.textureIds.push_back(texPtr);
     }
 
+    vkCtx_->endUploadBatch();
+
     models[id] = std::move(gpuModel);
 
     core::Logger::getInstance().debug("Loaded M2 model ", id, " (", model.vertices.size(),
@@ -1388,8 +1514,9 @@ uint32_t CharacterRenderer::createInstance(uint32_t modelId, const glm::vec3& po
     instance.scale = scale;
 
     // Initialize bone matrices to identity
-    auto& model = models[modelId].data;
-    instance.boneMatrices.resize(std::max(static_cast<size_t>(1), model.bones.size()), glm::mat4(1.0f));
+    auto& gpuRef = models[modelId];
+    instance.boneMatrices.resize(std::max(static_cast<size_t>(1), gpuRef.data.bones.size()), glm::mat4(1.0f));
+    instance.cachedModel = &gpuRef;
 
     uint32_t id = instance.id;
     instances[id] = std::move(instance);
@@ -1448,8 +1575,14 @@ void CharacterRenderer::update(float deltaTime, const glm::vec3& cameraPos) {
     const float animUpdateRadius = static_cast<float>(envSizeOrDefault("WOWEE_CHAR_ANIM_RADIUS", 120));
     const float animUpdateRadiusSq = animUpdateRadius * animUpdateRadius;
 
-    // Update fade-in opacity
-    for (auto& [id, inst] : instances) {
+    // Single pass: fade-in, movement, and animation bone collection
+    std::vector<std::reference_wrapper<CharacterInstance>> toUpdate;
+    toUpdate.reserve(instances.size());
+
+    for (auto& pair : instances) {
+        auto& inst = pair.second;
+
+        // Update fade-in opacity
         if (inst.fadeInDuration > 0.0f && inst.opacity < 1.0f) {
             inst.fadeInTime += deltaTime;
             inst.opacity = std::min(1.0f, inst.fadeInTime / inst.fadeInDuration);
@@ -1457,10 +1590,8 @@ void CharacterRenderer::update(float deltaTime, const glm::vec3& cameraPos) {
                 inst.fadeInDuration = 0.0f;
             }
         }
-    }
 
-    // Interpolate creature movement
-    for (auto& [id, inst] : instances) {
+        // Interpolate creature movement
         if (inst.isMoving) {
             inst.moveElapsed += deltaTime;
             float t = inst.moveElapsed / inst.moveDuration;
@@ -1469,36 +1600,26 @@ void CharacterRenderer::update(float deltaTime, const glm::vec3& cameraPos) {
                 inst.isMoving = false;
                 // Return to idle when movement completes
                 if (inst.currentAnimationId == 4 || inst.currentAnimationId == 5) {
-                    playAnimation(id, 0, true);
+                    playAnimation(pair.first, 0, true);
                 }
             } else {
                 inst.position = glm::mix(inst.moveStart, inst.moveEnd, t);
             }
         }
-    }
 
-    // Only update animations for nearby characters (performance optimization)
-    // Collect instances that need bone recomputation, with distance-based throttling
-    std::vector<std::reference_wrapper<CharacterInstance>> toUpdate;
-    toUpdate.reserve(instances.size());
-
-    for (auto& pair : instances) {
-        auto& inst = pair.second;
-
-        // Skip weapon instances — their transforms are set by parent bones
+        // Skip weapon instances for animation — their transforms are set by parent bones
         if (inst.hasOverrideModelMatrix) continue;
 
         float distSq = glm::distance2(inst.position, cameraPos);
         if (distSq >= animUpdateRadiusSq) continue;
 
         // Always advance animation time (cheap)
-        auto modelIt = models.find(inst.modelId);
-        if (modelIt != models.end() && !modelIt->second.data.sequences.empty()) {
+        if (inst.cachedModel && !inst.cachedModel->data.sequences.empty()) {
             if (inst.currentSequenceIndex < 0) {
                 inst.currentSequenceIndex = 0;
-                inst.currentAnimationId = modelIt->second.data.sequences[0].id;
+                inst.currentAnimationId = inst.cachedModel->data.sequences[0].id;
             }
-            const auto& seq = modelIt->second.data.sequences[inst.currentSequenceIndex];
+            const auto& seq = inst.cachedModel->data.sequences[inst.currentSequenceIndex];
             inst.animationTime += deltaTime * 1000.0f;
             if (seq.duration > 0 && inst.animationTime >= static_cast<float>(seq.duration)) {
                 if (inst.animationLoop) {
@@ -1509,10 +1630,11 @@ void CharacterRenderer::update(float deltaTime, const glm::vec3& cameraPos) {
             }
         }
 
-        // Distance-tiered bone throttling: near=every frame, mid=every 3rd, far=every 6th
+        // Distance-tiered bone throttling: near=every frame, mid=every 4th, far=every 8th
         uint32_t boneInterval = 1;
-        if (distSq > 60.0f * 60.0f) boneInterval = 6;
-        else if (distSq > 30.0f * 30.0f) boneInterval = 3;
+        if (distSq > 40.0f * 40.0f) boneInterval = 8;
+        else if (distSq > 20.0f * 20.0f) boneInterval = 4;
+        else if (distSq > 10.0f * 10.0f) boneInterval = 2;
 
         inst.boneUpdateCounter++;
         bool needsBones = (inst.boneUpdateCounter >= boneInterval) || inst.boneMatrices.empty();
@@ -1527,7 +1649,7 @@ void CharacterRenderer::update(float deltaTime, const glm::vec3& cameraPos) {
     // Thread bone matrix computation in chunks
     if (updatedCount >= 8 && numAnimThreads_ > 1) {
         static const size_t minAnimWorkPerThread = std::max<size_t>(
-            16, envSizeOrDefault("WOWEE_CHAR_ANIM_WORK_PER_THREAD", 64));
+            8, envSizeOrDefault("WOWEE_CHAR_ANIM_WORK_PER_THREAD", 16));
         const size_t maxUsefulThreads = std::max<size_t>(
             1, (updatedCount + minAnimWorkPerThread - 1) / minAnimWorkPerThread);
         const size_t numThreads = std::min(static_cast<size_t>(numAnimThreads_), maxUsefulThreads);
@@ -1596,11 +1718,8 @@ void CharacterRenderer::update(float deltaTime, const glm::vec3& cameraPos) {
 }
 
 void CharacterRenderer::updateAnimation(CharacterInstance& instance, float deltaTime) {
-    auto modelIt = models.find(instance.modelId);
-    if (modelIt == models.end()) {
-        return;
-    }
-    const auto& model = modelIt->second.data;
+    if (!instance.cachedModel) return;
+    const auto& model = instance.cachedModel->data;
 
     if (model.sequences.empty()) {
         return;
@@ -1713,7 +1832,8 @@ glm::quat CharacterRenderer::interpolateQuat(const pipeline::M2AnimationTrack& t
 // --- Bone transform calculation ---
 
 void CharacterRenderer::calculateBoneMatrices(CharacterInstance& instance) {
-    auto& model = models[instance.modelId].data;
+    if (!instance.cachedModel) return;
+    auto& model = instance.cachedModel->data;
 
     if (model.bones.empty()) {
         return;
@@ -1722,8 +1842,6 @@ void CharacterRenderer::calculateBoneMatrices(CharacterInstance& instance) {
     size_t numBones = model.bones.size();
     instance.boneMatrices.resize(numBones);
 
-    static bool dumpedOnce = false;
-
     for (size_t i = 0; i < numBones; i++) {
         const auto& bone = model.bones[i];
 
@@ -1731,19 +1849,6 @@ void CharacterRenderer::calculateBoneMatrices(CharacterInstance& instance) {
         // At rest this is identity, so no separate bind pose is needed
         glm::mat4 localTransform = getBoneTransform(bone, instance.animationTime, instance.currentSequenceIndex);
 
-        // Debug: dump first frame bone data
-        if (!dumpedOnce && i < 5) {
-            glm::vec3 t = interpolateVec3(bone.translation, instance.currentSequenceIndex, instance.animationTime, glm::vec3(0.0f));
-            glm::quat r = interpolateQuat(bone.rotation, instance.currentSequenceIndex, instance.animationTime);
-            glm::vec3 s = interpolateVec3(bone.scale, instance.currentSequenceIndex, instance.animationTime, glm::vec3(1.0f));
-            core::Logger::getInstance().info("Bone ", i, " parent=", bone.parentBone,
-                " pivot=(", bone.pivot.x, ",", bone.pivot.y, ",", bone.pivot.z, ")",
-                " t=(", t.x, ",", t.y, ",", t.z, ")",
-                " r=(", r.w, ",", r.x, ",", r.y, ",", r.z, ")",
-                " s=(", s.x, ",", s.y, ",", s.z, ")",
-                " seqIdx=", instance.currentSequenceIndex);
-        }
-
         // Compose with parent
         if (bone.parentBone >= 0 && static_cast<size_t>(bone.parentBone) < numBones) {
             instance.boneMatrices[i] = instance.boneMatrices[bone.parentBone] * localTransform;
@@ -1751,12 +1856,6 @@ void CharacterRenderer::calculateBoneMatrices(CharacterInstance& instance) {
             instance.boneMatrices[i] = localTransform;
         }
     }
-    if (!dumpedOnce) {
-        dumpedOnce = true;
-        // Dump final matrix for bone 0
-        auto& m = instance.boneMatrices[0];
-        core::Logger::getInstance().info("Bone 0 final matrix row0=(", m[0][0], ",", m[1][0], ",", m[2][0], ",", m[3][0], ")");
-    }
 }
 
 glm::mat4 CharacterRenderer::getBoneTransform(const pipeline::M2Bone& bone, float time, int sequenceIndex) {
@@ -1791,22 +1890,19 @@ void CharacterRenderer::render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet,
     uint32_t frameIndex = vkCtx_->getCurrentFrame();
     uint32_t frameSlot = frameIndex % 2u;
 
-    // Reset transient material allocations once per frame slot.
-    // beginFrame() waits on this slot's fence before recording.
+    // Reset material ring buffer and descriptor pool once per frame slot.
     if (lastMaterialPoolResetFrame_ != frameIndex) {
-        VmaAllocator alloc = vkCtx_->getAllocator();
-        for (const auto& b : transientMaterialUbos_[frameSlot]) {
-            if (b.first) {
-                vmaDestroyBuffer(alloc, b.first, b.second);
-            }
-        }
-        transientMaterialUbos_[frameSlot].clear();
+        materialRingOffset_[frameSlot] = 0;
         if (materialDescPools_[frameSlot]) {
             vkResetDescriptorPool(vkCtx_->getDevice(), materialDescPools_[frameSlot], 0);
         }
         lastMaterialPoolResetFrame_ = frameIndex;
     }
 
+    // Pre-compute aligned UBO stride for ring buffer sub-allocation
+    const uint32_t uboStride = (sizeof(CharMaterialUBO) + materialUboAlignment_ - 1) & ~(materialUboAlignment_ - 1);
+    const uint32_t ringCapacityBytes = uboStride * MATERIAL_RING_CAPACITY;
+
     // Bind per-frame descriptor set (set 0) -- shared across all draws
     vkCmdBindDescriptorSets(cmd, VK_PIPELINE_BIND_POINT_GRAPHICS,
                             pipelineLayout_, 0, 1, &perFrameSet, 0, nullptr);
@@ -1838,9 +1934,8 @@ void CharacterRenderer::render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet,
             }
         }
 
-        auto modelIt = models.find(instance.modelId);
-        if (modelIt == models.end()) continue;
-        const auto& gpuModel = modelIt->second;
+        if (!instance.cachedModel) continue;
+        const auto& gpuModel = *instance.cachedModel;
 
         // Skip models without GPU buffers
         if (!gpuModel.vertexBuffer) continue;
@@ -2176,27 +2271,18 @@ void CharacterRenderer::render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet,
                 matData.heightMapVariance = batchHeightVariance;
                 matData.normalMapStrength = normalMapStrength_;
 
-                // Create a small UBO for this batch's material
-                VkBufferCreateInfo bci{VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO};
-                bci.size = sizeof(CharMaterialUBO);
-                bci.usage = VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT;
-                VmaAllocationCreateInfo aci{};
-                aci.usage = VMA_MEMORY_USAGE_CPU_TO_GPU;
-                aci.flags = VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT | VMA_ALLOCATION_CREATE_MAPPED_BIT;
-                VmaAllocationInfo allocInfo{};
-                ::VkBuffer matUBO = VK_NULL_HANDLE;
-                VmaAllocation matUBOAlloc = VK_NULL_HANDLE;
-                vmaCreateBuffer(vkCtx_->getAllocator(), &bci, &aci, &matUBO, &matUBOAlloc, &allocInfo);
-                if (allocInfo.pMappedData) {
-                    memcpy(allocInfo.pMappedData, &matData, sizeof(CharMaterialUBO));
-                }
+                // Sub-allocate material UBO from ring buffer
+                uint32_t matOffset = materialRingOffset_[frameSlot];
+                if (matOffset + uboStride > ringCapacityBytes) continue; // ring exhausted
+                memcpy(static_cast<char*>(materialRingMapped_[frameSlot]) + matOffset, &matData, sizeof(CharMaterialUBO));
+                materialRingOffset_[frameSlot] = matOffset + uboStride;
 
                 // Write descriptor set: binding 0 = texture, binding 1 = material UBO, binding 2 = normal/height map
                 VkTexture* bindTex = (texPtr && texPtr->isValid()) ? texPtr : whiteTexture_.get();
                 VkDescriptorImageInfo imgInfo = bindTex->descriptorInfo();
                 VkDescriptorBufferInfo bufInfo{};
-                bufInfo.buffer = matUBO;
-                bufInfo.offset = 0;
+                bufInfo.buffer = materialRingBuffer_[frameSlot];
+                bufInfo.offset = matOffset;
                 bufInfo.range = sizeof(CharMaterialUBO);
                 VkDescriptorImageInfo nhImgInfo = normalMap->descriptorInfo();
 
@@ -2229,8 +2315,6 @@ void CharacterRenderer::render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet,
                                         pipelineLayout_, 1, 1, &materialSet, 0, nullptr);
 
                 vkCmdDrawIndexed(cmd, batch.indexCount, 1, batch.indexStart, 0, 0);
-
-                transientMaterialUbos_[frameSlot].emplace_back(matUBO, matUBOAlloc);
             }
         } else {
             // Draw entire model with first texture
@@ -2271,24 +2355,16 @@ void CharacterRenderer::render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet,
             matData.heightMapVariance = 0.0f;
             matData.normalMapStrength = normalMapStrength_;
 
-            VkBufferCreateInfo bci{VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO};
-            bci.size = sizeof(CharMaterialUBO);
-            bci.usage = VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT;
-            VmaAllocationCreateInfo aci{};
-            aci.usage = VMA_MEMORY_USAGE_CPU_TO_GPU;
-            aci.flags = VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT | VMA_ALLOCATION_CREATE_MAPPED_BIT;
-            VmaAllocationInfo allocInfo{};
-            ::VkBuffer matUBO = VK_NULL_HANDLE;
-            VmaAllocation matUBOAlloc = VK_NULL_HANDLE;
-            vmaCreateBuffer(vkCtx_->getAllocator(), &bci, &aci, &matUBO, &matUBOAlloc, &allocInfo);
-            if (allocInfo.pMappedData) {
-                memcpy(allocInfo.pMappedData, &matData, sizeof(CharMaterialUBO));
-            }
+            // Sub-allocate material UBO from ring buffer
+            uint32_t matOffset2 = materialRingOffset_[frameSlot];
+            if (matOffset2 + uboStride > ringCapacityBytes) continue; // ring exhausted
+            memcpy(static_cast<char*>(materialRingMapped_[frameSlot]) + matOffset2, &matData, sizeof(CharMaterialUBO));
+            materialRingOffset_[frameSlot] = matOffset2 + uboStride;
 
             VkDescriptorImageInfo imgInfo = texPtr->descriptorInfo();
             VkDescriptorBufferInfo bufInfo{};
-            bufInfo.buffer = matUBO;
-            bufInfo.offset = 0;
+            bufInfo.buffer = materialRingBuffer_[frameSlot];
+            bufInfo.offset = matOffset2;
             bufInfo.range = sizeof(CharMaterialUBO);
             VkDescriptorImageInfo nhImgInfo2 = flatNormalTexture_->descriptorInfo();
 
@@ -2320,8 +2396,6 @@ void CharacterRenderer::render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet,
                                     pipelineLayout_, 1, 1, &materialSet, 0, nullptr);
 
             vkCmdDrawIndexed(cmd, gpuModel.indexCount, 1, 0, 0, 0);
-
-            transientMaterialUbos_[frameSlot].emplace_back(matUBO, matUBOAlloc);
         }
     }
 }
@@ -2513,9 +2587,8 @@ void CharacterRenderer::renderShadow(VkCommandBuffer cmd, const glm::mat4& light
         glm::vec3 diff = inst.position - shadowCenter;
         if (glm::dot(diff, diff) > shadowRadiusSq) continue;
 
-        auto modelIt = models.find(inst.modelId);
-        if (modelIt == models.end()) continue;
-        const M2ModelGPU& gpuModel = modelIt->second;
+        if (!inst.cachedModel) continue;
+        const M2ModelGPU& gpuModel = *inst.cachedModel;
         if (!gpuModel.vertexBuffer) continue;
 
         glm::mat4 modelMat = inst.hasOverrideModelMatrix
diff --git a/src/rendering/m2_renderer.cpp b/src/rendering/m2_renderer.cpp
index d76843a0..d455e494 100644
--- a/src/rendering/m2_renderer.cpp
+++ b/src/rendering/m2_renderer.cpp
@@ -678,6 +678,7 @@ void M2Renderer::shutdown() {
     instances.clear();
     spatialGrid.clear();
     instanceIndexById.clear();
+    instanceDedupMap_.clear();
 
     // Delete cached textures
     textureCache.clear();
@@ -1184,6 +1185,10 @@ bool M2Renderer::loadModel(const pipeline::M2Model& model, uint32_t modelId) {
         }
     }
 
+    // Batch all GPU uploads (VB, IB, textures) into a single command buffer
+    // submission with one fence wait, instead of one fence wait per upload.
+    vkCtx_->beginUploadBatch();
+
     if (hasGeometry) {
         // Create VBO with interleaved vertex data
         // Format: position (3), normal (3), texcoord0 (2), texcoord1 (2), boneWeights (4), boneIndices (4 as float)
@@ -1535,6 +1540,8 @@ bool M2Renderer::loadModel(const pipeline::M2Model& model, uint32_t modelId) {
         }
     }
 
+    vkCtx_->endUploadBatch();
+
     // Allocate Vulkan descriptor sets and UBOs for each batch
     for (auto& bgpu : gpuModel.batches) {
         // Create combined UBO for M2Params (binding 1) + M2Material (binding 2)
@@ -1613,17 +1620,16 @@ uint32_t M2Renderer::createInstance(uint32_t modelId, const glm::vec3& position,
     }
     const auto& mdlRef = modelIt->second;
 
-    // Ground clutter is procedurally scattered and high-count; avoid O(N) dedup
-    // scans that can hitch when new tiles stream in.
+    // Deduplicate: skip if same model already at nearly the same position.
+    // Uses hash map for O(1) lookup instead of O(N) scan.
     if (!mdlRef.isGroundDetail) {
-        // Deduplicate: skip if same model already at nearly the same position
-        for (const auto& existing : instances) {
-            if (existing.modelId == modelId) {
-                glm::vec3 d = existing.position - position;
-                if (glm::dot(d, d) < 0.01f) {
-                    return existing.id;
-                }
-            }
+        DedupKey dk{modelId,
+                    static_cast<int32_t>(std::round(position.x * 10.0f)),
+                    static_cast<int32_t>(std::round(position.y * 10.0f)),
+                    static_cast<int32_t>(std::round(position.z * 10.0f))};
+        auto dit = instanceDedupMap_.find(dk);
+        if (dit != instanceDedupMap_.end()) {
+            return dit->second;
         }
     }
 
@@ -1651,6 +1657,7 @@ uint32_t M2Renderer::createInstance(uint32_t modelId, const glm::vec3& position,
     instance.cachedIsInvisibleTrap = mdlRef.isInvisibleTrap;
     instance.cachedIsInstancePortal = mdlRef.isInstancePortal;
     instance.cachedIsValid = mdlRef.isValid();
+    instance.cachedModel = &mdlRef;
 
     // Initialize animation: play first sequence (usually Stand/Idle)
     const auto& mdl = mdlRef;
@@ -1662,6 +1669,15 @@ uint32_t M2Renderer::createInstance(uint32_t modelId, const glm::vec3& position,
         instance.variationTimer = 3000.0f + static_cast<float>(rand() % 8000);
     }
 
+    // Register in dedup map before pushing (uses original position, not ground-adjusted)
+    if (!mdlRef.isGroundDetail) {
+        DedupKey dk{modelId,
+                    static_cast<int32_t>(std::round(position.x * 10.0f)),
+                    static_cast<int32_t>(std::round(position.y * 10.0f)),
+                    static_cast<int32_t>(std::round(position.z * 10.0f))};
+        instanceDedupMap_[dk] = instance.id;
+    }
+
     instances.push_back(instance);
     size_t idx = instances.size() - 1;
     // Track special instances for fast-path iteration
@@ -1700,13 +1716,15 @@ uint32_t M2Renderer::createInstanceWithMatrix(uint32_t modelId, const glm::mat4&
         return 0;
     }
 
-    // Deduplicate: skip if same model already at nearly the same position
-    for (const auto& existing : instances) {
-        if (existing.modelId == modelId) {
-            glm::vec3 d = existing.position - position;
-            if (glm::dot(d, d) < 0.01f) {
-                return existing.id;
-            }
+    // Deduplicate: O(1) hash lookup
+    {
+        DedupKey dk{modelId,
+                    static_cast<int32_t>(std::round(position.x * 10.0f)),
+                    static_cast<int32_t>(std::round(position.y * 10.0f)),
+                    static_cast<int32_t>(std::round(position.z * 10.0f))};
+        auto dit = instanceDedupMap_.find(dk);
+        if (dit != instanceDedupMap_.end()) {
+            return dit->second;
         }
     }
 
@@ -1731,6 +1749,7 @@ uint32_t M2Renderer::createInstanceWithMatrix(uint32_t modelId, const glm::mat4&
     instance.cachedIsGroundDetail = mdl2.isGroundDetail;
     instance.cachedIsInvisibleTrap = mdl2.isInvisibleTrap;
     instance.cachedIsValid = mdl2.isValid();
+    instance.cachedModel = &mdl2;
 
     // Initialize animation
     if (mdl2.hasAnimation && !mdl2.disableAnimation && !mdl2.sequences.empty()) {
@@ -1743,6 +1762,15 @@ uint32_t M2Renderer::createInstanceWithMatrix(uint32_t modelId, const glm::mat4&
         instance.animTime = static_cast<float>(rand()) / RAND_MAX * 10000.0f;
     }
 
+    // Register in dedup map
+    {
+        DedupKey dk{modelId,
+                    static_cast<int32_t>(std::round(position.x * 10.0f)),
+                    static_cast<int32_t>(std::round(position.y * 10.0f)),
+                    static_cast<int32_t>(std::round(position.z * 10.0f))};
+        instanceDedupMap_[dk] = instance.id;
+    }
+
     instances.push_back(instance);
     size_t idx = instances.size() - 1;
     if (mdl2.isSmoke) {
@@ -2000,9 +2028,8 @@ void M2Renderer::update(float deltaTime, const glm::vec3& cameraPos, const glm::
         instance.animTime += dtMs * (instance.animSpeed - 1.0f);
 
         // For animation looping/variation, we need the actual model data.
-        auto it = models.find(instance.modelId);
-        if (it == models.end()) continue;
-        const M2ModelGPU& model = it->second;
+        if (!instance.cachedModel) continue;
+        const M2ModelGPU& model = *instance.cachedModel;
 
         // Validate sequence index
         if (instance.currentSequenceIndex < 0 ||
@@ -2058,6 +2085,14 @@ void M2Renderer::update(float deltaTime, const glm::vec3& cameraPos, const glm::
         float paddedRadius = std::max(cullRadius * 1.5f, cullRadius + 3.0f);
         if (cullRadius > 0.0f && !updateFrustum.intersectsSphere(instance.position, paddedRadius)) continue;
 
+        // Distance-based frame skipping: update distant bones less frequently
+        uint32_t boneInterval = 1;
+        if (distSq > 200.0f * 200.0f) boneInterval = 8;
+        else if (distSq > 100.0f * 100.0f) boneInterval = 4;
+        else if (distSq > 50.0f * 50.0f) boneInterval = 2;
+        instance.frameSkipCounter++;
+        if ((instance.frameSkipCounter % boneInterval) != 0) continue;
+
         boneWorkIndices_.push_back(idx);
     }
 
@@ -2071,9 +2106,8 @@ void M2Renderer::update(float deltaTime, const glm::vec3& cameraPos, const glm::
             for (size_t i : boneWorkIndices_) {
                 if (i >= instances.size()) continue;
                 auto& inst = instances[i];
-                auto mdlIt = models.find(inst.modelId);
-                if (mdlIt == models.end()) continue;
-                computeBoneMatrices(mdlIt->second, inst);
+                if (!inst.cachedModel) continue;
+                computeBoneMatrices(*inst.cachedModel, inst);
             }
         } else {
             // Parallel — dispatch across worker threads
@@ -2086,9 +2120,8 @@ void M2Renderer::update(float deltaTime, const glm::vec3& cameraPos, const glm::
                 for (size_t i : boneWorkIndices_) {
                     if (i >= instances.size()) continue;
                     auto& inst = instances[i];
-                    auto mdlIt = models.find(inst.modelId);
-                    if (mdlIt == models.end()) continue;
-                    computeBoneMatrices(mdlIt->second, inst);
+                    if (!inst.cachedModel) continue;
+                    computeBoneMatrices(*inst.cachedModel, inst);
                 }
             } else {
                 const size_t chunkSize = animCount / numThreads;
@@ -2109,9 +2142,8 @@ void M2Renderer::update(float deltaTime, const glm::vec3& cameraPos, const glm::
                                 size_t idx = boneWorkIndices_[j];
                                 if (idx >= instances.size()) continue;
                                 auto& inst = instances[idx];
-                                auto mdlIt = models.find(inst.modelId);
-                                if (mdlIt == models.end()) continue;
-                                computeBoneMatrices(mdlIt->second, inst);
+                                if (!inst.cachedModel) continue;
+                                computeBoneMatrices(*inst.cachedModel, inst);
                             }
                         }));
                     start = end;
@@ -2133,9 +2165,8 @@ void M2Renderer::update(float deltaTime, const glm::vec3& cameraPos, const glm::
         glm::vec3 toCam = instance.position - cachedCamPos_;
         float distSq = glm::dot(toCam, toCam);
         if (distSq > cachedMaxRenderDistSq_) continue;
-        auto mdlIt = models.find(instance.modelId);
-        if (mdlIt == models.end()) continue;
-        emitParticles(instance, mdlIt->second, deltaTime);
+        if (!instance.cachedModel) continue;
+        emitParticles(instance, *instance.cachedModel, deltaTime);
         updateParticles(instance, deltaTime);
     }
 
@@ -2839,9 +2870,8 @@ void M2Renderer::renderShadow(VkCommandBuffer cmd, const glm::mat4& lightSpaceMa
             glm::vec3 diff = instance.position - shadowCenter;
             if (glm::dot(diff, diff) > shadowRadiusSq) continue;
 
-            auto modelIt = models.find(instance.modelId);
-            if (modelIt == models.end()) continue;
-            const M2ModelGPU& model = modelIt->second;
+            if (!instance.cachedModel) continue;
+            const M2ModelGPU& model = *instance.cachedModel;
 
             // Filter: only draw foliage models in foliage pass, non-foliage in non-foliage pass
             if (model.shadowWindFoliage != foliagePass) continue;
@@ -2947,8 +2977,7 @@ std::vector<glm::vec3> M2Renderer::getWaterVegetationPositions(const glm::vec3&
     std::vector<glm::vec3> result;
     float maxDistSq = maxDist * maxDist;
     for (const auto& inst : instances) {
-        auto it = models.find(inst.modelId);
-        if (it == models.end() || !it->second.isWaterVegetation) continue;
+        if (!inst.cachedModel || !inst.cachedModel->isWaterVegetation) continue;
         glm::vec3 diff = inst.position - camPos;
         if (glm::dot(diff, diff) <= maxDistSq) {
             result.push_back(inst.position);
@@ -3059,9 +3088,8 @@ void M2Renderer::emitParticles(M2Instance& inst, const M2ModelGPU& gpu, float dt
 }
 
 void M2Renderer::updateParticles(M2Instance& inst, float dt) {
-    auto it = models.find(inst.modelId);
-    if (it == models.end()) return;
-    const auto& gpu = it->second;
+    if (!inst.cachedModel) return;
+    const auto& gpu = *inst.cachedModel;
 
     for (size_t i = 0; i < inst.particles.size(); ) {
         auto& p = inst.particles[i];
@@ -3136,9 +3164,8 @@ void M2Renderer::renderM2Particles(VkCommandBuffer cmd, VkDescriptorSet perFrame
 
     for (auto& inst : instances) {
         if (inst.particles.empty()) continue;
-        auto it = models.find(inst.modelId);
-        if (it == models.end()) continue;
-        const auto& gpu = it->second;
+        if (!inst.cachedModel) continue;
+        const auto& gpu = *inst.cachedModel;
 
         for (const auto& p : inst.particles) {
             if (p.emitterIndex < 0 || p.emitterIndex >= static_cast<int>(gpu.particleEmitters.size())) continue;
@@ -3477,6 +3504,7 @@ void M2Renderer::clear() {
     instances.clear();
     spatialGrid.clear();
     instanceIndexById.clear();
+    instanceDedupMap_.clear();
     smokeParticles.clear();
     smokeInstanceIndices_.clear();
     portalInstanceIndices_.clear();
@@ -3513,6 +3541,7 @@ M2Renderer::GridCell M2Renderer::toCell(const glm::vec3& p) const {
 void M2Renderer::rebuildSpatialIndex() {
     spatialGrid.clear();
     instanceIndexById.clear();
+    instanceDedupMap_.clear();
     instanceIndexById.reserve(instances.size());
     smokeInstanceIndices_.clear();
     portalInstanceIndices_.clear();
@@ -3521,9 +3550,22 @@ void M2Renderer::rebuildSpatialIndex() {
     particleInstanceIndices_.clear();
 
     for (size_t i = 0; i < instances.size(); i++) {
-        const auto& inst = instances[i];
+        auto& inst = instances[i];
         instanceIndexById[inst.id] = i;
 
+        // Re-cache model pointer (may have changed after model map modifications)
+        auto mdlIt = models.find(inst.modelId);
+        inst.cachedModel = (mdlIt != models.end()) ? &mdlIt->second : nullptr;
+
+        // Rebuild dedup map (skip ground detail)
+        if (!inst.cachedIsGroundDetail) {
+            DedupKey dk{inst.modelId,
+                        static_cast<int32_t>(std::round(inst.position.x * 10.0f)),
+                        static_cast<int32_t>(std::round(inst.position.y * 10.0f)),
+                        static_cast<int32_t>(std::round(inst.position.z * 10.0f))};
+            instanceDedupMap_[dk] = inst.id;
+        }
+
         if (inst.cachedIsSmoke) {
             smokeInstanceIndices_.push_back(i);
         }
@@ -3647,8 +3689,18 @@ VkTexture* M2Renderer::loadTexture(const std::string& path, uint32_t texFlags) {
         containsToken(key, "campfire") ||
         containsToken(key, "bonfire");
 
-    // Load BLP texture
-    pipeline::BLPImage blp = assetManager->loadTexture(key);
+    // Check pre-decoded BLP cache first (populated by background worker threads)
+    pipeline::BLPImage blp;
+    if (predecodedBLPCache_) {
+        auto pit = predecodedBLPCache_->find(key);
+        if (pit != predecodedBLPCache_->end()) {
+            blp = std::move(pit->second);
+            predecodedBLPCache_->erase(pit);
+        }
+    }
+    if (!blp.isValid()) {
+        blp = assetManager->loadTexture(key);
+    }
     if (!blp.isValid()) {
         // Return white fallback but don't cache the failure — MPQ reads can
         // fail transiently during streaming; allow retry on next model load.
@@ -3714,9 +3766,8 @@ VkTexture* M2Renderer::loadTexture(const std::string& path, uint32_t texFlags) {
 uint32_t M2Renderer::getTotalTriangleCount() const {
     uint32_t total = 0;
     for (const auto& instance : instances) {
-        auto it = models.find(instance.modelId);
-        if (it != models.end()) {
-            total += it->second.indexCount / 3;
+        if (instance.cachedModel) {
+            total += instance.cachedModel->indexCount / 3;
         }
     }
     return total;
@@ -3738,11 +3789,10 @@ std::optional<float> M2Renderer::getFloorHeight(float glX, float glY, float glZ,
             continue;
         }
 
-        auto it = models.find(instance.modelId);
-        if (it == models.end()) continue;
+        if (!instance.cachedModel) continue;
         if (instance.scale <= 0.001f) continue;
 
-        const M2ModelGPU& model = it->second;
+        const M2ModelGPU& model = *instance.cachedModel;
         if (model.collisionNoBlock || model.isInvisibleTrap || model.isSpellEffect) continue;
         if (instance.skipCollision) continue;
 
@@ -3894,10 +3944,9 @@ bool M2Renderer::checkCollision(const glm::vec3& from, const glm::vec3& to,
         if (from.z > instance.worldBoundsMax.z + 2.5f && adjustedPos.z > instance.worldBoundsMax.z + 2.5f) continue;
         if (from.z + 2.5f < instance.worldBoundsMin.z && adjustedPos.z + 2.5f < instance.worldBoundsMin.z) continue;
 
-        auto it = models.find(instance.modelId);
-        if (it == models.end()) continue;
+        if (!instance.cachedModel) continue;
 
-        const M2ModelGPU& model = it->second;
+        const M2ModelGPU& model = *instance.cachedModel;
         if (model.collisionNoBlock || model.isInvisibleTrap || model.isSpellEffect) continue;
         if (instance.skipCollision) continue;
         if (instance.scale <= 0.001f) continue;
@@ -4135,10 +4184,9 @@ float M2Renderer::raycastBoundingBoxes(const glm::vec3& origin, const glm::vec3&
             continue;
         }
 
-        auto it = models.find(instance.modelId);
-        if (it == models.end()) continue;
+        if (!instance.cachedModel) continue;
 
-        const M2ModelGPU& model = it->second;
+        const M2ModelGPU& model = *instance.cachedModel;
         if (model.collisionNoBlock || model.isInvisibleTrap || model.isSpellEffect) continue;
         glm::vec3 localMin, localMax;
         getTightCollisionBounds(model, localMin, localMax);
diff --git a/src/rendering/renderer.cpp b/src/rendering/renderer.cpp
index 5f3e48ae..55ba1370 100644
--- a/src/rendering/renderer.cpp
+++ b/src/rendering/renderer.cpp
@@ -2434,6 +2434,9 @@ void Renderer::update(float deltaTime) {
         cameraController->update(deltaTime);
         auto cameraEnd = std::chrono::steady_clock::now();
         lastCameraUpdateMs = std::chrono::duration<double, std::milli>(cameraEnd - cameraStart).count();
+        if (lastCameraUpdateMs > 3.0) {
+            LOG_WARNING("SLOW cameraController->update: ", lastCameraUpdateMs, "ms");
+        }
 
         // Update 3D audio listener position/orientation to match camera
         if (camera) {
@@ -2527,7 +2530,13 @@ void Renderer::update(float deltaTime) {
 
     // Update terrain streaming
     if (terrainManager && camera) {
+        auto terrStart = std::chrono::steady_clock::now();
         terrainManager->update(*camera, deltaTime);
+        float terrMs = std::chrono::duration<float, std::milli>(
+            std::chrono::steady_clock::now() - terrStart).count();
+        if (terrMs > 5.0f) {
+            LOG_WARNING("SLOW terrainManager->update: ", terrMs, "ms");
+        }
     }
 
     // Update sky system (skybox time, star twinkle, clouds, celestial moon phases)
@@ -2579,7 +2588,14 @@ void Renderer::update(float deltaTime) {
 
     // Update character animations
     if (characterRenderer && camera) {
+        auto charAnimStart = std::chrono::steady_clock::now();
         characterRenderer->update(deltaTime, camera->getPosition());
+        float charAnimMs = std::chrono::duration<float, std::milli>(
+            std::chrono::steady_clock::now() - charAnimStart).count();
+        if (charAnimMs > 5.0f) {
+            LOG_WARNING("SLOW characterRenderer->update: ", charAnimMs, "ms (",
+                        characterRenderer->getInstanceCount(), " instances)");
+        }
     }
 
     // Update AudioEngine (cleanup finished sounds, etc.)
@@ -2766,8 +2782,15 @@ void Renderer::update(float deltaTime) {
 
     // Update M2 doodad animations (pass camera for frustum-culling bone computation)
     if (m2Renderer && camera) {
+        auto m2Start = std::chrono::steady_clock::now();
         m2Renderer->update(deltaTime, camera->getPosition(),
                            camera->getProjectionMatrix() * camera->getViewMatrix());
+        float m2Ms = std::chrono::duration<float, std::milli>(
+            std::chrono::steady_clock::now() - m2Start).count();
+        if (m2Ms > 3.0f) {
+            LOG_WARNING("SLOW m2Renderer->update: ", m2Ms, "ms (",
+                        m2Renderer->getInstanceCount(), " instances)");
+        }
     }
 
     // Helper: play zone music, dispatching local files (file: prefix) vs MPQ paths
diff --git a/src/rendering/terrain_manager.cpp b/src/rendering/terrain_manager.cpp
index b164d969..97527c8c 100644
--- a/src/rendering/terrain_manager.cpp
+++ b/src/rendering/terrain_manager.cpp
@@ -1,5 +1,6 @@
 #include "rendering/terrain_manager.hpp"
 #include "rendering/terrain_renderer.hpp"
+#include "rendering/vk_context.hpp"
 #include "rendering/water_renderer.hpp"
 #include "rendering/m2_renderer.hpp"
 #include "rendering/wmo_renderer.hpp"
@@ -53,12 +54,12 @@ int computeTerrainWorkerCount() {
 
     unsigned hc = std::thread::hardware_concurrency();
     if (hc > 0) {
-        // Terrain streaming should leave CPU room for render/update threads.
-        const unsigned availableCores = (hc > 1u) ? (hc - 1u) : 1u;
-        const unsigned targetWorkers = std::max(2u, availableCores / 2u);
+        // Use most cores for loading — leave 1-2 for render/update threads.
+        const unsigned reserved = (hc >= 8u) ? 2u : 1u;
+        const unsigned targetWorkers = std::max(4u, hc - reserved);
         return static_cast<int>(targetWorkers);
     }
-    return 2;  // Fallback
+    return 4;  // Fallback
 }
 
 bool decodeLayerAlpha(const pipeline::MapChunk& chunk, size_t layerIdx, std::vector<uint8_t>& outAlpha) {
@@ -230,9 +231,14 @@ bool TerrainManager::loadTile(int x, int y) {
         return false;
     }
 
+    VkContext* vkCtx = terrainRenderer ? terrainRenderer->getVkContext() : nullptr;
+    if (vkCtx) vkCtx->beginUploadBatch();
+
     FinalizingTile ft;
     ft.pending = std::move(pending);
     while (!advanceFinalization(ft)) {}
+
+    if (vkCtx) vkCtx->endUploadBatchSync();  // Sync — caller expects tile ready
     return true;
 }
 
@@ -372,6 +378,15 @@ std::shared_ptr<PendingTile> TerrainManager::prepareTile(int x, int y) {
                                    int& skippedSkinNotFound) -> bool {
         if (preparedModelIds.find(modelId) != preparedModelIds.end()) return true;
 
+        // Skip file I/O + parsing for models already uploaded to GPU from previous tiles
+        {
+            std::lock_guard<std::mutex> lock(uploadedM2IdsMutex_);
+            if (uploadedM2Ids_.count(modelId)) {
+                preparedModelIds.insert(modelId);
+                return true;
+            }
+        }
+
         std::vector<uint8_t> m2Data = assetManager->readFile(m2Path);
         if (m2Data.empty()) {
             skippedFileNotFound++;
@@ -397,6 +412,20 @@ std::shared_ptr<PendingTile> TerrainManager::prepareTile(int x, int y) {
             return false;
         }
 
+        // Pre-decode M2 model textures on background thread
+        for (const auto& tex : m2Model.textures) {
+            if (tex.filename.empty()) continue;
+            std::string texKey = tex.filename;
+            std::replace(texKey.begin(), texKey.end(), '/', '\\');
+            std::transform(texKey.begin(), texKey.end(), texKey.begin(),
+                           [](unsigned char c) { return static_cast<char>(std::tolower(c)); });
+            if (pending->preloadedM2Textures.find(texKey) != pending->preloadedM2Textures.end()) continue;
+            auto blp = assetManager->loadTexture(texKey);
+            if (blp.isValid()) {
+                pending->preloadedM2Textures[texKey] = std::move(blp);
+            }
+        }
+
         PendingTile::M2Ready ready;
         ready.modelId = modelId;
         ready.model = std::move(m2Model);
@@ -551,19 +580,44 @@ std::shared_ptr<PendingTile> TerrainManager::prepareTile(int x, int y) {
                         }
 
                         uint32_t doodadModelId = static_cast<uint32_t>(std::hash<std::string>{}(m2Path));
-                        std::vector<uint8_t> m2Data = assetManager->readFile(m2Path);
-                        if (m2Data.empty()) continue;
 
-                        pipeline::M2Model m2Model = pipeline::M2Loader::load(m2Data);
-                        if (m2Model.name.empty()) {
-                            m2Model.name = m2Path;
+                        // Skip file I/O if model already uploaded from a previous tile
+                        bool modelAlreadyUploaded = false;
+                        {
+                            std::lock_guard<std::mutex> lock(uploadedM2IdsMutex_);
+                            modelAlreadyUploaded = uploadedM2Ids_.count(doodadModelId) > 0;
                         }
-                        std::string skinPath = m2Path.substr(0, m2Path.size() - 3) + "00.skin";
-                        std::vector<uint8_t> skinData = assetManager->readFile(skinPath);
-                        if (!skinData.empty() && m2Model.version >= 264) {
-                            pipeline::M2Loader::loadSkin(skinData, m2Model);
+
+                        pipeline::M2Model m2Model;
+                        if (!modelAlreadyUploaded) {
+                            std::vector<uint8_t> m2Data = assetManager->readFile(m2Path);
+                            if (m2Data.empty()) continue;
+
+                            m2Model = pipeline::M2Loader::load(m2Data);
+                            if (m2Model.name.empty()) {
+                                m2Model.name = m2Path;
+                            }
+                            std::string skinPath = m2Path.substr(0, m2Path.size() - 3) + "00.skin";
+                            std::vector<uint8_t> skinData = assetManager->readFile(skinPath);
+                            if (!skinData.empty() && m2Model.version >= 264) {
+                                pipeline::M2Loader::loadSkin(skinData, m2Model);
+                            }
+                            if (!m2Model.isValid()) continue;
+
+                            // Pre-decode doodad M2 textures on background thread
+                            for (const auto& tex : m2Model.textures) {
+                                if (tex.filename.empty()) continue;
+                                std::string texKey = tex.filename;
+                                std::replace(texKey.begin(), texKey.end(), '/', '\\');
+                                std::transform(texKey.begin(), texKey.end(), texKey.begin(),
+                                               [](unsigned char c) { return static_cast<char>(std::tolower(c)); });
+                                if (pending->preloadedM2Textures.find(texKey) != pending->preloadedM2Textures.end()) continue;
+                                auto blp = assetManager->loadTexture(texKey);
+                                if (blp.isValid()) {
+                                    pending->preloadedM2Textures[texKey] = std::move(blp);
+                                }
+                            }
                         }
-                        if (!m2Model.isValid()) continue;
 
                         // Build doodad's local transform (WoW coordinates)
                         // WMO doodads use quaternion rotation
@@ -633,6 +687,32 @@ std::shared_ptr<PendingTile> TerrainManager::prepareTile(int x, int y) {
                     }
                 }
 
+                // Pre-decode WMO textures on background thread
+                for (const auto& texPath : wmoModel.textures) {
+                    if (texPath.empty()) continue;
+                    std::string texKey = texPath;
+                    // Truncate at NUL (WMO paths can have stray bytes)
+                    size_t nul = texKey.find('\0');
+                    if (nul != std::string::npos) texKey.resize(nul);
+                    std::replace(texKey.begin(), texKey.end(), '/', '\\');
+                    std::transform(texKey.begin(), texKey.end(), texKey.begin(),
+                                   [](unsigned char c) { return static_cast<char>(std::tolower(c)); });
+                    if (texKey.empty()) continue;
+                    if (pending->preloadedWMOTextures.find(texKey) != pending->preloadedWMOTextures.end()) continue;
+                    // Try .blp variant
+                    std::string blpKey = texKey;
+                    if (blpKey.size() >= 4) {
+                        std::string ext = blpKey.substr(blpKey.size() - 4);
+                        if (ext == ".tga" || ext == ".dds") {
+                            blpKey = blpKey.substr(0, blpKey.size() - 4) + ".blp";
+                        }
+                    }
+                    auto blp = assetManager->loadTexture(blpKey);
+                    if (blp.isValid()) {
+                        pending->preloadedWMOTextures[blpKey] = std::move(blp);
+                    }
+                }
+
                 PendingTile::WMOReady ready;
                 // Cache WMO model uploads by path; placement dedup uses uniqueId separately.
                 ready.modelId = static_cast<uint32_t>(std::hash<std::string>{}(wmoPath));
@@ -695,27 +775,39 @@ bool TerrainManager::advanceFinalization(FinalizingTile& ft) {
             return true;
         }
 
-        LOG_DEBUG("Finalizing tile [", x, ",", y, "] (incremental)");
-
-        // Upload pre-loaded textures
-        if (!pending->preloadedTextures.empty()) {
-            terrainRenderer->uploadPreloadedTextures(pending->preloadedTextures);
-        }
-
-        // Upload terrain mesh to GPU
-        if (!terrainRenderer->loadTerrain(pending->mesh, pending->terrain.textures, x, y)) {
-            LOG_ERROR("Failed to upload terrain to GPU for tile [", x, ",", y, "]");
-            failedTiles[coord] = true;
-            {
-                std::lock_guard<std::mutex> lock(queueMutex);
-                pendingTiles.erase(coord);
+        // Upload pre-loaded textures (once)
+        if (!ft.terrainPreloaded) {
+            LOG_DEBUG("Finalizing tile [", x, ",", y, "] (incremental)");
+            if (!pending->preloadedTextures.empty()) {
+                terrainRenderer->uploadPreloadedTextures(pending->preloadedTextures);
             }
-            ft.phase = FinalizationPhase::DONE;
-            return true;
+            ft.terrainPreloaded = true;
+            // Yield after preload to give time budget a chance to interrupt
+            return false;
         }
 
-        // Load water immediately after terrain (same frame) — water is now
-        // deduplicated to ~1-2 merged surfaces per tile, so this is fast.
+        // Upload terrain chunks incrementally (16 per call to spread across frames)
+        if (!ft.terrainMeshDone) {
+            if (pending->mesh.validChunkCount == 0) {
+                LOG_ERROR("Failed to upload terrain to GPU for tile [", x, ",", y, "]");
+                failedTiles[coord] = true;
+                {
+                    std::lock_guard<std::mutex> lock(queueMutex);
+                    pendingTiles.erase(coord);
+                }
+                ft.phase = FinalizationPhase::DONE;
+                return true;
+            }
+            bool allDone = terrainRenderer->loadTerrainIncremental(
+                pending->mesh, pending->terrain.textures, x, y,
+                ft.terrainChunkNext, 32);
+            if (!allDone) {
+                return false; // More chunks remain — yield to time budget
+            }
+            ft.terrainMeshDone = true;
+        }
+
+        // Load water after all terrain chunks are uploaded
         if (waterRenderer) {
             size_t beforeSurfaces = waterRenderer->getSurfaceCount();
             waterRenderer->loadFromTerrain(pending->terrain, true, x, y);
@@ -738,13 +830,24 @@ bool TerrainManager::advanceFinalization(FinalizingTile& ft) {
     }
 
     case FinalizationPhase::M2_MODELS: {
-        // Upload ONE M2 model per call
+        // Upload multiple M2 models per call (batched GPU uploads)
         if (m2Renderer && ft.m2ModelIndex < pending->m2Models.size()) {
-            auto& m2Ready = pending->m2Models[ft.m2ModelIndex];
-            if (m2Renderer->loadModel(m2Ready.model, m2Ready.modelId)) {
-                ft.uploadedM2ModelIds.insert(m2Ready.modelId);
+            // Set pre-decoded BLP cache so loadTexture() skips main-thread BLP decode
+            m2Renderer->setPredecodedBLPCache(&pending->preloadedM2Textures);
+            constexpr size_t kModelsPerStep = 4;
+            size_t uploaded = 0;
+            while (ft.m2ModelIndex < pending->m2Models.size() && uploaded < kModelsPerStep) {
+                auto& m2Ready = pending->m2Models[ft.m2ModelIndex];
+                if (m2Renderer->loadModel(m2Ready.model, m2Ready.modelId)) {
+                    ft.uploadedM2ModelIds.insert(m2Ready.modelId);
+                    // Track uploaded model IDs so background threads can skip re-reading
+                    std::lock_guard<std::mutex> lock(uploadedM2IdsMutex_);
+                    uploadedM2Ids_.insert(m2Ready.modelId);
+                }
+                ft.m2ModelIndex++;
+                uploaded++;
             }
-            ft.m2ModelIndex++;
+            m2Renderer->setPredecodedBLPCache(nullptr);
             // Stay in this phase until all models uploaded
             if (ft.m2ModelIndex < pending->m2Models.size()) {
                 return false;
@@ -786,22 +889,28 @@ bool TerrainManager::advanceFinalization(FinalizingTile& ft) {
     }
 
     case FinalizationPhase::WMO_MODELS: {
-        // Upload ONE WMO model per call
+        // Upload multiple WMO models per call (batched GPU uploads)
         if (wmoRenderer && assetManager) {
             wmoRenderer->initialize(nullptr, VK_NULL_HANDLE, assetManager);
+            // Set pre-decoded BLP cache and defer normal maps during streaming
+            wmoRenderer->setPredecodedBLPCache(&pending->preloadedWMOTextures);
+            wmoRenderer->setDeferNormalMaps(true);
 
-            if (ft.wmoModelIndex < pending->wmoModels.size()) {
+            constexpr size_t kWmosPerStep = 1;
+            size_t uploaded = 0;
+            while (ft.wmoModelIndex < pending->wmoModels.size() && uploaded < kWmosPerStep) {
                 auto& wmoReady = pending->wmoModels[ft.wmoModelIndex];
-                // Deduplicate
                 if (wmoReady.uniqueId != 0 && placedWmoIds.count(wmoReady.uniqueId)) {
                     ft.wmoModelIndex++;
-                    if (ft.wmoModelIndex < pending->wmoModels.size()) return false;
                 } else {
                     wmoRenderer->loadModel(wmoReady.model, wmoReady.modelId);
                     ft.wmoModelIndex++;
-                    if (ft.wmoModelIndex < pending->wmoModels.size()) return false;
+                    uploaded++;
                 }
             }
+            wmoRenderer->setDeferNormalMaps(false);
+            wmoRenderer->setPredecodedBLPCache(nullptr);
+            if (ft.wmoModelIndex < pending->wmoModels.size()) return false;
         }
         ft.phase = FinalizationPhase::WMO_INSTANCES;
         return false;
@@ -862,17 +971,28 @@ bool TerrainManager::advanceFinalization(FinalizingTile& ft) {
     }
 
     case FinalizationPhase::WMO_DOODADS: {
-        // Upload ONE WMO doodad M2 per call
+        // Upload multiple WMO doodad M2s per call (batched GPU uploads)
         if (m2Renderer && ft.wmoDoodadIndex < pending->wmoDoodads.size()) {
-            auto& doodad = pending->wmoDoodads[ft.wmoDoodadIndex];
-            m2Renderer->loadModel(doodad.model, doodad.modelId);
-            uint32_t wmoDoodadInstId = m2Renderer->createInstanceWithMatrix(
-                doodad.modelId, doodad.modelMatrix, doodad.worldPosition);
-            if (wmoDoodadInstId) {
-                m2Renderer->setSkipCollision(wmoDoodadInstId, true);
-                ft.m2InstanceIds.push_back(wmoDoodadInstId);
+            // Set pre-decoded BLP cache for doodad M2 textures
+            m2Renderer->setPredecodedBLPCache(&pending->preloadedM2Textures);
+            constexpr size_t kDoodadsPerStep = 4;
+            size_t uploaded = 0;
+            while (ft.wmoDoodadIndex < pending->wmoDoodads.size() && uploaded < kDoodadsPerStep) {
+                auto& doodad = pending->wmoDoodads[ft.wmoDoodadIndex];
+                if (m2Renderer->loadModel(doodad.model, doodad.modelId)) {
+                    std::lock_guard<std::mutex> lock(uploadedM2IdsMutex_);
+                    uploadedM2Ids_.insert(doodad.modelId);
+                }
+                uint32_t wmoDoodadInstId = m2Renderer->createInstanceWithMatrix(
+                    doodad.modelId, doodad.modelMatrix, doodad.worldPosition);
+                if (wmoDoodadInstId) {
+                    m2Renderer->setSkipCollision(wmoDoodadInstId, true);
+                    ft.m2InstanceIds.push_back(wmoDoodadInstId);
+                }
+                ft.wmoDoodadIndex++;
+                uploaded++;
             }
-            ft.wmoDoodadIndex++;
+            m2Renderer->setPredecodedBLPCache(nullptr);
             if (ft.wmoDoodadIndex < pending->wmoDoodads.size()) return false;
         }
         ft.phase = FinalizationPhase::WATER;
@@ -1030,11 +1150,6 @@ void TerrainManager::workerLoop() {
 }
 
 void TerrainManager::processReadyTiles() {
-    // Process tiles with time budget to avoid frame spikes
-    // Taxi mode gets a slightly larger budget to avoid visible late-pop terrain/models.
-    const float timeBudgetMs = taxiStreamingMode_ ? 8.0f : 5.0f;
-    auto startTime = std::chrono::high_resolution_clock::now();
-
     // Move newly ready tiles into the finalizing deque.
     // Keep them in pendingTiles so streamTiles() won't re-enqueue them.
     {
@@ -1050,21 +1165,32 @@ void TerrainManager::processReadyTiles() {
         }
     }
 
-    // Drive incremental finalization within time budget
-    while (!finalizingTiles_.empty()) {
+    VkContext* vkCtx = terrainRenderer ? terrainRenderer->getVkContext() : nullptr;
+
+    // Reclaim completed async uploads from previous frames (non-blocking)
+    if (vkCtx) vkCtx->pollUploadBatches();
+
+    // Nothing to finalize — done.
+    if (finalizingTiles_.empty()) return;
+
+    // Async upload batch: record GPU copies into a command buffer, submit with
+    // a fence, but DON'T wait.  The fence is polled on subsequent frames.
+    // This eliminates the main-thread stall from vkWaitForFences entirely.
+    const int maxSteps = taxiStreamingMode_ ? 8 : 2;
+    int steps = 0;
+
+    if (vkCtx) vkCtx->beginUploadBatch();
+
+    while (!finalizingTiles_.empty() && steps < maxSteps) {
         auto& ft = finalizingTiles_.front();
         bool done = advanceFinalization(ft);
-
         if (done) {
             finalizingTiles_.pop_front();
         }
-
-        auto now = std::chrono::high_resolution_clock::now();
-        float elapsedMs = std::chrono::duration<float, std::milli>(now - startTime).count();
-        if (elapsedMs >= timeBudgetMs) {
-            break;
-        }
+        steps++;
     }
+
+    if (vkCtx) vkCtx->endUploadBatch();  // Async — submits but doesn't wait
 }
 
 void TerrainManager::processAllReadyTiles() {
@@ -1082,12 +1208,19 @@ void TerrainManager::processAllReadyTiles() {
             }
         }
     }
+
+    // Batch all GPU uploads across all tiles into a single submission
+    VkContext* vkCtx = terrainRenderer ? terrainRenderer->getVkContext() : nullptr;
+    if (vkCtx) vkCtx->beginUploadBatch();
+
     // Finalize all tiles completely (no time budget — used for loading screens)
     while (!finalizingTiles_.empty()) {
         auto& ft = finalizingTiles_.front();
         while (!advanceFinalization(ft)) {}
         finalizingTiles_.pop_front();
     }
+
+    if (vkCtx) vkCtx->endUploadBatchSync();  // Sync — load screen needs data ready
 }
 
 void TerrainManager::processOneReadyTile() {
@@ -1106,9 +1239,14 @@ void TerrainManager::processOneReadyTile() {
     }
     // Finalize ONE tile completely, then return so caller can update the screen
     if (!finalizingTiles_.empty()) {
+        VkContext* vkCtx = terrainRenderer ? terrainRenderer->getVkContext() : nullptr;
+        if (vkCtx) vkCtx->beginUploadBatch();
+
         auto& ft = finalizingTiles_.front();
         while (!advanceFinalization(ft)) {}
         finalizingTiles_.pop_front();
+
+        if (vkCtx) vkCtx->endUploadBatchSync();  // Sync — load screen needs data ready
     }
 }
 
@@ -1328,6 +1466,10 @@ void TerrainManager::unloadAll() {
     finalizingTiles_.clear();
     placedDoodadIds.clear();
     placedWmoIds.clear();
+    {
+        std::lock_guard<std::mutex> lock(uploadedM2IdsMutex_);
+        uploadedM2Ids_.clear();
+    }
 
     LOG_INFO("Unloading all terrain tiles");
     loadedTiles.clear();
@@ -1376,6 +1518,10 @@ void TerrainManager::softReset() {
     finalizingTiles_.clear();
     placedDoodadIds.clear();
     placedWmoIds.clear();
+    {
+        std::lock_guard<std::mutex> lock(uploadedM2IdsMutex_);
+        uploadedM2Ids_.clear();
+    }
 
     // Clear tile cache — keys are (x,y) without map name, so stale entries from
     // a different map with overlapping coordinates would produce wrong geometry.
diff --git a/src/rendering/terrain_renderer.cpp b/src/rendering/terrain_renderer.cpp
index 6e312233..fb20ce42 100644
--- a/src/rendering/terrain_renderer.cpp
+++ b/src/rendering/terrain_renderer.cpp
@@ -326,6 +326,8 @@ bool TerrainRenderer::loadTerrain(const pipeline::TerrainMesh& mesh,
     }
     LOG_DEBUG("Loading terrain mesh: ", mesh.validChunkCount, " chunks");
 
+    vkCtx->beginUploadBatch();
+
     for (int y = 0; y < 16; y++) {
         for (int x = 0; x < 16; x++) {
             const auto& chunk = mesh.getChunk(x, y);
@@ -405,10 +407,102 @@ bool TerrainRenderer::loadTerrain(const pipeline::TerrainMesh& mesh,
         }
     }
 
+    vkCtx->endUploadBatch();
+
     LOG_DEBUG("Loaded ", chunks.size(), " terrain chunks to GPU");
     return !chunks.empty();
 }
 
+bool TerrainRenderer::loadTerrainIncremental(const pipeline::TerrainMesh& mesh,
+                                              const std::vector<std::string>& texturePaths,
+                                              int tileX, int tileY,
+                                              int& chunkIndex, int maxChunksPerCall) {
+    // Batch all GPU uploads (VBs, IBs, textures) into a single command buffer
+    // submission with one fence wait, instead of one per buffer/texture.
+    vkCtx->beginUploadBatch();
+
+    int uploaded = 0;
+    while (chunkIndex < 256 && uploaded < maxChunksPerCall) {
+        int cy = chunkIndex / 16;
+        int cx = chunkIndex % 16;
+        chunkIndex++;
+
+        const auto& chunk = mesh.getChunk(cx, cy);
+        if (!chunk.isValid()) continue;
+
+        TerrainChunkGPU gpuChunk = uploadChunk(chunk);
+        if (!gpuChunk.isValid()) continue;
+
+        calculateBoundingSphere(gpuChunk, chunk);
+
+        if (!chunk.layers.empty()) {
+            uint32_t baseTexId = chunk.layers[0].textureId;
+            if (baseTexId < texturePaths.size()) {
+                gpuChunk.baseTexture = loadTexture(texturePaths[baseTexId]);
+            } else {
+                gpuChunk.baseTexture = whiteTexture.get();
+            }
+
+            for (size_t i = 1; i < chunk.layers.size() && i < 4; i++) {
+                const auto& layer = chunk.layers[i];
+                int li = static_cast<int>(i) - 1;
+
+                VkTexture* layerTex = whiteTexture.get();
+                if (layer.textureId < texturePaths.size()) {
+                    layerTex = loadTexture(texturePaths[layer.textureId]);
+                }
+                gpuChunk.layerTextures[li] = layerTex;
+
+                VkTexture* alphaTex = opaqueAlphaTexture.get();
+                if (!layer.alphaData.empty()) {
+                    alphaTex = createAlphaTexture(layer.alphaData);
+                }
+                gpuChunk.alphaTextures[li] = alphaTex;
+                gpuChunk.layerCount = static_cast<int>(i);
+            }
+        } else {
+            gpuChunk.baseTexture = whiteTexture.get();
+        }
+
+        gpuChunk.tileX = tileX;
+        gpuChunk.tileY = tileY;
+
+        TerrainParamsUBO params{};
+        params.layerCount = gpuChunk.layerCount;
+        params.hasLayer1 = gpuChunk.layerCount >= 1 ? 1 : 0;
+        params.hasLayer2 = gpuChunk.layerCount >= 2 ? 1 : 0;
+        params.hasLayer3 = gpuChunk.layerCount >= 3 ? 1 : 0;
+
+        VkBufferCreateInfo bufCI{};
+        bufCI.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO;
+        bufCI.size = sizeof(TerrainParamsUBO);
+        bufCI.usage = VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT;
+
+        VmaAllocationCreateInfo allocCI{};
+        allocCI.usage = VMA_MEMORY_USAGE_CPU_TO_GPU;
+        allocCI.flags = VMA_ALLOCATION_CREATE_MAPPED_BIT;
+
+        VmaAllocationInfo mapInfo{};
+        vmaCreateBuffer(vkCtx->getAllocator(), &bufCI, &allocCI,
+                        &gpuChunk.paramsUBO, &gpuChunk.paramsAlloc, &mapInfo);
+        if (mapInfo.pMappedData) {
+            std::memcpy(mapInfo.pMappedData, &params, sizeof(params));
+        }
+
+        gpuChunk.materialSet = allocateMaterialSet();
+        if (gpuChunk.materialSet) {
+            writeMaterialDescriptors(gpuChunk.materialSet, gpuChunk);
+        }
+
+        chunks.push_back(std::move(gpuChunk));
+        uploaded++;
+    }
+
+    vkCtx->endUploadBatch();
+
+    return chunkIndex >= 256;
+}
+
 TerrainChunkGPU TerrainRenderer::uploadChunk(const pipeline::ChunkMesh& chunk) {
     TerrainChunkGPU gpuChunk;
 
@@ -496,6 +590,9 @@ void TerrainRenderer::uploadPreloadedTextures(
                        [](unsigned char c) { return static_cast<char>(std::tolower(c)); });
         return key;
     };
+    // Batch all texture uploads into a single command buffer submission
+    vkCtx->beginUploadBatch();
+
     for (const auto& [path, blp] : textures) {
         std::string key = normalizeKey(path);
         if (textureCache.find(key) != textureCache.end()) continue;
@@ -515,6 +612,8 @@ void TerrainRenderer::uploadPreloadedTextures(
         textureCacheBytes_ += e.approxBytes;
         textureCache[key] = std::move(e);
     }
+
+    vkCtx->endUploadBatch();
 }
 
 VkTexture* TerrainRenderer::createAlphaTexture(const std::vector<uint8_t>& alphaData) {
diff --git a/src/rendering/vk_context.cpp b/src/rendering/vk_context.cpp
index e1a76cee..79e7eac3 100644
--- a/src/rendering/vk_context.cpp
+++ b/src/rendering/vk_context.cpp
@@ -67,6 +67,14 @@ void VkContext::shutdown() {
         frame = {};
     }
 
+    // Clean up any in-flight async upload batches (device already idle)
+    for (auto& batch : inFlightBatches_) {
+        // Staging buffers: skip destroy — allocator is about to be torn down
+        vkDestroyFence(device, batch.fence, nullptr);
+        // Command buffer freed when pool is destroyed below
+    }
+    inFlightBatches_.clear();
+
     if (immFence) { vkDestroyFence(device, immFence, nullptr); immFence = VK_NULL_HANDLE; }
     if (immCommandPool) { vkDestroyCommandPool(device, immCommandPool, nullptr); immCommandPool = VK_NULL_HANDLE; }
 
@@ -1423,10 +1431,121 @@ void VkContext::endSingleTimeCommands(VkCommandBuffer cmd) {
 }
 
 void VkContext::immediateSubmit(std::function<void(VkCommandBuffer cmd)>&& function) {
+    if (inUploadBatch_) {
+        // Record into the batch command buffer — no submit, no fence wait
+        function(batchCmd_);
+        return;
+    }
     VkCommandBuffer cmd = beginSingleTimeCommands();
     function(cmd);
     endSingleTimeCommands(cmd);
 }
 
+void VkContext::beginUploadBatch() {
+    uploadBatchDepth_++;
+    if (inUploadBatch_) return; // already in a batch (nested call)
+    inUploadBatch_ = true;
+    batchCmd_ = beginSingleTimeCommands();
+}
+
+void VkContext::endUploadBatch() {
+    if (uploadBatchDepth_ <= 0) return;
+    uploadBatchDepth_--;
+    if (uploadBatchDepth_ > 0) return; // still inside an outer batch
+
+    inUploadBatch_ = false;
+
+    if (batchStagingBuffers_.empty()) {
+        // No GPU copies were recorded — skip the submit entirely.
+        vkEndCommandBuffer(batchCmd_);
+        vkFreeCommandBuffers(device, immCommandPool, 1, &batchCmd_);
+        batchCmd_ = VK_NULL_HANDLE;
+        return;
+    }
+
+    // Submit commands with a NEW fence — don't wait, let GPU work in parallel.
+    vkEndCommandBuffer(batchCmd_);
+
+    VkFenceCreateInfo fenceInfo{};
+    fenceInfo.sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO;
+    VkFence fence = VK_NULL_HANDLE;
+    vkCreateFence(device, &fenceInfo, nullptr, &fence);
+
+    VkSubmitInfo submitInfo{};
+    submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
+    submitInfo.commandBufferCount = 1;
+    submitInfo.pCommandBuffers = &batchCmd_;
+    vkQueueSubmit(graphicsQueue, 1, &submitInfo, fence);
+
+    // Stash everything for later cleanup when fence signals
+    InFlightBatch batch;
+    batch.fence = fence;
+    batch.cmd = batchCmd_;
+    batch.stagingBuffers = std::move(batchStagingBuffers_);
+    inFlightBatches_.push_back(std::move(batch));
+
+    batchCmd_ = VK_NULL_HANDLE;
+    batchStagingBuffers_.clear();
+}
+
+void VkContext::endUploadBatchSync() {
+    if (uploadBatchDepth_ <= 0) return;
+    uploadBatchDepth_--;
+    if (uploadBatchDepth_ > 0) return;
+
+    inUploadBatch_ = false;
+
+    if (batchStagingBuffers_.empty()) {
+        vkEndCommandBuffer(batchCmd_);
+        vkFreeCommandBuffers(device, immCommandPool, 1, &batchCmd_);
+        batchCmd_ = VK_NULL_HANDLE;
+        return;
+    }
+
+    // Synchronous path for load screens — submit and wait
+    endSingleTimeCommands(batchCmd_);
+    batchCmd_ = VK_NULL_HANDLE;
+
+    for (auto& staging : batchStagingBuffers_) {
+        destroyBuffer(allocator, staging);
+    }
+    batchStagingBuffers_.clear();
+}
+
+void VkContext::pollUploadBatches() {
+    if (inFlightBatches_.empty()) return;
+
+    for (auto it = inFlightBatches_.begin(); it != inFlightBatches_.end(); ) {
+        VkResult result = vkGetFenceStatus(device, it->fence);
+        if (result == VK_SUCCESS) {
+            // GPU finished — free resources
+            for (auto& staging : it->stagingBuffers) {
+                destroyBuffer(allocator, staging);
+            }
+            vkFreeCommandBuffers(device, immCommandPool, 1, &it->cmd);
+            vkDestroyFence(device, it->fence, nullptr);
+            it = inFlightBatches_.erase(it);
+        } else {
+            ++it;
+        }
+    }
+}
+
+void VkContext::waitAllUploads() {
+    for (auto& batch : inFlightBatches_) {
+        vkWaitForFences(device, 1, &batch.fence, VK_TRUE, UINT64_MAX);
+        for (auto& staging : batch.stagingBuffers) {
+            destroyBuffer(allocator, staging);
+        }
+        vkFreeCommandBuffers(device, immCommandPool, 1, &batch.cmd);
+        vkDestroyFence(device, batch.fence, nullptr);
+    }
+    inFlightBatches_.clear();
+}
+
+void VkContext::deferStagingCleanup(AllocatedBuffer staging) {
+    batchStagingBuffers_.push_back(staging);
+}
+
 } // namespace rendering
 } // namespace wowee
diff --git a/src/rendering/vk_texture.cpp b/src/rendering/vk_texture.cpp
index fba6d72b..415e3d56 100644
--- a/src/rendering/vk_texture.cpp
+++ b/src/rendering/vk_texture.cpp
@@ -96,7 +96,11 @@ bool VkTexture::upload(VkContext& ctx, const uint8_t* pixels, uint32_t width, ui
         generateMipmaps(ctx, format, width, height);
     }
 
-    destroyBuffer(ctx.getAllocator(), staging);
+    if (ctx.isInUploadBatch()) {
+        ctx.deferStagingCleanup(staging);
+    } else {
+        destroyBuffer(ctx.getAllocator(), staging);
+    }
     return true;
 }
 
@@ -162,7 +166,11 @@ bool VkTexture::uploadMips(VkContext& ctx, const uint8_t* const* mipData,
             VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT);
     });
 
-    destroyBuffer(ctx.getAllocator(), staging);
+    if (ctx.isInUploadBatch()) {
+        ctx.deferStagingCleanup(staging);
+    } else {
+        destroyBuffer(ctx.getAllocator(), staging);
+    }
     return true;
 }
 
diff --git a/src/rendering/vk_utils.cpp b/src/rendering/vk_utils.cpp
index d105c986..3a2f51d1 100644
--- a/src/rendering/vk_utils.cpp
+++ b/src/rendering/vk_utils.cpp
@@ -198,8 +198,12 @@ AllocatedBuffer uploadBuffer(VkContext& ctx, const void* data, VkDeviceSize size
         vkCmdCopyBuffer(cmd, staging.buffer, gpuBuffer.buffer, 1, &copyRegion);
     });
 
-    // Destroy staging buffer
-    destroyBuffer(ctx.getAllocator(), staging);
+    // Destroy staging buffer (deferred if in batch mode)
+    if (ctx.isInUploadBatch()) {
+        ctx.deferStagingCleanup(staging);
+    } else {
+        destroyBuffer(ctx.getAllocator(), staging);
+    }
 
     return gpuBuffer;
 }
diff --git a/src/rendering/wmo_renderer.cpp b/src/rendering/wmo_renderer.cpp
index ff6b0035..5dec0e3e 100644
--- a/src/rendering/wmo_renderer.cpp
+++ b/src/rendering/wmo_renderer.cpp
@@ -419,6 +419,10 @@ bool WMORenderer::loadModel(const pipeline::WMOModel& model, uint32_t id) {
     core::Logger::getInstance().debug("  WMO bounds: min=(", model.boundingBoxMin.x, ", ", model.boundingBoxMin.y, ", ", model.boundingBoxMin.z,
                                       ") max=(", model.boundingBoxMax.x, ", ", model.boundingBoxMax.y, ", ", model.boundingBoxMax.z, ")");
 
+    // Batch all GPU uploads (textures, VBs, IBs) into a single command buffer
+    // submission with one fence wait, instead of one per upload.
+    vkCtx_->beginUploadBatch();
+
     // Load textures for this model
     core::Logger::getInstance().debug("  WMO has ", model.textures.size(), " texture paths, ", model.materials.size(), " materials");
     if (assetManager && !model.textures.empty()) {
@@ -720,6 +724,8 @@ bool WMORenderer::loadModel(const pipeline::WMOModel& model, uint32_t id) {
         groupRes.allUntextured = !anyTextured && !groupRes.mergedBatches.empty();
     }
 
+    vkCtx_->endUploadBatch();
+
     // Copy portal data for visibility culling
     modelData.portalVertices = model.portalVertices;
     for (const auto& portal : model.portals) {
@@ -2319,13 +2325,27 @@ VkTexture* WMORenderer::loadTexture(const std::string& path) {
     const auto& attemptedCandidates = uniqueCandidates;
 
     // Try loading all candidates until one succeeds
+    // Check pre-decoded BLP cache first (populated by background worker threads)
     pipeline::BLPImage blp;
     std::string resolvedKey;
-    for (const auto& c : attemptedCandidates) {
-        blp = assetManager->loadTexture(c);
-        if (blp.isValid()) {
-            resolvedKey = c;
-            break;
+    if (predecodedBLPCache_) {
+        for (const auto& c : uniqueCandidates) {
+            auto pit = predecodedBLPCache_->find(c);
+            if (pit != predecodedBLPCache_->end()) {
+                blp = std::move(pit->second);
+                predecodedBLPCache_->erase(pit);
+                resolvedKey = c;
+                break;
+            }
+        }
+    }
+    if (!blp.isValid()) {
+        for (const auto& c : attemptedCandidates) {
+            blp = assetManager->loadTexture(c);
+            if (blp.isValid()) {
+                resolvedKey = c;
+                break;
+            }
         }
     }
     if (!blp.isValid()) {
@@ -2363,10 +2383,10 @@ VkTexture* WMORenderer::loadTexture(const std::string& path) {
     texture->createSampler(vkCtx_->getDevice(), VK_FILTER_LINEAR, VK_FILTER_LINEAR,
                             VK_SAMPLER_ADDRESS_MODE_REPEAT);
 
-    // Generate normal+height map from diffuse pixels
+    // Generate normal+height map from diffuse pixels (skip during streaming to avoid CPU stalls)
     float nhVariance = 0.0f;
     std::unique_ptr<VkTexture> nhMap;
-    if (normalMappingEnabled_ || pomEnabled_) {
+    if ((normalMappingEnabled_ || pomEnabled_) && !deferNormalMaps_) {
         nhMap = generateNormalHeightMap(blp.data.data(), blp.width, blp.height, nhVariance);
         if (nhMap) {
             approxBytes *= 2;  // account for normal map in budget