Defer normal map generation to reduce GPU model upload stalls by ~50%

Each loadTexture call was generating a normal/height map inline (3 full-image passes: luminance + blur + Sobel). For models with 15-20 textures this added 30-40ms to the 70ms model upload. Now deferred to a per-frame budget (2/frame in-game, 10/frame during load screen). Models render without POM until their normal maps are ready.
Async humanoid NPC texture pipeline to eliminate 30-150ms main-thread stalls
2026-03-22 23:30:14 +00:00 · 2026-03-07 17:16:38 -08:00 · 2026-03-07 16:54:58 -08:00 · 2026-03-07 15:46:56 -08:00 · 2026-03-07 13:44:09 -08:00 · 2026-03-07 12:39:38 -08:00
18 changed files with 2460 additions and 707 deletions
--- a/include/core/application.hpp
+++ b/include/core/application.hpp
@ -3,13 +3,19 @@
 #include "core/window.hpp"
 #include "core/input.hpp"
 #include "game/character.hpp"
+#include "pipeline/blp_loader.hpp"
 #include <memory>
 #include <string>
 #include <vector>
+#include <deque>
 #include <unordered_map>
 #include <unordered_set>
 #include <array>
 #include <optional>
+#include <future>
+#include <mutex>
+#include <thread>
+#include <atomic>

 namespace wowee {

@ -18,7 +24,7 @@ namespace rendering { class Renderer; }
 namespace ui { class UIManager; }
 namespace auth { class AuthHandler; }
 namespace game { class GameHandler; class World; class ExpansionRegistry; }
-namespace pipeline { class AssetManager; class DBCLayout; }
+namespace pipeline { class AssetManager; class DBCLayout; struct M2Model; struct WMOModel; }
 namespace audio { enum class VoiceType; }

 namespace core {
@ -90,6 +96,7 @@ private:
    static const char* mapIdToName(uint32_t mapId);
    void loadOnlineWorldTerrain(uint32_t mapId, float x, float y, float z);
    void buildFactionHostilityMap(uint8_t playerRace);
+    pipeline::M2Model loadCreatureM2Sync(const std::string& m2Path);
    void spawnOnlineCreature(uint64_t guid, uint32_t displayId, float x, float y, float z, float orientation);
    void despawnOnlineCreature(uint64_t guid);
    bool tryAttachCreatureVirtualWeapons(uint64_t guid, uint32_t instanceId);
@ -181,8 +188,39 @@ private:
    std::unordered_map<uint64_t, glm::vec3> creatureRenderPosCache_; // guid -> last synced render position
    std::unordered_set<uint64_t> creatureWeaponsAttached_;       // guid set when NPC virtual weapons attached
    std::unordered_map<uint64_t, uint8_t> creatureWeaponAttachAttempts_; // guid -> attach attempts
+    std::unordered_map<uint32_t, bool> modelIdIsWolfLike_;     // modelId → cached wolf/worg check
+    static constexpr int MAX_WEAPON_ATTACHES_PER_TICK = 2;     // limit weapon attach work per 1s tick
+
+    // CharSections.dbc lookup cache to avoid O(N) DBC scan per NPC spawn.
+    // Key: (race<<24)|(sex<<16)|(section<<12)|(variation<<8)|color → texture path
+    std::unordered_map<uint64_t, std::string> charSectionsCache_;
+    bool charSectionsCacheBuilt_ = false;
+    void buildCharSectionsCache();
+    std::string lookupCharSection(uint8_t race, uint8_t sex, uint8_t section,
+                                  uint8_t variation, uint8_t color, int texIndex = 0) const;
+
+    // Async creature model loading: file I/O + M2 parsing on background thread,
+    // GPU upload + instance creation on main thread.
+    struct PreparedCreatureModel {
+        uint64_t guid;
+        uint32_t displayId;
+        uint32_t modelId;
+        float x, y, z, orientation;
+        std::shared_ptr<pipeline::M2Model> model; // parsed on background thread
+        std::unordered_map<std::string, pipeline::BLPImage> predecodedTextures; // decoded on bg thread
+        bool valid = false;
+        bool permanent_failure = false;
+    };
+    struct AsyncCreatureLoad {
+        std::future<PreparedCreatureModel> future;
+    };
+    std::vector<AsyncCreatureLoad> asyncCreatureLoads_;
+    void processAsyncCreatureResults();
+    static constexpr int MAX_ASYNC_CREATURE_LOADS = 4; // concurrent background loads
    std::unordered_set<uint64_t> deadCreatureGuids_;            // GUIDs that should spawn in corpse/death pose
    std::unordered_map<uint32_t, uint32_t> displayIdModelCache_; // displayId → modelId (model caching)
+    std::unordered_set<uint32_t> displayIdTexturesApplied_;    // displayIds with per-model textures applied
+    std::unordered_map<uint32_t, std::unordered_map<std::string, pipeline::BLPImage>> displayIdPredecodedTextures_; // displayId → pre-decoded skin textures
    mutable std::unordered_set<uint32_t> warnedMissingDisplayDataIds_; // displayIds already warned
    mutable std::unordered_set<uint32_t> warnedMissingModelPathIds_;   // modelIds/displayIds already warned
    uint32_t nextCreatureModelId_ = 5000;  // Model IDs for online creatures
@ -250,7 +288,7 @@ private:
        uint32_t displayId;
        float x, y, z, orientation;
    };
-    std::vector<PendingCreatureSpawn> pendingCreatureSpawns_;
+    std::deque<PendingCreatureSpawn> pendingCreatureSpawns_;
    static constexpr int MAX_SPAWNS_PER_FRAME = 3;
    static constexpr int MAX_NEW_CREATURE_MODELS_PER_FRAME = 1;
    static constexpr uint16_t MAX_CREATURE_SPAWN_RETRIES = 300;
@ -275,6 +313,49 @@ private:
    // Deferred equipment compositing queue — processes max 1 per frame to avoid stutter
    std::vector<std::pair<uint64_t, std::pair<std::array<uint32_t, 19>, std::array<uint8_t, 19>>>> deferredEquipmentQueue_;
    void processDeferredEquipmentQueue();
+    // Async equipment texture pre-decode: BLP decode on background thread, composite on main thread
+    struct PreparedEquipmentUpdate {
+        uint64_t guid;
+        std::array<uint32_t, 19> displayInfoIds;
+        std::array<uint8_t, 19> inventoryTypes;
+        std::unordered_map<std::string, pipeline::BLPImage> predecodedTextures;
+    };
+    struct AsyncEquipmentLoad {
+        std::future<PreparedEquipmentUpdate> future;
+    };
+    std::vector<AsyncEquipmentLoad> asyncEquipmentLoads_;
+    void processAsyncEquipmentResults();
+    std::vector<std::string> resolveEquipmentTexturePaths(uint64_t guid,
+        const std::array<uint32_t, 19>& displayInfoIds,
+        const std::array<uint8_t, 19>& inventoryTypes) const;
+    // Deferred NPC texture setup — async DBC lookups + BLP pre-decode to avoid main-thread stalls
+    struct DeferredNpcComposite {
+        uint32_t modelId;
+        uint32_t displayId;
+        // Skin compositing (type-1 slots)
+        std::string basePath;                     // CharSections skin base texture
+        std::vector<std::string> overlayPaths;    // face + underwear overlays
+        std::vector<std::pair<int, std::string>> regionLayers;  // equipment region overlays
+        std::vector<uint32_t> skinTextureSlots;   // model texture slots needing skin composite
+        bool hasComposite = false;                // needs compositing (overlays or equipment regions)
+        bool hasSimpleSkin = false;               // just base skin, no compositing needed
+        // Baked skin (type-1 slots)
+        std::string bakedSkinPath;                // baked texture path (if available)
+        bool hasBakedSkin = false;                // baked skin resolved successfully
+        // Hair (type-6 slots)
+        std::vector<uint32_t> hairTextureSlots;   // model texture slots needing hair texture
+        std::string hairTexturePath;              // resolved hair texture path
+        bool useBakedForHair = false;             // bald NPC: use baked skin for type-6
+    };
+    struct PreparedNpcComposite {
+        DeferredNpcComposite info;
+        std::unordered_map<std::string, pipeline::BLPImage> predecodedTextures;
+    };
+    struct AsyncNpcCompositeLoad {
+        std::future<PreparedNpcComposite> future;
+    };
+    std::vector<AsyncNpcCompositeLoad> asyncNpcCompositeLoads_;
+    void processAsyncNpcCompositeResults();
    // Cache base player model geometry by (raceId, genderId)
    std::unordered_map<uint32_t, uint32_t> playerModelCache_; // key=(race<<8)|gender → modelId
    struct PlayerTextureSlots { int skin = -1; int hair = -1; int underwear = -1; };
@ -302,6 +383,24 @@ private:
    };
    std::vector<PendingGameObjectSpawn> pendingGameObjectSpawns_;
    void processGameObjectSpawnQueue();
+
+    // Async WMO loading for game objects (file I/O + parse on background thread)
+    struct PreparedGameObjectWMO {
+        uint64_t guid;
+        uint32_t entry;
+        uint32_t displayId;
+        float x, y, z, orientation;
+        std::shared_ptr<pipeline::WMOModel> wmoModel;
+        std::unordered_map<std::string, pipeline::BLPImage> predecodedTextures; // decoded on bg thread
+        bool valid = false;
+        bool isWmo = false;
+        std::string modelPath;
+    };
+    struct AsyncGameObjectLoad {
+        std::future<PreparedGameObjectWMO> future;
+    };
+    std::vector<AsyncGameObjectLoad> asyncGameObjectLoads_;
+    void processAsyncGameObjectResults();
    struct PendingTransportDoodadBatch {
        uint64_t guid = 0;
        uint32_t modelId = 0;
@ -321,6 +420,23 @@ private:
    // Quest marker billboard sprites (above NPCs)
    void loadQuestMarkerModels();  // Now loads BLP textures
    void updateQuestMarkers();     // Updates billboard positions
+
+    // Background world preloader — warms AssetManager file cache for the
+    // expected world before the user clicks Enter World.
+    struct WorldPreload {
+        uint32_t mapId = 0;
+        std::string mapName;
+        int centerTileX = 0;
+        int centerTileY = 0;
+        std::atomic<bool> cancel{false};
+        std::vector<std::thread> workers;
+    };
+    std::unique_ptr<WorldPreload> worldPreload_;
+    void startWorldPreload(uint32_t mapId, const std::string& mapName, float serverX, float serverY);
+    void cancelWorldPreload();
+    void saveLastWorldInfo(uint32_t mapId, const std::string& mapName, float serverX, float serverY);
+    struct LastWorldInfo { uint32_t mapId = 0; std::string mapName; float x = 0, y = 0; bool valid = false; };
+    LastWorldInfo loadLastWorldInfo() const;
 };

 } // namespace core
--- a/include/rendering/character_renderer.hpp
+++ b/include/rendering/character_renderer.hpp
@ -1,6 +1,7 @@
 #pragma once

 #include "pipeline/m2_loader.hpp"
+#include "pipeline/blp_loader.hpp"
 #include <vulkan/vulkan.h>
 #include <vk_mem_alloc.h>
 #include <glm/glm.hpp>
@ -11,6 +12,7 @@
 #include <string>
 #include <utility>
 #include <future>
+#include <deque>

 namespace wowee {
 namespace pipeline { class AssetManager; }
@ -114,7 +116,11 @@ public:
    void setShadowMap(VkTexture*, const glm::mat4&) {}
    void clearShadowMap() {}

+    // Pre-decoded BLP cache: set before calling loadModel() to skip main-thread BLP decode
+    void setPredecodedBLPCache(std::unordered_map<std::string, pipeline::BLPImage>* cache) { predecodedBLPCache_ = cache; }
+
 private:
+    std::unordered_map<std::string, pipeline::BLPImage>* predecodedBLPCache_ = nullptr;
    // GPU representation of M2 model
    struct M2ModelGPU {
        VkBuffer vertexBuffer = VK_NULL_HANDLE;
@ -180,6 +186,7 @@ private:

        // Bone update throttling (skip frames for distant characters)
        uint32_t boneUpdateCounter = 0;
+        const M2ModelGPU* cachedModel = nullptr;  // Avoid per-frame hash lookups

        // Per-instance bone SSBO (double-buffered per frame)
        VkBuffer boneBuffer[2] = {};
@ -254,7 +261,14 @@ private:
    VkDescriptorPool materialDescPools_[2] = {VK_NULL_HANDLE, VK_NULL_HANDLE};
    VkDescriptorPool boneDescPool_ = VK_NULL_HANDLE;
    uint32_t lastMaterialPoolResetFrame_ = 0xFFFFFFFFu;
-    std::vector<std::pair<VkBuffer, VmaAllocation>> transientMaterialUbos_[2];
+
+    // Material UBO ring buffer — pre-allocated per frame slot, sub-allocated each draw
+    VkBuffer materialRingBuffer_[2] = {VK_NULL_HANDLE, VK_NULL_HANDLE};
+    VmaAllocation materialRingAlloc_[2] = {VK_NULL_HANDLE, VK_NULL_HANDLE};
+    void* materialRingMapped_[2] = {nullptr, nullptr};
+    uint32_t materialRingOffset_[2] = {0, 0};
+    uint32_t materialUboAlignment_ = 256;  // minUniformBufferOffsetAlignment
+    static constexpr uint32_t MATERIAL_RING_CAPACITY = 4096;

    // Texture cache
    struct TextureCacheEntry {
@ -265,6 +279,7 @@ private:
        uint64_t lastUse = 0;
        bool hasAlpha = false;
        bool colorKeyBlack = false;
+        bool normalMapPending = false;  // deferred normal map generation
    };
    std::unordered_map<std::string, TextureCacheEntry> textureCache;
    std::unordered_map<VkTexture*, bool> textureHasAlphaByPtr_;
@ -289,6 +304,17 @@ private:
    std::unique_ptr<VkTexture> generateNormalHeightMap(
        const uint8_t* pixels, uint32_t width, uint32_t height, float& outVariance);

+    // Deferred normal map generation — avoids stalling loadModel
+    struct PendingNormalMap {
+        std::string cacheKey;
+        std::vector<uint8_t> pixels;  // RGBA pixel data
+        uint32_t width, height;
+    };
+    std::deque<PendingNormalMap> pendingNormalMaps_;
+public:
+    void processPendingNormalMaps(int budget = 2);
+private:
+
    // Normal mapping / POM settings
    bool normalMappingEnabled_ = true;
    float normalMapStrength_ = 0.8f;
--- a/include/rendering/m2_renderer.hpp
+++ b/include/rendering/m2_renderer.hpp
@ -1,6 +1,7 @@
 #pragma once

 #include "pipeline/m2_loader.hpp"
+#include "pipeline/blp_loader.hpp"
 #include <vulkan/vulkan.h>
 #include <vk_mem_alloc.h>
 #include <glm/glm.hpp>
@ -188,6 +189,7 @@ struct M2Instance {
    bool skipCollision = false;    // WMO interior doodads — skip player wall collision
    float cachedBoundRadius = 0.0f;
    float portalSpinAngle = 0.0f;  // Accumulated spin angle for portal rotation
+    const M2ModelGPU* cachedModel = nullptr;  // Avoid per-frame hash lookups

    // Frame-skip optimization (update distant animations less frequently)
    uint8_t frameSkipCounter = 0;
@ -328,6 +330,10 @@ public:

    std::vector<glm::vec3> getWaterVegetationPositions(const glm::vec3& camPos, float maxDist) const;

+    // Pre-decoded BLP cache: set by terrain manager before calling loadModel()
+    // so loadTexture() can skip the expensive assetManager->loadTexture() call.
+    void setPredecodedBLPCache(std::unordered_map<std::string, pipeline::BLPImage>* cache) { predecodedBLPCache_ = cache; }
+
 private:
    bool initialized_ = false;
    bool insideInterior = false;
@ -389,12 +395,33 @@ private:
    std::unordered_map<uint32_t, M2ModelGPU> models;
    std::vector<M2Instance> instances;

+    // O(1) dedup: key = (modelId, quantized x, quantized y, quantized z) → instanceId
+    struct DedupKey {
+        uint32_t modelId;
+        int32_t qx, qy, qz; // position quantized to 0.1 units
+        bool operator==(const DedupKey& o) const {
+            return modelId == o.modelId && qx == o.qx && qy == o.qy && qz == o.qz;
+        }
+    };
+    struct DedupHash {
+        size_t operator()(const DedupKey& k) const {
+            size_t h = std::hash<uint32_t>()(k.modelId);
+            h ^= std::hash<int32_t>()(k.qx) * 2654435761u;
+            h ^= std::hash<int32_t>()(k.qy) * 40503u;
+            h ^= std::hash<int32_t>()(k.qz) * 12289u;
+            return h;
+        }
+    };
+    std::unordered_map<DedupKey, uint32_t, DedupHash> instanceDedupMap_;
+
    uint32_t nextInstanceId = 1;
    uint32_t lastDrawCallCount = 0;
    size_t modelCacheLimit_ = 6000;
    uint32_t modelLimitRejectWarnings_ = 0;

    VkTexture* loadTexture(const std::string& path, uint32_t texFlags = 0);
+    std::unordered_map<std::string, pipeline::BLPImage>* predecodedBLPCache_ = nullptr;
+
    struct TextureCacheEntry {
        std::unique_ptr<VkTexture> texture;
        size_t approxBytes = 0;
--- a/include/rendering/terrain_manager.hpp
+++ b/include/rendering/terrain_manager.hpp
@ -121,6 +121,12 @@ struct PendingTile {
    // Pre-loaded terrain texture BLP data (loaded on background thread to avoid
    // blocking file I/O on the main thread during finalizeTile)
    std::unordered_map<std::string, pipeline::BLPImage> preloadedTextures;
+
+    // Pre-decoded M2 model textures (decoded on background thread)
+    std::unordered_map<std::string, pipeline::BLPImage> preloadedM2Textures;
+
+    // Pre-decoded WMO textures (decoded on background thread)
+    std::unordered_map<std::string, pipeline::BLPImage> preloadedWMOTextures;
 };

 /**
@ -150,6 +156,11 @@ struct FinalizingTile {
    size_t wmoModelIndex = 0;      // Next WMO model to upload
    size_t wmoDoodadIndex = 0;     // Next WMO doodad to upload

+    // Incremental terrain upload state (splits TERRAIN phase across frames)
+    bool terrainPreloaded = false;  // True after preloaded textures uploaded
+    int terrainChunkNext = 0;       // Next chunk index to upload (0-255, row-major)
+    bool terrainMeshDone = false;   // True when all chunks uploaded
+
    // Accumulated results (built up across phases)
    std::vector<uint32_t> m2InstanceIds;
    std::vector<uint32_t> wmoInstanceIds;
@ -376,6 +387,11 @@ private:
    std::unordered_set<std::string> missingAdtWarnings_;
    std::mutex missingAdtWarningsMutex_;

+    // Thread-safe set of M2 model IDs already uploaded to GPU
+    // (checked by workers to skip redundant file I/O + parsing)
+    std::unordered_set<uint32_t> uploadedM2Ids_;
+    std::mutex uploadedM2IdsMutex_;
+
    // Dedup set for doodad placements across tile boundaries
    std::unordered_set<uint32_t> placedDoodadIds;

--- a/include/rendering/terrain_renderer.hpp
+++ b/include/rendering/terrain_renderer.hpp
@ -86,6 +86,13 @@ public:
                     const std::vector<std::string>& texturePaths,
                     int tileX = -1, int tileY = -1);

+    /// Upload a batch of terrain chunks incrementally. Returns true when all chunks done.
+    /// chunkIndex is updated to the next chunk to process (0-255 row-major).
+    bool loadTerrainIncremental(const pipeline::TerrainMesh& mesh,
+                                const std::vector<std::string>& texturePaths,
+                                int tileX, int tileY,
+                                int& chunkIndex, int maxChunksPerCall = 16);
+
    void removeTile(int tileX, int tileY);

    void uploadPreloadedTextures(const std::unordered_map<std::string, pipeline::BLPImage>& textures);
@ -120,6 +127,7 @@ public:
    int getRenderedChunkCount() const { return renderedChunks; }
    int getCulledChunkCount() const { return culledChunks; }
    int getTriangleCount() const;
+    VkContext* getVkContext() const { return vkCtx; }

 private:
    TerrainChunkGPU uploadChunk(const pipeline::ChunkMesh& chunk);
--- a/include/rendering/vk_context.hpp
+++ b/include/rendering/vk_context.hpp
@ -1,5 +1,6 @@
 #pragma once

+#include "rendering/vk_utils.hpp"
 #include <vulkan/vulkan.h>
 #include <vk_mem_alloc.h>
 #include <VkBootstrap.h>
@ -46,6 +47,16 @@ public:
    // Immediate submit for one-off GPU work (descriptor pool creation, etc.)
    void immediateSubmit(std::function<void(VkCommandBuffer cmd)>&& function);

+    // Batch upload mode: records multiple upload commands into a single
+    // command buffer, then submits with ONE fence wait instead of one per upload.
+    void beginUploadBatch();
+    void endUploadBatch();       // Async: submits but does NOT wait for fence
+    void endUploadBatchSync();   // Sync: submits and waits (for load screens)
+    bool isInUploadBatch() const { return inUploadBatch_; }
+    void deferStagingCleanup(AllocatedBuffer staging);
+    void pollUploadBatches();    // Check completed async uploads, free staging buffers
+    void waitAllUploads();       // Block until all in-flight uploads complete
+
    // Accessors
    VkInstance getInstance() const { return instance; }
    VkPhysicalDevice getPhysicalDevice() const { return physicalDevice; }
@ -143,6 +154,20 @@ private:
    VkCommandPool immCommandPool = VK_NULL_HANDLE;
    VkFence immFence = VK_NULL_HANDLE;

+    // Batch upload state (nesting-safe via depth counter)
+    int uploadBatchDepth_ = 0;
+    bool inUploadBatch_ = false;
+    VkCommandBuffer batchCmd_ = VK_NULL_HANDLE;
+    std::vector<AllocatedBuffer> batchStagingBuffers_;
+
+    // Async upload: in-flight batches awaiting GPU completion
+    struct InFlightBatch {
+        VkFence fence = VK_NULL_HANDLE;
+        VkCommandBuffer cmd = VK_NULL_HANDLE;
+        std::vector<AllocatedBuffer> stagingBuffers;
+    };
+    std::vector<InFlightBatch> inFlightBatches_;
+
    // Depth buffer (shared across all framebuffers)
    VkImage depthImage = VK_NULL_HANDLE;
    VkImageView depthImageView = VK_NULL_HANDLE;
--- a/include/rendering/wmo_renderer.hpp
+++ b/include/rendering/wmo_renderer.hpp
@ -1,5 +1,6 @@
 #pragma once

+#include "pipeline/blp_loader.hpp"
 #include <vulkan/vulkan.h>
 #include <vk_mem_alloc.h>
 #include <glm/glm.hpp>
@ -325,6 +326,12 @@ public:
    // Pre-compute floor cache for all loaded WMO instances
    void precomputeFloorCache();

+    // Pre-decoded BLP cache: set before calling loadModel() to skip main-thread BLP decode
+    void setPredecodedBLPCache(std::unordered_map<std::string, pipeline::BLPImage>* cache) { predecodedBLPCache_ = cache; }
+
+    // Defer normal/height map generation during streaming to avoid CPU stalls
+    void setDeferNormalMaps(bool defer) { deferNormalMaps_ = defer; }
+
 private:
    // WMO material UBO — matches WMOMaterial in wmo.frag.glsl
    struct WMOMaterialUBO {
@ -558,6 +565,7 @@ private:
     * Load a texture from path
     */
    VkTexture* loadTexture(const std::string& path);
+    std::unordered_map<std::string, pipeline::BLPImage>* predecodedBLPCache_ = nullptr;

    /**
     * Generate normal+height map from diffuse RGBA8 pixels
@ -670,6 +678,7 @@ private:

    // Normal mapping / POM settings
    bool normalMappingEnabled_ = true;   // on by default
+    bool deferNormalMaps_ = false;       // skip normal map gen during streaming
    float normalMapStrength_ = 0.8f;     // 0.0 = flat, 1.0 = full, 2.0 = exaggerated
    bool pomEnabled_ = true;             // on by default
    int pomQuality_ = 1;                 // 0=Low(16), 1=Medium(32), 2=High(64)
--- a/src/core/application.cpp
+++ b/src/core/application.cpp
--- a/src/game/game_handler.cpp
+++ b/src/game/game_handler.cpp
@ -541,7 +541,13 @@ void GameHandler::update(float deltaTime) {

    // Update socket (processes incoming data and triggers callbacks)
    if (socket) {
+        auto socketStart = std::chrono::steady_clock::now();
        socket->update();
+        float socketMs = std::chrono::duration<float, std::milli>(
+            std::chrono::steady_clock::now() - socketStart).count();
+        if (socketMs > 3.0f) {
+            LOG_WARNING("SLOW socket->update: ", socketMs, "ms");
+        }
    }

    // Detect server-side disconnect (socket closed during update)
--- a/src/rendering/character_renderer.cpp
+++ b/src/rendering/character_renderer.cpp
@ -197,6 +197,29 @@ bool CharacterRenderer::initialize(VkContext* ctx, VkDescriptorSetLayout perFram
        vkCreateDescriptorPool(device, &ci, nullptr, &boneDescPool_);
    }

+    // --- Material UBO ring buffers (one per frame slot) ---
+    {
+        VkPhysicalDeviceProperties props;
+        vkGetPhysicalDeviceProperties(ctx->getPhysicalDevice(), &props);
+        materialUboAlignment_ = static_cast<uint32_t>(props.limits.minUniformBufferOffsetAlignment);
+        if (materialUboAlignment_ < 1) materialUboAlignment_ = 1;
+        // Round up UBO size to alignment
+        uint32_t alignedUboSize = (sizeof(CharMaterialUBO) + materialUboAlignment_ - 1) & ~(materialUboAlignment_ - 1);
+        uint32_t ringSize = alignedUboSize * MATERIAL_RING_CAPACITY;
+        for (int i = 0; i < 2; i++) {
+            VkBufferCreateInfo bci{VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO};
+            bci.size = ringSize;
+            bci.usage = VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT;
+            VmaAllocationCreateInfo aci{};
+            aci.usage = VMA_MEMORY_USAGE_CPU_TO_GPU;
+            aci.flags = VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT | VMA_ALLOCATION_CREATE_MAPPED_BIT;
+            VmaAllocationInfo allocInfo{};
+            vmaCreateBuffer(ctx->getAllocator(), &bci, &aci,
+                            &materialRingBuffer_[i], &materialRingAlloc_[i], &allocInfo);
+            materialRingMapped_[i] = allocInfo.pMappedData;
+        }
+    }
+
    // --- Pipeline layout ---
    // set 0 = perFrame, set 1 = material, set 2 = bones
    // Push constant: mat4 model = 64 bytes
@ -352,14 +375,15 @@ void CharacterRenderer::shutdown() {

    if (pipelineLayout_) { vkDestroyPipelineLayout(device, pipelineLayout_, nullptr); pipelineLayout_ = VK_NULL_HANDLE; }

-    // Release any deferred transient material UBOs.
+    // Destroy material ring buffers
    for (int i = 0; i < 2; i++) {
-        for (const auto& b : transientMaterialUbos_[i]) {
-            if (b.first) {
-                vmaDestroyBuffer(alloc, b.first, b.second);
+        if (materialRingBuffer_[i]) {
+            vmaDestroyBuffer(alloc, materialRingBuffer_[i], materialRingAlloc_[i]);
+            materialRingBuffer_[i] = VK_NULL_HANDLE;
+            materialRingAlloc_[i] = VK_NULL_HANDLE;
+            materialRingMapped_[i] = nullptr;
        }
-        }
-        transientMaterialUbos_[i].clear();
+        materialRingOffset_[i] = 0;
    }

    // Destroy descriptor pools and layouts
@ -391,7 +415,6 @@ void CharacterRenderer::clear() {

    vkDeviceWaitIdle(vkCtx_->getDevice());
    VkDevice device = vkCtx_->getDevice();
-    VmaAllocator alloc = vkCtx_->getAllocator();

    // Destroy GPU resources for all models
    for (auto& pair : models) {
@ -441,14 +464,9 @@ void CharacterRenderer::clear() {
    models.clear();
    instances.clear();

-    // Release deferred transient material UBOs
+    // Reset material ring buffer offsets (buffers persist, just reset write position)
    for (int i = 0; i < 2; i++) {
-        for (const auto& b : transientMaterialUbos_[i]) {
-            if (b.first) {
-                vmaDestroyBuffer(alloc, b.first, b.second);
-            }
-        }
-        transientMaterialUbos_[i].clear();
+        materialRingOffset_[i] = 0;
    }

    // Reset descriptor pools (don't destroy — reuse for new allocations)
@ -607,7 +625,18 @@ VkTexture* CharacterRenderer::loadTexture(const std::string& path) {
        return whiteTexture_.get();
    }

-    auto blpImage = assetManager->loadTexture(key);
+    // Check pre-decoded BLP cache first (populated by background threads)
+    pipeline::BLPImage blpImage;
+    if (predecodedBLPCache_) {
+        auto pit = predecodedBLPCache_->find(key);
+        if (pit != predecodedBLPCache_->end()) {
+            blpImage = std::move(pit->second);
+            predecodedBLPCache_->erase(pit);
+        }
+    }
+    if (!blpImage.isValid()) {
+        blpImage = assetManager->loadTexture(key);
+    }
    if (!blpImage.isValid()) {
        // Return white fallback but don't cache the failure — allow retry
        // on next character load in case the asset becomes available.
@ -658,13 +687,16 @@ VkTexture* CharacterRenderer::loadTexture(const std::string& path) {
    e.hasAlpha = hasAlpha;
    e.colorKeyBlack = colorKeyBlackHint;

-    // Generate normal/height map from diffuse texture
-    float nhVariance = 0.0f;
-    auto nhMap = generateNormalHeightMap(blpImage.data.data(), blpImage.width, blpImage.height, nhVariance);
-    if (nhMap) {
-        e.heightMapVariance = nhVariance;
-        e.approxBytes += approxTextureBytesWithMips(blpImage.width, blpImage.height);
-        e.normalHeightMap = std::move(nhMap);
+    // Defer normal/height map generation to avoid stalling loadModel.
+    // Normal maps are generated in processPendingNormalMaps() at a per-frame budget.
+    if (blpImage.width >= 32 && blpImage.height >= 32) {
+        PendingNormalMap pending;
+        pending.cacheKey = key;
+        pending.pixels.assign(blpImage.data.begin(), blpImage.data.end());
+        pending.width = blpImage.width;
+        pending.height = blpImage.height;
+        pendingNormalMaps_.push_back(std::move(pending));
+        e.normalMapPending = true;
    }

    textureCacheBytes_ += e.approxBytes;
@ -676,6 +708,34 @@ VkTexture* CharacterRenderer::loadTexture(const std::string& path) {
    return texPtr;
 }

+void CharacterRenderer::processPendingNormalMaps(int budget) {
+    if (pendingNormalMaps_.empty() || !vkCtx_) return;
+
+    int processed = 0;
+    while (!pendingNormalMaps_.empty() && processed < budget) {
+        auto pending = std::move(pendingNormalMaps_.front());
+        pendingNormalMaps_.pop_front();
+
+        auto it = textureCache.find(pending.cacheKey);
+        if (it == textureCache.end()) continue;  // texture was evicted
+
+        float nhVariance = 0.0f;
+        vkCtx_->beginUploadBatch();
+        auto nhMap = generateNormalHeightMap(pending.pixels.data(),
+            pending.width, pending.height, nhVariance);
+        vkCtx_->endUploadBatch();
+
+        if (nhMap) {
+            it->second.heightMapVariance = nhVariance;
+            it->second.approxBytes += approxTextureBytesWithMips(pending.width, pending.height);
+            textureCacheBytes_ += approxTextureBytesWithMips(pending.width, pending.height);
+            it->second.normalHeightMap = std::move(nhMap);
+        }
+        it->second.normalMapPending = false;
+        processed++;
+    }
+}
+
 // Alpha-blend overlay onto composite at (dstX, dstY)
 static void blitOverlay(std::vector<uint8_t>& composite, int compW, int compH,
                         const pipeline::BLPImage& overlay, int dstX, int dstY) {
@ -807,7 +867,19 @@ VkTexture* CharacterRenderer::compositeTextures(const std::vector<std::string>&
    }

    // Load base layer
-    auto base = assetManager->loadTexture(layerPaths[0]);
+    pipeline::BLPImage base;
+    if (predecodedBLPCache_) {
+        std::string key = layerPaths[0];
+        std::replace(key.begin(), key.end(), '/', '\\');
+        std::transform(key.begin(), key.end(), key.begin(),
+                       [](unsigned char c) { return static_cast<char>(std::tolower(c)); });
+        auto pit = predecodedBLPCache_->find(key);
+        if (pit != predecodedBLPCache_->end()) {
+            base = std::move(pit->second);
+            predecodedBLPCache_->erase(pit);
+        }
+    }
+    if (!base.isValid()) base = assetManager->loadTexture(layerPaths[0]);
    if (!base.isValid()) {
        core::Logger::getInstance().warning("Composite: failed to load base layer: ", layerPaths[0]);
        return whiteTexture_.get();
@ -848,7 +920,19 @@ VkTexture* CharacterRenderer::compositeTextures(const std::vector<std::string>&
    for (size_t layer = 1; layer < layerPaths.size(); layer++) {
        if (layerPaths[layer].empty()) continue;

-        auto overlay = assetManager->loadTexture(layerPaths[layer]);
+        pipeline::BLPImage overlay;
+        if (predecodedBLPCache_) {
+            std::string key = layerPaths[layer];
+            std::replace(key.begin(), key.end(), '/', '\\');
+            std::transform(key.begin(), key.end(), key.begin(),
+                           [](unsigned char c) { return static_cast<char>(std::tolower(c)); });
+            auto pit = predecodedBLPCache_->find(key);
+            if (pit != predecodedBLPCache_->end()) {
+                overlay = std::move(pit->second);
+                predecodedBLPCache_->erase(pit);
+            }
+        }
+        if (!overlay.isValid()) overlay = assetManager->loadTexture(layerPaths[layer]);
        if (!overlay.isValid()) {
            core::Logger::getInstance().warning("Composite: FAILED to load overlay: ", layerPaths[layer]);
            continue;
@ -1025,7 +1109,19 @@ VkTexture* CharacterRenderer::compositeWithRegions(const std::string& basePath,
        return whiteTexture_.get();
    }

-    auto base = assetManager->loadTexture(basePath);
+    pipeline::BLPImage base;
+    if (predecodedBLPCache_) {
+        std::string key = basePath;
+        std::replace(key.begin(), key.end(), '/', '\\');
+        std::transform(key.begin(), key.end(), key.begin(),
+                       [](unsigned char c) { return static_cast<char>(std::tolower(c)); });
+        auto pit = predecodedBLPCache_->find(key);
+        if (pit != predecodedBLPCache_->end()) {
+            base = std::move(pit->second);
+            predecodedBLPCache_->erase(pit);
+        }
+    }
+    if (!base.isValid()) base = assetManager->loadTexture(basePath);
    if (!base.isValid()) {
        return whiteTexture_.get();
    }
@ -1064,7 +1160,19 @@ VkTexture* CharacterRenderer::compositeWithRegions(const std::string& basePath,
    bool upscaled = (base.width == 256 && base.height == 256 && width == 512);
    for (const auto& ul : baseLayers) {
        if (ul.empty()) continue;
-        auto overlay = assetManager->loadTexture(ul);
+        pipeline::BLPImage overlay;
+        if (predecodedBLPCache_) {
+            std::string key = ul;
+            std::replace(key.begin(), key.end(), '/', '\\');
+            std::transform(key.begin(), key.end(), key.begin(),
+                           [](unsigned char c) { return static_cast<char>(std::tolower(c)); });
+            auto pit = predecodedBLPCache_->find(key);
+            if (pit != predecodedBLPCache_->end()) {
+                overlay = std::move(pit->second);
+                predecodedBLPCache_->erase(pit);
+            }
+        }
+        if (!overlay.isValid()) overlay = assetManager->loadTexture(ul);
        if (!overlay.isValid()) continue;

        if (overlay.width == width && overlay.height == height) {
@ -1142,7 +1250,19 @@ VkTexture* CharacterRenderer::compositeWithRegions(const std::string& basePath,
        int regionIdx = rl.first;
        if (regionIdx < 0 || regionIdx >= 8) continue;

-        auto overlay = assetManager->loadTexture(rl.second);
+        pipeline::BLPImage overlay;
+        if (predecodedBLPCache_) {
+            std::string key = rl.second;
+            std::replace(key.begin(), key.end(), '/', '\\');
+            std::transform(key.begin(), key.end(), key.begin(),
+                           [](unsigned char c) { return static_cast<char>(std::tolower(c)); });
+            auto pit = predecodedBLPCache_->find(key);
+            if (pit != predecodedBLPCache_->end()) {
+                overlay = std::move(pit->second);
+                predecodedBLPCache_->erase(pit);
+            }
+        }
+        if (!overlay.isValid()) overlay = assetManager->loadTexture(rl.second);
        if (!overlay.isValid()) {
            core::Logger::getInstance().warning("compositeWithRegions: failed to load ", rl.second);
            continue;
@ -1247,6 +1367,10 @@ bool CharacterRenderer::loadModel(const pipeline::M2Model& model, uint32_t id) {
    M2ModelGPU gpuModel;
    gpuModel.data = model;

+    // Batch all GPU uploads (VB, IB, textures) into a single command buffer
+    // submission with one fence wait, instead of one fence wait per upload.
+    vkCtx_->beginUploadBatch();
+
    // Setup GPU buffers
    setupModelBuffers(gpuModel);

@ -1259,6 +1383,8 @@ bool CharacterRenderer::loadModel(const pipeline::M2Model& model, uint32_t id) {
        gpuModel.textureIds.push_back(texPtr);
    }

+    vkCtx_->endUploadBatch();
+
    models[id] = std::move(gpuModel);

    core::Logger::getInstance().debug("Loaded M2 model ", id, " (", model.vertices.size(),
@ -1388,8 +1514,9 @@ uint32_t CharacterRenderer::createInstance(uint32_t modelId, const glm::vec3& po
    instance.scale = scale;

    // Initialize bone matrices to identity
-    auto& model = models[modelId].data;
-    instance.boneMatrices.resize(std::max(static_cast<size_t>(1), model.bones.size()), glm::mat4(1.0f));
+    auto& gpuRef = models[modelId];
+    instance.boneMatrices.resize(std::max(static_cast<size_t>(1), gpuRef.data.bones.size()), glm::mat4(1.0f));
+    instance.cachedModel = &gpuRef;

    uint32_t id = instance.id;
    instances[id] = std::move(instance);
@ -1448,8 +1575,14 @@ void CharacterRenderer::update(float deltaTime, const glm::vec3& cameraPos) {
    const float animUpdateRadius = static_cast<float>(envSizeOrDefault("WOWEE_CHAR_ANIM_RADIUS", 120));
    const float animUpdateRadiusSq = animUpdateRadius * animUpdateRadius;

+    // Single pass: fade-in, movement, and animation bone collection
+    std::vector<std::reference_wrapper<CharacterInstance>> toUpdate;
+    toUpdate.reserve(instances.size());
+
+    for (auto& pair : instances) {
+        auto& inst = pair.second;
+
        // Update fade-in opacity
-    for (auto& [id, inst] : instances) {
        if (inst.fadeInDuration > 0.0f && inst.opacity < 1.0f) {
            inst.fadeInTime += deltaTime;
            inst.opacity = std::min(1.0f, inst.fadeInTime / inst.fadeInDuration);
@ -1457,10 +1590,8 @@ void CharacterRenderer::update(float deltaTime, const glm::vec3& cameraPos) {
                inst.fadeInDuration = 0.0f;
            }
        }
-    }

        // Interpolate creature movement
-    for (auto& [id, inst] : instances) {
        if (inst.isMoving) {
            inst.moveElapsed += deltaTime;
            float t = inst.moveElapsed / inst.moveDuration;
@ -1469,36 +1600,26 @@ void CharacterRenderer::update(float deltaTime, const glm::vec3& cameraPos) {
                inst.isMoving = false;
                // Return to idle when movement completes
                if (inst.currentAnimationId == 4 || inst.currentAnimationId == 5) {
-                    playAnimation(id, 0, true);
+                    playAnimation(pair.first, 0, true);
                }
            } else {
                inst.position = glm::mix(inst.moveStart, inst.moveEnd, t);
            }
        }
-    }

-    // Only update animations for nearby characters (performance optimization)
-    // Collect instances that need bone recomputation, with distance-based throttling
-    std::vector<std::reference_wrapper<CharacterInstance>> toUpdate;
-    toUpdate.reserve(instances.size());
-
-    for (auto& pair : instances) {
-        auto& inst = pair.second;
-
-        // Skip weapon instances — their transforms are set by parent bones
+        // Skip weapon instances for animation — their transforms are set by parent bones
        if (inst.hasOverrideModelMatrix) continue;

        float distSq = glm::distance2(inst.position, cameraPos);
        if (distSq >= animUpdateRadiusSq) continue;

        // Always advance animation time (cheap)
-        auto modelIt = models.find(inst.modelId);
-        if (modelIt != models.end() && !modelIt->second.data.sequences.empty()) {
+        if (inst.cachedModel && !inst.cachedModel->data.sequences.empty()) {
            if (inst.currentSequenceIndex < 0) {
                inst.currentSequenceIndex = 0;
-                inst.currentAnimationId = modelIt->second.data.sequences[0].id;
+                inst.currentAnimationId = inst.cachedModel->data.sequences[0].id;
            }
-            const auto& seq = modelIt->second.data.sequences[inst.currentSequenceIndex];
+            const auto& seq = inst.cachedModel->data.sequences[inst.currentSequenceIndex];
            inst.animationTime += deltaTime * 1000.0f;
            if (seq.duration > 0 && inst.animationTime >= static_cast<float>(seq.duration)) {
                if (inst.animationLoop) {
@ -1509,10 +1630,11 @@ void CharacterRenderer::update(float deltaTime, const glm::vec3& cameraPos) {
            }
        }

-        // Distance-tiered bone throttling: near=every frame, mid=every 3rd, far=every 6th
+        // Distance-tiered bone throttling: near=every frame, mid=every 4th, far=every 8th
        uint32_t boneInterval = 1;
-        if (distSq > 60.0f * 60.0f) boneInterval = 6;
-        else if (distSq > 30.0f * 30.0f) boneInterval = 3;
+        if (distSq > 40.0f * 40.0f) boneInterval = 8;
+        else if (distSq > 20.0f * 20.0f) boneInterval = 4;
+        else if (distSq > 10.0f * 10.0f) boneInterval = 2;

        inst.boneUpdateCounter++;
        bool needsBones = (inst.boneUpdateCounter >= boneInterval) || inst.boneMatrices.empty();
@ -1527,7 +1649,7 @@ void CharacterRenderer::update(float deltaTime, const glm::vec3& cameraPos) {
    // Thread bone matrix computation in chunks
    if (updatedCount >= 8 && numAnimThreads_ > 1) {
        static const size_t minAnimWorkPerThread = std::max<size_t>(
-            16, envSizeOrDefault("WOWEE_CHAR_ANIM_WORK_PER_THREAD", 64));
+            8, envSizeOrDefault("WOWEE_CHAR_ANIM_WORK_PER_THREAD", 16));
        const size_t maxUsefulThreads = std::max<size_t>(
            1, (updatedCount + minAnimWorkPerThread - 1) / minAnimWorkPerThread);
        const size_t numThreads = std::min(static_cast<size_t>(numAnimThreads_), maxUsefulThreads);
@ -1596,11 +1718,8 @@ void CharacterRenderer::update(float deltaTime, const glm::vec3& cameraPos) {
 }

 void CharacterRenderer::updateAnimation(CharacterInstance& instance, float deltaTime) {
-    auto modelIt = models.find(instance.modelId);
-    if (modelIt == models.end()) {
-        return;
-    }
-    const auto& model = modelIt->second.data;
+    if (!instance.cachedModel) return;
+    const auto& model = instance.cachedModel->data;

    if (model.sequences.empty()) {
        return;
@ -1713,7 +1832,8 @@ glm::quat CharacterRenderer::interpolateQuat(const pipeline::M2AnimationTrack& t
 // --- Bone transform calculation ---

 void CharacterRenderer::calculateBoneMatrices(CharacterInstance& instance) {
-    auto& model = models[instance.modelId].data;
+    if (!instance.cachedModel) return;
+    auto& model = instance.cachedModel->data;

    if (model.bones.empty()) {
        return;
@ -1722,8 +1842,6 @@ void CharacterRenderer::calculateBoneMatrices(CharacterInstance& instance) {
    size_t numBones = model.bones.size();
    instance.boneMatrices.resize(numBones);

-    static bool dumpedOnce = false;
-
    for (size_t i = 0; i < numBones; i++) {
        const auto& bone = model.bones[i];

@ -1731,19 +1849,6 @@ void CharacterRenderer::calculateBoneMatrices(CharacterInstance& instance) {
        // At rest this is identity, so no separate bind pose is needed
        glm::mat4 localTransform = getBoneTransform(bone, instance.animationTime, instance.currentSequenceIndex);

-        // Debug: dump first frame bone data
-        if (!dumpedOnce && i < 5) {
-            glm::vec3 t = interpolateVec3(bone.translation, instance.currentSequenceIndex, instance.animationTime, glm::vec3(0.0f));
-            glm::quat r = interpolateQuat(bone.rotation, instance.currentSequenceIndex, instance.animationTime);
-            glm::vec3 s = interpolateVec3(bone.scale, instance.currentSequenceIndex, instance.animationTime, glm::vec3(1.0f));
-            core::Logger::getInstance().info("Bone ", i, " parent=", bone.parentBone,
-                " pivot=(", bone.pivot.x, ",", bone.pivot.y, ",", bone.pivot.z, ")",
-                " t=(", t.x, ",", t.y, ",", t.z, ")",
-                " r=(", r.w, ",", r.x, ",", r.y, ",", r.z, ")",
-                " s=(", s.x, ",", s.y, ",", s.z, ")",
-                " seqIdx=", instance.currentSequenceIndex);
-        }
-
        // Compose with parent
        if (bone.parentBone >= 0 && static_cast<size_t>(bone.parentBone) < numBones) {
            instance.boneMatrices[i] = instance.boneMatrices[bone.parentBone] * localTransform;
@ -1751,12 +1856,6 @@ void CharacterRenderer::calculateBoneMatrices(CharacterInstance& instance) {
            instance.boneMatrices[i] = localTransform;
        }
    }
-    if (!dumpedOnce) {
-        dumpedOnce = true;
-        // Dump final matrix for bone 0
-        auto& m = instance.boneMatrices[0];
-        core::Logger::getInstance().info("Bone 0 final matrix row0=(", m[0][0], ",", m[1][0], ",", m[2][0], ",", m[3][0], ")");
-    }
 }

 glm::mat4 CharacterRenderer::getBoneTransform(const pipeline::M2Bone& bone, float time, int sequenceIndex) {
@ -1791,22 +1890,19 @@ void CharacterRenderer::render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet,
    uint32_t frameIndex = vkCtx_->getCurrentFrame();
    uint32_t frameSlot = frameIndex % 2u;

-    // Reset transient material allocations once per frame slot.
-    // beginFrame() waits on this slot's fence before recording.
+    // Reset material ring buffer and descriptor pool once per frame slot.
    if (lastMaterialPoolResetFrame_ != frameIndex) {
-        VmaAllocator alloc = vkCtx_->getAllocator();
-        for (const auto& b : transientMaterialUbos_[frameSlot]) {
-            if (b.first) {
-                vmaDestroyBuffer(alloc, b.first, b.second);
-            }
-        }
-        transientMaterialUbos_[frameSlot].clear();
+        materialRingOffset_[frameSlot] = 0;
        if (materialDescPools_[frameSlot]) {
            vkResetDescriptorPool(vkCtx_->getDevice(), materialDescPools_[frameSlot], 0);
        }
        lastMaterialPoolResetFrame_ = frameIndex;
    }

+    // Pre-compute aligned UBO stride for ring buffer sub-allocation
+    const uint32_t uboStride = (sizeof(CharMaterialUBO) + materialUboAlignment_ - 1) & ~(materialUboAlignment_ - 1);
+    const uint32_t ringCapacityBytes = uboStride * MATERIAL_RING_CAPACITY;
+
    // Bind per-frame descriptor set (set 0) -- shared across all draws
    vkCmdBindDescriptorSets(cmd, VK_PIPELINE_BIND_POINT_GRAPHICS,
                            pipelineLayout_, 0, 1, &perFrameSet, 0, nullptr);
@ -1838,9 +1934,8 @@ void CharacterRenderer::render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet,
            }
        }

-        auto modelIt = models.find(instance.modelId);
-        if (modelIt == models.end()) continue;
-        const auto& gpuModel = modelIt->second;
+        if (!instance.cachedModel) continue;
+        const auto& gpuModel = *instance.cachedModel;

        // Skip models without GPU buffers
        if (!gpuModel.vertexBuffer) continue;
@ -2176,27 +2271,18 @@ void CharacterRenderer::render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet,
                matData.heightMapVariance = batchHeightVariance;
                matData.normalMapStrength = normalMapStrength_;

-                // Create a small UBO for this batch's material
-                VkBufferCreateInfo bci{VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO};
-                bci.size = sizeof(CharMaterialUBO);
-                bci.usage = VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT;
-                VmaAllocationCreateInfo aci{};
-                aci.usage = VMA_MEMORY_USAGE_CPU_TO_GPU;
-                aci.flags = VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT | VMA_ALLOCATION_CREATE_MAPPED_BIT;
-                VmaAllocationInfo allocInfo{};
-                ::VkBuffer matUBO = VK_NULL_HANDLE;
-                VmaAllocation matUBOAlloc = VK_NULL_HANDLE;
-                vmaCreateBuffer(vkCtx_->getAllocator(), &bci, &aci, &matUBO, &matUBOAlloc, &allocInfo);
-                if (allocInfo.pMappedData) {
-                    memcpy(allocInfo.pMappedData, &matData, sizeof(CharMaterialUBO));
-                }
+                // Sub-allocate material UBO from ring buffer
+                uint32_t matOffset = materialRingOffset_[frameSlot];
+                if (matOffset + uboStride > ringCapacityBytes) continue; // ring exhausted
+                memcpy(static_cast<char*>(materialRingMapped_[frameSlot]) + matOffset, &matData, sizeof(CharMaterialUBO));
+                materialRingOffset_[frameSlot] = matOffset + uboStride;

                // Write descriptor set: binding 0 = texture, binding 1 = material UBO, binding 2 = normal/height map
                VkTexture* bindTex = (texPtr && texPtr->isValid()) ? texPtr : whiteTexture_.get();
                VkDescriptorImageInfo imgInfo = bindTex->descriptorInfo();
                VkDescriptorBufferInfo bufInfo{};
-                bufInfo.buffer = matUBO;
-                bufInfo.offset = 0;
+                bufInfo.buffer = materialRingBuffer_[frameSlot];
+                bufInfo.offset = matOffset;
                bufInfo.range = sizeof(CharMaterialUBO);
                VkDescriptorImageInfo nhImgInfo = normalMap->descriptorInfo();

@ -2229,8 +2315,6 @@ void CharacterRenderer::render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet,
                                        pipelineLayout_, 1, 1, &materialSet, 0, nullptr);

                vkCmdDrawIndexed(cmd, batch.indexCount, 1, batch.indexStart, 0, 0);
-
-                transientMaterialUbos_[frameSlot].emplace_back(matUBO, matUBOAlloc);
            }
        } else {
            // Draw entire model with first texture
@ -2271,24 +2355,16 @@ void CharacterRenderer::render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet,
            matData.heightMapVariance = 0.0f;
            matData.normalMapStrength = normalMapStrength_;

-            VkBufferCreateInfo bci{VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO};
-            bci.size = sizeof(CharMaterialUBO);
-            bci.usage = VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT;
-            VmaAllocationCreateInfo aci{};
-            aci.usage = VMA_MEMORY_USAGE_CPU_TO_GPU;
-            aci.flags = VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT | VMA_ALLOCATION_CREATE_MAPPED_BIT;
-            VmaAllocationInfo allocInfo{};
-            ::VkBuffer matUBO = VK_NULL_HANDLE;
-            VmaAllocation matUBOAlloc = VK_NULL_HANDLE;
-            vmaCreateBuffer(vkCtx_->getAllocator(), &bci, &aci, &matUBO, &matUBOAlloc, &allocInfo);
-            if (allocInfo.pMappedData) {
-                memcpy(allocInfo.pMappedData, &matData, sizeof(CharMaterialUBO));
-            }
+            // Sub-allocate material UBO from ring buffer
+            uint32_t matOffset2 = materialRingOffset_[frameSlot];
+            if (matOffset2 + uboStride > ringCapacityBytes) continue; // ring exhausted
+            memcpy(static_cast<char*>(materialRingMapped_[frameSlot]) + matOffset2, &matData, sizeof(CharMaterialUBO));
+            materialRingOffset_[frameSlot] = matOffset2 + uboStride;

            VkDescriptorImageInfo imgInfo = texPtr->descriptorInfo();
            VkDescriptorBufferInfo bufInfo{};
-            bufInfo.buffer = matUBO;
-            bufInfo.offset = 0;
+            bufInfo.buffer = materialRingBuffer_[frameSlot];
+            bufInfo.offset = matOffset2;
            bufInfo.range = sizeof(CharMaterialUBO);
            VkDescriptorImageInfo nhImgInfo2 = flatNormalTexture_->descriptorInfo();

@ -2320,8 +2396,6 @@ void CharacterRenderer::render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet,
                                    pipelineLayout_, 1, 1, &materialSet, 0, nullptr);

            vkCmdDrawIndexed(cmd, gpuModel.indexCount, 1, 0, 0, 0);
-
-            transientMaterialUbos_[frameSlot].emplace_back(matUBO, matUBOAlloc);
        }
    }
 }
@ -2513,9 +2587,8 @@ void CharacterRenderer::renderShadow(VkCommandBuffer cmd, const glm::mat4& light
        glm::vec3 diff = inst.position - shadowCenter;
        if (glm::dot(diff, diff) > shadowRadiusSq) continue;

-        auto modelIt = models.find(inst.modelId);
-        if (modelIt == models.end()) continue;
-        const M2ModelGPU& gpuModel = modelIt->second;
+        if (!inst.cachedModel) continue;
+        const M2ModelGPU& gpuModel = *inst.cachedModel;
        if (!gpuModel.vertexBuffer) continue;

        glm::mat4 modelMat = inst.hasOverrideModelMatrix
--- a/src/rendering/m2_renderer.cpp
+++ b/src/rendering/m2_renderer.cpp
@ -678,6 +678,7 @@ void M2Renderer::shutdown() {
    instances.clear();
    spatialGrid.clear();
    instanceIndexById.clear();
+    instanceDedupMap_.clear();

    // Delete cached textures
    textureCache.clear();
@ -1184,6 +1185,10 @@ bool M2Renderer::loadModel(const pipeline::M2Model& model, uint32_t modelId) {
        }
    }

+    // Batch all GPU uploads (VB, IB, textures) into a single command buffer
+    // submission with one fence wait, instead of one fence wait per upload.
+    vkCtx_->beginUploadBatch();
+
    if (hasGeometry) {
        // Create VBO with interleaved vertex data
        // Format: position (3), normal (3), texcoord0 (2), texcoord1 (2), boneWeights (4), boneIndices (4 as float)
@ -1535,6 +1540,8 @@ bool M2Renderer::loadModel(const pipeline::M2Model& model, uint32_t modelId) {
        }
    }

+    vkCtx_->endUploadBatch();
+
    // Allocate Vulkan descriptor sets and UBOs for each batch
    for (auto& bgpu : gpuModel.batches) {
        // Create combined UBO for M2Params (binding 1) + M2Material (binding 2)
@ -1613,17 +1620,16 @@ uint32_t M2Renderer::createInstance(uint32_t modelId, const glm::vec3& position,
    }
    const auto& mdlRef = modelIt->second;

-    // Ground clutter is procedurally scattered and high-count; avoid O(N) dedup
-    // scans that can hitch when new tiles stream in.
+    // Deduplicate: skip if same model already at nearly the same position.
+    // Uses hash map for O(1) lookup instead of O(N) scan.
    if (!mdlRef.isGroundDetail) {
-        // Deduplicate: skip if same model already at nearly the same position
-        for (const auto& existing : instances) {
-            if (existing.modelId == modelId) {
-                glm::vec3 d = existing.position - position;
-                if (glm::dot(d, d) < 0.01f) {
-                    return existing.id;
-                }
-            }
+        DedupKey dk{modelId,
+                    static_cast<int32_t>(std::round(position.x * 10.0f)),
+                    static_cast<int32_t>(std::round(position.y * 10.0f)),
+                    static_cast<int32_t>(std::round(position.z * 10.0f))};
+        auto dit = instanceDedupMap_.find(dk);
+        if (dit != instanceDedupMap_.end()) {
+            return dit->second;
        }
    }

@ -1651,6 +1657,7 @@ uint32_t M2Renderer::createInstance(uint32_t modelId, const glm::vec3& position,
    instance.cachedIsInvisibleTrap = mdlRef.isInvisibleTrap;
    instance.cachedIsInstancePortal = mdlRef.isInstancePortal;
    instance.cachedIsValid = mdlRef.isValid();
+    instance.cachedModel = &mdlRef;

    // Initialize animation: play first sequence (usually Stand/Idle)
    const auto& mdl = mdlRef;
@ -1662,6 +1669,15 @@ uint32_t M2Renderer::createInstance(uint32_t modelId, const glm::vec3& position,
        instance.variationTimer = 3000.0f + static_cast<float>(rand() % 8000);
    }

+    // Register in dedup map before pushing (uses original position, not ground-adjusted)
+    if (!mdlRef.isGroundDetail) {
+        DedupKey dk{modelId,
+                    static_cast<int32_t>(std::round(position.x * 10.0f)),
+                    static_cast<int32_t>(std::round(position.y * 10.0f)),
+                    static_cast<int32_t>(std::round(position.z * 10.0f))};
+        instanceDedupMap_[dk] = instance.id;
+    }
+
    instances.push_back(instance);
    size_t idx = instances.size() - 1;
    // Track special instances for fast-path iteration
@ -1700,13 +1716,15 @@ uint32_t M2Renderer::createInstanceWithMatrix(uint32_t modelId, const glm::mat4&
        return 0;
    }

-    // Deduplicate: skip if same model already at nearly the same position
-    for (const auto& existing : instances) {
-        if (existing.modelId == modelId) {
-            glm::vec3 d = existing.position - position;
-            if (glm::dot(d, d) < 0.01f) {
-                return existing.id;
-            }
+    // Deduplicate: O(1) hash lookup
+    {
+        DedupKey dk{modelId,
+                    static_cast<int32_t>(std::round(position.x * 10.0f)),
+                    static_cast<int32_t>(std::round(position.y * 10.0f)),
+                    static_cast<int32_t>(std::round(position.z * 10.0f))};
+        auto dit = instanceDedupMap_.find(dk);
+        if (dit != instanceDedupMap_.end()) {
+            return dit->second;
        }
    }

@ -1731,6 +1749,7 @@ uint32_t M2Renderer::createInstanceWithMatrix(uint32_t modelId, const glm::mat4&
    instance.cachedIsGroundDetail = mdl2.isGroundDetail;
    instance.cachedIsInvisibleTrap = mdl2.isInvisibleTrap;
    instance.cachedIsValid = mdl2.isValid();
+    instance.cachedModel = &mdl2;

    // Initialize animation
    if (mdl2.hasAnimation && !mdl2.disableAnimation && !mdl2.sequences.empty()) {
@ -1743,6 +1762,15 @@ uint32_t M2Renderer::createInstanceWithMatrix(uint32_t modelId, const glm::mat4&
        instance.animTime = static_cast<float>(rand()) / RAND_MAX * 10000.0f;
    }

+    // Register in dedup map
+    {
+        DedupKey dk{modelId,
+                    static_cast<int32_t>(std::round(position.x * 10.0f)),
+                    static_cast<int32_t>(std::round(position.y * 10.0f)),
+                    static_cast<int32_t>(std::round(position.z * 10.0f))};
+        instanceDedupMap_[dk] = instance.id;
+    }
+
    instances.push_back(instance);
    size_t idx = instances.size() - 1;
    if (mdl2.isSmoke) {
@ -2000,9 +2028,8 @@ void M2Renderer::update(float deltaTime, const glm::vec3& cameraPos, const glm::
        instance.animTime += dtMs * (instance.animSpeed - 1.0f);

        // For animation looping/variation, we need the actual model data.
-        auto it = models.find(instance.modelId);
-        if (it == models.end()) continue;
-        const M2ModelGPU& model = it->second;
+        if (!instance.cachedModel) continue;
+        const M2ModelGPU& model = *instance.cachedModel;

        // Validate sequence index
        if (instance.currentSequenceIndex < 0 ||
@ -2058,6 +2085,14 @@ void M2Renderer::update(float deltaTime, const glm::vec3& cameraPos, const glm::
        float paddedRadius = std::max(cullRadius * 1.5f, cullRadius + 3.0f);
        if (cullRadius > 0.0f && !updateFrustum.intersectsSphere(instance.position, paddedRadius)) continue;

+        // Distance-based frame skipping: update distant bones less frequently
+        uint32_t boneInterval = 1;
+        if (distSq > 200.0f * 200.0f) boneInterval = 8;
+        else if (distSq > 100.0f * 100.0f) boneInterval = 4;
+        else if (distSq > 50.0f * 50.0f) boneInterval = 2;
+        instance.frameSkipCounter++;
+        if ((instance.frameSkipCounter % boneInterval) != 0) continue;
+
        boneWorkIndices_.push_back(idx);
    }

@ -2071,9 +2106,8 @@ void M2Renderer::update(float deltaTime, const glm::vec3& cameraPos, const glm::
            for (size_t i : boneWorkIndices_) {
                if (i >= instances.size()) continue;
                auto& inst = instances[i];
-                auto mdlIt = models.find(inst.modelId);
-                if (mdlIt == models.end()) continue;
-                computeBoneMatrices(mdlIt->second, inst);
+                if (!inst.cachedModel) continue;
+                computeBoneMatrices(*inst.cachedModel, inst);
            }
        } else {
            // Parallel — dispatch across worker threads
@ -2086,9 +2120,8 @@ void M2Renderer::update(float deltaTime, const glm::vec3& cameraPos, const glm::
                for (size_t i : boneWorkIndices_) {
                    if (i >= instances.size()) continue;
                    auto& inst = instances[i];
-                    auto mdlIt = models.find(inst.modelId);
-                    if (mdlIt == models.end()) continue;
-                    computeBoneMatrices(mdlIt->second, inst);
+                    if (!inst.cachedModel) continue;
+                    computeBoneMatrices(*inst.cachedModel, inst);
                }
            } else {
                const size_t chunkSize = animCount / numThreads;
@ -2109,9 +2142,8 @@ void M2Renderer::update(float deltaTime, const glm::vec3& cameraPos, const glm::
                                size_t idx = boneWorkIndices_[j];
                                if (idx >= instances.size()) continue;
                                auto& inst = instances[idx];
-                                auto mdlIt = models.find(inst.modelId);
-                                if (mdlIt == models.end()) continue;
-                                computeBoneMatrices(mdlIt->second, inst);
+                                if (!inst.cachedModel) continue;
+                                computeBoneMatrices(*inst.cachedModel, inst);
                            }
                        }));
                    start = end;
@ -2133,9 +2165,8 @@ void M2Renderer::update(float deltaTime, const glm::vec3& cameraPos, const glm::
        glm::vec3 toCam = instance.position - cachedCamPos_;
        float distSq = glm::dot(toCam, toCam);
        if (distSq > cachedMaxRenderDistSq_) continue;
-        auto mdlIt = models.find(instance.modelId);
-        if (mdlIt == models.end()) continue;
-        emitParticles(instance, mdlIt->second, deltaTime);
+        if (!instance.cachedModel) continue;
+        emitParticles(instance, *instance.cachedModel, deltaTime);
        updateParticles(instance, deltaTime);
    }

@ -2839,9 +2870,8 @@ void M2Renderer::renderShadow(VkCommandBuffer cmd, const glm::mat4& lightSpaceMa
            glm::vec3 diff = instance.position - shadowCenter;
            if (glm::dot(diff, diff) > shadowRadiusSq) continue;

-            auto modelIt = models.find(instance.modelId);
-            if (modelIt == models.end()) continue;
-            const M2ModelGPU& model = modelIt->second;
+            if (!instance.cachedModel) continue;
+            const M2ModelGPU& model = *instance.cachedModel;

            // Filter: only draw foliage models in foliage pass, non-foliage in non-foliage pass
            if (model.shadowWindFoliage != foliagePass) continue;
@ -2947,8 +2977,7 @@ std::vector<glm::vec3> M2Renderer::getWaterVegetationPositions(const glm::vec3&
    std::vector<glm::vec3> result;
    float maxDistSq = maxDist * maxDist;
    for (const auto& inst : instances) {
-        auto it = models.find(inst.modelId);
-        if (it == models.end() || !it->second.isWaterVegetation) continue;
+        if (!inst.cachedModel || !inst.cachedModel->isWaterVegetation) continue;
        glm::vec3 diff = inst.position - camPos;
        if (glm::dot(diff, diff) <= maxDistSq) {
            result.push_back(inst.position);
@ -3059,9 +3088,8 @@ void M2Renderer::emitParticles(M2Instance& inst, const M2ModelGPU& gpu, float dt
 }

 void M2Renderer::updateParticles(M2Instance& inst, float dt) {
-    auto it = models.find(inst.modelId);
-    if (it == models.end()) return;
-    const auto& gpu = it->second;
+    if (!inst.cachedModel) return;
+    const auto& gpu = *inst.cachedModel;

    for (size_t i = 0; i < inst.particles.size(); ) {
        auto& p = inst.particles[i];
@ -3136,9 +3164,8 @@ void M2Renderer::renderM2Particles(VkCommandBuffer cmd, VkDescriptorSet perFrame

    for (auto& inst : instances) {
        if (inst.particles.empty()) continue;
-        auto it = models.find(inst.modelId);
-        if (it == models.end()) continue;
-        const auto& gpu = it->second;
+        if (!inst.cachedModel) continue;
+        const auto& gpu = *inst.cachedModel;

        for (const auto& p : inst.particles) {
            if (p.emitterIndex < 0 || p.emitterIndex >= static_cast<int>(gpu.particleEmitters.size())) continue;
@ -3477,6 +3504,7 @@ void M2Renderer::clear() {
    instances.clear();
    spatialGrid.clear();
    instanceIndexById.clear();
+    instanceDedupMap_.clear();
    smokeParticles.clear();
    smokeInstanceIndices_.clear();
    portalInstanceIndices_.clear();
@ -3513,6 +3541,7 @@ M2Renderer::GridCell M2Renderer::toCell(const glm::vec3& p) const {
 void M2Renderer::rebuildSpatialIndex() {
    spatialGrid.clear();
    instanceIndexById.clear();
+    instanceDedupMap_.clear();
    instanceIndexById.reserve(instances.size());
    smokeInstanceIndices_.clear();
    portalInstanceIndices_.clear();
@ -3521,9 +3550,22 @@ void M2Renderer::rebuildSpatialIndex() {
    particleInstanceIndices_.clear();

    for (size_t i = 0; i < instances.size(); i++) {
-        const auto& inst = instances[i];
+        auto& inst = instances[i];
        instanceIndexById[inst.id] = i;

+        // Re-cache model pointer (may have changed after model map modifications)
+        auto mdlIt = models.find(inst.modelId);
+        inst.cachedModel = (mdlIt != models.end()) ? &mdlIt->second : nullptr;
+
+        // Rebuild dedup map (skip ground detail)
+        if (!inst.cachedIsGroundDetail) {
+            DedupKey dk{inst.modelId,
+                        static_cast<int32_t>(std::round(inst.position.x * 10.0f)),
+                        static_cast<int32_t>(std::round(inst.position.y * 10.0f)),
+                        static_cast<int32_t>(std::round(inst.position.z * 10.0f))};
+            instanceDedupMap_[dk] = inst.id;
+        }
+
        if (inst.cachedIsSmoke) {
            smokeInstanceIndices_.push_back(i);
        }
@ -3647,8 +3689,18 @@ VkTexture* M2Renderer::loadTexture(const std::string& path, uint32_t texFlags) {
        containsToken(key, "campfire") ||
        containsToken(key, "bonfire");

-    // Load BLP texture
-    pipeline::BLPImage blp = assetManager->loadTexture(key);
+    // Check pre-decoded BLP cache first (populated by background worker threads)
+    pipeline::BLPImage blp;
+    if (predecodedBLPCache_) {
+        auto pit = predecodedBLPCache_->find(key);
+        if (pit != predecodedBLPCache_->end()) {
+            blp = std::move(pit->second);
+            predecodedBLPCache_->erase(pit);
+        }
+    }
+    if (!blp.isValid()) {
+        blp = assetManager->loadTexture(key);
+    }
    if (!blp.isValid()) {
        // Return white fallback but don't cache the failure — MPQ reads can
        // fail transiently during streaming; allow retry on next model load.
@ -3714,9 +3766,8 @@ VkTexture* M2Renderer::loadTexture(const std::string& path, uint32_t texFlags) {
 uint32_t M2Renderer::getTotalTriangleCount() const {
    uint32_t total = 0;
    for (const auto& instance : instances) {
-        auto it = models.find(instance.modelId);
-        if (it != models.end()) {
-            total += it->second.indexCount / 3;
+        if (instance.cachedModel) {
+            total += instance.cachedModel->indexCount / 3;
        }
    }
    return total;
@ -3738,11 +3789,10 @@ std::optional<float> M2Renderer::getFloorHeight(float glX, float glY, float glZ,
            continue;
        }

-        auto it = models.find(instance.modelId);
-        if (it == models.end()) continue;
+        if (!instance.cachedModel) continue;
        if (instance.scale <= 0.001f) continue;

-        const M2ModelGPU& model = it->second;
+        const M2ModelGPU& model = *instance.cachedModel;
        if (model.collisionNoBlock || model.isInvisibleTrap || model.isSpellEffect) continue;
        if (instance.skipCollision) continue;

@ -3894,10 +3944,9 @@ bool M2Renderer::checkCollision(const glm::vec3& from, const glm::vec3& to,
        if (from.z > instance.worldBoundsMax.z + 2.5f && adjustedPos.z > instance.worldBoundsMax.z + 2.5f) continue;
        if (from.z + 2.5f < instance.worldBoundsMin.z && adjustedPos.z + 2.5f < instance.worldBoundsMin.z) continue;

-        auto it = models.find(instance.modelId);
-        if (it == models.end()) continue;
+        if (!instance.cachedModel) continue;

-        const M2ModelGPU& model = it->second;
+        const M2ModelGPU& model = *instance.cachedModel;
        if (model.collisionNoBlock || model.isInvisibleTrap || model.isSpellEffect) continue;
        if (instance.skipCollision) continue;
        if (instance.scale <= 0.001f) continue;
@ -4135,10 +4184,9 @@ float M2Renderer::raycastBoundingBoxes(const glm::vec3& origin, const glm::vec3&
            continue;
        }

-        auto it = models.find(instance.modelId);
-        if (it == models.end()) continue;
+        if (!instance.cachedModel) continue;

-        const M2ModelGPU& model = it->second;
+        const M2ModelGPU& model = *instance.cachedModel;
        if (model.collisionNoBlock || model.isInvisibleTrap || model.isSpellEffect) continue;
        glm::vec3 localMin, localMax;
        getTightCollisionBounds(model, localMin, localMax);
--- a/src/rendering/renderer.cpp
+++ b/src/rendering/renderer.cpp
@ -2434,6 +2434,9 @@ void Renderer::update(float deltaTime) {
        cameraController->update(deltaTime);
        auto cameraEnd = std::chrono::steady_clock::now();
        lastCameraUpdateMs = std::chrono::duration<double, std::milli>(cameraEnd - cameraStart).count();
+        if (lastCameraUpdateMs > 3.0) {
+            LOG_WARNING("SLOW cameraController->update: ", lastCameraUpdateMs, "ms");
+        }

        // Update 3D audio listener position/orientation to match camera
        if (camera) {
@ -2527,7 +2530,13 @@ void Renderer::update(float deltaTime) {

    // Update terrain streaming
    if (terrainManager && camera) {
+        auto terrStart = std::chrono::steady_clock::now();
        terrainManager->update(*camera, deltaTime);
+        float terrMs = std::chrono::duration<float, std::milli>(
+            std::chrono::steady_clock::now() - terrStart).count();
+        if (terrMs > 5.0f) {
+            LOG_WARNING("SLOW terrainManager->update: ", terrMs, "ms");
+        }
    }

    // Update sky system (skybox time, star twinkle, clouds, celestial moon phases)
@ -2579,7 +2588,14 @@ void Renderer::update(float deltaTime) {

    // Update character animations
    if (characterRenderer && camera) {
+        auto charAnimStart = std::chrono::steady_clock::now();
        characterRenderer->update(deltaTime, camera->getPosition());
+        float charAnimMs = std::chrono::duration<float, std::milli>(
+            std::chrono::steady_clock::now() - charAnimStart).count();
+        if (charAnimMs > 5.0f) {
+            LOG_WARNING("SLOW characterRenderer->update: ", charAnimMs, "ms (",
+                        characterRenderer->getInstanceCount(), " instances)");
+        }
    }

    // Update AudioEngine (cleanup finished sounds, etc.)
@ -2766,8 +2782,15 @@ void Renderer::update(float deltaTime) {

    // Update M2 doodad animations (pass camera for frustum-culling bone computation)
    if (m2Renderer && camera) {
+        auto m2Start = std::chrono::steady_clock::now();
        m2Renderer->update(deltaTime, camera->getPosition(),
                           camera->getProjectionMatrix() * camera->getViewMatrix());
+        float m2Ms = std::chrono::duration<float, std::milli>(
+            std::chrono::steady_clock::now() - m2Start).count();
+        if (m2Ms > 3.0f) {
+            LOG_WARNING("SLOW m2Renderer->update: ", m2Ms, "ms (",
+                        m2Renderer->getInstanceCount(), " instances)");
+        }
    }

    // Helper: play zone music, dispatching local files (file: prefix) vs MPQ paths
--- a/src/rendering/terrain_manager.cpp
+++ b/src/rendering/terrain_manager.cpp
@ -1,5 +1,6 @@
 #include "rendering/terrain_manager.hpp"
 #include "rendering/terrain_renderer.hpp"
+#include "rendering/vk_context.hpp"
 #include "rendering/water_renderer.hpp"
 #include "rendering/m2_renderer.hpp"
 #include "rendering/wmo_renderer.hpp"
@ -53,12 +54,12 @@ int computeTerrainWorkerCount() {

    unsigned hc = std::thread::hardware_concurrency();
    if (hc > 0) {
-        // Terrain streaming should leave CPU room for render/update threads.
-        const unsigned availableCores = (hc > 1u) ? (hc - 1u) : 1u;
-        const unsigned targetWorkers = std::max(2u, availableCores / 2u);
+        // Use most cores for loading — leave 1-2 for render/update threads.
+        const unsigned reserved = (hc >= 8u) ? 2u : 1u;
+        const unsigned targetWorkers = std::max(4u, hc - reserved);
        return static_cast<int>(targetWorkers);
    }
-    return 2;  // Fallback
+    return 4;  // Fallback
 }

 bool decodeLayerAlpha(const pipeline::MapChunk& chunk, size_t layerIdx, std::vector<uint8_t>& outAlpha) {
@ -230,9 +231,14 @@ bool TerrainManager::loadTile(int x, int y) {
        return false;
    }

+    VkContext* vkCtx = terrainRenderer ? terrainRenderer->getVkContext() : nullptr;
+    if (vkCtx) vkCtx->beginUploadBatch();
+
    FinalizingTile ft;
    ft.pending = std::move(pending);
    while (!advanceFinalization(ft)) {}
+
+    if (vkCtx) vkCtx->endUploadBatchSync();  // Sync — caller expects tile ready
    return true;
 }

@ -372,6 +378,15 @@ std::shared_ptr<PendingTile> TerrainManager::prepareTile(int x, int y) {
                                   int& skippedSkinNotFound) -> bool {
        if (preparedModelIds.find(modelId) != preparedModelIds.end()) return true;

+        // Skip file I/O + parsing for models already uploaded to GPU from previous tiles
+        {
+            std::lock_guard<std::mutex> lock(uploadedM2IdsMutex_);
+            if (uploadedM2Ids_.count(modelId)) {
+                preparedModelIds.insert(modelId);
+                return true;
+            }
+        }
+
        std::vector<uint8_t> m2Data = assetManager->readFile(m2Path);
        if (m2Data.empty()) {
            skippedFileNotFound++;
@ -397,6 +412,20 @@ std::shared_ptr<PendingTile> TerrainManager::prepareTile(int x, int y) {
            return false;
        }

+        // Pre-decode M2 model textures on background thread
+        for (const auto& tex : m2Model.textures) {
+            if (tex.filename.empty()) continue;
+            std::string texKey = tex.filename;
+            std::replace(texKey.begin(), texKey.end(), '/', '\\');
+            std::transform(texKey.begin(), texKey.end(), texKey.begin(),
+                           [](unsigned char c) { return static_cast<char>(std::tolower(c)); });
+            if (pending->preloadedM2Textures.find(texKey) != pending->preloadedM2Textures.end()) continue;
+            auto blp = assetManager->loadTexture(texKey);
+            if (blp.isValid()) {
+                pending->preloadedM2Textures[texKey] = std::move(blp);
+            }
+        }
+
        PendingTile::M2Ready ready;
        ready.modelId = modelId;
        ready.model = std::move(m2Model);
@ -551,10 +580,20 @@ std::shared_ptr<PendingTile> TerrainManager::prepareTile(int x, int y) {
                        }

                        uint32_t doodadModelId = static_cast<uint32_t>(std::hash<std::string>{}(m2Path));
+
+                        // Skip file I/O if model already uploaded from a previous tile
+                        bool modelAlreadyUploaded = false;
+                        {
+                            std::lock_guard<std::mutex> lock(uploadedM2IdsMutex_);
+                            modelAlreadyUploaded = uploadedM2Ids_.count(doodadModelId) > 0;
+                        }
+
+                        pipeline::M2Model m2Model;
+                        if (!modelAlreadyUploaded) {
                            std::vector<uint8_t> m2Data = assetManager->readFile(m2Path);
                            if (m2Data.empty()) continue;

-                        pipeline::M2Model m2Model = pipeline::M2Loader::load(m2Data);
+                            m2Model = pipeline::M2Loader::load(m2Data);
                            if (m2Model.name.empty()) {
                                m2Model.name = m2Path;
                            }
@ -565,6 +604,21 @@ std::shared_ptr<PendingTile> TerrainManager::prepareTile(int x, int y) {
                            }
                            if (!m2Model.isValid()) continue;

+                            // Pre-decode doodad M2 textures on background thread
+                            for (const auto& tex : m2Model.textures) {
+                                if (tex.filename.empty()) continue;
+                                std::string texKey = tex.filename;
+                                std::replace(texKey.begin(), texKey.end(), '/', '\\');
+                                std::transform(texKey.begin(), texKey.end(), texKey.begin(),
+                                               [](unsigned char c) { return static_cast<char>(std::tolower(c)); });
+                                if (pending->preloadedM2Textures.find(texKey) != pending->preloadedM2Textures.end()) continue;
+                                auto blp = assetManager->loadTexture(texKey);
+                                if (blp.isValid()) {
+                                    pending->preloadedM2Textures[texKey] = std::move(blp);
+                                }
+                            }
+                        }
+
                        // Build doodad's local transform (WoW coordinates)
                        // WMO doodads use quaternion rotation
                        glm::quat fixedRotation(doodad.rotation.w, doodad.rotation.x, doodad.rotation.y, doodad.rotation.z);
@ -633,6 +687,32 @@ std::shared_ptr<PendingTile> TerrainManager::prepareTile(int x, int y) {
                    }
                }

+                // Pre-decode WMO textures on background thread
+                for (const auto& texPath : wmoModel.textures) {
+                    if (texPath.empty()) continue;
+                    std::string texKey = texPath;
+                    // Truncate at NUL (WMO paths can have stray bytes)
+                    size_t nul = texKey.find('\0');
+                    if (nul != std::string::npos) texKey.resize(nul);
+                    std::replace(texKey.begin(), texKey.end(), '/', '\\');
+                    std::transform(texKey.begin(), texKey.end(), texKey.begin(),
+                                   [](unsigned char c) { return static_cast<char>(std::tolower(c)); });
+                    if (texKey.empty()) continue;
+                    if (pending->preloadedWMOTextures.find(texKey) != pending->preloadedWMOTextures.end()) continue;
+                    // Try .blp variant
+                    std::string blpKey = texKey;
+                    if (blpKey.size() >= 4) {
+                        std::string ext = blpKey.substr(blpKey.size() - 4);
+                        if (ext == ".tga" || ext == ".dds") {
+                            blpKey = blpKey.substr(0, blpKey.size() - 4) + ".blp";
+                        }
+                    }
+                    auto blp = assetManager->loadTexture(blpKey);
+                    if (blp.isValid()) {
+                        pending->preloadedWMOTextures[blpKey] = std::move(blp);
+                    }
+                }
+
                PendingTile::WMOReady ready;
                // Cache WMO model uploads by path; placement dedup uses uniqueId separately.
                ready.modelId = static_cast<uint32_t>(std::hash<std::string>{}(wmoPath));
@ -695,15 +775,20 @@ bool TerrainManager::advanceFinalization(FinalizingTile& ft) {
            return true;
        }

+        // Upload pre-loaded textures (once)
+        if (!ft.terrainPreloaded) {
            LOG_DEBUG("Finalizing tile [", x, ",", y, "] (incremental)");
-
-        // Upload pre-loaded textures
            if (!pending->preloadedTextures.empty()) {
                terrainRenderer->uploadPreloadedTextures(pending->preloadedTextures);
            }
+            ft.terrainPreloaded = true;
+            // Yield after preload to give time budget a chance to interrupt
+            return false;
+        }

-        // Upload terrain mesh to GPU
-        if (!terrainRenderer->loadTerrain(pending->mesh, pending->terrain.textures, x, y)) {
+        // Upload terrain chunks incrementally (16 per call to spread across frames)
+        if (!ft.terrainMeshDone) {
+            if (pending->mesh.validChunkCount == 0) {
                LOG_ERROR("Failed to upload terrain to GPU for tile [", x, ",", y, "]");
                failedTiles[coord] = true;
                {
@ -713,9 +798,16 @@ bool TerrainManager::advanceFinalization(FinalizingTile& ft) {
                ft.phase = FinalizationPhase::DONE;
                return true;
            }
+            bool allDone = terrainRenderer->loadTerrainIncremental(
+                pending->mesh, pending->terrain.textures, x, y,
+                ft.terrainChunkNext, 32);
+            if (!allDone) {
+                return false; // More chunks remain — yield to time budget
+            }
+            ft.terrainMeshDone = true;
+        }

-        // Load water immediately after terrain (same frame) — water is now
-        // deduplicated to ~1-2 merged surfaces per tile, so this is fast.
+        // Load water after all terrain chunks are uploaded
        if (waterRenderer) {
            size_t beforeSurfaces = waterRenderer->getSurfaceCount();
            waterRenderer->loadFromTerrain(pending->terrain, true, x, y);
@ -738,13 +830,24 @@ bool TerrainManager::advanceFinalization(FinalizingTile& ft) {
    }

    case FinalizationPhase::M2_MODELS: {
-        // Upload ONE M2 model per call
+        // Upload multiple M2 models per call (batched GPU uploads)
        if (m2Renderer && ft.m2ModelIndex < pending->m2Models.size()) {
+            // Set pre-decoded BLP cache so loadTexture() skips main-thread BLP decode
+            m2Renderer->setPredecodedBLPCache(&pending->preloadedM2Textures);
+            constexpr size_t kModelsPerStep = 4;
+            size_t uploaded = 0;
+            while (ft.m2ModelIndex < pending->m2Models.size() && uploaded < kModelsPerStep) {
                auto& m2Ready = pending->m2Models[ft.m2ModelIndex];
                if (m2Renderer->loadModel(m2Ready.model, m2Ready.modelId)) {
                    ft.uploadedM2ModelIds.insert(m2Ready.modelId);
+                    // Track uploaded model IDs so background threads can skip re-reading
+                    std::lock_guard<std::mutex> lock(uploadedM2IdsMutex_);
+                    uploadedM2Ids_.insert(m2Ready.modelId);
                }
                ft.m2ModelIndex++;
+                uploaded++;
+            }
+            m2Renderer->setPredecodedBLPCache(nullptr);
            // Stay in this phase until all models uploaded
            if (ft.m2ModelIndex < pending->m2Models.size()) {
                return false;
@ -786,23 +889,29 @@ bool TerrainManager::advanceFinalization(FinalizingTile& ft) {
    }

    case FinalizationPhase::WMO_MODELS: {
-        // Upload ONE WMO model per call
+        // Upload multiple WMO models per call (batched GPU uploads)
        if (wmoRenderer && assetManager) {
            wmoRenderer->initialize(nullptr, VK_NULL_HANDLE, assetManager);
+            // Set pre-decoded BLP cache and defer normal maps during streaming
+            wmoRenderer->setPredecodedBLPCache(&pending->preloadedWMOTextures);
+            wmoRenderer->setDeferNormalMaps(true);

-            if (ft.wmoModelIndex < pending->wmoModels.size()) {
+            constexpr size_t kWmosPerStep = 1;
+            size_t uploaded = 0;
+            while (ft.wmoModelIndex < pending->wmoModels.size() && uploaded < kWmosPerStep) {
                auto& wmoReady = pending->wmoModels[ft.wmoModelIndex];
-                // Deduplicate
                if (wmoReady.uniqueId != 0 && placedWmoIds.count(wmoReady.uniqueId)) {
                    ft.wmoModelIndex++;
-                    if (ft.wmoModelIndex < pending->wmoModels.size()) return false;
                } else {
                    wmoRenderer->loadModel(wmoReady.model, wmoReady.modelId);
                    ft.wmoModelIndex++;
+                    uploaded++;
+                }
+            }
+            wmoRenderer->setDeferNormalMaps(false);
+            wmoRenderer->setPredecodedBLPCache(nullptr);
            if (ft.wmoModelIndex < pending->wmoModels.size()) return false;
        }
-            }
-        }
        ft.phase = FinalizationPhase::WMO_INSTANCES;
        return false;
    }
@ -862,10 +971,18 @@ bool TerrainManager::advanceFinalization(FinalizingTile& ft) {
    }

    case FinalizationPhase::WMO_DOODADS: {
-        // Upload ONE WMO doodad M2 per call
+        // Upload multiple WMO doodad M2s per call (batched GPU uploads)
        if (m2Renderer && ft.wmoDoodadIndex < pending->wmoDoodads.size()) {
+            // Set pre-decoded BLP cache for doodad M2 textures
+            m2Renderer->setPredecodedBLPCache(&pending->preloadedM2Textures);
+            constexpr size_t kDoodadsPerStep = 4;
+            size_t uploaded = 0;
+            while (ft.wmoDoodadIndex < pending->wmoDoodads.size() && uploaded < kDoodadsPerStep) {
                auto& doodad = pending->wmoDoodads[ft.wmoDoodadIndex];
-            m2Renderer->loadModel(doodad.model, doodad.modelId);
+                if (m2Renderer->loadModel(doodad.model, doodad.modelId)) {
+                    std::lock_guard<std::mutex> lock(uploadedM2IdsMutex_);
+                    uploadedM2Ids_.insert(doodad.modelId);
+                }
                uint32_t wmoDoodadInstId = m2Renderer->createInstanceWithMatrix(
                    doodad.modelId, doodad.modelMatrix, doodad.worldPosition);
                if (wmoDoodadInstId) {
@ -873,6 +990,9 @@ bool TerrainManager::advanceFinalization(FinalizingTile& ft) {
                    ft.m2InstanceIds.push_back(wmoDoodadInstId);
                }
                ft.wmoDoodadIndex++;
+                uploaded++;
+            }
+            m2Renderer->setPredecodedBLPCache(nullptr);
            if (ft.wmoDoodadIndex < pending->wmoDoodads.size()) return false;
        }
        ft.phase = FinalizationPhase::WATER;
@ -1030,11 +1150,6 @@ void TerrainManager::workerLoop() {
 }

 void TerrainManager::processReadyTiles() {
-    // Process tiles with time budget to avoid frame spikes
-    // Taxi mode gets a slightly larger budget to avoid visible late-pop terrain/models.
-    const float timeBudgetMs = taxiStreamingMode_ ? 8.0f : 5.0f;
-    auto startTime = std::chrono::high_resolution_clock::now();
-
    // Move newly ready tiles into the finalizing deque.
    // Keep them in pendingTiles so streamTiles() won't re-enqueue them.
    {
@ -1050,21 +1165,32 @@ void TerrainManager::processReadyTiles() {
        }
    }

-    // Drive incremental finalization within time budget
-    while (!finalizingTiles_.empty()) {
+    VkContext* vkCtx = terrainRenderer ? terrainRenderer->getVkContext() : nullptr;
+
+    // Reclaim completed async uploads from previous frames (non-blocking)
+    if (vkCtx) vkCtx->pollUploadBatches();
+
+    // Nothing to finalize — done.
+    if (finalizingTiles_.empty()) return;
+
+    // Async upload batch: record GPU copies into a command buffer, submit with
+    // a fence, but DON'T wait.  The fence is polled on subsequent frames.
+    // This eliminates the main-thread stall from vkWaitForFences entirely.
+    const int maxSteps = taxiStreamingMode_ ? 8 : 2;
+    int steps = 0;
+
+    if (vkCtx) vkCtx->beginUploadBatch();
+
+    while (!finalizingTiles_.empty() && steps < maxSteps) {
        auto& ft = finalizingTiles_.front();
        bool done = advanceFinalization(ft);
-
        if (done) {
            finalizingTiles_.pop_front();
        }
+        steps++;
+    }

-        auto now = std::chrono::high_resolution_clock::now();
-        float elapsedMs = std::chrono::duration<float, std::milli>(now - startTime).count();
-        if (elapsedMs >= timeBudgetMs) {
-            break;
-        }
-    }
+    if (vkCtx) vkCtx->endUploadBatch();  // Async — submits but doesn't wait
 }

 void TerrainManager::processAllReadyTiles() {
@ -1082,12 +1208,19 @@ void TerrainManager::processAllReadyTiles() {
            }
        }
    }
+
+    // Batch all GPU uploads across all tiles into a single submission
+    VkContext* vkCtx = terrainRenderer ? terrainRenderer->getVkContext() : nullptr;
+    if (vkCtx) vkCtx->beginUploadBatch();
+
    // Finalize all tiles completely (no time budget — used for loading screens)
    while (!finalizingTiles_.empty()) {
        auto& ft = finalizingTiles_.front();
        while (!advanceFinalization(ft)) {}
        finalizingTiles_.pop_front();
    }
+
+    if (vkCtx) vkCtx->endUploadBatchSync();  // Sync — load screen needs data ready
 }

 void TerrainManager::processOneReadyTile() {
@ -1106,9 +1239,14 @@ void TerrainManager::processOneReadyTile() {
    }
    // Finalize ONE tile completely, then return so caller can update the screen
    if (!finalizingTiles_.empty()) {
+        VkContext* vkCtx = terrainRenderer ? terrainRenderer->getVkContext() : nullptr;
+        if (vkCtx) vkCtx->beginUploadBatch();
+
        auto& ft = finalizingTiles_.front();
        while (!advanceFinalization(ft)) {}
        finalizingTiles_.pop_front();
+
+        if (vkCtx) vkCtx->endUploadBatchSync();  // Sync — load screen needs data ready
    }
 }

@ -1328,6 +1466,10 @@ void TerrainManager::unloadAll() {
    finalizingTiles_.clear();
    placedDoodadIds.clear();
    placedWmoIds.clear();
+    {
+        std::lock_guard<std::mutex> lock(uploadedM2IdsMutex_);
+        uploadedM2Ids_.clear();
+    }

    LOG_INFO("Unloading all terrain tiles");
    loadedTiles.clear();
@ -1376,6 +1518,10 @@ void TerrainManager::softReset() {
    finalizingTiles_.clear();
    placedDoodadIds.clear();
    placedWmoIds.clear();
+    {
+        std::lock_guard<std::mutex> lock(uploadedM2IdsMutex_);
+        uploadedM2Ids_.clear();
+    }

    // Clear tile cache — keys are (x,y) without map name, so stale entries from
    // a different map with overlapping coordinates would produce wrong geometry.
--- a/src/rendering/terrain_renderer.cpp
+++ b/src/rendering/terrain_renderer.cpp
@ -326,6 +326,8 @@ bool TerrainRenderer::loadTerrain(const pipeline::TerrainMesh& mesh,
    }
    LOG_DEBUG("Loading terrain mesh: ", mesh.validChunkCount, " chunks");

+    vkCtx->beginUploadBatch();
+
    for (int y = 0; y < 16; y++) {
        for (int x = 0; x < 16; x++) {
            const auto& chunk = mesh.getChunk(x, y);
@ -405,10 +407,102 @@ bool TerrainRenderer::loadTerrain(const pipeline::TerrainMesh& mesh,
        }
    }

+    vkCtx->endUploadBatch();
+
    LOG_DEBUG("Loaded ", chunks.size(), " terrain chunks to GPU");
    return !chunks.empty();
 }

+bool TerrainRenderer::loadTerrainIncremental(const pipeline::TerrainMesh& mesh,
+                                              const std::vector<std::string>& texturePaths,
+                                              int tileX, int tileY,
+                                              int& chunkIndex, int maxChunksPerCall) {
+    // Batch all GPU uploads (VBs, IBs, textures) into a single command buffer
+    // submission with one fence wait, instead of one per buffer/texture.
+    vkCtx->beginUploadBatch();
+
+    int uploaded = 0;
+    while (chunkIndex < 256 && uploaded < maxChunksPerCall) {
+        int cy = chunkIndex / 16;
+        int cx = chunkIndex % 16;
+        chunkIndex++;
+
+        const auto& chunk = mesh.getChunk(cx, cy);
+        if (!chunk.isValid()) continue;
+
+        TerrainChunkGPU gpuChunk = uploadChunk(chunk);
+        if (!gpuChunk.isValid()) continue;
+
+        calculateBoundingSphere(gpuChunk, chunk);
+
+        if (!chunk.layers.empty()) {
+            uint32_t baseTexId = chunk.layers[0].textureId;
+            if (baseTexId < texturePaths.size()) {
+                gpuChunk.baseTexture = loadTexture(texturePaths[baseTexId]);
+            } else {
+                gpuChunk.baseTexture = whiteTexture.get();
+            }
+
+            for (size_t i = 1; i < chunk.layers.size() && i < 4; i++) {
+                const auto& layer = chunk.layers[i];
+                int li = static_cast<int>(i) - 1;
+
+                VkTexture* layerTex = whiteTexture.get();
+                if (layer.textureId < texturePaths.size()) {
+                    layerTex = loadTexture(texturePaths[layer.textureId]);
+                }
+                gpuChunk.layerTextures[li] = layerTex;
+
+                VkTexture* alphaTex = opaqueAlphaTexture.get();
+                if (!layer.alphaData.empty()) {
+                    alphaTex = createAlphaTexture(layer.alphaData);
+                }
+                gpuChunk.alphaTextures[li] = alphaTex;
+                gpuChunk.layerCount = static_cast<int>(i);
+            }
+        } else {
+            gpuChunk.baseTexture = whiteTexture.get();
+        }
+
+        gpuChunk.tileX = tileX;
+        gpuChunk.tileY = tileY;
+
+        TerrainParamsUBO params{};
+        params.layerCount = gpuChunk.layerCount;
+        params.hasLayer1 = gpuChunk.layerCount >= 1 ? 1 : 0;
+        params.hasLayer2 = gpuChunk.layerCount >= 2 ? 1 : 0;
+        params.hasLayer3 = gpuChunk.layerCount >= 3 ? 1 : 0;
+
+        VkBufferCreateInfo bufCI{};
+        bufCI.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO;
+        bufCI.size = sizeof(TerrainParamsUBO);
+        bufCI.usage = VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT;
+
+        VmaAllocationCreateInfo allocCI{};
+        allocCI.usage = VMA_MEMORY_USAGE_CPU_TO_GPU;
+        allocCI.flags = VMA_ALLOCATION_CREATE_MAPPED_BIT;
+
+        VmaAllocationInfo mapInfo{};
+        vmaCreateBuffer(vkCtx->getAllocator(), &bufCI, &allocCI,
+                        &gpuChunk.paramsUBO, &gpuChunk.paramsAlloc, &mapInfo);
+        if (mapInfo.pMappedData) {
+            std::memcpy(mapInfo.pMappedData, &params, sizeof(params));
+        }
+
+        gpuChunk.materialSet = allocateMaterialSet();
+        if (gpuChunk.materialSet) {
+            writeMaterialDescriptors(gpuChunk.materialSet, gpuChunk);
+        }
+
+        chunks.push_back(std::move(gpuChunk));
+        uploaded++;
+    }
+
+    vkCtx->endUploadBatch();
+
+    return chunkIndex >= 256;
+}
+
 TerrainChunkGPU TerrainRenderer::uploadChunk(const pipeline::ChunkMesh& chunk) {
    TerrainChunkGPU gpuChunk;

@ -496,6 +590,9 @@ void TerrainRenderer::uploadPreloadedTextures(
                       [](unsigned char c) { return static_cast<char>(std::tolower(c)); });
        return key;
    };
+    // Batch all texture uploads into a single command buffer submission
+    vkCtx->beginUploadBatch();
+
    for (const auto& [path, blp] : textures) {
        std::string key = normalizeKey(path);
        if (textureCache.find(key) != textureCache.end()) continue;
@ -515,6 +612,8 @@ void TerrainRenderer::uploadPreloadedTextures(
        textureCacheBytes_ += e.approxBytes;
        textureCache[key] = std::move(e);
    }
+
+    vkCtx->endUploadBatch();
 }

 VkTexture* TerrainRenderer::createAlphaTexture(const std::vector<uint8_t>& alphaData) {
--- a/src/rendering/vk_context.cpp
+++ b/src/rendering/vk_context.cpp
@ -67,6 +67,14 @@ void VkContext::shutdown() {
        frame = {};
    }

+    // Clean up any in-flight async upload batches (device already idle)
+    for (auto& batch : inFlightBatches_) {
+        // Staging buffers: skip destroy — allocator is about to be torn down
+        vkDestroyFence(device, batch.fence, nullptr);
+        // Command buffer freed when pool is destroyed below
+    }
+    inFlightBatches_.clear();
+
    if (immFence) { vkDestroyFence(device, immFence, nullptr); immFence = VK_NULL_HANDLE; }
    if (immCommandPool) { vkDestroyCommandPool(device, immCommandPool, nullptr); immCommandPool = VK_NULL_HANDLE; }

@ -1423,10 +1431,121 @@ void VkContext::endSingleTimeCommands(VkCommandBuffer cmd) {
 }

 void VkContext::immediateSubmit(std::function<void(VkCommandBuffer cmd)>&& function) {
+    if (inUploadBatch_) {
+        // Record into the batch command buffer — no submit, no fence wait
+        function(batchCmd_);
+        return;
+    }
    VkCommandBuffer cmd = beginSingleTimeCommands();
    function(cmd);
    endSingleTimeCommands(cmd);
 }

+void VkContext::beginUploadBatch() {
+    uploadBatchDepth_++;
+    if (inUploadBatch_) return; // already in a batch (nested call)
+    inUploadBatch_ = true;
+    batchCmd_ = beginSingleTimeCommands();
+}
+
+void VkContext::endUploadBatch() {
+    if (uploadBatchDepth_ <= 0) return;
+    uploadBatchDepth_--;
+    if (uploadBatchDepth_ > 0) return; // still inside an outer batch
+
+    inUploadBatch_ = false;
+
+    if (batchStagingBuffers_.empty()) {
+        // No GPU copies were recorded — skip the submit entirely.
+        vkEndCommandBuffer(batchCmd_);
+        vkFreeCommandBuffers(device, immCommandPool, 1, &batchCmd_);
+        batchCmd_ = VK_NULL_HANDLE;
+        return;
+    }
+
+    // Submit commands with a NEW fence — don't wait, let GPU work in parallel.
+    vkEndCommandBuffer(batchCmd_);
+
+    VkFenceCreateInfo fenceInfo{};
+    fenceInfo.sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO;
+    VkFence fence = VK_NULL_HANDLE;
+    vkCreateFence(device, &fenceInfo, nullptr, &fence);
+
+    VkSubmitInfo submitInfo{};
+    submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
+    submitInfo.commandBufferCount = 1;
+    submitInfo.pCommandBuffers = &batchCmd_;
+    vkQueueSubmit(graphicsQueue, 1, &submitInfo, fence);
+
+    // Stash everything for later cleanup when fence signals
+    InFlightBatch batch;
+    batch.fence = fence;
+    batch.cmd = batchCmd_;
+    batch.stagingBuffers = std::move(batchStagingBuffers_);
+    inFlightBatches_.push_back(std::move(batch));
+
+    batchCmd_ = VK_NULL_HANDLE;
+    batchStagingBuffers_.clear();
+}
+
+void VkContext::endUploadBatchSync() {
+    if (uploadBatchDepth_ <= 0) return;
+    uploadBatchDepth_--;
+    if (uploadBatchDepth_ > 0) return;
+
+    inUploadBatch_ = false;
+
+    if (batchStagingBuffers_.empty()) {
+        vkEndCommandBuffer(batchCmd_);
+        vkFreeCommandBuffers(device, immCommandPool, 1, &batchCmd_);
+        batchCmd_ = VK_NULL_HANDLE;
+        return;
+    }
+
+    // Synchronous path for load screens — submit and wait
+    endSingleTimeCommands(batchCmd_);
+    batchCmd_ = VK_NULL_HANDLE;
+
+    for (auto& staging : batchStagingBuffers_) {
+        destroyBuffer(allocator, staging);
+    }
+    batchStagingBuffers_.clear();
+}
+
+void VkContext::pollUploadBatches() {
+    if (inFlightBatches_.empty()) return;
+
+    for (auto it = inFlightBatches_.begin(); it != inFlightBatches_.end(); ) {
+        VkResult result = vkGetFenceStatus(device, it->fence);
+        if (result == VK_SUCCESS) {
+            // GPU finished — free resources
+            for (auto& staging : it->stagingBuffers) {
+                destroyBuffer(allocator, staging);
+            }
+            vkFreeCommandBuffers(device, immCommandPool, 1, &it->cmd);
+            vkDestroyFence(device, it->fence, nullptr);
+            it = inFlightBatches_.erase(it);
+        } else {
+            ++it;
+        }
+    }
+}
+
+void VkContext::waitAllUploads() {
+    for (auto& batch : inFlightBatches_) {
+        vkWaitForFences(device, 1, &batch.fence, VK_TRUE, UINT64_MAX);
+        for (auto& staging : batch.stagingBuffers) {
+            destroyBuffer(allocator, staging);
+        }
+        vkFreeCommandBuffers(device, immCommandPool, 1, &batch.cmd);
+        vkDestroyFence(device, batch.fence, nullptr);
+    }
+    inFlightBatches_.clear();
+}
+
+void VkContext::deferStagingCleanup(AllocatedBuffer staging) {
+    batchStagingBuffers_.push_back(staging);
+}
+
 } // namespace rendering
 } // namespace wowee
--- a/src/rendering/vk_texture.cpp
+++ b/src/rendering/vk_texture.cpp
@ -96,7 +96,11 @@ bool VkTexture::upload(VkContext& ctx, const uint8_t* pixels, uint32_t width, ui
        generateMipmaps(ctx, format, width, height);
    }

+    if (ctx.isInUploadBatch()) {
+        ctx.deferStagingCleanup(staging);
+    } else {
        destroyBuffer(ctx.getAllocator(), staging);
+    }
    return true;
 }

@ -162,7 +166,11 @@ bool VkTexture::uploadMips(VkContext& ctx, const uint8_t* const* mipData,
            VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT);
    });

+    if (ctx.isInUploadBatch()) {
+        ctx.deferStagingCleanup(staging);
+    } else {
        destroyBuffer(ctx.getAllocator(), staging);
+    }
    return true;
 }

--- a/src/rendering/vk_utils.cpp
+++ b/src/rendering/vk_utils.cpp
@ -198,8 +198,12 @@ AllocatedBuffer uploadBuffer(VkContext& ctx, const void* data, VkDeviceSize size
        vkCmdCopyBuffer(cmd, staging.buffer, gpuBuffer.buffer, 1, &copyRegion);
    });

-    // Destroy staging buffer
+    // Destroy staging buffer (deferred if in batch mode)
+    if (ctx.isInUploadBatch()) {
+        ctx.deferStagingCleanup(staging);
+    } else {
        destroyBuffer(ctx.getAllocator(), staging);
+    }

    return gpuBuffer;
 }
--- a/src/rendering/wmo_renderer.cpp
+++ b/src/rendering/wmo_renderer.cpp
@ -419,6 +419,10 @@ bool WMORenderer::loadModel(const pipeline::WMOModel& model, uint32_t id) {
    core::Logger::getInstance().debug("  WMO bounds: min=(", model.boundingBoxMin.x, ", ", model.boundingBoxMin.y, ", ", model.boundingBoxMin.z,
                                      ") max=(", model.boundingBoxMax.x, ", ", model.boundingBoxMax.y, ", ", model.boundingBoxMax.z, ")");

+    // Batch all GPU uploads (textures, VBs, IBs) into a single command buffer
+    // submission with one fence wait, instead of one per upload.
+    vkCtx_->beginUploadBatch();
+
    // Load textures for this model
    core::Logger::getInstance().debug("  WMO has ", model.textures.size(), " texture paths, ", model.materials.size(), " materials");
    if (assetManager && !model.textures.empty()) {
@ -720,6 +724,8 @@ bool WMORenderer::loadModel(const pipeline::WMOModel& model, uint32_t id) {
        groupRes.allUntextured = !anyTextured && !groupRes.mergedBatches.empty();
    }

+    vkCtx_->endUploadBatch();
+
    // Copy portal data for visibility culling
    modelData.portalVertices = model.portalVertices;
    for (const auto& portal : model.portals) {
@ -2319,8 +2325,21 @@ VkTexture* WMORenderer::loadTexture(const std::string& path) {
    const auto& attemptedCandidates = uniqueCandidates;

    // Try loading all candidates until one succeeds
+    // Check pre-decoded BLP cache first (populated by background worker threads)
    pipeline::BLPImage blp;
    std::string resolvedKey;
+    if (predecodedBLPCache_) {
+        for (const auto& c : uniqueCandidates) {
+            auto pit = predecodedBLPCache_->find(c);
+            if (pit != predecodedBLPCache_->end()) {
+                blp = std::move(pit->second);
+                predecodedBLPCache_->erase(pit);
+                resolvedKey = c;
+                break;
+            }
+        }
+    }
+    if (!blp.isValid()) {
        for (const auto& c : attemptedCandidates) {
            blp = assetManager->loadTexture(c);
            if (blp.isValid()) {
@ -2328,6 +2347,7 @@ VkTexture* WMORenderer::loadTexture(const std::string& path) {
                break;
            }
        }
+    }
    if (!blp.isValid()) {
        if (loggedTextureLoadFails_.insert(key).second) {
            core::Logger::getInstance().warning("WMO: Failed to load texture: ", path);
@ -2363,10 +2383,10 @@ VkTexture* WMORenderer::loadTexture(const std::string& path) {
    texture->createSampler(vkCtx_->getDevice(), VK_FILTER_LINEAR, VK_FILTER_LINEAR,
                            VK_SAMPLER_ADDRESS_MODE_REPEAT);

-    // Generate normal+height map from diffuse pixels
+    // Generate normal+height map from diffuse pixels (skip during streaming to avoid CPU stalls)
    float nhVariance = 0.0f;
    std::unique_ptr<VkTexture> nhMap;
-    if (normalMappingEnabled_ || pomEnabled_) {
+    if ((normalMappingEnabled_ || pomEnabled_) && !deferNormalMaps_) {
        nhMap = generateNormalHeightMap(blp.data.data(), blp.width, blp.height, nhVariance);
        if (nhMap) {
            approxBytes *= 2;  // account for normal map in budget
Author	SHA1	Message	Date
Kelsi	24f2ec75ec	Defer normal map generation to reduce GPU model upload stalls by ~50% Some checks are pending Build / Build (arm64) (push) Waiting to run Details Build / Build (x86-64) (push) Waiting to run Details Build / Build (macOS arm64) (push) Waiting to run Details Build / Build (windows-arm64) (push) Waiting to run Details Build / Build (windows-x86-64) (push) Waiting to run Details Security / CodeQL (C/C++) (push) Waiting to run Details Security / Semgrep (push) Waiting to run Details Security / Sanitizer Build (ASan/UBSan) (push) Waiting to run Details Each loadTexture call was generating a normal/height map inline (3 full-image passes: luminance + blur + Sobel). For models with 15-20 textures this added 30-40ms to the 70ms model upload. Now deferred to a per-frame budget (2/frame in-game, 10/frame during load screen). Models render without POM until their normal maps are ready.	2026-03-07 17:16:38 -08:00
Kelsi	faca22ac5f	Async humanoid NPC texture pipeline to eliminate 30-150ms main-thread stalls Move all DBC lookups (CharSections, ItemDisplayInfo), texture path resolution, and BLP decoding for humanoid NPCs to background threads. Only GPU texture uploads remain on the main thread via pre-decoded BLP cache.	2026-03-07 16:54:58 -08:00
Kelsi	7ac990cff4	Background BLP texture pre-decoding + deferred WMO normal maps (12x streaming perf) Move CPU-heavy BLP texture decoding from main thread to background worker threads for all hot paths: terrain M2 models, WMO doodad M2s, WMO textures, creature models, and gameobject WMOs. Each renderer (M2, WMO, Character) now accepts a pre-decoded BLP cache that loadTexture() checks before falling back to synchronous decode. Defer WMO normal/height map generation (3 per-pixel passes: luminance, box blur, Sobel) during terrain streaming finalization — this was the dominant remaining bottleneck after BLP pre-decoding. Terrain streaming stalls: 1576ms → 124ms worst case.	2026-03-07 15:46:56 -08:00
Kelsi	0313bd8692	Performance: ring buffer UBOs, batched load screen uploads, background world preloader - Replace per-frame VMA alloc/free of material UBOs with a ring buffer in CharacterRenderer (~500 allocations/frame eliminated) - Batch all ready terrain tiles into a single GPU upload during load screen (processAllReadyTiles instead of one-at-a-time with individual fence waits) - Lift per-frame creature/GO spawn budgets during load screen warmup phase - Add background world preloader: saves last world position to disk, pre-warms AssetManager file cache with ADT files starting at app init (login screen) so terrain workers get instant cache hits when Enter World is clicked - Distance-filter expensive collision guard to 8-unit melee range - Merge 3 CharacterRenderer update loops into single pass - Time-budget instrumentation for slow update stages (>3ms threshold) - Count-based async creature model upload budget (max 3/frame in-game) - 1-per-frame game object spawn + per-doodad time budget for transport loading - Use deque for creature spawn queue to avoid O(n) front-erase	2026-03-07 13:44:09 -08:00
Kelsi	71e8ed5b7d	Reduce initial load to radius 1 (~5 tiles) for fast game entry Was waiting for all ~50 tiles (radius 4) to fully prepare + finalize before entering the game. Now loads only the immediate surrounding tiles during the loading screen, then restores the full radius for in-game streaming. setLoadRadius just sets an int — actual loading happens lazily via background workers during the game loop.	2026-03-07 12:39:38 -08:00
Kelsi	25bb63c50a	Faster terrain/model loading: more workers, batched finalization, skip redundant I/O - Worker threads: use (cores - 1-2) instead of cores/2, minimum 4 - Outer upload batch in processReadyTiles: ALL model/texture uploads per frame share a single command buffer submission + fence wait - Upload multiple models per finalization step: 8 M2s, 4 WMOs, 16 doodads per call instead of 1 each (all within same GPU batch) - Terrain chunks: 64 per step instead of 16 - Skip redundant M2 file I/O: thread-safe uploadedM2Ids_ set lets background workers skip re-reading+parsing models already on GPU - processAllReadyTiles (loading screen) and processOneReadyTile also wrapped in outer upload batches	2026-03-07 12:32:39 -08:00
Kelsi	16b4336700	Batch GPU uploads to eliminate per-upload fence waits (stutter fix) Every uploadBuffer/VkTexture::upload called immediateSubmit which did a separate vkQueueSubmit + vkWaitForFences. Loading a single creature model with textures caused 4-8+ fence waits; terrain chunks caused 80+ per batch. Added beginUploadBatch/endUploadBatch to VkContext: records all upload commands into a single command buffer, submits once with one fence wait. Staging buffers are deferred for cleanup after the batch completes. Wrapped in batch mode: - CharacterRenderer::loadModel (creature VB/IB + textures) - M2Renderer::loadModel (doodad VB/IB + textures) - TerrainRenderer::loadTerrain/loadTerrainIncremental (chunk geometry + textures) - TerrainRenderer::uploadPreloadedTextures - WMORenderer::loadModel (group geometry + textures)	2026-03-07 12:19:59 -08:00
Kelsi	884b72bc1c	Incremental terrain upload + M2 instance dedup hash for city stutter Terrain finalization was uploading all 256 chunks (GPU fence waits) in one atomic advanceFinalization call that couldn't be interrupted by the 5ms time budget. Now split into incremental batches of 16 chunks per call, allowing the time budget to yield between batches. M2 instance creation had O(N) dedup scans iterating ALL instances to check for duplicates. In cities with 5000+ doodads, this caused O(N²) total work during tile loading. Replaced with hash-based DedupKey map for O(1) lookups. Changes: - TerrainRenderer::loadTerrainIncremental: uploads N chunks per call - FinalizingTile tracks terrainChunkNext for cross-frame progress - TERRAIN phase yields after preload and after each chunk batch - M2Renderer::DedupKey hash map replaces linear scan in createInstance and createInstanceWithMatrix - Dedup map maintained through rebuildSpatialIndex and clear paths	2026-03-07 11:59:19 -08:00
Kelsi	f9410cc4bd	Fix city NPC stuttering: async model loading, CharSections cache, frame budgets - Async creature model loading: M2 file I/O and parsing on background threads via std::async, GPU upload on main thread when ready (MAX_ASYNC_CREATURE_LOADS=4) - CharSections.dbc lookup cache: O(1) hash lookup instead of O(N) full DBC scan per humanoid NPC spawn (was scanning thousands of records twice per spawn) - Frame time budget: 4ms cap on creature spawn processing per frame - Wolf/worg model name check cached per modelId (was doing tolower+find per hostile creature per frame) - Weapon attach throttle: max 2 per 1s tick (was attempting all unweaponized NPCs) - Separate texture application tracking (displayIdTexturesApplied_) so async-loaded models still get skin/equipment textures applied correctly	2026-03-07 11:44:14 -08:00