From 7ac990cff43028e26f6674b90950286f7817131b Mon Sep 17 00:00:00 2001 From: Kelsi Date: Sat, 7 Mar 2026 15:46:56 -0800 Subject: [PATCH] Background BLP texture pre-decoding + deferred WMO normal maps (12x streaming perf) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move CPU-heavy BLP texture decoding from main thread to background worker threads for all hot paths: terrain M2 models, WMO doodad M2s, WMO textures, creature models, and gameobject WMOs. Each renderer (M2, WMO, Character) now accepts a pre-decoded BLP cache that loadTexture() checks before falling back to synchronous decode. Defer WMO normal/height map generation (3 per-pixel passes: luminance, box blur, Sobel) during terrain streaming finalization — this was the dominant remaining bottleneck after BLP pre-decoding. Terrain streaming stalls: 1576ms → 124ms worst case. --- include/core/application.hpp | 22 ++- include/rendering/character_renderer.hpp | 6 + include/rendering/m2_renderer.hpp | 8 + include/rendering/terrain_manager.hpp | 6 + include/rendering/vk_context.hpp | 13 +- include/rendering/wmo_renderer.hpp | 9 + src/core/application.cpp | 231 +++++++++++++++++++++-- src/rendering/character_renderer.cpp | 52 ++--- src/rendering/m2_renderer.cpp | 93 +++++---- src/rendering/renderer.cpp | 10 + src/rendering/terrain_manager.cpp | 115 ++++++++--- src/rendering/vk_context.cpp | 89 ++++++++- src/rendering/wmo_renderer.cpp | 28 ++- 13 files changed, 573 insertions(+), 109 deletions(-) diff --git a/include/core/application.hpp b/include/core/application.hpp index a23e6bd8..c97bfaf6 100644 --- a/include/core/application.hpp +++ b/include/core/application.hpp @@ -3,6 +3,7 @@ #include "core/window.hpp" #include "core/input.hpp" #include "game/character.hpp" +#include "pipeline/blp_loader.hpp" #include #include #include @@ -23,7 +24,7 @@ namespace rendering { class Renderer; } namespace ui { class UIManager; } namespace auth { class AuthHandler; } namespace game { class GameHandler; class World; class ExpansionRegistry; } -namespace pipeline { class AssetManager; class DBCLayout; struct M2Model; } +namespace pipeline { class AssetManager; class DBCLayout; struct M2Model; struct WMOModel; } namespace audio { enum class VoiceType; } namespace core { @@ -206,6 +207,7 @@ private: uint32_t modelId; float x, y, z, orientation; std::shared_ptr model; // parsed on background thread + std::unordered_map predecodedTextures; // decoded on bg thread bool valid = false; bool permanent_failure = false; }; @@ -337,6 +339,24 @@ private: }; std::vector pendingGameObjectSpawns_; void processGameObjectSpawnQueue(); + + // Async WMO loading for game objects (file I/O + parse on background thread) + struct PreparedGameObjectWMO { + uint64_t guid; + uint32_t entry; + uint32_t displayId; + float x, y, z, orientation; + std::shared_ptr wmoModel; + std::unordered_map predecodedTextures; // decoded on bg thread + bool valid = false; + bool isWmo = false; + std::string modelPath; + }; + struct AsyncGameObjectLoad { + std::future future; + }; + std::vector asyncGameObjectLoads_; + void processAsyncGameObjectResults(); struct PendingTransportDoodadBatch { uint64_t guid = 0; uint32_t modelId = 0; diff --git a/include/rendering/character_renderer.hpp b/include/rendering/character_renderer.hpp index 52813cf4..c7cae0d7 100644 --- a/include/rendering/character_renderer.hpp +++ b/include/rendering/character_renderer.hpp @@ -1,6 +1,7 @@ #pragma once #include "pipeline/m2_loader.hpp" +#include "pipeline/blp_loader.hpp" #include #include #include @@ -114,7 +115,11 @@ public: void setShadowMap(VkTexture*, const glm::mat4&) {} void clearShadowMap() {} + // Pre-decoded BLP cache: set before calling loadModel() to skip main-thread BLP decode + void setPredecodedBLPCache(std::unordered_map* cache) { predecodedBLPCache_ = cache; } + private: + std::unordered_map* predecodedBLPCache_ = nullptr; // GPU representation of M2 model struct M2ModelGPU { VkBuffer vertexBuffer = VK_NULL_HANDLE; @@ -180,6 +185,7 @@ private: // Bone update throttling (skip frames for distant characters) uint32_t boneUpdateCounter = 0; + const M2ModelGPU* cachedModel = nullptr; // Avoid per-frame hash lookups // Per-instance bone SSBO (double-buffered per frame) VkBuffer boneBuffer[2] = {}; diff --git a/include/rendering/m2_renderer.hpp b/include/rendering/m2_renderer.hpp index 91616a28..1c35e34b 100644 --- a/include/rendering/m2_renderer.hpp +++ b/include/rendering/m2_renderer.hpp @@ -1,6 +1,7 @@ #pragma once #include "pipeline/m2_loader.hpp" +#include "pipeline/blp_loader.hpp" #include #include #include @@ -188,6 +189,7 @@ struct M2Instance { bool skipCollision = false; // WMO interior doodads — skip player wall collision float cachedBoundRadius = 0.0f; float portalSpinAngle = 0.0f; // Accumulated spin angle for portal rotation + const M2ModelGPU* cachedModel = nullptr; // Avoid per-frame hash lookups // Frame-skip optimization (update distant animations less frequently) uint8_t frameSkipCounter = 0; @@ -328,6 +330,10 @@ public: std::vector getWaterVegetationPositions(const glm::vec3& camPos, float maxDist) const; + // Pre-decoded BLP cache: set by terrain manager before calling loadModel() + // so loadTexture() can skip the expensive assetManager->loadTexture() call. + void setPredecodedBLPCache(std::unordered_map* cache) { predecodedBLPCache_ = cache; } + private: bool initialized_ = false; bool insideInterior = false; @@ -414,6 +420,8 @@ private: uint32_t modelLimitRejectWarnings_ = 0; VkTexture* loadTexture(const std::string& path, uint32_t texFlags = 0); + std::unordered_map* predecodedBLPCache_ = nullptr; + struct TextureCacheEntry { std::unique_ptr texture; size_t approxBytes = 0; diff --git a/include/rendering/terrain_manager.hpp b/include/rendering/terrain_manager.hpp index 1b2af320..6f732721 100644 --- a/include/rendering/terrain_manager.hpp +++ b/include/rendering/terrain_manager.hpp @@ -121,6 +121,12 @@ struct PendingTile { // Pre-loaded terrain texture BLP data (loaded on background thread to avoid // blocking file I/O on the main thread during finalizeTile) std::unordered_map preloadedTextures; + + // Pre-decoded M2 model textures (decoded on background thread) + std::unordered_map preloadedM2Textures; + + // Pre-decoded WMO textures (decoded on background thread) + std::unordered_map preloadedWMOTextures; }; /** diff --git a/include/rendering/vk_context.hpp b/include/rendering/vk_context.hpp index dab96d2a..907e21bf 100644 --- a/include/rendering/vk_context.hpp +++ b/include/rendering/vk_context.hpp @@ -50,9 +50,12 @@ public: // Batch upload mode: records multiple upload commands into a single // command buffer, then submits with ONE fence wait instead of one per upload. void beginUploadBatch(); - void endUploadBatch(); + void endUploadBatch(); // Async: submits but does NOT wait for fence + void endUploadBatchSync(); // Sync: submits and waits (for load screens) bool isInUploadBatch() const { return inUploadBatch_; } void deferStagingCleanup(AllocatedBuffer staging); + void pollUploadBatches(); // Check completed async uploads, free staging buffers + void waitAllUploads(); // Block until all in-flight uploads complete // Accessors VkInstance getInstance() const { return instance; } @@ -157,6 +160,14 @@ private: VkCommandBuffer batchCmd_ = VK_NULL_HANDLE; std::vector batchStagingBuffers_; + // Async upload: in-flight batches awaiting GPU completion + struct InFlightBatch { + VkFence fence = VK_NULL_HANDLE; + VkCommandBuffer cmd = VK_NULL_HANDLE; + std::vector stagingBuffers; + }; + std::vector inFlightBatches_; + // Depth buffer (shared across all framebuffers) VkImage depthImage = VK_NULL_HANDLE; VkImageView depthImageView = VK_NULL_HANDLE; diff --git a/include/rendering/wmo_renderer.hpp b/include/rendering/wmo_renderer.hpp index 095a354d..f0d3b36f 100644 --- a/include/rendering/wmo_renderer.hpp +++ b/include/rendering/wmo_renderer.hpp @@ -1,5 +1,6 @@ #pragma once +#include "pipeline/blp_loader.hpp" #include #include #include @@ -325,6 +326,12 @@ public: // Pre-compute floor cache for all loaded WMO instances void precomputeFloorCache(); + // Pre-decoded BLP cache: set before calling loadModel() to skip main-thread BLP decode + void setPredecodedBLPCache(std::unordered_map* cache) { predecodedBLPCache_ = cache; } + + // Defer normal/height map generation during streaming to avoid CPU stalls + void setDeferNormalMaps(bool defer) { deferNormalMaps_ = defer; } + private: // WMO material UBO — matches WMOMaterial in wmo.frag.glsl struct WMOMaterialUBO { @@ -558,6 +565,7 @@ private: * Load a texture from path */ VkTexture* loadTexture(const std::string& path); + std::unordered_map* predecodedBLPCache_ = nullptr; /** * Generate normal+height map from diffuse RGBA8 pixels @@ -670,6 +678,7 @@ private: // Normal mapping / POM settings bool normalMappingEnabled_ = true; // on by default + bool deferNormalMaps_ = false; // skip normal map gen during streaming float normalMapStrength_ = 0.8f; // 0.0 = flat, 1.0 = full, 2.0 = exaggerated bool pomEnabled_ = true; // on by default int pomQuality_ = 1; // 0=Low(16), 1=Medium(32), 2=High(64) diff --git a/src/core/application.cpp b/src/core/application.cpp index f0c22a2c..f4712613 100644 --- a/src/core/application.cpp +++ b/src/core/application.cpp @@ -6883,7 +6883,7 @@ void Application::spawnOnlineGameObject(uint64_t guid, uint32_t entry, uint32_t void Application::processAsyncCreatureResults() { // Check completed async model loads and finalize on main thread (GPU upload + instance creation). // Limit GPU model uploads per frame to avoid spikes, but always drain cheap bookkeeping. - static constexpr int kMaxModelUploadsPerFrame = 3; + static constexpr int kMaxModelUploadsPerFrame = 1; int modelUploads = 0; for (auto it = asyncCreatureLoads_.begin(); it != asyncCreatureLoads_.end(); ) { @@ -6925,13 +6925,17 @@ void Application::processAsyncCreatureResults() { } // Upload model to GPU (must happen on main thread) + // Use pre-decoded BLP cache to skip main-thread texture decode + charRenderer->setPredecodedBLPCache(&result.predecodedTextures); if (!charRenderer->loadModel(*result.model, result.modelId)) { + charRenderer->setPredecodedBLPCache(nullptr); nonRenderableCreatureDisplayIds_.insert(result.displayId); creaturePermanentFailureGuids_.insert(result.guid); pendingCreatureSpawnGuids_.erase(result.guid); creatureSpawnRetryCounts_.erase(result.guid); continue; } + charRenderer->setPredecodedBLPCache(nullptr); displayIdModelCache_[result.displayId] = result.modelId; modelUploads++; @@ -6956,6 +6960,10 @@ void Application::processAsyncCreatureResults() { } void Application::processCreatureSpawnQueue() { + auto startTime = std::chrono::steady_clock::now(); + // Budget: max 2ms per frame for creature spawning to prevent stutter. + static constexpr float kSpawnBudgetMs = 2.0f; + // First, finalize any async model loads that completed on background threads. processAsyncCreatureResults(); @@ -6965,18 +6973,15 @@ void Application::processCreatureSpawnQueue() { if (!creatureLookupsBuilt_) return; } - auto startTime = std::chrono::steady_clock::now(); - // Budget: max 4ms per frame for creature spawning to prevent stutter. - static constexpr float kSpawnBudgetMs = 4.0f; - int processed = 0; int asyncLaunched = 0; size_t rotationsLeft = pendingCreatureSpawns_.size(); while (!pendingCreatureSpawns_.empty() && processed < MAX_SPAWNS_PER_FRAME && rotationsLeft > 0) { - // Check time budget after each spawn (not for the first one, always process at least 1) - if (processed > 0) { + // Check time budget every iteration (including first — async results may + // have already consumed the budget via GPU model uploads). + { auto now = std::chrono::steady_clock::now(); float elapsedMs = std::chrono::duration(now - startTime).count(); if (elapsedMs >= kSpawnBudgetMs) break; @@ -7081,6 +7086,20 @@ void Application::processCreatureSpawnQueue() { } } + // Pre-decode model textures on background thread + for (const auto& tex : model->textures) { + if (tex.filename.empty()) continue; + std::string texKey = tex.filename; + std::replace(texKey.begin(), texKey.end(), '/', '\\'); + std::transform(texKey.begin(), texKey.end(), texKey.begin(), + [](unsigned char c) { return static_cast(std::tolower(c)); }); + if (result.predecodedTextures.find(texKey) != result.predecodedTextures.end()) continue; + auto blp = am->loadTexture(texKey); + if (blp.isValid()) { + result.predecodedTextures[texKey] = std::move(blp); + } + } + result.model = std::move(model); result.valid = true; return result; @@ -7161,14 +7180,202 @@ void Application::processDeferredEquipmentQueue() { setOnlinePlayerEquipment(guid, equipData.first, equipData.second); } +void Application::processAsyncGameObjectResults() { + for (auto it = asyncGameObjectLoads_.begin(); it != asyncGameObjectLoads_.end(); ) { + if (!it->future.valid() || + it->future.wait_for(std::chrono::milliseconds(0)) != std::future_status::ready) { + ++it; + continue; + } + + auto result = it->future.get(); + it = asyncGameObjectLoads_.erase(it); + + if (!result.valid || !result.isWmo || !result.wmoModel) { + // Fallback: spawn via sync path (likely an M2 or failed WMO) + spawnOnlineGameObject(result.guid, result.entry, result.displayId, + result.x, result.y, result.z, result.orientation); + continue; + } + + // WMO parsed on background thread — do GPU upload + instance creation on main thread + auto* wmoRenderer = renderer ? renderer->getWMORenderer() : nullptr; + if (!wmoRenderer) continue; + + uint32_t modelId = 0; + auto itCache = gameObjectDisplayIdWmoCache_.find(result.displayId); + if (itCache != gameObjectDisplayIdWmoCache_.end()) { + modelId = itCache->second; + } else { + modelId = nextGameObjectWmoModelId_++; + wmoRenderer->setPredecodedBLPCache(&result.predecodedTextures); + if (!wmoRenderer->loadModel(*result.wmoModel, modelId)) { + wmoRenderer->setPredecodedBLPCache(nullptr); + LOG_WARNING("Failed to load async gameobject WMO: ", result.modelPath); + continue; + } + wmoRenderer->setPredecodedBLPCache(nullptr); + gameObjectDisplayIdWmoCache_[result.displayId] = modelId; + } + + glm::vec3 renderPos = core::coords::canonicalToRender( + glm::vec3(result.x, result.y, result.z)); + uint32_t instanceId = wmoRenderer->createInstance( + modelId, renderPos, glm::vec3(0.0f, 0.0f, result.orientation), 1.0f); + if (instanceId == 0) continue; + + gameObjectInstances_[result.guid] = {modelId, instanceId, true}; + + // Queue transport doodad loading if applicable + std::string lowerPath = result.modelPath; + std::transform(lowerPath.begin(), lowerPath.end(), lowerPath.begin(), + [](unsigned char c) { return static_cast(std::tolower(c)); }); + if (lowerPath.find("transport") != std::string::npos) { + const auto* doodadTemplates = wmoRenderer->getDoodadTemplates(modelId); + if (doodadTemplates && !doodadTemplates->empty()) { + PendingTransportDoodadBatch batch; + batch.guid = result.guid; + batch.modelId = modelId; + batch.instanceId = instanceId; + batch.x = result.x; + batch.y = result.y; + batch.z = result.z; + batch.orientation = result.orientation; + batch.doodadBudget = doodadTemplates->size(); + pendingTransportDoodadBatches_.push_back(batch); + } + } + } +} + void Application::processGameObjectSpawnQueue() { + // Finalize any completed async WMO loads first + processAsyncGameObjectResults(); + if (pendingGameObjectSpawns_.empty()) return; - // Only spawn 1 game object per frame — each can involve heavy synchronous - // WMO loading (root + groups from disk + GPU upload), easily 100ms+. - auto& s = pendingGameObjectSpawns_.front(); - spawnOnlineGameObject(s.guid, s.entry, s.displayId, s.x, s.y, s.z, s.orientation); - pendingGameObjectSpawns_.erase(pendingGameObjectSpawns_.begin()); + // Process spawns: cached WMOs and M2s go sync (cheap), uncached WMOs go async + auto startTime = std::chrono::steady_clock::now(); + static constexpr float kBudgetMs = 2.0f; + static constexpr int kMaxAsyncLoads = 2; + + while (!pendingGameObjectSpawns_.empty()) { + float elapsedMs = std::chrono::duration( + std::chrono::steady_clock::now() - startTime).count(); + if (elapsedMs >= kBudgetMs) break; + + auto& s = pendingGameObjectSpawns_.front(); + + // Check if this is an uncached WMO that needs async loading + std::string modelPath; + if (gameObjectLookupsBuilt_) { + // Check transport overrides first + bool isTransport = gameHandler && gameHandler->isTransportGuid(s.guid); + if (isTransport) { + if (s.entry == 20808 || s.entry == 176231 || s.entry == 176310) + modelPath = "World\\wmo\\transports\\transport_ship\\transportship.wmo"; + else if (s.displayId == 807 || s.displayId == 808 || s.displayId == 175080 || s.displayId == 176495 || s.displayId == 164871) + modelPath = "World\\wmo\\transports\\transport_zeppelin\\transport_zeppelin.wmo"; + else if (s.displayId == 1587) + modelPath = "World\\wmo\\transports\\transport_horde_zeppelin\\Transport_Horde_Zeppelin.wmo"; + else if (s.displayId == 2454 || s.displayId == 181688 || s.displayId == 190536) + modelPath = "World\\wmo\\transports\\icebreaker\\Transport_Icebreaker_ship.wmo"; + } + if (modelPath.empty()) + modelPath = getGameObjectModelPathForDisplayId(s.displayId); + } + + std::string lowerPath = modelPath; + std::transform(lowerPath.begin(), lowerPath.end(), lowerPath.begin(), + [](unsigned char c) { return static_cast(std::tolower(c)); }); + bool isWmo = lowerPath.size() >= 4 && lowerPath.substr(lowerPath.size() - 4) == ".wmo"; + bool isCached = isWmo && gameObjectDisplayIdWmoCache_.count(s.displayId); + + if (isWmo && !isCached && !modelPath.empty() && + static_cast(asyncGameObjectLoads_.size()) < kMaxAsyncLoads) { + // Launch async WMO load — file I/O + parse on background thread + auto* am = assetManager.get(); + PendingGameObjectSpawn capture = s; + std::string capturePath = modelPath; + AsyncGameObjectLoad load; + load.future = std::async(std::launch::async, + [am, capture, capturePath]() -> PreparedGameObjectWMO { + PreparedGameObjectWMO result; + result.guid = capture.guid; + result.entry = capture.entry; + result.displayId = capture.displayId; + result.x = capture.x; + result.y = capture.y; + result.z = capture.z; + result.orientation = capture.orientation; + result.modelPath = capturePath; + result.isWmo = true; + + auto wmoData = am->readFile(capturePath); + if (wmoData.empty()) return result; + + auto wmo = std::make_shared( + pipeline::WMOLoader::load(wmoData)); + + // Load groups + if (wmo->nGroups > 0) { + std::string basePath = capturePath; + std::string ext; + if (basePath.size() > 4) { + ext = basePath.substr(basePath.size() - 4); + basePath = basePath.substr(0, basePath.size() - 4); + } + for (uint32_t gi = 0; gi < wmo->nGroups; gi++) { + char suffix[16]; + snprintf(suffix, sizeof(suffix), "_%03u%s", gi, ext.c_str()); + auto groupData = am->readFile(basePath + suffix); + if (groupData.empty()) { + snprintf(suffix, sizeof(suffix), "_%03u.wmo", gi); + groupData = am->readFile(basePath + suffix); + } + if (!groupData.empty()) { + pipeline::WMOLoader::loadGroup(groupData, *wmo, gi); + } + } + } + + // Pre-decode WMO textures on background thread + for (const auto& texPath : wmo->textures) { + if (texPath.empty()) continue; + std::string texKey = texPath; + size_t nul = texKey.find('\0'); + if (nul != std::string::npos) texKey.resize(nul); + std::replace(texKey.begin(), texKey.end(), '/', '\\'); + std::transform(texKey.begin(), texKey.end(), texKey.begin(), + [](unsigned char c) { return static_cast(std::tolower(c)); }); + if (texKey.empty()) continue; + // Convert to .blp extension + if (texKey.size() >= 4) { + std::string ext = texKey.substr(texKey.size() - 4); + if (ext == ".tga" || ext == ".dds") { + texKey = texKey.substr(0, texKey.size() - 4) + ".blp"; + } + } + if (result.predecodedTextures.find(texKey) != result.predecodedTextures.end()) continue; + auto blp = am->loadTexture(texKey); + if (blp.isValid()) { + result.predecodedTextures[texKey] = std::move(blp); + } + } + + result.wmoModel = wmo; + result.valid = true; + return result; + }); + asyncGameObjectLoads_.push_back(std::move(load)); + pendingGameObjectSpawns_.erase(pendingGameObjectSpawns_.begin()); + continue; + } + + // Cached WMO or M2 — spawn synchronously (cheap) + spawnOnlineGameObject(s.guid, s.entry, s.displayId, s.x, s.y, s.z, s.orientation); + pendingGameObjectSpawns_.erase(pendingGameObjectSpawns_.begin()); + } } void Application::processPendingTransportDoodads() { diff --git a/src/rendering/character_renderer.cpp b/src/rendering/character_renderer.cpp index f735dd7d..040a301d 100644 --- a/src/rendering/character_renderer.cpp +++ b/src/rendering/character_renderer.cpp @@ -625,7 +625,18 @@ VkTexture* CharacterRenderer::loadTexture(const std::string& path) { return whiteTexture_.get(); } - auto blpImage = assetManager->loadTexture(key); + // Check pre-decoded BLP cache first (populated by background threads) + pipeline::BLPImage blpImage; + if (predecodedBLPCache_) { + auto pit = predecodedBLPCache_->find(key); + if (pit != predecodedBLPCache_->end()) { + blpImage = std::move(pit->second); + predecodedBLPCache_->erase(pit); + } + } + if (!blpImage.isValid()) { + blpImage = assetManager->loadTexture(key); + } if (!blpImage.isValid()) { // Return white fallback but don't cache the failure — allow retry // on next character load in case the asset becomes available. @@ -1412,8 +1423,9 @@ uint32_t CharacterRenderer::createInstance(uint32_t modelId, const glm::vec3& po instance.scale = scale; // Initialize bone matrices to identity - auto& model = models[modelId].data; - instance.boneMatrices.resize(std::max(static_cast(1), model.bones.size()), glm::mat4(1.0f)); + auto& gpuRef = models[modelId]; + instance.boneMatrices.resize(std::max(static_cast(1), gpuRef.data.bones.size()), glm::mat4(1.0f)); + instance.cachedModel = &gpuRef; uint32_t id = instance.id; instances[id] = std::move(instance); @@ -1511,13 +1523,12 @@ void CharacterRenderer::update(float deltaTime, const glm::vec3& cameraPos) { if (distSq >= animUpdateRadiusSq) continue; // Always advance animation time (cheap) - auto modelIt = models.find(inst.modelId); - if (modelIt != models.end() && !modelIt->second.data.sequences.empty()) { + if (inst.cachedModel && !inst.cachedModel->data.sequences.empty()) { if (inst.currentSequenceIndex < 0) { inst.currentSequenceIndex = 0; - inst.currentAnimationId = modelIt->second.data.sequences[0].id; + inst.currentAnimationId = inst.cachedModel->data.sequences[0].id; } - const auto& seq = modelIt->second.data.sequences[inst.currentSequenceIndex]; + const auto& seq = inst.cachedModel->data.sequences[inst.currentSequenceIndex]; inst.animationTime += deltaTime * 1000.0f; if (seq.duration > 0 && inst.animationTime >= static_cast(seq.duration)) { if (inst.animationLoop) { @@ -1528,10 +1539,11 @@ void CharacterRenderer::update(float deltaTime, const glm::vec3& cameraPos) { } } - // Distance-tiered bone throttling: near=every frame, mid=every 3rd, far=every 6th + // Distance-tiered bone throttling: near=every frame, mid=every 4th, far=every 8th uint32_t boneInterval = 1; - if (distSq > 60.0f * 60.0f) boneInterval = 6; - else if (distSq > 30.0f * 30.0f) boneInterval = 3; + if (distSq > 40.0f * 40.0f) boneInterval = 8; + else if (distSq > 20.0f * 20.0f) boneInterval = 4; + else if (distSq > 10.0f * 10.0f) boneInterval = 2; inst.boneUpdateCounter++; bool needsBones = (inst.boneUpdateCounter >= boneInterval) || inst.boneMatrices.empty(); @@ -1615,11 +1627,8 @@ void CharacterRenderer::update(float deltaTime, const glm::vec3& cameraPos) { } void CharacterRenderer::updateAnimation(CharacterInstance& instance, float deltaTime) { - auto modelIt = models.find(instance.modelId); - if (modelIt == models.end()) { - return; - } - const auto& model = modelIt->second.data; + if (!instance.cachedModel) return; + const auto& model = instance.cachedModel->data; if (model.sequences.empty()) { return; @@ -1732,7 +1741,8 @@ glm::quat CharacterRenderer::interpolateQuat(const pipeline::M2AnimationTrack& t // --- Bone transform calculation --- void CharacterRenderer::calculateBoneMatrices(CharacterInstance& instance) { - auto& model = models[instance.modelId].data; + if (!instance.cachedModel) return; + auto& model = instance.cachedModel->data; if (model.bones.empty()) { return; @@ -1833,9 +1843,8 @@ void CharacterRenderer::render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet, } } - auto modelIt = models.find(instance.modelId); - if (modelIt == models.end()) continue; - const auto& gpuModel = modelIt->second; + if (!instance.cachedModel) continue; + const auto& gpuModel = *instance.cachedModel; // Skip models without GPU buffers if (!gpuModel.vertexBuffer) continue; @@ -2487,9 +2496,8 @@ void CharacterRenderer::renderShadow(VkCommandBuffer cmd, const glm::mat4& light glm::vec3 diff = inst.position - shadowCenter; if (glm::dot(diff, diff) > shadowRadiusSq) continue; - auto modelIt = models.find(inst.modelId); - if (modelIt == models.end()) continue; - const M2ModelGPU& gpuModel = modelIt->second; + if (!inst.cachedModel) continue; + const M2ModelGPU& gpuModel = *inst.cachedModel; if (!gpuModel.vertexBuffer) continue; glm::mat4 modelMat = inst.hasOverrideModelMatrix diff --git a/src/rendering/m2_renderer.cpp b/src/rendering/m2_renderer.cpp index c4e7a727..d455e494 100644 --- a/src/rendering/m2_renderer.cpp +++ b/src/rendering/m2_renderer.cpp @@ -1657,6 +1657,7 @@ uint32_t M2Renderer::createInstance(uint32_t modelId, const glm::vec3& position, instance.cachedIsInvisibleTrap = mdlRef.isInvisibleTrap; instance.cachedIsInstancePortal = mdlRef.isInstancePortal; instance.cachedIsValid = mdlRef.isValid(); + instance.cachedModel = &mdlRef; // Initialize animation: play first sequence (usually Stand/Idle) const auto& mdl = mdlRef; @@ -1748,6 +1749,7 @@ uint32_t M2Renderer::createInstanceWithMatrix(uint32_t modelId, const glm::mat4& instance.cachedIsGroundDetail = mdl2.isGroundDetail; instance.cachedIsInvisibleTrap = mdl2.isInvisibleTrap; instance.cachedIsValid = mdl2.isValid(); + instance.cachedModel = &mdl2; // Initialize animation if (mdl2.hasAnimation && !mdl2.disableAnimation && !mdl2.sequences.empty()) { @@ -2026,9 +2028,8 @@ void M2Renderer::update(float deltaTime, const glm::vec3& cameraPos, const glm:: instance.animTime += dtMs * (instance.animSpeed - 1.0f); // For animation looping/variation, we need the actual model data. - auto it = models.find(instance.modelId); - if (it == models.end()) continue; - const M2ModelGPU& model = it->second; + if (!instance.cachedModel) continue; + const M2ModelGPU& model = *instance.cachedModel; // Validate sequence index if (instance.currentSequenceIndex < 0 || @@ -2084,6 +2085,14 @@ void M2Renderer::update(float deltaTime, const glm::vec3& cameraPos, const glm:: float paddedRadius = std::max(cullRadius * 1.5f, cullRadius + 3.0f); if (cullRadius > 0.0f && !updateFrustum.intersectsSphere(instance.position, paddedRadius)) continue; + // Distance-based frame skipping: update distant bones less frequently + uint32_t boneInterval = 1; + if (distSq > 200.0f * 200.0f) boneInterval = 8; + else if (distSq > 100.0f * 100.0f) boneInterval = 4; + else if (distSq > 50.0f * 50.0f) boneInterval = 2; + instance.frameSkipCounter++; + if ((instance.frameSkipCounter % boneInterval) != 0) continue; + boneWorkIndices_.push_back(idx); } @@ -2097,9 +2106,8 @@ void M2Renderer::update(float deltaTime, const glm::vec3& cameraPos, const glm:: for (size_t i : boneWorkIndices_) { if (i >= instances.size()) continue; auto& inst = instances[i]; - auto mdlIt = models.find(inst.modelId); - if (mdlIt == models.end()) continue; - computeBoneMatrices(mdlIt->second, inst); + if (!inst.cachedModel) continue; + computeBoneMatrices(*inst.cachedModel, inst); } } else { // Parallel — dispatch across worker threads @@ -2112,9 +2120,8 @@ void M2Renderer::update(float deltaTime, const glm::vec3& cameraPos, const glm:: for (size_t i : boneWorkIndices_) { if (i >= instances.size()) continue; auto& inst = instances[i]; - auto mdlIt = models.find(inst.modelId); - if (mdlIt == models.end()) continue; - computeBoneMatrices(mdlIt->second, inst); + if (!inst.cachedModel) continue; + computeBoneMatrices(*inst.cachedModel, inst); } } else { const size_t chunkSize = animCount / numThreads; @@ -2135,9 +2142,8 @@ void M2Renderer::update(float deltaTime, const glm::vec3& cameraPos, const glm:: size_t idx = boneWorkIndices_[j]; if (idx >= instances.size()) continue; auto& inst = instances[idx]; - auto mdlIt = models.find(inst.modelId); - if (mdlIt == models.end()) continue; - computeBoneMatrices(mdlIt->second, inst); + if (!inst.cachedModel) continue; + computeBoneMatrices(*inst.cachedModel, inst); } })); start = end; @@ -2159,9 +2165,8 @@ void M2Renderer::update(float deltaTime, const glm::vec3& cameraPos, const glm:: glm::vec3 toCam = instance.position - cachedCamPos_; float distSq = glm::dot(toCam, toCam); if (distSq > cachedMaxRenderDistSq_) continue; - auto mdlIt = models.find(instance.modelId); - if (mdlIt == models.end()) continue; - emitParticles(instance, mdlIt->second, deltaTime); + if (!instance.cachedModel) continue; + emitParticles(instance, *instance.cachedModel, deltaTime); updateParticles(instance, deltaTime); } @@ -2865,9 +2870,8 @@ void M2Renderer::renderShadow(VkCommandBuffer cmd, const glm::mat4& lightSpaceMa glm::vec3 diff = instance.position - shadowCenter; if (glm::dot(diff, diff) > shadowRadiusSq) continue; - auto modelIt = models.find(instance.modelId); - if (modelIt == models.end()) continue; - const M2ModelGPU& model = modelIt->second; + if (!instance.cachedModel) continue; + const M2ModelGPU& model = *instance.cachedModel; // Filter: only draw foliage models in foliage pass, non-foliage in non-foliage pass if (model.shadowWindFoliage != foliagePass) continue; @@ -2973,8 +2977,7 @@ std::vector M2Renderer::getWaterVegetationPositions(const glm::vec3& std::vector result; float maxDistSq = maxDist * maxDist; for (const auto& inst : instances) { - auto it = models.find(inst.modelId); - if (it == models.end() || !it->second.isWaterVegetation) continue; + if (!inst.cachedModel || !inst.cachedModel->isWaterVegetation) continue; glm::vec3 diff = inst.position - camPos; if (glm::dot(diff, diff) <= maxDistSq) { result.push_back(inst.position); @@ -3085,9 +3088,8 @@ void M2Renderer::emitParticles(M2Instance& inst, const M2ModelGPU& gpu, float dt } void M2Renderer::updateParticles(M2Instance& inst, float dt) { - auto it = models.find(inst.modelId); - if (it == models.end()) return; - const auto& gpu = it->second; + if (!inst.cachedModel) return; + const auto& gpu = *inst.cachedModel; for (size_t i = 0; i < inst.particles.size(); ) { auto& p = inst.particles[i]; @@ -3162,9 +3164,8 @@ void M2Renderer::renderM2Particles(VkCommandBuffer cmd, VkDescriptorSet perFrame for (auto& inst : instances) { if (inst.particles.empty()) continue; - auto it = models.find(inst.modelId); - if (it == models.end()) continue; - const auto& gpu = it->second; + if (!inst.cachedModel) continue; + const auto& gpu = *inst.cachedModel; for (const auto& p : inst.particles) { if (p.emitterIndex < 0 || p.emitterIndex >= static_cast(gpu.particleEmitters.size())) continue; @@ -3549,9 +3550,13 @@ void M2Renderer::rebuildSpatialIndex() { particleInstanceIndices_.clear(); for (size_t i = 0; i < instances.size(); i++) { - const auto& inst = instances[i]; + auto& inst = instances[i]; instanceIndexById[inst.id] = i; + // Re-cache model pointer (may have changed after model map modifications) + auto mdlIt = models.find(inst.modelId); + inst.cachedModel = (mdlIt != models.end()) ? &mdlIt->second : nullptr; + // Rebuild dedup map (skip ground detail) if (!inst.cachedIsGroundDetail) { DedupKey dk{inst.modelId, @@ -3684,8 +3689,18 @@ VkTexture* M2Renderer::loadTexture(const std::string& path, uint32_t texFlags) { containsToken(key, "campfire") || containsToken(key, "bonfire"); - // Load BLP texture - pipeline::BLPImage blp = assetManager->loadTexture(key); + // Check pre-decoded BLP cache first (populated by background worker threads) + pipeline::BLPImage blp; + if (predecodedBLPCache_) { + auto pit = predecodedBLPCache_->find(key); + if (pit != predecodedBLPCache_->end()) { + blp = std::move(pit->second); + predecodedBLPCache_->erase(pit); + } + } + if (!blp.isValid()) { + blp = assetManager->loadTexture(key); + } if (!blp.isValid()) { // Return white fallback but don't cache the failure — MPQ reads can // fail transiently during streaming; allow retry on next model load. @@ -3751,9 +3766,8 @@ VkTexture* M2Renderer::loadTexture(const std::string& path, uint32_t texFlags) { uint32_t M2Renderer::getTotalTriangleCount() const { uint32_t total = 0; for (const auto& instance : instances) { - auto it = models.find(instance.modelId); - if (it != models.end()) { - total += it->second.indexCount / 3; + if (instance.cachedModel) { + total += instance.cachedModel->indexCount / 3; } } return total; @@ -3775,11 +3789,10 @@ std::optional M2Renderer::getFloorHeight(float glX, float glY, float glZ, continue; } - auto it = models.find(instance.modelId); - if (it == models.end()) continue; + if (!instance.cachedModel) continue; if (instance.scale <= 0.001f) continue; - const M2ModelGPU& model = it->second; + const M2ModelGPU& model = *instance.cachedModel; if (model.collisionNoBlock || model.isInvisibleTrap || model.isSpellEffect) continue; if (instance.skipCollision) continue; @@ -3931,10 +3944,9 @@ bool M2Renderer::checkCollision(const glm::vec3& from, const glm::vec3& to, if (from.z > instance.worldBoundsMax.z + 2.5f && adjustedPos.z > instance.worldBoundsMax.z + 2.5f) continue; if (from.z + 2.5f < instance.worldBoundsMin.z && adjustedPos.z + 2.5f < instance.worldBoundsMin.z) continue; - auto it = models.find(instance.modelId); - if (it == models.end()) continue; + if (!instance.cachedModel) continue; - const M2ModelGPU& model = it->second; + const M2ModelGPU& model = *instance.cachedModel; if (model.collisionNoBlock || model.isInvisibleTrap || model.isSpellEffect) continue; if (instance.skipCollision) continue; if (instance.scale <= 0.001f) continue; @@ -4172,10 +4184,9 @@ float M2Renderer::raycastBoundingBoxes(const glm::vec3& origin, const glm::vec3& continue; } - auto it = models.find(instance.modelId); - if (it == models.end()) continue; + if (!instance.cachedModel) continue; - const M2ModelGPU& model = it->second; + const M2ModelGPU& model = *instance.cachedModel; if (model.collisionNoBlock || model.isInvisibleTrap || model.isSpellEffect) continue; glm::vec3 localMin, localMax; getTightCollisionBounds(model, localMin, localMax); diff --git a/src/rendering/renderer.cpp b/src/rendering/renderer.cpp index 69bfecdb..55ba1370 100644 --- a/src/rendering/renderer.cpp +++ b/src/rendering/renderer.cpp @@ -2434,6 +2434,9 @@ void Renderer::update(float deltaTime) { cameraController->update(deltaTime); auto cameraEnd = std::chrono::steady_clock::now(); lastCameraUpdateMs = std::chrono::duration(cameraEnd - cameraStart).count(); + if (lastCameraUpdateMs > 3.0) { + LOG_WARNING("SLOW cameraController->update: ", lastCameraUpdateMs, "ms"); + } // Update 3D audio listener position/orientation to match camera if (camera) { @@ -2779,8 +2782,15 @@ void Renderer::update(float deltaTime) { // Update M2 doodad animations (pass camera for frustum-culling bone computation) if (m2Renderer && camera) { + auto m2Start = std::chrono::steady_clock::now(); m2Renderer->update(deltaTime, camera->getPosition(), camera->getProjectionMatrix() * camera->getViewMatrix()); + float m2Ms = std::chrono::duration( + std::chrono::steady_clock::now() - m2Start).count(); + if (m2Ms > 3.0f) { + LOG_WARNING("SLOW m2Renderer->update: ", m2Ms, "ms (", + m2Renderer->getInstanceCount(), " instances)"); + } } // Helper: play zone music, dispatching local files (file: prefix) vs MPQ paths diff --git a/src/rendering/terrain_manager.cpp b/src/rendering/terrain_manager.cpp index 20a2e9a1..97527c8c 100644 --- a/src/rendering/terrain_manager.cpp +++ b/src/rendering/terrain_manager.cpp @@ -231,9 +231,14 @@ bool TerrainManager::loadTile(int x, int y) { return false; } + VkContext* vkCtx = terrainRenderer ? terrainRenderer->getVkContext() : nullptr; + if (vkCtx) vkCtx->beginUploadBatch(); + FinalizingTile ft; ft.pending = std::move(pending); while (!advanceFinalization(ft)) {} + + if (vkCtx) vkCtx->endUploadBatchSync(); // Sync — caller expects tile ready return true; } @@ -407,6 +412,20 @@ std::shared_ptr TerrainManager::prepareTile(int x, int y) { return false; } + // Pre-decode M2 model textures on background thread + for (const auto& tex : m2Model.textures) { + if (tex.filename.empty()) continue; + std::string texKey = tex.filename; + std::replace(texKey.begin(), texKey.end(), '/', '\\'); + std::transform(texKey.begin(), texKey.end(), texKey.begin(), + [](unsigned char c) { return static_cast(std::tolower(c)); }); + if (pending->preloadedM2Textures.find(texKey) != pending->preloadedM2Textures.end()) continue; + auto blp = assetManager->loadTexture(texKey); + if (blp.isValid()) { + pending->preloadedM2Textures[texKey] = std::move(blp); + } + } + PendingTile::M2Ready ready; ready.modelId = modelId; ready.model = std::move(m2Model); @@ -584,6 +603,20 @@ std::shared_ptr TerrainManager::prepareTile(int x, int y) { pipeline::M2Loader::loadSkin(skinData, m2Model); } if (!m2Model.isValid()) continue; + + // Pre-decode doodad M2 textures on background thread + for (const auto& tex : m2Model.textures) { + if (tex.filename.empty()) continue; + std::string texKey = tex.filename; + std::replace(texKey.begin(), texKey.end(), '/', '\\'); + std::transform(texKey.begin(), texKey.end(), texKey.begin(), + [](unsigned char c) { return static_cast(std::tolower(c)); }); + if (pending->preloadedM2Textures.find(texKey) != pending->preloadedM2Textures.end()) continue; + auto blp = assetManager->loadTexture(texKey); + if (blp.isValid()) { + pending->preloadedM2Textures[texKey] = std::move(blp); + } + } } // Build doodad's local transform (WoW coordinates) @@ -654,6 +687,32 @@ std::shared_ptr TerrainManager::prepareTile(int x, int y) { } } + // Pre-decode WMO textures on background thread + for (const auto& texPath : wmoModel.textures) { + if (texPath.empty()) continue; + std::string texKey = texPath; + // Truncate at NUL (WMO paths can have stray bytes) + size_t nul = texKey.find('\0'); + if (nul != std::string::npos) texKey.resize(nul); + std::replace(texKey.begin(), texKey.end(), '/', '\\'); + std::transform(texKey.begin(), texKey.end(), texKey.begin(), + [](unsigned char c) { return static_cast(std::tolower(c)); }); + if (texKey.empty()) continue; + if (pending->preloadedWMOTextures.find(texKey) != pending->preloadedWMOTextures.end()) continue; + // Try .blp variant + std::string blpKey = texKey; + if (blpKey.size() >= 4) { + std::string ext = blpKey.substr(blpKey.size() - 4); + if (ext == ".tga" || ext == ".dds") { + blpKey = blpKey.substr(0, blpKey.size() - 4) + ".blp"; + } + } + auto blp = assetManager->loadTexture(blpKey); + if (blp.isValid()) { + pending->preloadedWMOTextures[blpKey] = std::move(blp); + } + } + PendingTile::WMOReady ready; // Cache WMO model uploads by path; placement dedup uses uniqueId separately. ready.modelId = static_cast(std::hash{}(wmoPath)); @@ -741,7 +800,7 @@ bool TerrainManager::advanceFinalization(FinalizingTile& ft) { } bool allDone = terrainRenderer->loadTerrainIncremental( pending->mesh, pending->terrain.textures, x, y, - ft.terrainChunkNext, 64); + ft.terrainChunkNext, 32); if (!allDone) { return false; // More chunks remain — yield to time budget } @@ -773,7 +832,9 @@ bool TerrainManager::advanceFinalization(FinalizingTile& ft) { case FinalizationPhase::M2_MODELS: { // Upload multiple M2 models per call (batched GPU uploads) if (m2Renderer && ft.m2ModelIndex < pending->m2Models.size()) { - constexpr size_t kModelsPerStep = 8; + // Set pre-decoded BLP cache so loadTexture() skips main-thread BLP decode + m2Renderer->setPredecodedBLPCache(&pending->preloadedM2Textures); + constexpr size_t kModelsPerStep = 4; size_t uploaded = 0; while (ft.m2ModelIndex < pending->m2Models.size() && uploaded < kModelsPerStep) { auto& m2Ready = pending->m2Models[ft.m2ModelIndex]; @@ -786,6 +847,7 @@ bool TerrainManager::advanceFinalization(FinalizingTile& ft) { ft.m2ModelIndex++; uploaded++; } + m2Renderer->setPredecodedBLPCache(nullptr); // Stay in this phase until all models uploaded if (ft.m2ModelIndex < pending->m2Models.size()) { return false; @@ -830,8 +892,11 @@ bool TerrainManager::advanceFinalization(FinalizingTile& ft) { // Upload multiple WMO models per call (batched GPU uploads) if (wmoRenderer && assetManager) { wmoRenderer->initialize(nullptr, VK_NULL_HANDLE, assetManager); + // Set pre-decoded BLP cache and defer normal maps during streaming + wmoRenderer->setPredecodedBLPCache(&pending->preloadedWMOTextures); + wmoRenderer->setDeferNormalMaps(true); - constexpr size_t kWmosPerStep = 4; + constexpr size_t kWmosPerStep = 1; size_t uploaded = 0; while (ft.wmoModelIndex < pending->wmoModels.size() && uploaded < kWmosPerStep) { auto& wmoReady = pending->wmoModels[ft.wmoModelIndex]; @@ -843,6 +908,8 @@ bool TerrainManager::advanceFinalization(FinalizingTile& ft) { uploaded++; } } + wmoRenderer->setDeferNormalMaps(false); + wmoRenderer->setPredecodedBLPCache(nullptr); if (ft.wmoModelIndex < pending->wmoModels.size()) return false; } ft.phase = FinalizationPhase::WMO_INSTANCES; @@ -906,7 +973,9 @@ bool TerrainManager::advanceFinalization(FinalizingTile& ft) { case FinalizationPhase::WMO_DOODADS: { // Upload multiple WMO doodad M2s per call (batched GPU uploads) if (m2Renderer && ft.wmoDoodadIndex < pending->wmoDoodads.size()) { - constexpr size_t kDoodadsPerStep = 16; + // Set pre-decoded BLP cache for doodad M2 textures + m2Renderer->setPredecodedBLPCache(&pending->preloadedM2Textures); + constexpr size_t kDoodadsPerStep = 4; size_t uploaded = 0; while (ft.wmoDoodadIndex < pending->wmoDoodads.size() && uploaded < kDoodadsPerStep) { auto& doodad = pending->wmoDoodads[ft.wmoDoodadIndex]; @@ -923,6 +992,7 @@ bool TerrainManager::advanceFinalization(FinalizingTile& ft) { ft.wmoDoodadIndex++; uploaded++; } + m2Renderer->setPredecodedBLPCache(nullptr); if (ft.wmoDoodadIndex < pending->wmoDoodads.size()) return false; } ft.phase = FinalizationPhase::WATER; @@ -1080,11 +1150,6 @@ void TerrainManager::workerLoop() { } void TerrainManager::processReadyTiles() { - // Process tiles with time budget to avoid frame spikes - // Taxi mode gets a slightly larger budget to avoid visible late-pop terrain/models. - const float timeBudgetMs = taxiStreamingMode_ ? 8.0f : 3.0f; - auto startTime = std::chrono::high_resolution_clock::now(); - // Move newly ready tiles into the finalizing deque. // Keep them in pendingTiles so streamTiles() won't re-enqueue them. { @@ -1100,28 +1165,32 @@ void TerrainManager::processReadyTiles() { } } - // Outer upload batch: all GPU uploads across all advanceFinalization calls - // this frame share a single command buffer submission + fence wait. VkContext* vkCtx = terrainRenderer ? terrainRenderer->getVkContext() : nullptr; + + // Reclaim completed async uploads from previous frames (non-blocking) + if (vkCtx) vkCtx->pollUploadBatches(); + + // Nothing to finalize — done. + if (finalizingTiles_.empty()) return; + + // Async upload batch: record GPU copies into a command buffer, submit with + // a fence, but DON'T wait. The fence is polled on subsequent frames. + // This eliminates the main-thread stall from vkWaitForFences entirely. + const int maxSteps = taxiStreamingMode_ ? 8 : 2; + int steps = 0; + if (vkCtx) vkCtx->beginUploadBatch(); - // Drive incremental finalization within time budget - while (!finalizingTiles_.empty()) { + while (!finalizingTiles_.empty() && steps < maxSteps) { auto& ft = finalizingTiles_.front(); bool done = advanceFinalization(ft); - if (done) { finalizingTiles_.pop_front(); } - - auto now = std::chrono::high_resolution_clock::now(); - float elapsedMs = std::chrono::duration(now - startTime).count(); - if (elapsedMs >= timeBudgetMs) { - break; - } + steps++; } - if (vkCtx) vkCtx->endUploadBatch(); + if (vkCtx) vkCtx->endUploadBatch(); // Async — submits but doesn't wait } void TerrainManager::processAllReadyTiles() { @@ -1151,7 +1220,7 @@ void TerrainManager::processAllReadyTiles() { finalizingTiles_.pop_front(); } - if (vkCtx) vkCtx->endUploadBatch(); + if (vkCtx) vkCtx->endUploadBatchSync(); // Sync — load screen needs data ready } void TerrainManager::processOneReadyTile() { @@ -1177,7 +1246,7 @@ void TerrainManager::processOneReadyTile() { while (!advanceFinalization(ft)) {} finalizingTiles_.pop_front(); - if (vkCtx) vkCtx->endUploadBatch(); + if (vkCtx) vkCtx->endUploadBatchSync(); // Sync — load screen needs data ready } } diff --git a/src/rendering/vk_context.cpp b/src/rendering/vk_context.cpp index dc73c685..79e7eac3 100644 --- a/src/rendering/vk_context.cpp +++ b/src/rendering/vk_context.cpp @@ -67,6 +67,14 @@ void VkContext::shutdown() { frame = {}; } + // Clean up any in-flight async upload batches (device already idle) + for (auto& batch : inFlightBatches_) { + // Staging buffers: skip destroy — allocator is about to be torn down + vkDestroyFence(device, batch.fence, nullptr); + // Command buffer freed when pool is destroyed below + } + inFlightBatches_.clear(); + if (immFence) { vkDestroyFence(device, immFence, nullptr); immFence = VK_NULL_HANDLE; } if (immCommandPool) { vkDestroyCommandPool(device, immCommandPool, nullptr); immCommandPool = VK_NULL_HANDLE; } @@ -1447,17 +1455,94 @@ void VkContext::endUploadBatch() { inUploadBatch_ = false; - // Submit all recorded commands with a single fence wait + if (batchStagingBuffers_.empty()) { + // No GPU copies were recorded — skip the submit entirely. + vkEndCommandBuffer(batchCmd_); + vkFreeCommandBuffers(device, immCommandPool, 1, &batchCmd_); + batchCmd_ = VK_NULL_HANDLE; + return; + } + + // Submit commands with a NEW fence — don't wait, let GPU work in parallel. + vkEndCommandBuffer(batchCmd_); + + VkFenceCreateInfo fenceInfo{}; + fenceInfo.sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO; + VkFence fence = VK_NULL_HANDLE; + vkCreateFence(device, &fenceInfo, nullptr, &fence); + + VkSubmitInfo submitInfo{}; + submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO; + submitInfo.commandBufferCount = 1; + submitInfo.pCommandBuffers = &batchCmd_; + vkQueueSubmit(graphicsQueue, 1, &submitInfo, fence); + + // Stash everything for later cleanup when fence signals + InFlightBatch batch; + batch.fence = fence; + batch.cmd = batchCmd_; + batch.stagingBuffers = std::move(batchStagingBuffers_); + inFlightBatches_.push_back(std::move(batch)); + + batchCmd_ = VK_NULL_HANDLE; + batchStagingBuffers_.clear(); +} + +void VkContext::endUploadBatchSync() { + if (uploadBatchDepth_ <= 0) return; + uploadBatchDepth_--; + if (uploadBatchDepth_ > 0) return; + + inUploadBatch_ = false; + + if (batchStagingBuffers_.empty()) { + vkEndCommandBuffer(batchCmd_); + vkFreeCommandBuffers(device, immCommandPool, 1, &batchCmd_); + batchCmd_ = VK_NULL_HANDLE; + return; + } + + // Synchronous path for load screens — submit and wait endSingleTimeCommands(batchCmd_); batchCmd_ = VK_NULL_HANDLE; - // Destroy all deferred staging buffers for (auto& staging : batchStagingBuffers_) { destroyBuffer(allocator, staging); } batchStagingBuffers_.clear(); } +void VkContext::pollUploadBatches() { + if (inFlightBatches_.empty()) return; + + for (auto it = inFlightBatches_.begin(); it != inFlightBatches_.end(); ) { + VkResult result = vkGetFenceStatus(device, it->fence); + if (result == VK_SUCCESS) { + // GPU finished — free resources + for (auto& staging : it->stagingBuffers) { + destroyBuffer(allocator, staging); + } + vkFreeCommandBuffers(device, immCommandPool, 1, &it->cmd); + vkDestroyFence(device, it->fence, nullptr); + it = inFlightBatches_.erase(it); + } else { + ++it; + } + } +} + +void VkContext::waitAllUploads() { + for (auto& batch : inFlightBatches_) { + vkWaitForFences(device, 1, &batch.fence, VK_TRUE, UINT64_MAX); + for (auto& staging : batch.stagingBuffers) { + destroyBuffer(allocator, staging); + } + vkFreeCommandBuffers(device, immCommandPool, 1, &batch.cmd); + vkDestroyFence(device, batch.fence, nullptr); + } + inFlightBatches_.clear(); +} + void VkContext::deferStagingCleanup(AllocatedBuffer staging) { batchStagingBuffers_.push_back(staging); } diff --git a/src/rendering/wmo_renderer.cpp b/src/rendering/wmo_renderer.cpp index 691abaa1..5dec0e3e 100644 --- a/src/rendering/wmo_renderer.cpp +++ b/src/rendering/wmo_renderer.cpp @@ -2325,13 +2325,27 @@ VkTexture* WMORenderer::loadTexture(const std::string& path) { const auto& attemptedCandidates = uniqueCandidates; // Try loading all candidates until one succeeds + // Check pre-decoded BLP cache first (populated by background worker threads) pipeline::BLPImage blp; std::string resolvedKey; - for (const auto& c : attemptedCandidates) { - blp = assetManager->loadTexture(c); - if (blp.isValid()) { - resolvedKey = c; - break; + if (predecodedBLPCache_) { + for (const auto& c : uniqueCandidates) { + auto pit = predecodedBLPCache_->find(c); + if (pit != predecodedBLPCache_->end()) { + blp = std::move(pit->second); + predecodedBLPCache_->erase(pit); + resolvedKey = c; + break; + } + } + } + if (!blp.isValid()) { + for (const auto& c : attemptedCandidates) { + blp = assetManager->loadTexture(c); + if (blp.isValid()) { + resolvedKey = c; + break; + } } } if (!blp.isValid()) { @@ -2369,10 +2383,10 @@ VkTexture* WMORenderer::loadTexture(const std::string& path) { texture->createSampler(vkCtx_->getDevice(), VK_FILTER_LINEAR, VK_FILTER_LINEAR, VK_SAMPLER_ADDRESS_MODE_REPEAT); - // Generate normal+height map from diffuse pixels + // Generate normal+height map from diffuse pixels (skip during streaming to avoid CPU stalls) float nhVariance = 0.0f; std::unique_ptr nhMap; - if (normalMappingEnabled_ || pomEnabled_) { + if ((normalMappingEnabled_ || pomEnabled_) && !deferNormalMaps_) { nhMap = generateNormalHeightMap(blp.data.data(), blp.width, blp.height, nhVariance); if (nhMap) { approxBytes *= 2; // account for normal map in budget