From 0313bd869285ff9e21e6bf24e3ec192189be4633 Mon Sep 17 00:00:00 2001 From: Kelsi Date: Sat, 7 Mar 2026 13:44:09 -0800 Subject: [PATCH] Performance: ring buffer UBOs, batched load screen uploads, background world preloader - Replace per-frame VMA alloc/free of material UBOs with a ring buffer in CharacterRenderer (~500 allocations/frame eliminated) - Batch all ready terrain tiles into a single GPU upload during load screen (processAllReadyTiles instead of one-at-a-time with individual fence waits) - Lift per-frame creature/GO spawn budgets during load screen warmup phase - Add background world preloader: saves last world position to disk, pre-warms AssetManager file cache with ADT files starting at app init (login screen) so terrain workers get instant cache hits when Enter World is clicked - Distance-filter expensive collision guard to 8-unit melee range - Merge 3 CharacterRenderer update loops into single pass - Time-budget instrumentation for slow update stages (>3ms threshold) - Count-based async creature model upload budget (max 3/frame in-game) - 1-per-frame game object spawn + per-doodad time budget for transport loading - Use deque for creature spawn queue to avoid O(n) front-erase --- include/core/application.hpp | 22 +- include/rendering/character_renderer.hpp | 9 +- src/core/application.cpp | 299 +++++++++++++++++++++-- src/game/game_handler.cpp | 6 + src/rendering/character_renderer.cpp | 160 +++++------- src/rendering/renderer.cpp | 13 + src/rendering/terrain_manager.cpp | 2 +- 7 files changed, 390 insertions(+), 121 deletions(-) diff --git a/include/core/application.hpp b/include/core/application.hpp index 7415da18..a23e6bd8 100644 --- a/include/core/application.hpp +++ b/include/core/application.hpp @@ -6,12 +6,15 @@ #include #include #include +#include #include #include #include #include #include #include +#include +#include namespace wowee { @@ -282,7 +285,7 @@ private: uint32_t displayId; float x, y, z, orientation; }; - std::vector pendingCreatureSpawns_; + std::deque pendingCreatureSpawns_; static constexpr int MAX_SPAWNS_PER_FRAME = 3; static constexpr int MAX_NEW_CREATURE_MODELS_PER_FRAME = 1; static constexpr uint16_t MAX_CREATURE_SPAWN_RETRIES = 300; @@ -353,6 +356,23 @@ private: // Quest marker billboard sprites (above NPCs) void loadQuestMarkerModels(); // Now loads BLP textures void updateQuestMarkers(); // Updates billboard positions + + // Background world preloader — warms AssetManager file cache for the + // expected world before the user clicks Enter World. + struct WorldPreload { + uint32_t mapId = 0; + std::string mapName; + int centerTileX = 0; + int centerTileY = 0; + std::atomic cancel{false}; + std::vector workers; + }; + std::unique_ptr worldPreload_; + void startWorldPreload(uint32_t mapId, const std::string& mapName, float serverX, float serverY); + void cancelWorldPreload(); + void saveLastWorldInfo(uint32_t mapId, const std::string& mapName, float serverX, float serverY); + struct LastWorldInfo { uint32_t mapId = 0; std::string mapName; float x = 0, y = 0; bool valid = false; }; + LastWorldInfo loadLastWorldInfo() const; }; } // namespace core diff --git a/include/rendering/character_renderer.hpp b/include/rendering/character_renderer.hpp index c6f63451..52813cf4 100644 --- a/include/rendering/character_renderer.hpp +++ b/include/rendering/character_renderer.hpp @@ -254,7 +254,14 @@ private: VkDescriptorPool materialDescPools_[2] = {VK_NULL_HANDLE, VK_NULL_HANDLE}; VkDescriptorPool boneDescPool_ = VK_NULL_HANDLE; uint32_t lastMaterialPoolResetFrame_ = 0xFFFFFFFFu; - std::vector> transientMaterialUbos_[2]; + + // Material UBO ring buffer — pre-allocated per frame slot, sub-allocated each draw + VkBuffer materialRingBuffer_[2] = {VK_NULL_HANDLE, VK_NULL_HANDLE}; + VmaAllocation materialRingAlloc_[2] = {VK_NULL_HANDLE, VK_NULL_HANDLE}; + void* materialRingMapped_[2] = {nullptr, nullptr}; + uint32_t materialRingOffset_[2] = {0, 0}; + uint32_t materialUboAlignment_ = 256; // minUniformBufferOffsetAlignment + static constexpr uint32_t MATERIAL_RING_CAPACITY = 4096; // Texture cache struct TextureCacheEntry { diff --git a/src/core/application.cpp b/src/core/application.cpp index cabcaa01..f0c22a2c 100644 --- a/src/core/application.cpp +++ b/src/core/application.cpp @@ -56,6 +56,7 @@ #include #include #include +#include #include #ifdef __linux__ @@ -314,6 +315,15 @@ bool Application::initialize() { gameHandler->getTransportManager()->loadTaxiPathNodeDBC(assetManager.get()); } + // Start background preload for last-played character's world. + // Warms the file cache so terrain tile loading is faster at Enter World. + { + auto lastWorld = loadLastWorldInfo(); + if (lastWorld.valid) { + startWorldPreload(lastWorld.mapId, lastWorld.mapName, lastWorld.x, lastWorld.y); + } + } + } else { LOG_WARNING("Failed to initialize asset manager - asset loading will be unavailable"); LOG_WARNING("Set WOW_DATA_PATH environment variable to your WoW Data directory"); @@ -521,6 +531,9 @@ void Application::run() { void Application::shutdown() { LOG_WARNING("Shutting down application..."); + // Stop background world preloader before destroying AssetManager + cancelWorldPreload(); + // Save floor cache before renderer is destroyed if (renderer && renderer->getWMORenderer()) { size_t cacheSize = renderer->getWMORenderer()->getFloorCacheSize(); @@ -843,6 +856,7 @@ void Application::update(float deltaTime) { const char* inGameStep = "begin"; try { auto runInGameStage = [&](const char* stageName, auto&& fn) { + auto stageStart = std::chrono::steady_clock::now(); try { fn(); } catch (const std::bad_alloc& e) { @@ -852,6 +866,11 @@ void Application::update(float deltaTime) { LOG_ERROR("Exception during IN_GAME update stage '", stageName, "': ", e.what()); throw; } + auto stageEnd = std::chrono::steady_clock::now(); + float stageMs = std::chrono::duration(stageEnd - stageStart).count(); + if (stageMs > 3.0f) { + LOG_WARNING("SLOW update stage '", stageName, "': ", stageMs, "ms"); + } }; inGameStep = "gameHandler update"; updateCheckpoint = "in_game: gameHandler update"; @@ -1289,6 +1308,7 @@ void Application::update(float deltaTime) { // creature models remain at stale spawn positions. inGameStep = "creature render sync"; updateCheckpoint = "in_game: creature render sync"; + auto creatureSyncStart = std::chrono::steady_clock::now(); if (renderer && gameHandler && renderer->getCharacterRenderer()) { auto* charRenderer = renderer->getCharacterRenderer(); static float npcWeaponRetryTimer = 0.0f; @@ -1333,24 +1353,31 @@ void Application::update(float deltaTime) { } glm::vec3 canonical(entity->getX(), entity->getY(), entity->getZ()); + float canonDistSq = 0.0f; if (havePlayerPos) { glm::vec3 d = canonical - playerPos; - if (glm::dot(d, d) > syncRadiusSq) continue; + canonDistSq = glm::dot(d, d); + if (canonDistSq > syncRadiusSq) continue; } glm::vec3 renderPos = core::coords::canonicalToRender(canonical); // Visual collision guard: keep hostile melee units from rendering inside the // player's model while attacking. This is client-side only (no server position change). - auto unit = std::static_pointer_cast(entity); - const uint64_t currentTargetGuid = gameHandler->hasTarget() ? gameHandler->getTargetGuid() : 0; - const uint64_t autoAttackGuid = gameHandler->getAutoAttackTargetGuid(); - const bool isCombatTarget = (guid == currentTargetGuid || guid == autoAttackGuid); - bool clipGuardEligible = havePlayerPos && - unit->getHealth() > 0 && - (unit->isHostile() || - gameHandler->isAggressiveTowardPlayer(guid) || - isCombatTarget); + // Only check for creatures within 8 units (melee range) — saves expensive + // getRenderBoundsForGuid/getModelData calls for distant creatures. + bool clipGuardEligible = false; + bool isCombatTarget = false; + if (havePlayerPos && canonDistSq < 64.0f) { // 8² = melee range + auto unit = std::static_pointer_cast(entity); + const uint64_t currentTargetGuid = gameHandler->hasTarget() ? gameHandler->getTargetGuid() : 0; + const uint64_t autoAttackGuid = gameHandler->getAutoAttackTargetGuid(); + isCombatTarget = (guid == currentTargetGuid || guid == autoAttackGuid); + clipGuardEligible = unit->getHealth() > 0 && + (unit->isHostile() || + gameHandler->isAggressiveTowardPlayer(guid) || + isCombatTarget); + } if (clipGuardEligible) { float creatureCollisionRadius = 0.8f; glm::vec3 cc; @@ -1410,7 +1437,8 @@ void Application::update(float deltaTime) { float planarDist = glm::length(delta2); float dz = std::abs(renderPos.z - prevPos.z); - const bool deadOrCorpse = unit->getHealth() == 0; + auto unitPtr = std::static_pointer_cast(entity); + const bool deadOrCorpse = unitPtr->getHealth() == 0; const bool largeCorrection = (planarDist > 6.0f) || (dz > 3.0f); if (deadOrCorpse || largeCorrection) { charRenderer->setInstancePosition(instanceId, renderPos); @@ -1425,6 +1453,14 @@ void Application::update(float deltaTime) { charRenderer->setInstanceRotation(instanceId, glm::vec3(0.0f, 0.0f, renderYaw)); } } + { + float csMs = std::chrono::duration( + std::chrono::steady_clock::now() - creatureSyncStart).count(); + if (csMs > 5.0f) { + LOG_WARNING("SLOW update stage 'creature render sync': ", csMs, "ms (", + creatureInstances_.size(), " creatures)"); + } + } // Movement heartbeat is sent from GameHandler::update() to avoid // duplicate packets from multiple update loops. @@ -1447,6 +1483,7 @@ void Application::update(float deltaTime) { // Update renderer (camera, etc.) only when in-game updateCheckpoint = "renderer update"; if (renderer && state == AppState::IN_GAME) { + auto rendererUpdateStart = std::chrono::steady_clock::now(); try { renderer->update(deltaTime); } catch (const std::bad_alloc& e) { @@ -1456,6 +1493,11 @@ void Application::update(float deltaTime) { LOG_ERROR("Exception during Application::update stage 'renderer->update': ", e.what()); throw; } + float ruMs = std::chrono::duration( + std::chrono::steady_clock::now() - rendererUpdateStart).count(); + if (ruMs > 5.0f) { + LOG_WARNING("SLOW update stage 'renderer->update': ", ruMs, "ms"); + } } // Update UI updateCheckpoint = "ui update"; @@ -3537,6 +3579,21 @@ void Application::loadOnlineWorldTerrain(uint32_t mapId, float x, float y, float } LOG_INFO("Loading online world terrain for map '", mapName, "' (ID ", mapId, ")"); + // Cancel any stale preload (if it was for a different map, the file cache + // still retains whatever was loaded — it doesn't hurt). + if (worldPreload_) { + if (worldPreload_->mapId == mapId) { + LOG_INFO("World preload: cache-warm hit for map '", mapName, "'"); + } else { + LOG_INFO("World preload: map mismatch (preloaded ", worldPreload_->mapName, + ", entering ", mapName, ")"); + } + } + cancelWorldPreload(); + + // Save this world info for next session's early preload + saveLastWorldInfo(mapId, mapName, x, y); + // Convert server coordinates to canonical WoW coordinates // Server sends: X=West (canonical.Y), Y=North (canonical.X), Z=Up glm::vec3 spawnCanonical = core::coords::serverToCanonical(glm::vec3(x, y, z)); @@ -3967,8 +4024,11 @@ void Application::loadOnlineWorldTerrain(uint32_t mapId, float x, float y, float // Trigger new streaming — enqueue tiles for background workers terrainMgr->update(*camera, 0.016f); - // Process ONE tile per iteration so loading screen updates after each - terrainMgr->processOneReadyTile(); + // Process ALL available ready tiles per iteration — batches GPU + // uploads into a single command buffer + fence wait instead of + // one fence per tile. Loading screen still updates between + // iterations while workers parse more tiles. + terrainMgr->processAllReadyTiles(); int remaining = terrainMgr->getRemainingTileCount(); int loaded = terrainMgr->getLoadedTileCount(); @@ -4126,9 +4186,64 @@ void Application::loadOnlineWorldTerrain(uint32_t mapId, float x, float y, float if (world) world->update(1.0f / 60.0f); processPlayerSpawnQueue(); + + // During load screen warmup: lift per-frame budgets so GPU uploads + // happen in bulk while the loading screen is still visible. + // Process ALL async creature model uploads (no 3-per-frame cap). + { + for (auto it = asyncCreatureLoads_.begin(); it != asyncCreatureLoads_.end(); ) { + if (!it->future.valid() || + it->future.wait_for(std::chrono::milliseconds(0)) != std::future_status::ready) { + ++it; + continue; + } + auto result = it->future.get(); + it = asyncCreatureLoads_.erase(it); + if (result.permanent_failure) { + nonRenderableCreatureDisplayIds_.insert(result.displayId); + creaturePermanentFailureGuids_.insert(result.guid); + pendingCreatureSpawnGuids_.erase(result.guid); + creatureSpawnRetryCounts_.erase(result.guid); + continue; + } + if (!result.valid || !result.model) { + pendingCreatureSpawnGuids_.erase(result.guid); + creatureSpawnRetryCounts_.erase(result.guid); + continue; + } + auto* charRenderer = renderer ? renderer->getCharacterRenderer() : nullptr; + if (!charRenderer) { pendingCreatureSpawnGuids_.erase(result.guid); continue; } + if (!charRenderer->loadModel(*result.model, result.modelId)) { + nonRenderableCreatureDisplayIds_.insert(result.displayId); + creaturePermanentFailureGuids_.insert(result.guid); + pendingCreatureSpawnGuids_.erase(result.guid); + creatureSpawnRetryCounts_.erase(result.guid); + continue; + } + displayIdModelCache_[result.displayId] = result.modelId; + pendingCreatureSpawnGuids_.erase(result.guid); + creatureSpawnRetryCounts_.erase(result.guid); + if (!creatureInstances_.count(result.guid) && + !creaturePermanentFailureGuids_.count(result.guid)) { + PendingCreatureSpawn s{}; + s.guid = result.guid; s.displayId = result.displayId; + s.x = result.x; s.y = result.y; s.z = result.z; + s.orientation = result.orientation; + pendingCreatureSpawns_.push_back(s); + pendingCreatureSpawnGuids_.insert(result.guid); + } + } + } processCreatureSpawnQueue(); processDeferredEquipmentQueue(); - processGameObjectSpawnQueue(); + + // Process ALL pending game object spawns (no 1-per-frame cap during load screen). + while (!pendingGameObjectSpawns_.empty()) { + auto& s = pendingGameObjectSpawns_.front(); + spawnOnlineGameObject(s.guid, s.entry, s.displayId, s.x, s.y, s.z, s.orientation); + pendingGameObjectSpawns_.erase(pendingGameObjectSpawns_.begin()); + } + processPendingTransportDoodads(); processPendingMount(); updateQuestMarkers(); @@ -6767,12 +6882,25 @@ void Application::spawnOnlineGameObject(uint64_t guid, uint32_t entry, uint32_t void Application::processAsyncCreatureResults() { // Check completed async model loads and finalize on main thread (GPU upload + instance creation). + // Limit GPU model uploads per frame to avoid spikes, but always drain cheap bookkeeping. + static constexpr int kMaxModelUploadsPerFrame = 3; + int modelUploads = 0; + for (auto it = asyncCreatureLoads_.begin(); it != asyncCreatureLoads_.end(); ) { if (!it->future.valid() || it->future.wait_for(std::chrono::milliseconds(0)) != std::future_status::ready) { ++it; continue; } + + // Peek: if this result needs a NEW model upload (not cached) and we've hit + // the upload budget, defer to next frame without consuming the future. + if (modelUploads >= kMaxModelUploadsPerFrame) { + // Check if this displayId already has a cached model (cheap spawn, no GPU upload). + // We can't peek the displayId without getting the future, so just break. + break; + } + auto result = it->future.get(); it = asyncCreatureLoads_.erase(it); @@ -6805,6 +6933,7 @@ void Application::processAsyncCreatureResults() { continue; } displayIdModelCache_[result.displayId] = result.modelId; + modelUploads++; pendingCreatureSpawnGuids_.erase(result.guid); creatureSpawnRetryCounts_.erase(result.guid); @@ -6854,7 +6983,7 @@ void Application::processCreatureSpawnQueue() { } PendingCreatureSpawn s = pendingCreatureSpawns_.front(); - pendingCreatureSpawns_.erase(pendingCreatureSpawns_.begin()); + pendingCreatureSpawns_.pop_front(); if (nonRenderableCreatureDisplayIds_.count(s.displayId)) { pendingCreatureSpawnGuids_.erase(s.guid); @@ -7035,13 +7164,11 @@ void Application::processDeferredEquipmentQueue() { void Application::processGameObjectSpawnQueue() { if (pendingGameObjectSpawns_.empty()) return; - int spawned = 0; - while (!pendingGameObjectSpawns_.empty() && spawned < MAX_SPAWNS_PER_FRAME) { - auto& s = pendingGameObjectSpawns_.front(); - spawnOnlineGameObject(s.guid, s.entry, s.displayId, s.x, s.y, s.z, s.orientation); - pendingGameObjectSpawns_.erase(pendingGameObjectSpawns_.begin()); - spawned++; - } + // Only spawn 1 game object per frame — each can involve heavy synchronous + // WMO loading (root + groups from disk + GPU upload), easily 100ms+. + auto& s = pendingGameObjectSpawns_.front(); + spawnOnlineGameObject(s.guid, s.entry, s.displayId, s.x, s.y, s.z, s.orientation); + pendingGameObjectSpawns_.erase(pendingGameObjectSpawns_.begin()); } void Application::processPendingTransportDoodads() { @@ -7052,9 +7179,16 @@ void Application::processPendingTransportDoodads() { auto* m2Renderer = renderer->getM2Renderer(); if (!wmoRenderer || !m2Renderer) return; + auto startTime = std::chrono::steady_clock::now(); + static constexpr float kDoodadBudgetMs = 4.0f; + size_t budgetLeft = MAX_TRANSPORT_DOODADS_PER_FRAME; for (auto it = pendingTransportDoodadBatches_.begin(); it != pendingTransportDoodadBatches_.end() && budgetLeft > 0;) { + // Time budget check + float elapsedMs = std::chrono::duration( + std::chrono::steady_clock::now() - startTime).count(); + if (elapsedMs >= kDoodadBudgetMs) break; auto goIt = gameObjectInstances_.find(it->guid); if (goIt == gameObjectInstances_.end() || !goIt->second.isWmo || goIt->second.instanceId != it->instanceId || goIt->second.modelId != it->modelId) { @@ -7070,6 +7204,11 @@ void Application::processPendingTransportDoodads() { const size_t maxIndex = std::min(it->doodadBudget, doodadTemplates->size()); while (it->nextIndex < maxIndex && budgetLeft > 0) { + // Per-doodad time budget (each does synchronous file I/O + parse + GPU upload) + float innerMs = std::chrono::duration( + std::chrono::steady_clock::now() - startTime).count(); + if (innerMs >= kDoodadBudgetMs) { budgetLeft = 0; break; } + const auto& doodadTemplate = (*doodadTemplates)[it->nextIndex]; it->nextIndex++; budgetLeft--; @@ -7729,5 +7868,121 @@ void Application::setupTestTransport() { LOG_INFO("========================================"); } +// ─── World Preloader ───────────────────────────────────────────────────────── +// Pre-warms AssetManager file cache with ADT files (and their _obj0 variants) +// for tiles around the expected spawn position. Runs in background so that +// when loadOnlineWorldTerrain eventually asks TerrainManager workers to parse +// the same files, every readFile() is an instant cache hit instead of disk I/O. + +void Application::startWorldPreload(uint32_t mapId, const std::string& mapName, + float serverX, float serverY) { + cancelWorldPreload(); + if (!assetManager || !assetManager->isInitialized() || mapName.empty()) return; + + glm::vec3 canonical = core::coords::serverToCanonical(glm::vec3(serverX, serverY, 0.0f)); + auto [tileX, tileY] = core::coords::canonicalToTile(canonical.x, canonical.y); + + worldPreload_ = std::make_unique(); + worldPreload_->mapId = mapId; + worldPreload_->mapName = mapName; + worldPreload_->centerTileX = tileX; + worldPreload_->centerTileY = tileY; + + LOG_INFO("World preload: starting for map '", mapName, "' tile [", tileX, ",", tileY, "]"); + + // Build list of tiles to preload (radius 1 = 3x3 = 9 tiles, matching load screen) + struct TileJob { int x, y; }; + auto jobs = std::make_shared>(); + // Center tile first (most important) + jobs->push_back({tileX, tileY}); + for (int dx = -1; dx <= 1; dx++) { + for (int dy = -1; dy <= 1; dy++) { + if (dx == 0 && dy == 0) continue; + int tx = tileX + dx, ty = tileY + dy; + if (tx < 0 || tx > 63 || ty < 0 || ty > 63) continue; + jobs->push_back({tx, ty}); + } + } + + // Spawn worker threads (one per tile for maximum parallelism) + auto cancelFlag = &worldPreload_->cancel; + auto* am = assetManager.get(); + std::string mn = mapName; + + int numWorkers = std::min(static_cast(jobs->size()), 4); + auto nextJob = std::make_shared>(0); + + for (int w = 0; w < numWorkers; w++) { + worldPreload_->workers.emplace_back([am, mn, jobs, nextJob, cancelFlag]() { + while (!cancelFlag->load(std::memory_order_relaxed)) { + int idx = nextJob->fetch_add(1, std::memory_order_relaxed); + if (idx >= static_cast(jobs->size())) break; + + int tx = (*jobs)[idx].x; + int ty = (*jobs)[idx].y; + + // Read ADT file (warms file cache) + std::string adtPath = "World\\Maps\\" + mn + "\\" + mn + "_" + + std::to_string(tx) + "_" + std::to_string(ty) + ".adt"; + am->readFile(adtPath); + if (cancelFlag->load(std::memory_order_relaxed)) break; + + // Read obj0 variant + std::string objPath = "World\\Maps\\" + mn + "\\" + mn + "_" + + std::to_string(tx) + "_" + std::to_string(ty) + "_obj0.adt"; + am->readFile(objPath); + } + LOG_DEBUG("World preload worker finished"); + }); + } +} + +void Application::cancelWorldPreload() { + if (!worldPreload_) return; + worldPreload_->cancel.store(true, std::memory_order_relaxed); + for (auto& t : worldPreload_->workers) { + if (t.joinable()) t.join(); + } + LOG_INFO("World preload: cancelled (map=", worldPreload_->mapName, + " tile=[", worldPreload_->centerTileX, ",", worldPreload_->centerTileY, "])"); + worldPreload_.reset(); +} + +void Application::saveLastWorldInfo(uint32_t mapId, const std::string& mapName, + float serverX, float serverY) { +#ifdef _WIN32 + const char* base = std::getenv("APPDATA"); + std::string dir = base ? std::string(base) + "\\wowee" : "."; +#else + const char* home = std::getenv("HOME"); + std::string dir = home ? std::string(home) + "/.wowee" : "."; +#endif + std::filesystem::create_directories(dir); + std::ofstream f(dir + "/last_world.cfg"); + if (f) { + f << mapId << "\n" << mapName << "\n" << serverX << "\n" << serverY << "\n"; + } +} + +Application::LastWorldInfo Application::loadLastWorldInfo() const { +#ifdef _WIN32 + const char* base = std::getenv("APPDATA"); + std::string dir = base ? std::string(base) + "\\wowee" : "."; +#else + const char* home = std::getenv("HOME"); + std::string dir = home ? std::string(home) + "/.wowee" : "."; +#endif + LastWorldInfo info; + std::ifstream f(dir + "/last_world.cfg"); + if (!f) return info; + std::string line; + if (std::getline(f, line)) info.mapId = static_cast(std::stoul(line)); + if (std::getline(f, line)) info.mapName = line; + if (std::getline(f, line)) info.x = std::stof(line); + if (std::getline(f, line)) info.y = std::stof(line); + info.valid = !info.mapName.empty(); + return info; +} + } // namespace core } // namespace wowee diff --git a/src/game/game_handler.cpp b/src/game/game_handler.cpp index e80e727f..9a7aed97 100644 --- a/src/game/game_handler.cpp +++ b/src/game/game_handler.cpp @@ -541,7 +541,13 @@ void GameHandler::update(float deltaTime) { // Update socket (processes incoming data and triggers callbacks) if (socket) { + auto socketStart = std::chrono::steady_clock::now(); socket->update(); + float socketMs = std::chrono::duration( + std::chrono::steady_clock::now() - socketStart).count(); + if (socketMs > 3.0f) { + LOG_WARNING("SLOW socket->update: ", socketMs, "ms"); + } } // Detect server-side disconnect (socket closed during update) diff --git a/src/rendering/character_renderer.cpp b/src/rendering/character_renderer.cpp index 9aa99c72..f735dd7d 100644 --- a/src/rendering/character_renderer.cpp +++ b/src/rendering/character_renderer.cpp @@ -197,6 +197,29 @@ bool CharacterRenderer::initialize(VkContext* ctx, VkDescriptorSetLayout perFram vkCreateDescriptorPool(device, &ci, nullptr, &boneDescPool_); } + // --- Material UBO ring buffers (one per frame slot) --- + { + VkPhysicalDeviceProperties props; + vkGetPhysicalDeviceProperties(ctx->getPhysicalDevice(), &props); + materialUboAlignment_ = static_cast(props.limits.minUniformBufferOffsetAlignment); + if (materialUboAlignment_ < 1) materialUboAlignment_ = 1; + // Round up UBO size to alignment + uint32_t alignedUboSize = (sizeof(CharMaterialUBO) + materialUboAlignment_ - 1) & ~(materialUboAlignment_ - 1); + uint32_t ringSize = alignedUboSize * MATERIAL_RING_CAPACITY; + for (int i = 0; i < 2; i++) { + VkBufferCreateInfo bci{VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO}; + bci.size = ringSize; + bci.usage = VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT; + VmaAllocationCreateInfo aci{}; + aci.usage = VMA_MEMORY_USAGE_CPU_TO_GPU; + aci.flags = VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT | VMA_ALLOCATION_CREATE_MAPPED_BIT; + VmaAllocationInfo allocInfo{}; + vmaCreateBuffer(ctx->getAllocator(), &bci, &aci, + &materialRingBuffer_[i], &materialRingAlloc_[i], &allocInfo); + materialRingMapped_[i] = allocInfo.pMappedData; + } + } + // --- Pipeline layout --- // set 0 = perFrame, set 1 = material, set 2 = bones // Push constant: mat4 model = 64 bytes @@ -352,14 +375,15 @@ void CharacterRenderer::shutdown() { if (pipelineLayout_) { vkDestroyPipelineLayout(device, pipelineLayout_, nullptr); pipelineLayout_ = VK_NULL_HANDLE; } - // Release any deferred transient material UBOs. + // Destroy material ring buffers for (int i = 0; i < 2; i++) { - for (const auto& b : transientMaterialUbos_[i]) { - if (b.first) { - vmaDestroyBuffer(alloc, b.first, b.second); - } + if (materialRingBuffer_[i]) { + vmaDestroyBuffer(alloc, materialRingBuffer_[i], materialRingAlloc_[i]); + materialRingBuffer_[i] = VK_NULL_HANDLE; + materialRingAlloc_[i] = VK_NULL_HANDLE; + materialRingMapped_[i] = nullptr; } - transientMaterialUbos_[i].clear(); + materialRingOffset_[i] = 0; } // Destroy descriptor pools and layouts @@ -391,7 +415,6 @@ void CharacterRenderer::clear() { vkDeviceWaitIdle(vkCtx_->getDevice()); VkDevice device = vkCtx_->getDevice(); - VmaAllocator alloc = vkCtx_->getAllocator(); // Destroy GPU resources for all models for (auto& pair : models) { @@ -441,14 +464,9 @@ void CharacterRenderer::clear() { models.clear(); instances.clear(); - // Release deferred transient material UBOs + // Reset material ring buffer offsets (buffers persist, just reset write position) for (int i = 0; i < 2; i++) { - for (const auto& b : transientMaterialUbos_[i]) { - if (b.first) { - vmaDestroyBuffer(alloc, b.first, b.second); - } - } - transientMaterialUbos_[i].clear(); + materialRingOffset_[i] = 0; } // Reset descriptor pools (don't destroy — reuse for new allocations) @@ -1454,8 +1472,14 @@ void CharacterRenderer::update(float deltaTime, const glm::vec3& cameraPos) { const float animUpdateRadius = static_cast(envSizeOrDefault("WOWEE_CHAR_ANIM_RADIUS", 120)); const float animUpdateRadiusSq = animUpdateRadius * animUpdateRadius; - // Update fade-in opacity - for (auto& [id, inst] : instances) { + // Single pass: fade-in, movement, and animation bone collection + std::vector> toUpdate; + toUpdate.reserve(instances.size()); + + for (auto& pair : instances) { + auto& inst = pair.second; + + // Update fade-in opacity if (inst.fadeInDuration > 0.0f && inst.opacity < 1.0f) { inst.fadeInTime += deltaTime; inst.opacity = std::min(1.0f, inst.fadeInTime / inst.fadeInDuration); @@ -1463,10 +1487,8 @@ void CharacterRenderer::update(float deltaTime, const glm::vec3& cameraPos) { inst.fadeInDuration = 0.0f; } } - } - // Interpolate creature movement - for (auto& [id, inst] : instances) { + // Interpolate creature movement if (inst.isMoving) { inst.moveElapsed += deltaTime; float t = inst.moveElapsed / inst.moveDuration; @@ -1475,23 +1497,14 @@ void CharacterRenderer::update(float deltaTime, const glm::vec3& cameraPos) { inst.isMoving = false; // Return to idle when movement completes if (inst.currentAnimationId == 4 || inst.currentAnimationId == 5) { - playAnimation(id, 0, true); + playAnimation(pair.first, 0, true); } } else { inst.position = glm::mix(inst.moveStart, inst.moveEnd, t); } } - } - // Only update animations for nearby characters (performance optimization) - // Collect instances that need bone recomputation, with distance-based throttling - std::vector> toUpdate; - toUpdate.reserve(instances.size()); - - for (auto& pair : instances) { - auto& inst = pair.second; - - // Skip weapon instances — their transforms are set by parent bones + // Skip weapon instances for animation — their transforms are set by parent bones if (inst.hasOverrideModelMatrix) continue; float distSq = glm::distance2(inst.position, cameraPos); @@ -1533,7 +1546,7 @@ void CharacterRenderer::update(float deltaTime, const glm::vec3& cameraPos) { // Thread bone matrix computation in chunks if (updatedCount >= 8 && numAnimThreads_ > 1) { static const size_t minAnimWorkPerThread = std::max( - 16, envSizeOrDefault("WOWEE_CHAR_ANIM_WORK_PER_THREAD", 64)); + 8, envSizeOrDefault("WOWEE_CHAR_ANIM_WORK_PER_THREAD", 16)); const size_t maxUsefulThreads = std::max( 1, (updatedCount + minAnimWorkPerThread - 1) / minAnimWorkPerThread); const size_t numThreads = std::min(static_cast(numAnimThreads_), maxUsefulThreads); @@ -1728,8 +1741,6 @@ void CharacterRenderer::calculateBoneMatrices(CharacterInstance& instance) { size_t numBones = model.bones.size(); instance.boneMatrices.resize(numBones); - static bool dumpedOnce = false; - for (size_t i = 0; i < numBones; i++) { const auto& bone = model.bones[i]; @@ -1737,19 +1748,6 @@ void CharacterRenderer::calculateBoneMatrices(CharacterInstance& instance) { // At rest this is identity, so no separate bind pose is needed glm::mat4 localTransform = getBoneTransform(bone, instance.animationTime, instance.currentSequenceIndex); - // Debug: dump first frame bone data - if (!dumpedOnce && i < 5) { - glm::vec3 t = interpolateVec3(bone.translation, instance.currentSequenceIndex, instance.animationTime, glm::vec3(0.0f)); - glm::quat r = interpolateQuat(bone.rotation, instance.currentSequenceIndex, instance.animationTime); - glm::vec3 s = interpolateVec3(bone.scale, instance.currentSequenceIndex, instance.animationTime, glm::vec3(1.0f)); - core::Logger::getInstance().info("Bone ", i, " parent=", bone.parentBone, - " pivot=(", bone.pivot.x, ",", bone.pivot.y, ",", bone.pivot.z, ")", - " t=(", t.x, ",", t.y, ",", t.z, ")", - " r=(", r.w, ",", r.x, ",", r.y, ",", r.z, ")", - " s=(", s.x, ",", s.y, ",", s.z, ")", - " seqIdx=", instance.currentSequenceIndex); - } - // Compose with parent if (bone.parentBone >= 0 && static_cast(bone.parentBone) < numBones) { instance.boneMatrices[i] = instance.boneMatrices[bone.parentBone] * localTransform; @@ -1757,12 +1755,6 @@ void CharacterRenderer::calculateBoneMatrices(CharacterInstance& instance) { instance.boneMatrices[i] = localTransform; } } - if (!dumpedOnce) { - dumpedOnce = true; - // Dump final matrix for bone 0 - auto& m = instance.boneMatrices[0]; - core::Logger::getInstance().info("Bone 0 final matrix row0=(", m[0][0], ",", m[1][0], ",", m[2][0], ",", m[3][0], ")"); - } } glm::mat4 CharacterRenderer::getBoneTransform(const pipeline::M2Bone& bone, float time, int sequenceIndex) { @@ -1797,22 +1789,19 @@ void CharacterRenderer::render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet, uint32_t frameIndex = vkCtx_->getCurrentFrame(); uint32_t frameSlot = frameIndex % 2u; - // Reset transient material allocations once per frame slot. - // beginFrame() waits on this slot's fence before recording. + // Reset material ring buffer and descriptor pool once per frame slot. if (lastMaterialPoolResetFrame_ != frameIndex) { - VmaAllocator alloc = vkCtx_->getAllocator(); - for (const auto& b : transientMaterialUbos_[frameSlot]) { - if (b.first) { - vmaDestroyBuffer(alloc, b.first, b.second); - } - } - transientMaterialUbos_[frameSlot].clear(); + materialRingOffset_[frameSlot] = 0; if (materialDescPools_[frameSlot]) { vkResetDescriptorPool(vkCtx_->getDevice(), materialDescPools_[frameSlot], 0); } lastMaterialPoolResetFrame_ = frameIndex; } + // Pre-compute aligned UBO stride for ring buffer sub-allocation + const uint32_t uboStride = (sizeof(CharMaterialUBO) + materialUboAlignment_ - 1) & ~(materialUboAlignment_ - 1); + const uint32_t ringCapacityBytes = uboStride * MATERIAL_RING_CAPACITY; + // Bind per-frame descriptor set (set 0) -- shared across all draws vkCmdBindDescriptorSets(cmd, VK_PIPELINE_BIND_POINT_GRAPHICS, pipelineLayout_, 0, 1, &perFrameSet, 0, nullptr); @@ -2182,27 +2171,18 @@ void CharacterRenderer::render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet, matData.heightMapVariance = batchHeightVariance; matData.normalMapStrength = normalMapStrength_; - // Create a small UBO for this batch's material - VkBufferCreateInfo bci{VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO}; - bci.size = sizeof(CharMaterialUBO); - bci.usage = VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT; - VmaAllocationCreateInfo aci{}; - aci.usage = VMA_MEMORY_USAGE_CPU_TO_GPU; - aci.flags = VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT | VMA_ALLOCATION_CREATE_MAPPED_BIT; - VmaAllocationInfo allocInfo{}; - ::VkBuffer matUBO = VK_NULL_HANDLE; - VmaAllocation matUBOAlloc = VK_NULL_HANDLE; - vmaCreateBuffer(vkCtx_->getAllocator(), &bci, &aci, &matUBO, &matUBOAlloc, &allocInfo); - if (allocInfo.pMappedData) { - memcpy(allocInfo.pMappedData, &matData, sizeof(CharMaterialUBO)); - } + // Sub-allocate material UBO from ring buffer + uint32_t matOffset = materialRingOffset_[frameSlot]; + if (matOffset + uboStride > ringCapacityBytes) continue; // ring exhausted + memcpy(static_cast(materialRingMapped_[frameSlot]) + matOffset, &matData, sizeof(CharMaterialUBO)); + materialRingOffset_[frameSlot] = matOffset + uboStride; // Write descriptor set: binding 0 = texture, binding 1 = material UBO, binding 2 = normal/height map VkTexture* bindTex = (texPtr && texPtr->isValid()) ? texPtr : whiteTexture_.get(); VkDescriptorImageInfo imgInfo = bindTex->descriptorInfo(); VkDescriptorBufferInfo bufInfo{}; - bufInfo.buffer = matUBO; - bufInfo.offset = 0; + bufInfo.buffer = materialRingBuffer_[frameSlot]; + bufInfo.offset = matOffset; bufInfo.range = sizeof(CharMaterialUBO); VkDescriptorImageInfo nhImgInfo = normalMap->descriptorInfo(); @@ -2235,8 +2215,6 @@ void CharacterRenderer::render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet, pipelineLayout_, 1, 1, &materialSet, 0, nullptr); vkCmdDrawIndexed(cmd, batch.indexCount, 1, batch.indexStart, 0, 0); - - transientMaterialUbos_[frameSlot].emplace_back(matUBO, matUBOAlloc); } } else { // Draw entire model with first texture @@ -2277,24 +2255,16 @@ void CharacterRenderer::render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet, matData.heightMapVariance = 0.0f; matData.normalMapStrength = normalMapStrength_; - VkBufferCreateInfo bci{VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO}; - bci.size = sizeof(CharMaterialUBO); - bci.usage = VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT; - VmaAllocationCreateInfo aci{}; - aci.usage = VMA_MEMORY_USAGE_CPU_TO_GPU; - aci.flags = VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT | VMA_ALLOCATION_CREATE_MAPPED_BIT; - VmaAllocationInfo allocInfo{}; - ::VkBuffer matUBO = VK_NULL_HANDLE; - VmaAllocation matUBOAlloc = VK_NULL_HANDLE; - vmaCreateBuffer(vkCtx_->getAllocator(), &bci, &aci, &matUBO, &matUBOAlloc, &allocInfo); - if (allocInfo.pMappedData) { - memcpy(allocInfo.pMappedData, &matData, sizeof(CharMaterialUBO)); - } + // Sub-allocate material UBO from ring buffer + uint32_t matOffset2 = materialRingOffset_[frameSlot]; + if (matOffset2 + uboStride > ringCapacityBytes) continue; // ring exhausted + memcpy(static_cast(materialRingMapped_[frameSlot]) + matOffset2, &matData, sizeof(CharMaterialUBO)); + materialRingOffset_[frameSlot] = matOffset2 + uboStride; VkDescriptorImageInfo imgInfo = texPtr->descriptorInfo(); VkDescriptorBufferInfo bufInfo{}; - bufInfo.buffer = matUBO; - bufInfo.offset = 0; + bufInfo.buffer = materialRingBuffer_[frameSlot]; + bufInfo.offset = matOffset2; bufInfo.range = sizeof(CharMaterialUBO); VkDescriptorImageInfo nhImgInfo2 = flatNormalTexture_->descriptorInfo(); @@ -2326,8 +2296,6 @@ void CharacterRenderer::render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet, pipelineLayout_, 1, 1, &materialSet, 0, nullptr); vkCmdDrawIndexed(cmd, gpuModel.indexCount, 1, 0, 0, 0); - - transientMaterialUbos_[frameSlot].emplace_back(matUBO, matUBOAlloc); } } } diff --git a/src/rendering/renderer.cpp b/src/rendering/renderer.cpp index 5f3e48ae..69bfecdb 100644 --- a/src/rendering/renderer.cpp +++ b/src/rendering/renderer.cpp @@ -2527,7 +2527,13 @@ void Renderer::update(float deltaTime) { // Update terrain streaming if (terrainManager && camera) { + auto terrStart = std::chrono::steady_clock::now(); terrainManager->update(*camera, deltaTime); + float terrMs = std::chrono::duration( + std::chrono::steady_clock::now() - terrStart).count(); + if (terrMs > 5.0f) { + LOG_WARNING("SLOW terrainManager->update: ", terrMs, "ms"); + } } // Update sky system (skybox time, star twinkle, clouds, celestial moon phases) @@ -2579,7 +2585,14 @@ void Renderer::update(float deltaTime) { // Update character animations if (characterRenderer && camera) { + auto charAnimStart = std::chrono::steady_clock::now(); characterRenderer->update(deltaTime, camera->getPosition()); + float charAnimMs = std::chrono::duration( + std::chrono::steady_clock::now() - charAnimStart).count(); + if (charAnimMs > 5.0f) { + LOG_WARNING("SLOW characterRenderer->update: ", charAnimMs, "ms (", + characterRenderer->getInstanceCount(), " instances)"); + } } // Update AudioEngine (cleanup finished sounds, etc.) diff --git a/src/rendering/terrain_manager.cpp b/src/rendering/terrain_manager.cpp index 3eb1ba1c..20a2e9a1 100644 --- a/src/rendering/terrain_manager.cpp +++ b/src/rendering/terrain_manager.cpp @@ -1082,7 +1082,7 @@ void TerrainManager::workerLoop() { void TerrainManager::processReadyTiles() { // Process tiles with time budget to avoid frame spikes // Taxi mode gets a slightly larger budget to avoid visible late-pop terrain/models. - const float timeBudgetMs = taxiStreamingMode_ ? 8.0f : 5.0f; + const float timeBudgetMs = taxiStreamingMode_ ? 8.0f : 3.0f; auto startTime = std::chrono::high_resolution_clock::now(); // Move newly ready tiles into the finalizing deque.