From 02cf0e4df381e33b2ac04c32a3095219effa4235 Mon Sep 17 00:00:00 2001 From: Kelsi Date: Sat, 7 Mar 2026 18:40:24 -0800 Subject: [PATCH] Background normal map generation, queue-draining load screen warmup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Normal map CPU work (luminance→blur→Sobel) moved to background threads, main thread only does GPU upload (~1-2ms vs 15-22ms per texture) - Load screen warmup now waits until ALL spawn/equipment/gameobject queues are drained before transitioning (prevents naked character, NPC pop-in) - Exit condition: min 2s + 5 consecutive empty iterations, hard cap 15s - Equipment queue processes 8 items per warmup iteration instead of 1 - Added LoadingScreen::renderOverlay() for future world-behind-loading use --- include/rendering/character_renderer.hpp | 20 +++- include/rendering/loading_screen.hpp | 4 + src/core/application.cpp | 64 ++++++++--- src/rendering/character_renderer.cpp | 130 ++++++++++++++++------- src/rendering/loading_screen.cpp | 60 +++++++++++ 5 files changed, 218 insertions(+), 60 deletions(-) diff --git a/include/rendering/character_renderer.hpp b/include/rendering/character_renderer.hpp index 83cb3e7f..c4676008 100644 --- a/include/rendering/character_renderer.hpp +++ b/include/rendering/character_renderer.hpp @@ -13,6 +13,8 @@ #include #include #include +#include +#include namespace wowee { namespace pipeline { class AssetManager; } @@ -304,15 +306,23 @@ private: std::unique_ptr generateNormalHeightMap( const uint8_t* pixels, uint32_t width, uint32_t height, float& outVariance); - // Deferred normal map generation — avoids stalling loadModel - struct PendingNormalMap { + // Background normal map generation — CPU work on thread pool, GPU upload on main thread + struct NormalMapResult { std::string cacheKey; - std::vector pixels; // RGBA pixel data + std::vector pixels; // RGBA normal map output uint32_t width, height; + float variance; }; - std::deque pendingNormalMaps_; + // Completed results ready for GPU upload (populated by background threads) + std::mutex normalMapResultsMutex_; + std::deque completedNormalMaps_; + std::atomic pendingNormalMapCount_{0}; // in-flight background tasks + + // Pure CPU normal map generation (thread-safe, no GPU access) + static NormalMapResult generateNormalHeightMapCPU( + std::string cacheKey, std::vector pixels, uint32_t width, uint32_t height); public: - void processPendingNormalMaps(int budget = 2); + void processPendingNormalMaps(int budget = 4); private: // Normal mapping / POM settings diff --git a/include/rendering/loading_screen.hpp b/include/rendering/loading_screen.hpp index 5f119676..afd134b9 100644 --- a/include/rendering/loading_screen.hpp +++ b/include/rendering/loading_screen.hpp @@ -24,6 +24,10 @@ public: // Render the loading screen with progress bar and status text (pure ImGui) void render(); + // Draw loading screen as ImGui overlay (call within an existing ImGui frame). + // Used during warmup to overlay loading screen on top of the rendered world. + void renderOverlay(); + void setProgress(float progress) { loadProgress = progress; } void setStatus(const std::string& status) { statusText = status; } diff --git a/src/core/application.cpp b/src/core/application.cpp index 23b2c15c..300bffc7 100644 --- a/src/core/application.cpp +++ b/src/core/application.cpp @@ -49,9 +49,9 @@ #include // GL/glew.h removed — Vulkan migration Phase 1 #include +#include #include #include -#include #include #include #include @@ -922,9 +922,9 @@ void Application::update(float deltaTime) { auto t3 = std::chrono::steady_clock::now(); processDeferredEquipmentQueue(); auto t4 = std::chrono::steady_clock::now(); - // Process deferred normal maps (2 per frame to spread CPU cost) + // Upload completed normal maps from background threads (~1-2ms each GPU upload) if (auto* cr = renderer ? renderer->getCharacterRenderer() : nullptr) { - cr->processPendingNormalMaps(2); + cr->processPendingNormalMaps(4); } auto t5 = std::chrono::steady_clock::now(); float pMs = std::chrono::duration(t1 - t0).count(); @@ -4167,11 +4167,17 @@ void Application::loadOnlineWorldTerrain(uint32_t mapId, float x, float y, float }); } - // Hide first-login hitch by draining initial world packets/spawn queues before - // dropping the loading screen. Keep this bounded so we don't stall indefinitely. + // Keep the loading screen visible until all spawn/equipment/gameobject queues + // are fully drained. This ensures the player sees a fully populated world + // (character clothed, NPCs placed, game objects loaded) when the screen drops. { - const float kWarmupMaxSeconds = 2.5f; + const float kMinWarmupSeconds = 2.0f; // minimum time to drain network packets + const float kMaxWarmupSeconds = 15.0f; // hard cap to avoid infinite stall const auto warmupStart = std::chrono::high_resolution_clock::now(); + // Track consecutive idle iterations (all queues empty) to detect convergence + int idleIterations = 0; + const int kIdleThreshold = 5; // require 5 consecutive empty loops (~80ms) + while (true) { SDL_Event event; while (SDL_PollEvent(&event)) { @@ -4185,7 +4191,6 @@ void Application::loadOnlineWorldTerrain(uint32_t mapId, float x, float y, float int w = event.window.data1; int h = event.window.data2; window->setSize(w, h); - // Vulkan viewport set in command buffer if (renderer && renderer->getCamera()) { renderer->getCamera()->setAspectRatio(static_cast(w) / h); } @@ -4208,14 +4213,17 @@ void Application::loadOnlineWorldTerrain(uint32_t mapId, float x, float y, float // During load screen warmup: lift per-frame budgets so GPU uploads // and spawns happen in bulk while the loading screen is still visible. - processCreatureSpawnQueue(true); // unlimited: no model upload cap, no time budget + processCreatureSpawnQueue(true); processAsyncNpcCompositeResults(); - processDeferredEquipmentQueue(); + // Process equipment queue more aggressively during warmup (multiple per iteration) + for (int i = 0; i < 8 && (!deferredEquipmentQueue_.empty() || !asyncEquipmentLoads_.empty()); i++) { + processDeferredEquipmentQueue(); + } if (auto* cr = renderer ? renderer->getCharacterRenderer() : nullptr) { - cr->processPendingNormalMaps(10); // higher budget during load screen + cr->processPendingNormalMaps(INT_MAX); } - // Process ALL pending game object spawns (no 1-per-frame cap during load screen). + // Process ALL pending game object spawns. while (!pendingGameObjectSpawns_.empty()) { auto& s = pendingGameObjectSpawns_.front(); spawnOnlineGameObject(s.guid, s.entry, s.displayId, s.x, s.y, s.z, s.orientation); @@ -4226,14 +4234,42 @@ void Application::loadOnlineWorldTerrain(uint32_t mapId, float x, float y, float processPendingMount(); updateQuestMarkers(); + // Update renderer (terrain streaming, animations) + if (renderer) { + renderer->update(1.0f / 60.0f); + } + const auto now = std::chrono::high_resolution_clock::now(); const float elapsed = std::chrono::duration(now - warmupStart).count(); - const float t = std::clamp(elapsed / kWarmupMaxSeconds, 0.0f, 1.0f); - showProgress("Finalizing world sync...", 0.97f + t * 0.025f); - if (elapsed >= kWarmupMaxSeconds) { + // Check if all queues are drained + bool queuesEmpty = + pendingCreatureSpawns_.empty() && + asyncCreatureLoads_.empty() && + asyncNpcCompositeLoads_.empty() && + deferredEquipmentQueue_.empty() && + asyncEquipmentLoads_.empty() && + pendingGameObjectSpawns_.empty() && + asyncGameObjectLoads_.empty() && + pendingPlayerSpawns_.empty(); + + if (queuesEmpty) { + idleIterations++; + } else { + idleIterations = 0; + } + + // Exit when: (min time passed AND queues drained for several iterations) OR hard cap + bool readyToExit = (elapsed >= kMinWarmupSeconds && idleIterations >= kIdleThreshold); + if (readyToExit || elapsed >= kMaxWarmupSeconds) { + if (elapsed >= kMaxWarmupSeconds) { + LOG_WARNING("Warmup hit hard cap (", kMaxWarmupSeconds, "s), entering world with pending work"); + } break; } + + const float t = std::clamp(elapsed / kMaxWarmupSeconds, 0.0f, 1.0f); + showProgress("Finalizing world sync...", 0.97f + t * 0.025f); SDL_Delay(16); } } diff --git a/src/rendering/character_renderer.cpp b/src/rendering/character_renderer.cpp index baaaf3e6..9607f755 100644 --- a/src/rendering/character_renderer.cpp +++ b/src/rendering/character_renderer.cpp @@ -332,6 +332,11 @@ void CharacterRenderer::shutdown() { LOG_INFO("CharacterRenderer::shutdown instances=", instances.size(), " models=", models.size(), " override=", (void*)renderPassOverride_); + // Wait for any in-flight background normal map generation threads + while (pendingNormalMapCount_.load(std::memory_order_relaxed) > 0) { + std::this_thread::sleep_for(std::chrono::milliseconds(1)); + } + vkDeviceWaitIdle(vkCtx_->getDevice()); VkDevice device = vkCtx_->getDevice(); VmaAllocator alloc = vkCtx_->getAllocator(); @@ -413,6 +418,16 @@ void CharacterRenderer::clear() { LOG_INFO("CharacterRenderer::clear instances=", instances.size(), " models=", models.size()); + // Wait for any in-flight background normal map generation threads + while (pendingNormalMapCount_.load(std::memory_order_relaxed) > 0) { + std::this_thread::sleep_for(std::chrono::milliseconds(1)); + } + // Discard any completed results that haven't been uploaded + { + std::lock_guard lock(normalMapResultsMutex_); + completedNormalMaps_.clear(); + } + vkDeviceWaitIdle(vkCtx_->getDevice()); VkDevice device = vkCtx_->getDevice(); @@ -509,7 +524,32 @@ std::unique_ptr CharacterRenderer::generateNormalHeightMap( const uint8_t* pixels, uint32_t width, uint32_t height, float& outVariance) { if (!vkCtx_ || width == 0 || height == 0) return nullptr; + // Use the CPU-only static method, then upload to GPU + std::vector dummy(width * height * 4); + std::memcpy(dummy.data(), pixels, dummy.size()); + auto result = generateNormalHeightMapCPU("", std::move(dummy), width, height); + outVariance = result.variance; + + auto tex = std::make_unique(); + if (!tex->upload(*vkCtx_, result.pixels.data(), width, height, VK_FORMAT_R8G8B8A8_UNORM, true)) { + return nullptr; + } + tex->createSampler(vkCtx_->getDevice(), VK_FILTER_LINEAR, VK_FILTER_LINEAR, + VK_SAMPLER_ADDRESS_MODE_REPEAT); + return tex; +} + +// Static, thread-safe CPU-only normal map generation (no GPU access) +CharacterRenderer::NormalMapResult CharacterRenderer::generateNormalHeightMapCPU( + std::string cacheKey, std::vector srcPixels, uint32_t width, uint32_t height) { + NormalMapResult result; + result.cacheKey = std::move(cacheKey); + result.width = width; + result.height = height; + result.variance = 0.0f; + const uint32_t totalPixels = width * height; + const uint8_t* pixels = srcPixels.data(); // Step 1: Compute height from luminance std::vector heightMap(totalPixels); @@ -524,7 +564,7 @@ std::unique_ptr CharacterRenderer::generateNormalHeightMap( sumH2 += h * h; } double mean = sumH / totalPixels; - outVariance = static_cast(sumH2 / totalPixels - mean * mean); + result.variance = static_cast(sumH2 / totalPixels - mean * mean); // Step 1.5: Box blur the height map to reduce noise from diffuse textures auto wrapSample = [&](const std::vector& map, int x, int y) -> float { @@ -545,11 +585,9 @@ std::unique_ptr CharacterRenderer::generateNormalHeightMap( } } - // Step 2: Sobel 3x3 → normal map (crisp detail from original, blurred for POM alpha) - // Higher strength than WMO (2.0) because character/weapon textures are hand-painted - // with baked-in lighting that produces low-contrast gradients in the Sobel filter. + // Step 2: Sobel 3x3 → normal map const float strength = 5.0f; - std::vector output(totalPixels * 4); + result.pixels.resize(totalPixels * 4); auto sampleH = [&](int x, int y) -> float { x = ((x % (int)width) + (int)width) % (int)width; @@ -573,20 +611,14 @@ std::unique_ptr CharacterRenderer::generateNormalHeightMap( if (len > 0.0f) { nx /= len; ny /= len; nz /= len; } uint32_t idx = (y * width + x) * 4; - output[idx + 0] = static_cast(std::clamp((nx * 0.5f + 0.5f) * 255.0f, 0.0f, 255.0f)); - output[idx + 1] = static_cast(std::clamp((ny * 0.5f + 0.5f) * 255.0f, 0.0f, 255.0f)); - output[idx + 2] = static_cast(std::clamp((nz * 0.5f + 0.5f) * 255.0f, 0.0f, 255.0f)); - output[idx + 3] = static_cast(std::clamp(blurredHeight[y * width + x] * 255.0f, 0.0f, 255.0f)); + result.pixels[idx + 0] = static_cast(std::clamp((nx * 0.5f + 0.5f) * 255.0f, 0.0f, 255.0f)); + result.pixels[idx + 1] = static_cast(std::clamp((ny * 0.5f + 0.5f) * 255.0f, 0.0f, 255.0f)); + result.pixels[idx + 2] = static_cast(std::clamp((nz * 0.5f + 0.5f) * 255.0f, 0.0f, 255.0f)); + result.pixels[idx + 3] = static_cast(std::clamp(blurredHeight[y * width + x] * 255.0f, 0.0f, 255.0f)); } } - auto tex = std::make_unique(); - if (!tex->upload(*vkCtx_, output.data(), width, height, VK_FORMAT_R8G8B8A8_UNORM, true)) { - return nullptr; - } - tex->createSampler(vkCtx_->getDevice(), VK_FILTER_LINEAR, VK_FILTER_LINEAR, - VK_SAMPLER_ADDRESS_MODE_REPEAT); - return tex; + return result; } VkTexture* CharacterRenderer::loadTexture(const std::string& path) { @@ -687,15 +719,22 @@ VkTexture* CharacterRenderer::loadTexture(const std::string& path) { e.hasAlpha = hasAlpha; e.colorKeyBlack = colorKeyBlackHint; - // Defer normal/height map generation to avoid stalling loadModel. - // Normal maps are generated in processPendingNormalMaps() at a per-frame budget. + // Launch normal map generation on background thread — CPU work is pure compute, + // only the GPU upload (in processPendingNormalMaps) needs the main thread (~1-2ms). if (blpImage.width >= 32 && blpImage.height >= 32) { - PendingNormalMap pending; - pending.cacheKey = key; - pending.pixels.assign(blpImage.data.begin(), blpImage.data.end()); - pending.width = blpImage.width; - pending.height = blpImage.height; - pendingNormalMaps_.push_back(std::move(pending)); + uint32_t w = blpImage.width, h = blpImage.height; + std::string ck = key; + std::vector px(blpImage.data.begin(), blpImage.data.end()); + pendingNormalMapCount_.fetch_add(1, std::memory_order_relaxed); + auto* self = this; + std::thread([self, ck = std::move(ck), px = std::move(px), w, h]() mutable { + auto result = generateNormalHeightMapCPU(std::move(ck), std::move(px), w, h); + { + std::lock_guard lock(self->normalMapResultsMutex_); + self->completedNormalMaps_.push_back(std::move(result)); + } + self->pendingNormalMapCount_.fetch_sub(1, std::memory_order_relaxed); + }).detach(); e.normalMapPending = true; } @@ -709,30 +748,39 @@ VkTexture* CharacterRenderer::loadTexture(const std::string& path) { } void CharacterRenderer::processPendingNormalMaps(int budget) { - if (pendingNormalMaps_.empty() || !vkCtx_) return; + if (!vkCtx_) return; - int processed = 0; - while (!pendingNormalMaps_.empty() && processed < budget) { - auto pending = std::move(pendingNormalMaps_.front()); - pendingNormalMaps_.pop_front(); + // Collect completed results from background threads + std::deque ready; + { + std::lock_guard lock(normalMapResultsMutex_); + if (completedNormalMaps_.empty()) return; + int count = std::min(budget, static_cast(completedNormalMaps_.size())); + for (int i = 0; i < count; i++) { + ready.push_back(std::move(completedNormalMaps_.front())); + completedNormalMaps_.pop_front(); + } + } - auto it = textureCache.find(pending.cacheKey); + // GPU upload only (~1-2ms each) — CPU work already done on background thread + for (auto& result : ready) { + auto it = textureCache.find(result.cacheKey); if (it == textureCache.end()) continue; // texture was evicted - float nhVariance = 0.0f; vkCtx_->beginUploadBatch(); - auto nhMap = generateNormalHeightMap(pending.pixels.data(), - pending.width, pending.height, nhVariance); - vkCtx_->endUploadBatch(); - - if (nhMap) { - it->second.heightMapVariance = nhVariance; - it->second.approxBytes += approxTextureBytesWithMips(pending.width, pending.height); - textureCacheBytes_ += approxTextureBytesWithMips(pending.width, pending.height); - it->second.normalHeightMap = std::move(nhMap); + auto tex = std::make_unique(); + bool ok = tex->upload(*vkCtx_, result.pixels.data(), result.width, result.height, + VK_FORMAT_R8G8B8A8_UNORM, true); + if (ok) { + tex->createSampler(vkCtx_->getDevice(), VK_FILTER_LINEAR, VK_FILTER_LINEAR, + VK_SAMPLER_ADDRESS_MODE_REPEAT); + it->second.heightMapVariance = result.variance; + it->second.approxBytes += approxTextureBytesWithMips(result.width, result.height); + textureCacheBytes_ += approxTextureBytesWithMips(result.width, result.height); + it->second.normalHeightMap = std::move(tex); } + vkCtx_->endUploadBatch(); it->second.normalMapPending = false; - processed++; } } diff --git a/src/rendering/loading_screen.cpp b/src/rendering/loading_screen.cpp index 34ad1aa6..a2e83a2b 100644 --- a/src/rendering/loading_screen.cpp +++ b/src/rendering/loading_screen.cpp @@ -240,6 +240,66 @@ bool LoadingScreen::loadImage(const std::string& path) { return true; } +void LoadingScreen::renderOverlay() { + // Draw loading screen content as ImGui overlay within an existing ImGui frame. + // Caller is responsible for ImGui NewFrame/Render and Vulkan frame management. + ImGuiIO& io = ImGui::GetIO(); + float screenW = io.DisplaySize.x; + float screenH = io.DisplaySize.y; + + ImGui::SetNextWindowPos(ImVec2(0, 0)); + ImGui::SetNextWindowSize(ImVec2(screenW, screenH)); + ImGui::Begin("##LoadingScreenOverlay", nullptr, + ImGuiWindowFlags_NoTitleBar | ImGuiWindowFlags_NoResize | + ImGuiWindowFlags_NoMove | ImGuiWindowFlags_NoScrollbar | + ImGuiWindowFlags_NoInputs | ImGuiWindowFlags_NoBackground | + ImGuiWindowFlags_NoBringToFrontOnFocus); + + if (bgDescriptorSet) { + ImGui::GetWindowDrawList()->AddImage( + reinterpret_cast(bgDescriptorSet), + ImVec2(0, 0), ImVec2(screenW, screenH)); + } + + // Progress bar + { + const float barWidthFrac = 0.6f; + const float barHeight = 6.0f; + const float barY = screenH * 0.06f; + float barX = screenW * (0.5f - barWidthFrac * 0.5f); + float barW = screenW * barWidthFrac; + ImDrawList* drawList = ImGui::GetWindowDrawList(); + drawList->AddRectFilled(ImVec2(barX, barY), ImVec2(barX + barW, barY + barHeight), + IM_COL32(25, 25, 25, 200), 2.0f); + if (loadProgress > 0.001f) { + drawList->AddRectFilled(ImVec2(barX, barY), ImVec2(barX + barW * loadProgress, barY + barHeight), + IM_COL32(199, 156, 33, 255), 2.0f); + } + drawList->AddRect(ImVec2(barX - 1, barY - 1), ImVec2(barX + barW + 1, barY + barHeight + 1), + IM_COL32(140, 110, 25, 255), 2.0f); + } + + // Percentage text + { + char pctBuf[32]; + snprintf(pctBuf, sizeof(pctBuf), "%d%%", static_cast(loadProgress * 100.0f)); + float textY = screenH * 0.06f - 20.0f; + ImVec2 pctSize = ImGui::CalcTextSize(pctBuf); + ImGui::SetCursorPos(ImVec2((screenW - pctSize.x) * 0.5f, textY)); + ImGui::TextColored(ImVec4(0.0f, 0.0f, 0.0f, 1.0f), "%s", pctBuf); + } + + // Status text + { + float statusY = screenH * 0.06f + 14.0f; + ImVec2 statusSize = ImGui::CalcTextSize(statusText.c_str()); + ImGui::SetCursorPos(ImVec2((screenW - statusSize.x) * 0.5f, statusY)); + ImGui::TextColored(ImVec4(0.0f, 0.0f, 0.0f, 1.0f), "%s", statusText.c_str()); + } + + ImGui::End(); +} + void LoadingScreen::render() { // If a frame is already in progress (e.g. called from a UI callback), // end it before starting our own