Background normal map generation, queue-draining load screen warmup

- Normal map CPU work (luminance→blur→Sobel) moved to background threads,
  main thread only does GPU upload (~1-2ms vs 15-22ms per texture)
- Load screen warmup now waits until ALL spawn/equipment/gameobject queues
  are drained before transitioning (prevents naked character, NPC pop-in)
- Exit condition: min 2s + 5 consecutive empty iterations, hard cap 15s
- Equipment queue processes 8 items per warmup iteration instead of 1
- Added LoadingScreen::renderOverlay() for future world-behind-loading use
This commit is contained in:
Kelsi 2026-03-07 18:40:24 -08:00
parent 63efac9fa6
commit 02cf0e4df3
5 changed files with 218 additions and 60 deletions

View file

@ -13,6 +13,8 @@
#include <utility>
#include <future>
#include <deque>
#include <mutex>
#include <atomic>
namespace wowee {
namespace pipeline { class AssetManager; }
@ -304,15 +306,23 @@ private:
std::unique_ptr<VkTexture> generateNormalHeightMap(
const uint8_t* pixels, uint32_t width, uint32_t height, float& outVariance);
// Deferred normal map generation — avoids stalling loadModel
struct PendingNormalMap {
// Background normal map generation — CPU work on thread pool, GPU upload on main thread
struct NormalMapResult {
std::string cacheKey;
std::vector<uint8_t> pixels; // RGBA pixel data
std::vector<uint8_t> pixels; // RGBA normal map output
uint32_t width, height;
float variance;
};
std::deque<PendingNormalMap> pendingNormalMaps_;
// Completed results ready for GPU upload (populated by background threads)
std::mutex normalMapResultsMutex_;
std::deque<NormalMapResult> completedNormalMaps_;
std::atomic<int> pendingNormalMapCount_{0}; // in-flight background tasks
// Pure CPU normal map generation (thread-safe, no GPU access)
static NormalMapResult generateNormalHeightMapCPU(
std::string cacheKey, std::vector<uint8_t> pixels, uint32_t width, uint32_t height);
public:
void processPendingNormalMaps(int budget = 2);
void processPendingNormalMaps(int budget = 4);
private:
// Normal mapping / POM settings

View file

@ -24,6 +24,10 @@ public:
// Render the loading screen with progress bar and status text (pure ImGui)
void render();
// Draw loading screen as ImGui overlay (call within an existing ImGui frame).
// Used during warmup to overlay loading screen on top of the rendered world.
void renderOverlay();
void setProgress(float progress) { loadProgress = progress; }
void setStatus(const std::string& status) { statusText = status; }

View file

@ -49,9 +49,9 @@
#include <SDL2/SDL.h>
// GL/glew.h removed — Vulkan migration Phase 1
#include <cstdlib>
#include <climits>
#include <algorithm>
#include <cctype>
#include <cctype>
#include <optional>
#include <sstream>
#include <set>
@ -922,9 +922,9 @@ void Application::update(float deltaTime) {
auto t3 = std::chrono::steady_clock::now();
processDeferredEquipmentQueue();
auto t4 = std::chrono::steady_clock::now();
// Process deferred normal maps (2 per frame to spread CPU cost)
// Upload completed normal maps from background threads (~1-2ms each GPU upload)
if (auto* cr = renderer ? renderer->getCharacterRenderer() : nullptr) {
cr->processPendingNormalMaps(2);
cr->processPendingNormalMaps(4);
}
auto t5 = std::chrono::steady_clock::now();
float pMs = std::chrono::duration<float, std::milli>(t1 - t0).count();
@ -4167,11 +4167,17 @@ void Application::loadOnlineWorldTerrain(uint32_t mapId, float x, float y, float
});
}
// Hide first-login hitch by draining initial world packets/spawn queues before
// dropping the loading screen. Keep this bounded so we don't stall indefinitely.
// Keep the loading screen visible until all spawn/equipment/gameobject queues
// are fully drained. This ensures the player sees a fully populated world
// (character clothed, NPCs placed, game objects loaded) when the screen drops.
{
const float kWarmupMaxSeconds = 2.5f;
const float kMinWarmupSeconds = 2.0f; // minimum time to drain network packets
const float kMaxWarmupSeconds = 15.0f; // hard cap to avoid infinite stall
const auto warmupStart = std::chrono::high_resolution_clock::now();
// Track consecutive idle iterations (all queues empty) to detect convergence
int idleIterations = 0;
const int kIdleThreshold = 5; // require 5 consecutive empty loops (~80ms)
while (true) {
SDL_Event event;
while (SDL_PollEvent(&event)) {
@ -4185,7 +4191,6 @@ void Application::loadOnlineWorldTerrain(uint32_t mapId, float x, float y, float
int w = event.window.data1;
int h = event.window.data2;
window->setSize(w, h);
// Vulkan viewport set in command buffer
if (renderer && renderer->getCamera()) {
renderer->getCamera()->setAspectRatio(static_cast<float>(w) / h);
}
@ -4208,14 +4213,17 @@ void Application::loadOnlineWorldTerrain(uint32_t mapId, float x, float y, float
// During load screen warmup: lift per-frame budgets so GPU uploads
// and spawns happen in bulk while the loading screen is still visible.
processCreatureSpawnQueue(true); // unlimited: no model upload cap, no time budget
processCreatureSpawnQueue(true);
processAsyncNpcCompositeResults();
processDeferredEquipmentQueue();
// Process equipment queue more aggressively during warmup (multiple per iteration)
for (int i = 0; i < 8 && (!deferredEquipmentQueue_.empty() || !asyncEquipmentLoads_.empty()); i++) {
processDeferredEquipmentQueue();
}
if (auto* cr = renderer ? renderer->getCharacterRenderer() : nullptr) {
cr->processPendingNormalMaps(10); // higher budget during load screen
cr->processPendingNormalMaps(INT_MAX);
}
// Process ALL pending game object spawns (no 1-per-frame cap during load screen).
// Process ALL pending game object spawns.
while (!pendingGameObjectSpawns_.empty()) {
auto& s = pendingGameObjectSpawns_.front();
spawnOnlineGameObject(s.guid, s.entry, s.displayId, s.x, s.y, s.z, s.orientation);
@ -4226,14 +4234,42 @@ void Application::loadOnlineWorldTerrain(uint32_t mapId, float x, float y, float
processPendingMount();
updateQuestMarkers();
// Update renderer (terrain streaming, animations)
if (renderer) {
renderer->update(1.0f / 60.0f);
}
const auto now = std::chrono::high_resolution_clock::now();
const float elapsed = std::chrono::duration<float>(now - warmupStart).count();
const float t = std::clamp(elapsed / kWarmupMaxSeconds, 0.0f, 1.0f);
showProgress("Finalizing world sync...", 0.97f + t * 0.025f);
if (elapsed >= kWarmupMaxSeconds) {
// Check if all queues are drained
bool queuesEmpty =
pendingCreatureSpawns_.empty() &&
asyncCreatureLoads_.empty() &&
asyncNpcCompositeLoads_.empty() &&
deferredEquipmentQueue_.empty() &&
asyncEquipmentLoads_.empty() &&
pendingGameObjectSpawns_.empty() &&
asyncGameObjectLoads_.empty() &&
pendingPlayerSpawns_.empty();
if (queuesEmpty) {
idleIterations++;
} else {
idleIterations = 0;
}
// Exit when: (min time passed AND queues drained for several iterations) OR hard cap
bool readyToExit = (elapsed >= kMinWarmupSeconds && idleIterations >= kIdleThreshold);
if (readyToExit || elapsed >= kMaxWarmupSeconds) {
if (elapsed >= kMaxWarmupSeconds) {
LOG_WARNING("Warmup hit hard cap (", kMaxWarmupSeconds, "s), entering world with pending work");
}
break;
}
const float t = std::clamp(elapsed / kMaxWarmupSeconds, 0.0f, 1.0f);
showProgress("Finalizing world sync...", 0.97f + t * 0.025f);
SDL_Delay(16);
}
}

View file

@ -332,6 +332,11 @@ void CharacterRenderer::shutdown() {
LOG_INFO("CharacterRenderer::shutdown instances=", instances.size(),
" models=", models.size(), " override=", (void*)renderPassOverride_);
// Wait for any in-flight background normal map generation threads
while (pendingNormalMapCount_.load(std::memory_order_relaxed) > 0) {
std::this_thread::sleep_for(std::chrono::milliseconds(1));
}
vkDeviceWaitIdle(vkCtx_->getDevice());
VkDevice device = vkCtx_->getDevice();
VmaAllocator alloc = vkCtx_->getAllocator();
@ -413,6 +418,16 @@ void CharacterRenderer::clear() {
LOG_INFO("CharacterRenderer::clear instances=", instances.size(),
" models=", models.size());
// Wait for any in-flight background normal map generation threads
while (pendingNormalMapCount_.load(std::memory_order_relaxed) > 0) {
std::this_thread::sleep_for(std::chrono::milliseconds(1));
}
// Discard any completed results that haven't been uploaded
{
std::lock_guard<std::mutex> lock(normalMapResultsMutex_);
completedNormalMaps_.clear();
}
vkDeviceWaitIdle(vkCtx_->getDevice());
VkDevice device = vkCtx_->getDevice();
@ -509,7 +524,32 @@ std::unique_ptr<VkTexture> CharacterRenderer::generateNormalHeightMap(
const uint8_t* pixels, uint32_t width, uint32_t height, float& outVariance) {
if (!vkCtx_ || width == 0 || height == 0) return nullptr;
// Use the CPU-only static method, then upload to GPU
std::vector<uint8_t> dummy(width * height * 4);
std::memcpy(dummy.data(), pixels, dummy.size());
auto result = generateNormalHeightMapCPU("", std::move(dummy), width, height);
outVariance = result.variance;
auto tex = std::make_unique<VkTexture>();
if (!tex->upload(*vkCtx_, result.pixels.data(), width, height, VK_FORMAT_R8G8B8A8_UNORM, true)) {
return nullptr;
}
tex->createSampler(vkCtx_->getDevice(), VK_FILTER_LINEAR, VK_FILTER_LINEAR,
VK_SAMPLER_ADDRESS_MODE_REPEAT);
return tex;
}
// Static, thread-safe CPU-only normal map generation (no GPU access)
CharacterRenderer::NormalMapResult CharacterRenderer::generateNormalHeightMapCPU(
std::string cacheKey, std::vector<uint8_t> srcPixels, uint32_t width, uint32_t height) {
NormalMapResult result;
result.cacheKey = std::move(cacheKey);
result.width = width;
result.height = height;
result.variance = 0.0f;
const uint32_t totalPixels = width * height;
const uint8_t* pixels = srcPixels.data();
// Step 1: Compute height from luminance
std::vector<float> heightMap(totalPixels);
@ -524,7 +564,7 @@ std::unique_ptr<VkTexture> CharacterRenderer::generateNormalHeightMap(
sumH2 += h * h;
}
double mean = sumH / totalPixels;
outVariance = static_cast<float>(sumH2 / totalPixels - mean * mean);
result.variance = static_cast<float>(sumH2 / totalPixels - mean * mean);
// Step 1.5: Box blur the height map to reduce noise from diffuse textures
auto wrapSample = [&](const std::vector<float>& map, int x, int y) -> float {
@ -545,11 +585,9 @@ std::unique_ptr<VkTexture> CharacterRenderer::generateNormalHeightMap(
}
}
// Step 2: Sobel 3x3 → normal map (crisp detail from original, blurred for POM alpha)
// Higher strength than WMO (2.0) because character/weapon textures are hand-painted
// with baked-in lighting that produces low-contrast gradients in the Sobel filter.
// Step 2: Sobel 3x3 → normal map
const float strength = 5.0f;
std::vector<uint8_t> output(totalPixels * 4);
result.pixels.resize(totalPixels * 4);
auto sampleH = [&](int x, int y) -> float {
x = ((x % (int)width) + (int)width) % (int)width;
@ -573,20 +611,14 @@ std::unique_ptr<VkTexture> CharacterRenderer::generateNormalHeightMap(
if (len > 0.0f) { nx /= len; ny /= len; nz /= len; }
uint32_t idx = (y * width + x) * 4;
output[idx + 0] = static_cast<uint8_t>(std::clamp((nx * 0.5f + 0.5f) * 255.0f, 0.0f, 255.0f));
output[idx + 1] = static_cast<uint8_t>(std::clamp((ny * 0.5f + 0.5f) * 255.0f, 0.0f, 255.0f));
output[idx + 2] = static_cast<uint8_t>(std::clamp((nz * 0.5f + 0.5f) * 255.0f, 0.0f, 255.0f));
output[idx + 3] = static_cast<uint8_t>(std::clamp(blurredHeight[y * width + x] * 255.0f, 0.0f, 255.0f));
result.pixels[idx + 0] = static_cast<uint8_t>(std::clamp((nx * 0.5f + 0.5f) * 255.0f, 0.0f, 255.0f));
result.pixels[idx + 1] = static_cast<uint8_t>(std::clamp((ny * 0.5f + 0.5f) * 255.0f, 0.0f, 255.0f));
result.pixels[idx + 2] = static_cast<uint8_t>(std::clamp((nz * 0.5f + 0.5f) * 255.0f, 0.0f, 255.0f));
result.pixels[idx + 3] = static_cast<uint8_t>(std::clamp(blurredHeight[y * width + x] * 255.0f, 0.0f, 255.0f));
}
}
auto tex = std::make_unique<VkTexture>();
if (!tex->upload(*vkCtx_, output.data(), width, height, VK_FORMAT_R8G8B8A8_UNORM, true)) {
return nullptr;
}
tex->createSampler(vkCtx_->getDevice(), VK_FILTER_LINEAR, VK_FILTER_LINEAR,
VK_SAMPLER_ADDRESS_MODE_REPEAT);
return tex;
return result;
}
VkTexture* CharacterRenderer::loadTexture(const std::string& path) {
@ -687,15 +719,22 @@ VkTexture* CharacterRenderer::loadTexture(const std::string& path) {
e.hasAlpha = hasAlpha;
e.colorKeyBlack = colorKeyBlackHint;
// Defer normal/height map generation to avoid stalling loadModel.
// Normal maps are generated in processPendingNormalMaps() at a per-frame budget.
// Launch normal map generation on background thread — CPU work is pure compute,
// only the GPU upload (in processPendingNormalMaps) needs the main thread (~1-2ms).
if (blpImage.width >= 32 && blpImage.height >= 32) {
PendingNormalMap pending;
pending.cacheKey = key;
pending.pixels.assign(blpImage.data.begin(), blpImage.data.end());
pending.width = blpImage.width;
pending.height = blpImage.height;
pendingNormalMaps_.push_back(std::move(pending));
uint32_t w = blpImage.width, h = blpImage.height;
std::string ck = key;
std::vector<uint8_t> px(blpImage.data.begin(), blpImage.data.end());
pendingNormalMapCount_.fetch_add(1, std::memory_order_relaxed);
auto* self = this;
std::thread([self, ck = std::move(ck), px = std::move(px), w, h]() mutable {
auto result = generateNormalHeightMapCPU(std::move(ck), std::move(px), w, h);
{
std::lock_guard<std::mutex> lock(self->normalMapResultsMutex_);
self->completedNormalMaps_.push_back(std::move(result));
}
self->pendingNormalMapCount_.fetch_sub(1, std::memory_order_relaxed);
}).detach();
e.normalMapPending = true;
}
@ -709,30 +748,39 @@ VkTexture* CharacterRenderer::loadTexture(const std::string& path) {
}
void CharacterRenderer::processPendingNormalMaps(int budget) {
if (pendingNormalMaps_.empty() || !vkCtx_) return;
if (!vkCtx_) return;
int processed = 0;
while (!pendingNormalMaps_.empty() && processed < budget) {
auto pending = std::move(pendingNormalMaps_.front());
pendingNormalMaps_.pop_front();
// Collect completed results from background threads
std::deque<NormalMapResult> ready;
{
std::lock_guard<std::mutex> lock(normalMapResultsMutex_);
if (completedNormalMaps_.empty()) return;
int count = std::min(budget, static_cast<int>(completedNormalMaps_.size()));
for (int i = 0; i < count; i++) {
ready.push_back(std::move(completedNormalMaps_.front()));
completedNormalMaps_.pop_front();
}
}
auto it = textureCache.find(pending.cacheKey);
// GPU upload only (~1-2ms each) — CPU work already done on background thread
for (auto& result : ready) {
auto it = textureCache.find(result.cacheKey);
if (it == textureCache.end()) continue; // texture was evicted
float nhVariance = 0.0f;
vkCtx_->beginUploadBatch();
auto nhMap = generateNormalHeightMap(pending.pixels.data(),
pending.width, pending.height, nhVariance);
vkCtx_->endUploadBatch();
if (nhMap) {
it->second.heightMapVariance = nhVariance;
it->second.approxBytes += approxTextureBytesWithMips(pending.width, pending.height);
textureCacheBytes_ += approxTextureBytesWithMips(pending.width, pending.height);
it->second.normalHeightMap = std::move(nhMap);
auto tex = std::make_unique<VkTexture>();
bool ok = tex->upload(*vkCtx_, result.pixels.data(), result.width, result.height,
VK_FORMAT_R8G8B8A8_UNORM, true);
if (ok) {
tex->createSampler(vkCtx_->getDevice(), VK_FILTER_LINEAR, VK_FILTER_LINEAR,
VK_SAMPLER_ADDRESS_MODE_REPEAT);
it->second.heightMapVariance = result.variance;
it->second.approxBytes += approxTextureBytesWithMips(result.width, result.height);
textureCacheBytes_ += approxTextureBytesWithMips(result.width, result.height);
it->second.normalHeightMap = std::move(tex);
}
vkCtx_->endUploadBatch();
it->second.normalMapPending = false;
processed++;
}
}

View file

@ -240,6 +240,66 @@ bool LoadingScreen::loadImage(const std::string& path) {
return true;
}
void LoadingScreen::renderOverlay() {
// Draw loading screen content as ImGui overlay within an existing ImGui frame.
// Caller is responsible for ImGui NewFrame/Render and Vulkan frame management.
ImGuiIO& io = ImGui::GetIO();
float screenW = io.DisplaySize.x;
float screenH = io.DisplaySize.y;
ImGui::SetNextWindowPos(ImVec2(0, 0));
ImGui::SetNextWindowSize(ImVec2(screenW, screenH));
ImGui::Begin("##LoadingScreenOverlay", nullptr,
ImGuiWindowFlags_NoTitleBar | ImGuiWindowFlags_NoResize |
ImGuiWindowFlags_NoMove | ImGuiWindowFlags_NoScrollbar |
ImGuiWindowFlags_NoInputs | ImGuiWindowFlags_NoBackground |
ImGuiWindowFlags_NoBringToFrontOnFocus);
if (bgDescriptorSet) {
ImGui::GetWindowDrawList()->AddImage(
reinterpret_cast<ImTextureID>(bgDescriptorSet),
ImVec2(0, 0), ImVec2(screenW, screenH));
}
// Progress bar
{
const float barWidthFrac = 0.6f;
const float barHeight = 6.0f;
const float barY = screenH * 0.06f;
float barX = screenW * (0.5f - barWidthFrac * 0.5f);
float barW = screenW * barWidthFrac;
ImDrawList* drawList = ImGui::GetWindowDrawList();
drawList->AddRectFilled(ImVec2(barX, barY), ImVec2(barX + barW, barY + barHeight),
IM_COL32(25, 25, 25, 200), 2.0f);
if (loadProgress > 0.001f) {
drawList->AddRectFilled(ImVec2(barX, barY), ImVec2(barX + barW * loadProgress, barY + barHeight),
IM_COL32(199, 156, 33, 255), 2.0f);
}
drawList->AddRect(ImVec2(barX - 1, barY - 1), ImVec2(barX + barW + 1, barY + barHeight + 1),
IM_COL32(140, 110, 25, 255), 2.0f);
}
// Percentage text
{
char pctBuf[32];
snprintf(pctBuf, sizeof(pctBuf), "%d%%", static_cast<int>(loadProgress * 100.0f));
float textY = screenH * 0.06f - 20.0f;
ImVec2 pctSize = ImGui::CalcTextSize(pctBuf);
ImGui::SetCursorPos(ImVec2((screenW - pctSize.x) * 0.5f, textY));
ImGui::TextColored(ImVec4(0.0f, 0.0f, 0.0f, 1.0f), "%s", pctBuf);
}
// Status text
{
float statusY = screenH * 0.06f + 14.0f;
ImVec2 statusSize = ImGui::CalcTextSize(statusText.c_str());
ImGui::SetCursorPos(ImVec2((screenW - statusSize.x) * 0.5f, statusY));
ImGui::TextColored(ImVec4(0.0f, 0.0f, 0.0f, 1.0f), "%s", statusText.c_str());
}
ImGui::End();
}
void LoadingScreen::render() {
// If a frame is already in progress (e.g. called from a UI callback),
// end it before starting our own