Background BLP texture pre-decoding + deferred WMO normal maps (12x streaming perf)

Move CPU-heavy BLP texture decoding from main thread to background worker
threads for all hot paths: terrain M2 models, WMO doodad M2s, WMO textures,
creature models, and gameobject WMOs. Each renderer (M2, WMO, Character) now
accepts a pre-decoded BLP cache that loadTexture() checks before falling back
to synchronous decode.

Defer WMO normal/height map generation (3 per-pixel passes: luminance, box
blur, Sobel) during terrain streaming finalization — this was the dominant
remaining bottleneck after BLP pre-decoding.

Terrain streaming stalls: 1576ms → 124ms worst case.
This commit is contained in:
Kelsi 2026-03-07 15:46:56 -08:00
parent 0313bd8692
commit 7ac990cff4
13 changed files with 573 additions and 109 deletions

View file

@ -6883,7 +6883,7 @@ void Application::spawnOnlineGameObject(uint64_t guid, uint32_t entry, uint32_t
void Application::processAsyncCreatureResults() {
// Check completed async model loads and finalize on main thread (GPU upload + instance creation).
// Limit GPU model uploads per frame to avoid spikes, but always drain cheap bookkeeping.
static constexpr int kMaxModelUploadsPerFrame = 3;
static constexpr int kMaxModelUploadsPerFrame = 1;
int modelUploads = 0;
for (auto it = asyncCreatureLoads_.begin(); it != asyncCreatureLoads_.end(); ) {
@ -6925,13 +6925,17 @@ void Application::processAsyncCreatureResults() {
}
// Upload model to GPU (must happen on main thread)
// Use pre-decoded BLP cache to skip main-thread texture decode
charRenderer->setPredecodedBLPCache(&result.predecodedTextures);
if (!charRenderer->loadModel(*result.model, result.modelId)) {
charRenderer->setPredecodedBLPCache(nullptr);
nonRenderableCreatureDisplayIds_.insert(result.displayId);
creaturePermanentFailureGuids_.insert(result.guid);
pendingCreatureSpawnGuids_.erase(result.guid);
creatureSpawnRetryCounts_.erase(result.guid);
continue;
}
charRenderer->setPredecodedBLPCache(nullptr);
displayIdModelCache_[result.displayId] = result.modelId;
modelUploads++;
@ -6956,6 +6960,10 @@ void Application::processAsyncCreatureResults() {
}
void Application::processCreatureSpawnQueue() {
auto startTime = std::chrono::steady_clock::now();
// Budget: max 2ms per frame for creature spawning to prevent stutter.
static constexpr float kSpawnBudgetMs = 2.0f;
// First, finalize any async model loads that completed on background threads.
processAsyncCreatureResults();
@ -6965,18 +6973,15 @@ void Application::processCreatureSpawnQueue() {
if (!creatureLookupsBuilt_) return;
}
auto startTime = std::chrono::steady_clock::now();
// Budget: max 4ms per frame for creature spawning to prevent stutter.
static constexpr float kSpawnBudgetMs = 4.0f;
int processed = 0;
int asyncLaunched = 0;
size_t rotationsLeft = pendingCreatureSpawns_.size();
while (!pendingCreatureSpawns_.empty() &&
processed < MAX_SPAWNS_PER_FRAME &&
rotationsLeft > 0) {
// Check time budget after each spawn (not for the first one, always process at least 1)
if (processed > 0) {
// Check time budget every iteration (including first — async results may
// have already consumed the budget via GPU model uploads).
{
auto now = std::chrono::steady_clock::now();
float elapsedMs = std::chrono::duration<float, std::milli>(now - startTime).count();
if (elapsedMs >= kSpawnBudgetMs) break;
@ -7081,6 +7086,20 @@ void Application::processCreatureSpawnQueue() {
}
}
// Pre-decode model textures on background thread
for (const auto& tex : model->textures) {
if (tex.filename.empty()) continue;
std::string texKey = tex.filename;
std::replace(texKey.begin(), texKey.end(), '/', '\\');
std::transform(texKey.begin(), texKey.end(), texKey.begin(),
[](unsigned char c) { return static_cast<char>(std::tolower(c)); });
if (result.predecodedTextures.find(texKey) != result.predecodedTextures.end()) continue;
auto blp = am->loadTexture(texKey);
if (blp.isValid()) {
result.predecodedTextures[texKey] = std::move(blp);
}
}
result.model = std::move(model);
result.valid = true;
return result;
@ -7161,14 +7180,202 @@ void Application::processDeferredEquipmentQueue() {
setOnlinePlayerEquipment(guid, equipData.first, equipData.second);
}
void Application::processAsyncGameObjectResults() {
for (auto it = asyncGameObjectLoads_.begin(); it != asyncGameObjectLoads_.end(); ) {
if (!it->future.valid() ||
it->future.wait_for(std::chrono::milliseconds(0)) != std::future_status::ready) {
++it;
continue;
}
auto result = it->future.get();
it = asyncGameObjectLoads_.erase(it);
if (!result.valid || !result.isWmo || !result.wmoModel) {
// Fallback: spawn via sync path (likely an M2 or failed WMO)
spawnOnlineGameObject(result.guid, result.entry, result.displayId,
result.x, result.y, result.z, result.orientation);
continue;
}
// WMO parsed on background thread — do GPU upload + instance creation on main thread
auto* wmoRenderer = renderer ? renderer->getWMORenderer() : nullptr;
if (!wmoRenderer) continue;
uint32_t modelId = 0;
auto itCache = gameObjectDisplayIdWmoCache_.find(result.displayId);
if (itCache != gameObjectDisplayIdWmoCache_.end()) {
modelId = itCache->second;
} else {
modelId = nextGameObjectWmoModelId_++;
wmoRenderer->setPredecodedBLPCache(&result.predecodedTextures);
if (!wmoRenderer->loadModel(*result.wmoModel, modelId)) {
wmoRenderer->setPredecodedBLPCache(nullptr);
LOG_WARNING("Failed to load async gameobject WMO: ", result.modelPath);
continue;
}
wmoRenderer->setPredecodedBLPCache(nullptr);
gameObjectDisplayIdWmoCache_[result.displayId] = modelId;
}
glm::vec3 renderPos = core::coords::canonicalToRender(
glm::vec3(result.x, result.y, result.z));
uint32_t instanceId = wmoRenderer->createInstance(
modelId, renderPos, glm::vec3(0.0f, 0.0f, result.orientation), 1.0f);
if (instanceId == 0) continue;
gameObjectInstances_[result.guid] = {modelId, instanceId, true};
// Queue transport doodad loading if applicable
std::string lowerPath = result.modelPath;
std::transform(lowerPath.begin(), lowerPath.end(), lowerPath.begin(),
[](unsigned char c) { return static_cast<char>(std::tolower(c)); });
if (lowerPath.find("transport") != std::string::npos) {
const auto* doodadTemplates = wmoRenderer->getDoodadTemplates(modelId);
if (doodadTemplates && !doodadTemplates->empty()) {
PendingTransportDoodadBatch batch;
batch.guid = result.guid;
batch.modelId = modelId;
batch.instanceId = instanceId;
batch.x = result.x;
batch.y = result.y;
batch.z = result.z;
batch.orientation = result.orientation;
batch.doodadBudget = doodadTemplates->size();
pendingTransportDoodadBatches_.push_back(batch);
}
}
}
}
void Application::processGameObjectSpawnQueue() {
// Finalize any completed async WMO loads first
processAsyncGameObjectResults();
if (pendingGameObjectSpawns_.empty()) return;
// Only spawn 1 game object per frame — each can involve heavy synchronous
// WMO loading (root + groups from disk + GPU upload), easily 100ms+.
auto& s = pendingGameObjectSpawns_.front();
spawnOnlineGameObject(s.guid, s.entry, s.displayId, s.x, s.y, s.z, s.orientation);
pendingGameObjectSpawns_.erase(pendingGameObjectSpawns_.begin());
// Process spawns: cached WMOs and M2s go sync (cheap), uncached WMOs go async
auto startTime = std::chrono::steady_clock::now();
static constexpr float kBudgetMs = 2.0f;
static constexpr int kMaxAsyncLoads = 2;
while (!pendingGameObjectSpawns_.empty()) {
float elapsedMs = std::chrono::duration<float, std::milli>(
std::chrono::steady_clock::now() - startTime).count();
if (elapsedMs >= kBudgetMs) break;
auto& s = pendingGameObjectSpawns_.front();
// Check if this is an uncached WMO that needs async loading
std::string modelPath;
if (gameObjectLookupsBuilt_) {
// Check transport overrides first
bool isTransport = gameHandler && gameHandler->isTransportGuid(s.guid);
if (isTransport) {
if (s.entry == 20808 || s.entry == 176231 || s.entry == 176310)
modelPath = "World\\wmo\\transports\\transport_ship\\transportship.wmo";
else if (s.displayId == 807 || s.displayId == 808 || s.displayId == 175080 || s.displayId == 176495 || s.displayId == 164871)
modelPath = "World\\wmo\\transports\\transport_zeppelin\\transport_zeppelin.wmo";
else if (s.displayId == 1587)
modelPath = "World\\wmo\\transports\\transport_horde_zeppelin\\Transport_Horde_Zeppelin.wmo";
else if (s.displayId == 2454 || s.displayId == 181688 || s.displayId == 190536)
modelPath = "World\\wmo\\transports\\icebreaker\\Transport_Icebreaker_ship.wmo";
}
if (modelPath.empty())
modelPath = getGameObjectModelPathForDisplayId(s.displayId);
}
std::string lowerPath = modelPath;
std::transform(lowerPath.begin(), lowerPath.end(), lowerPath.begin(),
[](unsigned char c) { return static_cast<char>(std::tolower(c)); });
bool isWmo = lowerPath.size() >= 4 && lowerPath.substr(lowerPath.size() - 4) == ".wmo";
bool isCached = isWmo && gameObjectDisplayIdWmoCache_.count(s.displayId);
if (isWmo && !isCached && !modelPath.empty() &&
static_cast<int>(asyncGameObjectLoads_.size()) < kMaxAsyncLoads) {
// Launch async WMO load — file I/O + parse on background thread
auto* am = assetManager.get();
PendingGameObjectSpawn capture = s;
std::string capturePath = modelPath;
AsyncGameObjectLoad load;
load.future = std::async(std::launch::async,
[am, capture, capturePath]() -> PreparedGameObjectWMO {
PreparedGameObjectWMO result;
result.guid = capture.guid;
result.entry = capture.entry;
result.displayId = capture.displayId;
result.x = capture.x;
result.y = capture.y;
result.z = capture.z;
result.orientation = capture.orientation;
result.modelPath = capturePath;
result.isWmo = true;
auto wmoData = am->readFile(capturePath);
if (wmoData.empty()) return result;
auto wmo = std::make_shared<pipeline::WMOModel>(
pipeline::WMOLoader::load(wmoData));
// Load groups
if (wmo->nGroups > 0) {
std::string basePath = capturePath;
std::string ext;
if (basePath.size() > 4) {
ext = basePath.substr(basePath.size() - 4);
basePath = basePath.substr(0, basePath.size() - 4);
}
for (uint32_t gi = 0; gi < wmo->nGroups; gi++) {
char suffix[16];
snprintf(suffix, sizeof(suffix), "_%03u%s", gi, ext.c_str());
auto groupData = am->readFile(basePath + suffix);
if (groupData.empty()) {
snprintf(suffix, sizeof(suffix), "_%03u.wmo", gi);
groupData = am->readFile(basePath + suffix);
}
if (!groupData.empty()) {
pipeline::WMOLoader::loadGroup(groupData, *wmo, gi);
}
}
}
// Pre-decode WMO textures on background thread
for (const auto& texPath : wmo->textures) {
if (texPath.empty()) continue;
std::string texKey = texPath;
size_t nul = texKey.find('\0');
if (nul != std::string::npos) texKey.resize(nul);
std::replace(texKey.begin(), texKey.end(), '/', '\\');
std::transform(texKey.begin(), texKey.end(), texKey.begin(),
[](unsigned char c) { return static_cast<char>(std::tolower(c)); });
if (texKey.empty()) continue;
// Convert to .blp extension
if (texKey.size() >= 4) {
std::string ext = texKey.substr(texKey.size() - 4);
if (ext == ".tga" || ext == ".dds") {
texKey = texKey.substr(0, texKey.size() - 4) + ".blp";
}
}
if (result.predecodedTextures.find(texKey) != result.predecodedTextures.end()) continue;
auto blp = am->loadTexture(texKey);
if (blp.isValid()) {
result.predecodedTextures[texKey] = std::move(blp);
}
}
result.wmoModel = wmo;
result.valid = true;
return result;
});
asyncGameObjectLoads_.push_back(std::move(load));
pendingGameObjectSpawns_.erase(pendingGameObjectSpawns_.begin());
continue;
}
// Cached WMO or M2 — spawn synchronously (cheap)
spawnOnlineGameObject(s.guid, s.entry, s.displayId, s.x, s.y, s.z, s.orientation);
pendingGameObjectSpawns_.erase(pendingGameObjectSpawns_.begin());
}
}
void Application::processPendingTransportDoodads() {

View file

@ -625,7 +625,18 @@ VkTexture* CharacterRenderer::loadTexture(const std::string& path) {
return whiteTexture_.get();
}
auto blpImage = assetManager->loadTexture(key);
// Check pre-decoded BLP cache first (populated by background threads)
pipeline::BLPImage blpImage;
if (predecodedBLPCache_) {
auto pit = predecodedBLPCache_->find(key);
if (pit != predecodedBLPCache_->end()) {
blpImage = std::move(pit->second);
predecodedBLPCache_->erase(pit);
}
}
if (!blpImage.isValid()) {
blpImage = assetManager->loadTexture(key);
}
if (!blpImage.isValid()) {
// Return white fallback but don't cache the failure — allow retry
// on next character load in case the asset becomes available.
@ -1412,8 +1423,9 @@ uint32_t CharacterRenderer::createInstance(uint32_t modelId, const glm::vec3& po
instance.scale = scale;
// Initialize bone matrices to identity
auto& model = models[modelId].data;
instance.boneMatrices.resize(std::max(static_cast<size_t>(1), model.bones.size()), glm::mat4(1.0f));
auto& gpuRef = models[modelId];
instance.boneMatrices.resize(std::max(static_cast<size_t>(1), gpuRef.data.bones.size()), glm::mat4(1.0f));
instance.cachedModel = &gpuRef;
uint32_t id = instance.id;
instances[id] = std::move(instance);
@ -1511,13 +1523,12 @@ void CharacterRenderer::update(float deltaTime, const glm::vec3& cameraPos) {
if (distSq >= animUpdateRadiusSq) continue;
// Always advance animation time (cheap)
auto modelIt = models.find(inst.modelId);
if (modelIt != models.end() && !modelIt->second.data.sequences.empty()) {
if (inst.cachedModel && !inst.cachedModel->data.sequences.empty()) {
if (inst.currentSequenceIndex < 0) {
inst.currentSequenceIndex = 0;
inst.currentAnimationId = modelIt->second.data.sequences[0].id;
inst.currentAnimationId = inst.cachedModel->data.sequences[0].id;
}
const auto& seq = modelIt->second.data.sequences[inst.currentSequenceIndex];
const auto& seq = inst.cachedModel->data.sequences[inst.currentSequenceIndex];
inst.animationTime += deltaTime * 1000.0f;
if (seq.duration > 0 && inst.animationTime >= static_cast<float>(seq.duration)) {
if (inst.animationLoop) {
@ -1528,10 +1539,11 @@ void CharacterRenderer::update(float deltaTime, const glm::vec3& cameraPos) {
}
}
// Distance-tiered bone throttling: near=every frame, mid=every 3rd, far=every 6th
// Distance-tiered bone throttling: near=every frame, mid=every 4th, far=every 8th
uint32_t boneInterval = 1;
if (distSq > 60.0f * 60.0f) boneInterval = 6;
else if (distSq > 30.0f * 30.0f) boneInterval = 3;
if (distSq > 40.0f * 40.0f) boneInterval = 8;
else if (distSq > 20.0f * 20.0f) boneInterval = 4;
else if (distSq > 10.0f * 10.0f) boneInterval = 2;
inst.boneUpdateCounter++;
bool needsBones = (inst.boneUpdateCounter >= boneInterval) || inst.boneMatrices.empty();
@ -1615,11 +1627,8 @@ void CharacterRenderer::update(float deltaTime, const glm::vec3& cameraPos) {
}
void CharacterRenderer::updateAnimation(CharacterInstance& instance, float deltaTime) {
auto modelIt = models.find(instance.modelId);
if (modelIt == models.end()) {
return;
}
const auto& model = modelIt->second.data;
if (!instance.cachedModel) return;
const auto& model = instance.cachedModel->data;
if (model.sequences.empty()) {
return;
@ -1732,7 +1741,8 @@ glm::quat CharacterRenderer::interpolateQuat(const pipeline::M2AnimationTrack& t
// --- Bone transform calculation ---
void CharacterRenderer::calculateBoneMatrices(CharacterInstance& instance) {
auto& model = models[instance.modelId].data;
if (!instance.cachedModel) return;
auto& model = instance.cachedModel->data;
if (model.bones.empty()) {
return;
@ -1833,9 +1843,8 @@ void CharacterRenderer::render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet,
}
}
auto modelIt = models.find(instance.modelId);
if (modelIt == models.end()) continue;
const auto& gpuModel = modelIt->second;
if (!instance.cachedModel) continue;
const auto& gpuModel = *instance.cachedModel;
// Skip models without GPU buffers
if (!gpuModel.vertexBuffer) continue;
@ -2487,9 +2496,8 @@ void CharacterRenderer::renderShadow(VkCommandBuffer cmd, const glm::mat4& light
glm::vec3 diff = inst.position - shadowCenter;
if (glm::dot(diff, diff) > shadowRadiusSq) continue;
auto modelIt = models.find(inst.modelId);
if (modelIt == models.end()) continue;
const M2ModelGPU& gpuModel = modelIt->second;
if (!inst.cachedModel) continue;
const M2ModelGPU& gpuModel = *inst.cachedModel;
if (!gpuModel.vertexBuffer) continue;
glm::mat4 modelMat = inst.hasOverrideModelMatrix

View file

@ -1657,6 +1657,7 @@ uint32_t M2Renderer::createInstance(uint32_t modelId, const glm::vec3& position,
instance.cachedIsInvisibleTrap = mdlRef.isInvisibleTrap;
instance.cachedIsInstancePortal = mdlRef.isInstancePortal;
instance.cachedIsValid = mdlRef.isValid();
instance.cachedModel = &mdlRef;
// Initialize animation: play first sequence (usually Stand/Idle)
const auto& mdl = mdlRef;
@ -1748,6 +1749,7 @@ uint32_t M2Renderer::createInstanceWithMatrix(uint32_t modelId, const glm::mat4&
instance.cachedIsGroundDetail = mdl2.isGroundDetail;
instance.cachedIsInvisibleTrap = mdl2.isInvisibleTrap;
instance.cachedIsValid = mdl2.isValid();
instance.cachedModel = &mdl2;
// Initialize animation
if (mdl2.hasAnimation && !mdl2.disableAnimation && !mdl2.sequences.empty()) {
@ -2026,9 +2028,8 @@ void M2Renderer::update(float deltaTime, const glm::vec3& cameraPos, const glm::
instance.animTime += dtMs * (instance.animSpeed - 1.0f);
// For animation looping/variation, we need the actual model data.
auto it = models.find(instance.modelId);
if (it == models.end()) continue;
const M2ModelGPU& model = it->second;
if (!instance.cachedModel) continue;
const M2ModelGPU& model = *instance.cachedModel;
// Validate sequence index
if (instance.currentSequenceIndex < 0 ||
@ -2084,6 +2085,14 @@ void M2Renderer::update(float deltaTime, const glm::vec3& cameraPos, const glm::
float paddedRadius = std::max(cullRadius * 1.5f, cullRadius + 3.0f);
if (cullRadius > 0.0f && !updateFrustum.intersectsSphere(instance.position, paddedRadius)) continue;
// Distance-based frame skipping: update distant bones less frequently
uint32_t boneInterval = 1;
if (distSq > 200.0f * 200.0f) boneInterval = 8;
else if (distSq > 100.0f * 100.0f) boneInterval = 4;
else if (distSq > 50.0f * 50.0f) boneInterval = 2;
instance.frameSkipCounter++;
if ((instance.frameSkipCounter % boneInterval) != 0) continue;
boneWorkIndices_.push_back(idx);
}
@ -2097,9 +2106,8 @@ void M2Renderer::update(float deltaTime, const glm::vec3& cameraPos, const glm::
for (size_t i : boneWorkIndices_) {
if (i >= instances.size()) continue;
auto& inst = instances[i];
auto mdlIt = models.find(inst.modelId);
if (mdlIt == models.end()) continue;
computeBoneMatrices(mdlIt->second, inst);
if (!inst.cachedModel) continue;
computeBoneMatrices(*inst.cachedModel, inst);
}
} else {
// Parallel — dispatch across worker threads
@ -2112,9 +2120,8 @@ void M2Renderer::update(float deltaTime, const glm::vec3& cameraPos, const glm::
for (size_t i : boneWorkIndices_) {
if (i >= instances.size()) continue;
auto& inst = instances[i];
auto mdlIt = models.find(inst.modelId);
if (mdlIt == models.end()) continue;
computeBoneMatrices(mdlIt->second, inst);
if (!inst.cachedModel) continue;
computeBoneMatrices(*inst.cachedModel, inst);
}
} else {
const size_t chunkSize = animCount / numThreads;
@ -2135,9 +2142,8 @@ void M2Renderer::update(float deltaTime, const glm::vec3& cameraPos, const glm::
size_t idx = boneWorkIndices_[j];
if (idx >= instances.size()) continue;
auto& inst = instances[idx];
auto mdlIt = models.find(inst.modelId);
if (mdlIt == models.end()) continue;
computeBoneMatrices(mdlIt->second, inst);
if (!inst.cachedModel) continue;
computeBoneMatrices(*inst.cachedModel, inst);
}
}));
start = end;
@ -2159,9 +2165,8 @@ void M2Renderer::update(float deltaTime, const glm::vec3& cameraPos, const glm::
glm::vec3 toCam = instance.position - cachedCamPos_;
float distSq = glm::dot(toCam, toCam);
if (distSq > cachedMaxRenderDistSq_) continue;
auto mdlIt = models.find(instance.modelId);
if (mdlIt == models.end()) continue;
emitParticles(instance, mdlIt->second, deltaTime);
if (!instance.cachedModel) continue;
emitParticles(instance, *instance.cachedModel, deltaTime);
updateParticles(instance, deltaTime);
}
@ -2865,9 +2870,8 @@ void M2Renderer::renderShadow(VkCommandBuffer cmd, const glm::mat4& lightSpaceMa
glm::vec3 diff = instance.position - shadowCenter;
if (glm::dot(diff, diff) > shadowRadiusSq) continue;
auto modelIt = models.find(instance.modelId);
if (modelIt == models.end()) continue;
const M2ModelGPU& model = modelIt->second;
if (!instance.cachedModel) continue;
const M2ModelGPU& model = *instance.cachedModel;
// Filter: only draw foliage models in foliage pass, non-foliage in non-foliage pass
if (model.shadowWindFoliage != foliagePass) continue;
@ -2973,8 +2977,7 @@ std::vector<glm::vec3> M2Renderer::getWaterVegetationPositions(const glm::vec3&
std::vector<glm::vec3> result;
float maxDistSq = maxDist * maxDist;
for (const auto& inst : instances) {
auto it = models.find(inst.modelId);
if (it == models.end() || !it->second.isWaterVegetation) continue;
if (!inst.cachedModel || !inst.cachedModel->isWaterVegetation) continue;
glm::vec3 diff = inst.position - camPos;
if (glm::dot(diff, diff) <= maxDistSq) {
result.push_back(inst.position);
@ -3085,9 +3088,8 @@ void M2Renderer::emitParticles(M2Instance& inst, const M2ModelGPU& gpu, float dt
}
void M2Renderer::updateParticles(M2Instance& inst, float dt) {
auto it = models.find(inst.modelId);
if (it == models.end()) return;
const auto& gpu = it->second;
if (!inst.cachedModel) return;
const auto& gpu = *inst.cachedModel;
for (size_t i = 0; i < inst.particles.size(); ) {
auto& p = inst.particles[i];
@ -3162,9 +3164,8 @@ void M2Renderer::renderM2Particles(VkCommandBuffer cmd, VkDescriptorSet perFrame
for (auto& inst : instances) {
if (inst.particles.empty()) continue;
auto it = models.find(inst.modelId);
if (it == models.end()) continue;
const auto& gpu = it->second;
if (!inst.cachedModel) continue;
const auto& gpu = *inst.cachedModel;
for (const auto& p : inst.particles) {
if (p.emitterIndex < 0 || p.emitterIndex >= static_cast<int>(gpu.particleEmitters.size())) continue;
@ -3549,9 +3550,13 @@ void M2Renderer::rebuildSpatialIndex() {
particleInstanceIndices_.clear();
for (size_t i = 0; i < instances.size(); i++) {
const auto& inst = instances[i];
auto& inst = instances[i];
instanceIndexById[inst.id] = i;
// Re-cache model pointer (may have changed after model map modifications)
auto mdlIt = models.find(inst.modelId);
inst.cachedModel = (mdlIt != models.end()) ? &mdlIt->second : nullptr;
// Rebuild dedup map (skip ground detail)
if (!inst.cachedIsGroundDetail) {
DedupKey dk{inst.modelId,
@ -3684,8 +3689,18 @@ VkTexture* M2Renderer::loadTexture(const std::string& path, uint32_t texFlags) {
containsToken(key, "campfire") ||
containsToken(key, "bonfire");
// Load BLP texture
pipeline::BLPImage blp = assetManager->loadTexture(key);
// Check pre-decoded BLP cache first (populated by background worker threads)
pipeline::BLPImage blp;
if (predecodedBLPCache_) {
auto pit = predecodedBLPCache_->find(key);
if (pit != predecodedBLPCache_->end()) {
blp = std::move(pit->second);
predecodedBLPCache_->erase(pit);
}
}
if (!blp.isValid()) {
blp = assetManager->loadTexture(key);
}
if (!blp.isValid()) {
// Return white fallback but don't cache the failure — MPQ reads can
// fail transiently during streaming; allow retry on next model load.
@ -3751,9 +3766,8 @@ VkTexture* M2Renderer::loadTexture(const std::string& path, uint32_t texFlags) {
uint32_t M2Renderer::getTotalTriangleCount() const {
uint32_t total = 0;
for (const auto& instance : instances) {
auto it = models.find(instance.modelId);
if (it != models.end()) {
total += it->second.indexCount / 3;
if (instance.cachedModel) {
total += instance.cachedModel->indexCount / 3;
}
}
return total;
@ -3775,11 +3789,10 @@ std::optional<float> M2Renderer::getFloorHeight(float glX, float glY, float glZ,
continue;
}
auto it = models.find(instance.modelId);
if (it == models.end()) continue;
if (!instance.cachedModel) continue;
if (instance.scale <= 0.001f) continue;
const M2ModelGPU& model = it->second;
const M2ModelGPU& model = *instance.cachedModel;
if (model.collisionNoBlock || model.isInvisibleTrap || model.isSpellEffect) continue;
if (instance.skipCollision) continue;
@ -3931,10 +3944,9 @@ bool M2Renderer::checkCollision(const glm::vec3& from, const glm::vec3& to,
if (from.z > instance.worldBoundsMax.z + 2.5f && adjustedPos.z > instance.worldBoundsMax.z + 2.5f) continue;
if (from.z + 2.5f < instance.worldBoundsMin.z && adjustedPos.z + 2.5f < instance.worldBoundsMin.z) continue;
auto it = models.find(instance.modelId);
if (it == models.end()) continue;
if (!instance.cachedModel) continue;
const M2ModelGPU& model = it->second;
const M2ModelGPU& model = *instance.cachedModel;
if (model.collisionNoBlock || model.isInvisibleTrap || model.isSpellEffect) continue;
if (instance.skipCollision) continue;
if (instance.scale <= 0.001f) continue;
@ -4172,10 +4184,9 @@ float M2Renderer::raycastBoundingBoxes(const glm::vec3& origin, const glm::vec3&
continue;
}
auto it = models.find(instance.modelId);
if (it == models.end()) continue;
if (!instance.cachedModel) continue;
const M2ModelGPU& model = it->second;
const M2ModelGPU& model = *instance.cachedModel;
if (model.collisionNoBlock || model.isInvisibleTrap || model.isSpellEffect) continue;
glm::vec3 localMin, localMax;
getTightCollisionBounds(model, localMin, localMax);

View file

@ -2434,6 +2434,9 @@ void Renderer::update(float deltaTime) {
cameraController->update(deltaTime);
auto cameraEnd = std::chrono::steady_clock::now();
lastCameraUpdateMs = std::chrono::duration<double, std::milli>(cameraEnd - cameraStart).count();
if (lastCameraUpdateMs > 3.0) {
LOG_WARNING("SLOW cameraController->update: ", lastCameraUpdateMs, "ms");
}
// Update 3D audio listener position/orientation to match camera
if (camera) {
@ -2779,8 +2782,15 @@ void Renderer::update(float deltaTime) {
// Update M2 doodad animations (pass camera for frustum-culling bone computation)
if (m2Renderer && camera) {
auto m2Start = std::chrono::steady_clock::now();
m2Renderer->update(deltaTime, camera->getPosition(),
camera->getProjectionMatrix() * camera->getViewMatrix());
float m2Ms = std::chrono::duration<float, std::milli>(
std::chrono::steady_clock::now() - m2Start).count();
if (m2Ms > 3.0f) {
LOG_WARNING("SLOW m2Renderer->update: ", m2Ms, "ms (",
m2Renderer->getInstanceCount(), " instances)");
}
}
// Helper: play zone music, dispatching local files (file: prefix) vs MPQ paths

View file

@ -231,9 +231,14 @@ bool TerrainManager::loadTile(int x, int y) {
return false;
}
VkContext* vkCtx = terrainRenderer ? terrainRenderer->getVkContext() : nullptr;
if (vkCtx) vkCtx->beginUploadBatch();
FinalizingTile ft;
ft.pending = std::move(pending);
while (!advanceFinalization(ft)) {}
if (vkCtx) vkCtx->endUploadBatchSync(); // Sync — caller expects tile ready
return true;
}
@ -407,6 +412,20 @@ std::shared_ptr<PendingTile> TerrainManager::prepareTile(int x, int y) {
return false;
}
// Pre-decode M2 model textures on background thread
for (const auto& tex : m2Model.textures) {
if (tex.filename.empty()) continue;
std::string texKey = tex.filename;
std::replace(texKey.begin(), texKey.end(), '/', '\\');
std::transform(texKey.begin(), texKey.end(), texKey.begin(),
[](unsigned char c) { return static_cast<char>(std::tolower(c)); });
if (pending->preloadedM2Textures.find(texKey) != pending->preloadedM2Textures.end()) continue;
auto blp = assetManager->loadTexture(texKey);
if (blp.isValid()) {
pending->preloadedM2Textures[texKey] = std::move(blp);
}
}
PendingTile::M2Ready ready;
ready.modelId = modelId;
ready.model = std::move(m2Model);
@ -584,6 +603,20 @@ std::shared_ptr<PendingTile> TerrainManager::prepareTile(int x, int y) {
pipeline::M2Loader::loadSkin(skinData, m2Model);
}
if (!m2Model.isValid()) continue;
// Pre-decode doodad M2 textures on background thread
for (const auto& tex : m2Model.textures) {
if (tex.filename.empty()) continue;
std::string texKey = tex.filename;
std::replace(texKey.begin(), texKey.end(), '/', '\\');
std::transform(texKey.begin(), texKey.end(), texKey.begin(),
[](unsigned char c) { return static_cast<char>(std::tolower(c)); });
if (pending->preloadedM2Textures.find(texKey) != pending->preloadedM2Textures.end()) continue;
auto blp = assetManager->loadTexture(texKey);
if (blp.isValid()) {
pending->preloadedM2Textures[texKey] = std::move(blp);
}
}
}
// Build doodad's local transform (WoW coordinates)
@ -654,6 +687,32 @@ std::shared_ptr<PendingTile> TerrainManager::prepareTile(int x, int y) {
}
}
// Pre-decode WMO textures on background thread
for (const auto& texPath : wmoModel.textures) {
if (texPath.empty()) continue;
std::string texKey = texPath;
// Truncate at NUL (WMO paths can have stray bytes)
size_t nul = texKey.find('\0');
if (nul != std::string::npos) texKey.resize(nul);
std::replace(texKey.begin(), texKey.end(), '/', '\\');
std::transform(texKey.begin(), texKey.end(), texKey.begin(),
[](unsigned char c) { return static_cast<char>(std::tolower(c)); });
if (texKey.empty()) continue;
if (pending->preloadedWMOTextures.find(texKey) != pending->preloadedWMOTextures.end()) continue;
// Try .blp variant
std::string blpKey = texKey;
if (blpKey.size() >= 4) {
std::string ext = blpKey.substr(blpKey.size() - 4);
if (ext == ".tga" || ext == ".dds") {
blpKey = blpKey.substr(0, blpKey.size() - 4) + ".blp";
}
}
auto blp = assetManager->loadTexture(blpKey);
if (blp.isValid()) {
pending->preloadedWMOTextures[blpKey] = std::move(blp);
}
}
PendingTile::WMOReady ready;
// Cache WMO model uploads by path; placement dedup uses uniqueId separately.
ready.modelId = static_cast<uint32_t>(std::hash<std::string>{}(wmoPath));
@ -741,7 +800,7 @@ bool TerrainManager::advanceFinalization(FinalizingTile& ft) {
}
bool allDone = terrainRenderer->loadTerrainIncremental(
pending->mesh, pending->terrain.textures, x, y,
ft.terrainChunkNext, 64);
ft.terrainChunkNext, 32);
if (!allDone) {
return false; // More chunks remain — yield to time budget
}
@ -773,7 +832,9 @@ bool TerrainManager::advanceFinalization(FinalizingTile& ft) {
case FinalizationPhase::M2_MODELS: {
// Upload multiple M2 models per call (batched GPU uploads)
if (m2Renderer && ft.m2ModelIndex < pending->m2Models.size()) {
constexpr size_t kModelsPerStep = 8;
// Set pre-decoded BLP cache so loadTexture() skips main-thread BLP decode
m2Renderer->setPredecodedBLPCache(&pending->preloadedM2Textures);
constexpr size_t kModelsPerStep = 4;
size_t uploaded = 0;
while (ft.m2ModelIndex < pending->m2Models.size() && uploaded < kModelsPerStep) {
auto& m2Ready = pending->m2Models[ft.m2ModelIndex];
@ -786,6 +847,7 @@ bool TerrainManager::advanceFinalization(FinalizingTile& ft) {
ft.m2ModelIndex++;
uploaded++;
}
m2Renderer->setPredecodedBLPCache(nullptr);
// Stay in this phase until all models uploaded
if (ft.m2ModelIndex < pending->m2Models.size()) {
return false;
@ -830,8 +892,11 @@ bool TerrainManager::advanceFinalization(FinalizingTile& ft) {
// Upload multiple WMO models per call (batched GPU uploads)
if (wmoRenderer && assetManager) {
wmoRenderer->initialize(nullptr, VK_NULL_HANDLE, assetManager);
// Set pre-decoded BLP cache and defer normal maps during streaming
wmoRenderer->setPredecodedBLPCache(&pending->preloadedWMOTextures);
wmoRenderer->setDeferNormalMaps(true);
constexpr size_t kWmosPerStep = 4;
constexpr size_t kWmosPerStep = 1;
size_t uploaded = 0;
while (ft.wmoModelIndex < pending->wmoModels.size() && uploaded < kWmosPerStep) {
auto& wmoReady = pending->wmoModels[ft.wmoModelIndex];
@ -843,6 +908,8 @@ bool TerrainManager::advanceFinalization(FinalizingTile& ft) {
uploaded++;
}
}
wmoRenderer->setDeferNormalMaps(false);
wmoRenderer->setPredecodedBLPCache(nullptr);
if (ft.wmoModelIndex < pending->wmoModels.size()) return false;
}
ft.phase = FinalizationPhase::WMO_INSTANCES;
@ -906,7 +973,9 @@ bool TerrainManager::advanceFinalization(FinalizingTile& ft) {
case FinalizationPhase::WMO_DOODADS: {
// Upload multiple WMO doodad M2s per call (batched GPU uploads)
if (m2Renderer && ft.wmoDoodadIndex < pending->wmoDoodads.size()) {
constexpr size_t kDoodadsPerStep = 16;
// Set pre-decoded BLP cache for doodad M2 textures
m2Renderer->setPredecodedBLPCache(&pending->preloadedM2Textures);
constexpr size_t kDoodadsPerStep = 4;
size_t uploaded = 0;
while (ft.wmoDoodadIndex < pending->wmoDoodads.size() && uploaded < kDoodadsPerStep) {
auto& doodad = pending->wmoDoodads[ft.wmoDoodadIndex];
@ -923,6 +992,7 @@ bool TerrainManager::advanceFinalization(FinalizingTile& ft) {
ft.wmoDoodadIndex++;
uploaded++;
}
m2Renderer->setPredecodedBLPCache(nullptr);
if (ft.wmoDoodadIndex < pending->wmoDoodads.size()) return false;
}
ft.phase = FinalizationPhase::WATER;
@ -1080,11 +1150,6 @@ void TerrainManager::workerLoop() {
}
void TerrainManager::processReadyTiles() {
// Process tiles with time budget to avoid frame spikes
// Taxi mode gets a slightly larger budget to avoid visible late-pop terrain/models.
const float timeBudgetMs = taxiStreamingMode_ ? 8.0f : 3.0f;
auto startTime = std::chrono::high_resolution_clock::now();
// Move newly ready tiles into the finalizing deque.
// Keep them in pendingTiles so streamTiles() won't re-enqueue them.
{
@ -1100,28 +1165,32 @@ void TerrainManager::processReadyTiles() {
}
}
// Outer upload batch: all GPU uploads across all advanceFinalization calls
// this frame share a single command buffer submission + fence wait.
VkContext* vkCtx = terrainRenderer ? terrainRenderer->getVkContext() : nullptr;
// Reclaim completed async uploads from previous frames (non-blocking)
if (vkCtx) vkCtx->pollUploadBatches();
// Nothing to finalize — done.
if (finalizingTiles_.empty()) return;
// Async upload batch: record GPU copies into a command buffer, submit with
// a fence, but DON'T wait. The fence is polled on subsequent frames.
// This eliminates the main-thread stall from vkWaitForFences entirely.
const int maxSteps = taxiStreamingMode_ ? 8 : 2;
int steps = 0;
if (vkCtx) vkCtx->beginUploadBatch();
// Drive incremental finalization within time budget
while (!finalizingTiles_.empty()) {
while (!finalizingTiles_.empty() && steps < maxSteps) {
auto& ft = finalizingTiles_.front();
bool done = advanceFinalization(ft);
if (done) {
finalizingTiles_.pop_front();
}
auto now = std::chrono::high_resolution_clock::now();
float elapsedMs = std::chrono::duration<float, std::milli>(now - startTime).count();
if (elapsedMs >= timeBudgetMs) {
break;
}
steps++;
}
if (vkCtx) vkCtx->endUploadBatch();
if (vkCtx) vkCtx->endUploadBatch(); // Async — submits but doesn't wait
}
void TerrainManager::processAllReadyTiles() {
@ -1151,7 +1220,7 @@ void TerrainManager::processAllReadyTiles() {
finalizingTiles_.pop_front();
}
if (vkCtx) vkCtx->endUploadBatch();
if (vkCtx) vkCtx->endUploadBatchSync(); // Sync — load screen needs data ready
}
void TerrainManager::processOneReadyTile() {
@ -1177,7 +1246,7 @@ void TerrainManager::processOneReadyTile() {
while (!advanceFinalization(ft)) {}
finalizingTiles_.pop_front();
if (vkCtx) vkCtx->endUploadBatch();
if (vkCtx) vkCtx->endUploadBatchSync(); // Sync — load screen needs data ready
}
}

View file

@ -67,6 +67,14 @@ void VkContext::shutdown() {
frame = {};
}
// Clean up any in-flight async upload batches (device already idle)
for (auto& batch : inFlightBatches_) {
// Staging buffers: skip destroy — allocator is about to be torn down
vkDestroyFence(device, batch.fence, nullptr);
// Command buffer freed when pool is destroyed below
}
inFlightBatches_.clear();
if (immFence) { vkDestroyFence(device, immFence, nullptr); immFence = VK_NULL_HANDLE; }
if (immCommandPool) { vkDestroyCommandPool(device, immCommandPool, nullptr); immCommandPool = VK_NULL_HANDLE; }
@ -1447,17 +1455,94 @@ void VkContext::endUploadBatch() {
inUploadBatch_ = false;
// Submit all recorded commands with a single fence wait
if (batchStagingBuffers_.empty()) {
// No GPU copies were recorded — skip the submit entirely.
vkEndCommandBuffer(batchCmd_);
vkFreeCommandBuffers(device, immCommandPool, 1, &batchCmd_);
batchCmd_ = VK_NULL_HANDLE;
return;
}
// Submit commands with a NEW fence — don't wait, let GPU work in parallel.
vkEndCommandBuffer(batchCmd_);
VkFenceCreateInfo fenceInfo{};
fenceInfo.sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO;
VkFence fence = VK_NULL_HANDLE;
vkCreateFence(device, &fenceInfo, nullptr, &fence);
VkSubmitInfo submitInfo{};
submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
submitInfo.commandBufferCount = 1;
submitInfo.pCommandBuffers = &batchCmd_;
vkQueueSubmit(graphicsQueue, 1, &submitInfo, fence);
// Stash everything for later cleanup when fence signals
InFlightBatch batch;
batch.fence = fence;
batch.cmd = batchCmd_;
batch.stagingBuffers = std::move(batchStagingBuffers_);
inFlightBatches_.push_back(std::move(batch));
batchCmd_ = VK_NULL_HANDLE;
batchStagingBuffers_.clear();
}
void VkContext::endUploadBatchSync() {
if (uploadBatchDepth_ <= 0) return;
uploadBatchDepth_--;
if (uploadBatchDepth_ > 0) return;
inUploadBatch_ = false;
if (batchStagingBuffers_.empty()) {
vkEndCommandBuffer(batchCmd_);
vkFreeCommandBuffers(device, immCommandPool, 1, &batchCmd_);
batchCmd_ = VK_NULL_HANDLE;
return;
}
// Synchronous path for load screens — submit and wait
endSingleTimeCommands(batchCmd_);
batchCmd_ = VK_NULL_HANDLE;
// Destroy all deferred staging buffers
for (auto& staging : batchStagingBuffers_) {
destroyBuffer(allocator, staging);
}
batchStagingBuffers_.clear();
}
void VkContext::pollUploadBatches() {
if (inFlightBatches_.empty()) return;
for (auto it = inFlightBatches_.begin(); it != inFlightBatches_.end(); ) {
VkResult result = vkGetFenceStatus(device, it->fence);
if (result == VK_SUCCESS) {
// GPU finished — free resources
for (auto& staging : it->stagingBuffers) {
destroyBuffer(allocator, staging);
}
vkFreeCommandBuffers(device, immCommandPool, 1, &it->cmd);
vkDestroyFence(device, it->fence, nullptr);
it = inFlightBatches_.erase(it);
} else {
++it;
}
}
}
void VkContext::waitAllUploads() {
for (auto& batch : inFlightBatches_) {
vkWaitForFences(device, 1, &batch.fence, VK_TRUE, UINT64_MAX);
for (auto& staging : batch.stagingBuffers) {
destroyBuffer(allocator, staging);
}
vkFreeCommandBuffers(device, immCommandPool, 1, &batch.cmd);
vkDestroyFence(device, batch.fence, nullptr);
}
inFlightBatches_.clear();
}
void VkContext::deferStagingCleanup(AllocatedBuffer staging) {
batchStagingBuffers_.push_back(staging);
}

View file

@ -2325,13 +2325,27 @@ VkTexture* WMORenderer::loadTexture(const std::string& path) {
const auto& attemptedCandidates = uniqueCandidates;
// Try loading all candidates until one succeeds
// Check pre-decoded BLP cache first (populated by background worker threads)
pipeline::BLPImage blp;
std::string resolvedKey;
for (const auto& c : attemptedCandidates) {
blp = assetManager->loadTexture(c);
if (blp.isValid()) {
resolvedKey = c;
break;
if (predecodedBLPCache_) {
for (const auto& c : uniqueCandidates) {
auto pit = predecodedBLPCache_->find(c);
if (pit != predecodedBLPCache_->end()) {
blp = std::move(pit->second);
predecodedBLPCache_->erase(pit);
resolvedKey = c;
break;
}
}
}
if (!blp.isValid()) {
for (const auto& c : attemptedCandidates) {
blp = assetManager->loadTexture(c);
if (blp.isValid()) {
resolvedKey = c;
break;
}
}
}
if (!blp.isValid()) {
@ -2369,10 +2383,10 @@ VkTexture* WMORenderer::loadTexture(const std::string& path) {
texture->createSampler(vkCtx_->getDevice(), VK_FILTER_LINEAR, VK_FILTER_LINEAR,
VK_SAMPLER_ADDRESS_MODE_REPEAT);
// Generate normal+height map from diffuse pixels
// Generate normal+height map from diffuse pixels (skip during streaming to avoid CPU stalls)
float nhVariance = 0.0f;
std::unique_ptr<VkTexture> nhMap;
if (normalMappingEnabled_ || pomEnabled_) {
if ((normalMappingEnabled_ || pomEnabled_) && !deferNormalMaps_) {
nhMap = generateNormalHeightMap(blp.data.data(), blp.width, blp.height, nhVariance);
if (nhMap) {
approxBytes *= 2; // account for normal map in budget