Background BLP texture pre-decoding + deferred WMO normal maps (12x streaming perf)

Move CPU-heavy BLP texture decoding from main thread to background worker
threads for all hot paths: terrain M2 models, WMO doodad M2s, WMO textures,
creature models, and gameobject WMOs. Each renderer (M2, WMO, Character) now
accepts a pre-decoded BLP cache that loadTexture() checks before falling back
to synchronous decode.

Defer WMO normal/height map generation (3 per-pixel passes: luminance, box
blur, Sobel) during terrain streaming finalization — this was the dominant
remaining bottleneck after BLP pre-decoding.

Terrain streaming stalls: 1576ms → 124ms worst case.
This commit is contained in:
Kelsi 2026-03-07 15:46:56 -08:00
parent 0313bd8692
commit 7ac990cff4
13 changed files with 573 additions and 109 deletions

View file

@ -1,6 +1,7 @@
#pragma once
#include "pipeline/m2_loader.hpp"
#include "pipeline/blp_loader.hpp"
#include <vulkan/vulkan.h>
#include <vk_mem_alloc.h>
#include <glm/glm.hpp>
@ -114,7 +115,11 @@ public:
void setShadowMap(VkTexture*, const glm::mat4&) {}
void clearShadowMap() {}
// Pre-decoded BLP cache: set before calling loadModel() to skip main-thread BLP decode
void setPredecodedBLPCache(std::unordered_map<std::string, pipeline::BLPImage>* cache) { predecodedBLPCache_ = cache; }
private:
std::unordered_map<std::string, pipeline::BLPImage>* predecodedBLPCache_ = nullptr;
// GPU representation of M2 model
struct M2ModelGPU {
VkBuffer vertexBuffer = VK_NULL_HANDLE;
@ -180,6 +185,7 @@ private:
// Bone update throttling (skip frames for distant characters)
uint32_t boneUpdateCounter = 0;
const M2ModelGPU* cachedModel = nullptr; // Avoid per-frame hash lookups
// Per-instance bone SSBO (double-buffered per frame)
VkBuffer boneBuffer[2] = {};

View file

@ -1,6 +1,7 @@
#pragma once
#include "pipeline/m2_loader.hpp"
#include "pipeline/blp_loader.hpp"
#include <vulkan/vulkan.h>
#include <vk_mem_alloc.h>
#include <glm/glm.hpp>
@ -188,6 +189,7 @@ struct M2Instance {
bool skipCollision = false; // WMO interior doodads — skip player wall collision
float cachedBoundRadius = 0.0f;
float portalSpinAngle = 0.0f; // Accumulated spin angle for portal rotation
const M2ModelGPU* cachedModel = nullptr; // Avoid per-frame hash lookups
// Frame-skip optimization (update distant animations less frequently)
uint8_t frameSkipCounter = 0;
@ -328,6 +330,10 @@ public:
std::vector<glm::vec3> getWaterVegetationPositions(const glm::vec3& camPos, float maxDist) const;
// Pre-decoded BLP cache: set by terrain manager before calling loadModel()
// so loadTexture() can skip the expensive assetManager->loadTexture() call.
void setPredecodedBLPCache(std::unordered_map<std::string, pipeline::BLPImage>* cache) { predecodedBLPCache_ = cache; }
private:
bool initialized_ = false;
bool insideInterior = false;
@ -414,6 +420,8 @@ private:
uint32_t modelLimitRejectWarnings_ = 0;
VkTexture* loadTexture(const std::string& path, uint32_t texFlags = 0);
std::unordered_map<std::string, pipeline::BLPImage>* predecodedBLPCache_ = nullptr;
struct TextureCacheEntry {
std::unique_ptr<VkTexture> texture;
size_t approxBytes = 0;

View file

@ -121,6 +121,12 @@ struct PendingTile {
// Pre-loaded terrain texture BLP data (loaded on background thread to avoid
// blocking file I/O on the main thread during finalizeTile)
std::unordered_map<std::string, pipeline::BLPImage> preloadedTextures;
// Pre-decoded M2 model textures (decoded on background thread)
std::unordered_map<std::string, pipeline::BLPImage> preloadedM2Textures;
// Pre-decoded WMO textures (decoded on background thread)
std::unordered_map<std::string, pipeline::BLPImage> preloadedWMOTextures;
};
/**

View file

@ -50,9 +50,12 @@ public:
// Batch upload mode: records multiple upload commands into a single
// command buffer, then submits with ONE fence wait instead of one per upload.
void beginUploadBatch();
void endUploadBatch();
void endUploadBatch(); // Async: submits but does NOT wait for fence
void endUploadBatchSync(); // Sync: submits and waits (for load screens)
bool isInUploadBatch() const { return inUploadBatch_; }
void deferStagingCleanup(AllocatedBuffer staging);
void pollUploadBatches(); // Check completed async uploads, free staging buffers
void waitAllUploads(); // Block until all in-flight uploads complete
// Accessors
VkInstance getInstance() const { return instance; }
@ -157,6 +160,14 @@ private:
VkCommandBuffer batchCmd_ = VK_NULL_HANDLE;
std::vector<AllocatedBuffer> batchStagingBuffers_;
// Async upload: in-flight batches awaiting GPU completion
struct InFlightBatch {
VkFence fence = VK_NULL_HANDLE;
VkCommandBuffer cmd = VK_NULL_HANDLE;
std::vector<AllocatedBuffer> stagingBuffers;
};
std::vector<InFlightBatch> inFlightBatches_;
// Depth buffer (shared across all framebuffers)
VkImage depthImage = VK_NULL_HANDLE;
VkImageView depthImageView = VK_NULL_HANDLE;

View file

@ -1,5 +1,6 @@
#pragma once
#include "pipeline/blp_loader.hpp"
#include <vulkan/vulkan.h>
#include <vk_mem_alloc.h>
#include <glm/glm.hpp>
@ -325,6 +326,12 @@ public:
// Pre-compute floor cache for all loaded WMO instances
void precomputeFloorCache();
// Pre-decoded BLP cache: set before calling loadModel() to skip main-thread BLP decode
void setPredecodedBLPCache(std::unordered_map<std::string, pipeline::BLPImage>* cache) { predecodedBLPCache_ = cache; }
// Defer normal/height map generation during streaming to avoid CPU stalls
void setDeferNormalMaps(bool defer) { deferNormalMaps_ = defer; }
private:
// WMO material UBO — matches WMOMaterial in wmo.frag.glsl
struct WMOMaterialUBO {
@ -558,6 +565,7 @@ private:
* Load a texture from path
*/
VkTexture* loadTexture(const std::string& path);
std::unordered_map<std::string, pipeline::BLPImage>* predecodedBLPCache_ = nullptr;
/**
* Generate normal+height map from diffuse RGBA8 pixels
@ -670,6 +678,7 @@ private:
// Normal mapping / POM settings
bool normalMappingEnabled_ = true; // on by default
bool deferNormalMaps_ = false; // skip normal map gen during streaming
float normalMapStrength_ = 0.8f; // 0.0 = flat, 1.0 = full, 2.0 = exaggerated
bool pomEnabled_ = true; // on by default
int pomQuality_ = 1; // 0=Low(16), 1=Medium(32), 2=High(64)