Optimize threading and texture fallback stability

2026-04-17 09:33:51 +00:00 · 2026-02-22 08:12:08 -08:00 · 2026-02-22 08:12:08 -08:00 · 9c8cd44803
commit 9c8cd44803
parent f4d947fab1
8 changed files with 251 additions and 141 deletions
--- a/include/rendering/character_renderer.hpp
+++ b/include/rendering/character_renderer.hpp
@ -260,6 +260,7 @@ private:
    size_t textureCacheBytes_ = 0;
    uint64_t textureCacheCounter_ = 0;
    size_t textureCacheBudgetBytes_ = 1024ull * 1024 * 1024;
    uint32_t textureBudgetRejectWarnings_ = 0;
    std::unique_ptr<VkTexture> whiteTexture_;
    std::unique_ptr<VkTexture> transparentTexture_;
--- a/include/rendering/wmo_renderer.hpp
+++ b/include/rendering/wmo_renderer.hpp
@ -639,7 +639,7 @@ private:
        uint32_t portalCulled = 0;
        uint32_t distanceCulled = 0;
    };
-    std::vector<std::future<std::vector<InstanceDrawList>>> cullFutures_;
+    std::vector<std::future<void>> cullFutures_;
    // Collision query profiling (per frame).
    mutable double queryTimeMs = 0.0;
--- a/src/network/world_socket.cpp
+++ b/src/network/world_socket.cpp
@ -69,7 +69,7 @@ WorldSocket::WorldSocket() {
    net::ensureInit();
    // Always reserve baseline receive capacity (safe, behavior-preserving).
    receiveBuffer.reserve(64 * 1024);
-    useFastRecvAppend_ = envFlagEnabled("WOWEE_NET_FAST_RECV_APPEND", false);
+    useFastRecvAppend_ = envFlagEnabled("WOWEE_NET_FAST_RECV_APPEND", true);
    useParseScratchQueue_ = envFlagEnabled("WOWEE_NET_PARSE_SCRATCH", false);
    if (useParseScratchQueue_) {
        LOG_WARNING("WOWEE_NET_PARSE_SCRATCH is temporarily disabled (known unstable); forcing off");
@ -304,8 +304,21 @@ void WorldSocket::update() {
                    disconnect();
                    return;
                }
-                receiveBuffer.resize(oldSize + receivedSize);
+                const size_t needed = oldSize + receivedSize;
-                std::memcpy(receiveBuffer.data() + oldSize, buffer, receivedSize);
+                if (receiveBuffer.capacity() < needed) {
                    size_t newCap = receiveBuffer.capacity() ? receiveBuffer.capacity() : 64 * 1024;
                    while (newCap < needed && newCap < kMaxReceiveBufferBytes) {
                        newCap = std::min(kMaxReceiveBufferBytes, newCap * 2);
                    }
                    if (newCap < needed) {
                        LOG_ERROR("World socket receive buffer capacity growth failed (needed=", needed,
                                  " max=", kMaxReceiveBufferBytes, "). Disconnecting to recover framing.");
                        disconnect();
                        return;
                    }
                    receiveBuffer.reserve(newCap);
                }
                receiveBuffer.insert(receiveBuffer.end(), buffer, buffer + receivedSize);
            } else {
                receiveBuffer.insert(receiveBuffer.end(), buffer, buffer + received);
            }
@ -334,10 +347,13 @@ void WorldSocket::update() {
    }
    if (receivedAny) {
-        LOG_DEBUG("World socket read ", bytesReadThisTick, " bytes in ", readOps,
+        const bool debugLog = core::Logger::getInstance().shouldLog(core::LogLevel::DEBUG);
-                 " recv call(s), buffered=", receiveBuffer.size());
+        if (debugLog) {
-        // Hex dump received bytes for auth debugging
+            LOG_DEBUG("World socket read ", bytesReadThisTick, " bytes in ", readOps,
-        if (bytesReadThisTick <= 128) {
+                      " recv call(s), buffered=", receiveBuffer.size());
        }
        // Hex dump received bytes for auth debugging (debug-only to avoid per-frame string work)
        if (debugLog && bytesReadThisTick <= 128) {
            std::string hex;
            for (size_t i = 0; i < receiveBuffer.size(); ++i) {
                char buf[4]; snprintf(buf, sizeof(buf), "%02x ", receiveBuffer[i]); hex += buf;
@ -345,7 +361,7 @@ void WorldSocket::update() {
            LOG_DEBUG("World socket raw bytes: ", hex);
        }
        tryParsePackets();
-        if (connected && !receiveBuffer.empty()) {
+        if (debugLog && connected && !receiveBuffer.empty()) {
            LOG_DEBUG("World socket parse left ", receiveBuffer.size(),
                     " bytes buffered (awaiting complete packet)");
        }
--- a/src/pipeline/asset_manager.cpp
+++ b/src/pipeline/asset_manager.cpp
@ -29,6 +29,19 @@ size_t parseEnvSizeMB(const char* name) {
    }
    return static_cast<size_t>(mb);
 }
 size_t parseEnvCount(const char* name, size_t defValue) {
    const char* v = std::getenv(name);
    if (!v || !*v) {
        return defValue;
    }
    char* end = nullptr;
    unsigned long long n = std::strtoull(v, &end, 10);
    if (end == v || n == 0) {
        return defValue;
    }
    return static_cast<size_t>(n);
 }
 } // namespace
 AssetManager::AssetManager() = default;
@ -148,7 +161,8 @@ BLPImage AssetManager::loadTexture(const std::string& path) {
    if (blpData.empty()) {
        static std::unordered_set<std::string> loggedMissingTextures;
        static bool missingTextureLogSuppressed = false;
-        static constexpr size_t kMaxMissingTextureLogKeys = 20000;
+        static const size_t kMaxMissingTextureLogKeys =
            parseEnvCount("WOWEE_TEXTURE_MISS_LOG_KEYS", 400);
        if (loggedMissingTextures.size() < kMaxMissingTextureLogKeys &&
            loggedMissingTextures.insert(normalizedPath).second) {
            LOG_WARNING("Texture not found: ", normalizedPath);
@ -164,7 +178,8 @@ BLPImage AssetManager::loadTexture(const std::string& path) {
    if (!image.isValid()) {
        static std::unordered_set<std::string> loggedDecodeFails;
        static bool decodeFailLogSuppressed = false;
-        static constexpr size_t kMaxDecodeFailLogKeys = 8000;
+        static const size_t kMaxDecodeFailLogKeys =
            parseEnvCount("WOWEE_TEXTURE_DECODE_LOG_KEYS", 200);
        if (loggedDecodeFails.size() < kMaxDecodeFailLogKeys &&
            loggedDecodeFails.insert(normalizedPath).second) {
            LOG_ERROR("Failed to load texture: ", normalizedPath);
--- a/src/rendering/character_renderer.cpp
+++ b/src/rendering/character_renderer.cpp
@ -56,6 +56,15 @@ size_t envSizeMBOrDefault(const char* name, size_t defMb) {
    return static_cast<size_t>(mb);
 }
 size_t envSizeOrDefault(const char* name, size_t defValue) {
    const char* v = std::getenv(name);
    if (!v || !*v) return defValue;
    char* end = nullptr;
    unsigned long long n = std::strtoull(v, &end, 10);
    if (end == v || n == 0) return defValue;
    return static_cast<size_t>(n);
 }
 size_t approxTextureBytesWithMips(int w, int h) {
    if (w <= 0 || h <= 0) return 0;
    size_t base = static_cast<size_t>(w) * static_cast<size_t>(h) * 4ull;
@ -95,7 +104,13 @@ bool CharacterRenderer::initialize(VkContext* ctx, VkDescriptorSetLayout perFram
    assetManager = am;
    perFrameLayout_ = perFrameLayout;
    renderPassOverride_ = renderPassOverride;
-    numAnimThreads_ = std::max(1u, std::min(8u, std::thread::hardware_concurrency()));
+    const unsigned hc = std::thread::hardware_concurrency();
    const size_t availableCores = (hc > 1u) ? static_cast<size_t>(hc - 1u) : 1ull;
    // Character updates run alongside M2/WMO work; default to a smaller share.
    const size_t defaultAnimThreads = std::max<size_t>(1, availableCores / 4);
    numAnimThreads_ = static_cast<uint32_t>(std::max<size_t>(
        1, envSizeOrDefault("WOWEE_CHAR_ANIM_THREADS", defaultAnimThreads)));
    core::Logger::getInstance().info("Character anim threads: ", numAnimThreads_);
    VkDevice device = vkCtx_->getDevice();
@ -250,7 +265,8 @@ bool CharacterRenderer::initialize(VkContext* ctx, VkDescriptorSetLayout perFram
    }
    // Diagnostics-only: cache lifetime is currently tied to renderer lifetime.
-    textureCacheBudgetBytes_ = envSizeMBOrDefault("WOWEE_CHARACTER_TEX_CACHE_MB", 512) * 1024ull * 1024ull;
+    textureCacheBudgetBytes_ = envSizeMBOrDefault("WOWEE_CHARACTER_TEX_CACHE_MB", 1024) * 1024ull * 1024ull;
    LOG_INFO("Character texture cache budget: ", textureCacheBudgetBytes_ / (1024 * 1024), " MB");
    core::Logger::getInstance().info("Character renderer initialized (Vulkan)");
    return true;
@ -403,8 +419,29 @@ VkTexture* CharacterRenderer::loadTexture(const std::string& path) {
    auto blpImage = assetManager->loadTexture(key);
    if (!blpImage.isValid()) {
        static constexpr size_t kMaxFailedTextureCache = 200000;
        core::Logger::getInstance().warning("Failed to load texture: ", path);
-        failedTextureCache_.insert(key);
+        if (failedTextureCache_.size() < kMaxFailedTextureCache) {
            failedTextureCache_.insert(key);
        }
        return whiteTexture_.get();
    }
    size_t approxBytes = approxTextureBytesWithMips(blpImage.width, blpImage.height);
    if (textureCacheBytes_ + approxBytes > textureCacheBudgetBytes_) {
        static constexpr size_t kMaxFailedTextureCache = 200000;
        if (failedTextureCache_.size() < kMaxFailedTextureCache) {
            // Budget is saturated; avoid repeatedly decoding/uploading this texture.
            failedTextureCache_.insert(key);
        }
        if (textureBudgetRejectWarnings_ < 8 || (textureBudgetRejectWarnings_ % 120) == 0) {
            core::Logger::getInstance().warning(
                "Character texture cache full (",
                textureCacheBytes_ / (1024 * 1024), " MB / ",
                textureCacheBudgetBytes_ / (1024 * 1024), " MB), rejecting texture: ",
                path);
        }
        ++textureBudgetRejectWarnings_;
        return whiteTexture_.get();
    }
@ -426,7 +463,7 @@ VkTexture* CharacterRenderer::loadTexture(const std::string& path) {
    TextureCacheEntry e;
    e.texture = std::move(tex);
-    e.approxBytes = approxTextureBytesWithMips(blpImage.width, blpImage.height);
+    e.approxBytes = approxBytes;
    e.lastUse = ++textureCacheCounter_;
    e.hasAlpha = hasAlpha;
    e.colorKeyBlack = colorKeyBlackHint;
@ -435,12 +472,6 @@ VkTexture* CharacterRenderer::loadTexture(const std::string& path) {
    textureColorKeyBlackByPtr_[texPtr] = colorKeyBlackHint;
    textureCache[key] = std::move(e);
    if (textureCacheBytes_ > textureCacheBudgetBytes_) {
        core::Logger::getInstance().warning(
            "Character texture cache over budget: ",
            textureCacheBytes_ / (1024 * 1024), " MB > ",
            textureCacheBudgetBytes_ / (1024 * 1024), " MB (textures=", textureCache.size(), ")");
    }
    core::Logger::getInstance().debug("Loaded character texture: ", path, " (", blpImage.width, "x", blpImage.height, ")");
    return texPtr;
 }
@ -1144,29 +1175,40 @@ void CharacterRenderer::update(float deltaTime, const glm::vec3& cameraPos) {
    // Thread animation updates in chunks to avoid spawning one task per instance.
    if (updatedCount >= 8 && numAnimThreads_ > 1) {
-        const size_t numThreads = std::min(static_cast<size_t>(numAnimThreads_), updatedCount);
+        static const size_t minAnimWorkPerThread = std::max<size_t>(
-        const size_t chunkSize = updatedCount / numThreads;
+            16, envSizeOrDefault("WOWEE_CHAR_ANIM_WORK_PER_THREAD", 64));
-        const size_t remainder = updatedCount % numThreads;
+        const size_t maxUsefulThreads = std::max<size_t>(
            1, (updatedCount + minAnimWorkPerThread - 1) / minAnimWorkPerThread);
        const size_t numThreads = std::min(static_cast<size_t>(numAnimThreads_), maxUsefulThreads);
-        animFutures_.clear();
+        if (numThreads <= 1) {
-        if (animFutures_.capacity() < numThreads) {
+            for (auto& instRef : toUpdate) {
-            animFutures_.reserve(numThreads);
+                updateAnimation(instRef.get(), deltaTime);
-        }
+            }
        } else {
            const size_t chunkSize = updatedCount / numThreads;
            const size_t remainder = updatedCount % numThreads;
-        size_t start = 0;
+            animFutures_.clear();
-        for (size_t t = 0; t < numThreads; t++) {
+            if (animFutures_.capacity() < numThreads) {
-            size_t end = start + chunkSize + (t < remainder ? 1 : 0);
+                animFutures_.reserve(numThreads);
-            animFutures_.push_back(std::async(std::launch::async,
+            }
                [this, &toUpdate, start, end, deltaTime]() {
                    for (size_t i = start; i < end; i++) {
                        updateAnimation(toUpdate[i].get(), deltaTime);
                    }
                }));
            start = end;
        }
-        for (auto& f : animFutures_) {
+            size_t start = 0;
-            f.get();
+            for (size_t t = 0; t < numThreads; t++) {
                size_t end = start + chunkSize + (t < remainder ? 1 : 0);
                animFutures_.push_back(std::async(std::launch::async,
                    [this, &toUpdate, start, end, deltaTime]() {
                        for (size_t i = start; i < end; i++) {
                            updateAnimation(toUpdate[i].get(), deltaTime);
                        }
                    }));
                start = end;
            }
            for (auto& f : animFutures_) {
                f.get();
            }
        }
    } else {
        // Sequential for small counts (avoid thread overhead)
--- a/src/rendering/m2_renderer.cpp
+++ b/src/rendering/m2_renderer.cpp
@ -49,6 +49,15 @@ size_t envSizeMBOrDefault(const char* name, size_t defMb) {
    return static_cast<size_t>(mb);
 }
 size_t envSizeOrDefault(const char* name, size_t defValue) {
    const char* raw = std::getenv(name);
    if (!raw || !*raw) return defValue;
    char* end = nullptr;
    unsigned long long v = std::strtoull(raw, &end, 10);
    if (end == raw || v == 0) return defValue;
    return static_cast<size_t>(v);
 }
 static constexpr uint32_t kParticleFlagRandomized = 0x40;
 static constexpr uint32_t kParticleFlagTiled = 0x80;
@ -299,7 +308,12 @@ bool M2Renderer::initialize(VkContext* ctx, VkDescriptorSetLayout perFrameLayout
    vkCtx_ = ctx;
    assetManager = assets;
-    numAnimThreads_ = std::min(4u, std::max(1u, std::thread::hardware_concurrency() - 1));
+    const unsigned hc = std::thread::hardware_concurrency();
    const size_t availableCores = (hc > 1u) ? static_cast<size_t>(hc - 1u) : 1ull;
    // Keep headroom for other frame tasks: M2 gets about half of non-main cores by default.
    const size_t defaultAnimThreads = std::max<size_t>(1, availableCores / 2);
    numAnimThreads_ = static_cast<uint32_t>(std::max<size_t>(
        1, envSizeOrDefault("WOWEE_M2_ANIM_THREADS", defaultAnimThreads)));
    LOG_INFO("Initializing M2 renderer (Vulkan, ", numAnimThreads_, " anim threads)...");
    VkDevice device = vkCtx_->getDevice();
@ -1915,7 +1929,9 @@ void M2Renderer::update(float deltaTime, const glm::vec3& cameraPos, const glm::
    // Phase 2: Compute bone matrices (expensive, parallel if enough work)
    const size_t animCount = boneWorkIndices_.size();
    if (animCount > 0) {
-        if (animCount < 6 || numAnimThreads_ <= 1) {
+        static const size_t minParallelAnimInstances = std::max<size_t>(
            8, envSizeOrDefault("WOWEE_M2_ANIM_MT_MIN", 96));
        if (animCount < minParallelAnimInstances || numAnimThreads_ <= 1) {
            // Sequential — not enough work to justify thread overhead
            for (size_t i : boneWorkIndices_) {
                if (i >= instances.size()) continue;
@ -1926,35 +1942,49 @@ void M2Renderer::update(float deltaTime, const glm::vec3& cameraPos, const glm::
            }
        } else {
            // Parallel — dispatch across worker threads
-            const size_t numThreads = std::min(static_cast<size_t>(numAnimThreads_), animCount);
+            static const size_t minAnimWorkPerThread = std::max<size_t>(
-            const size_t chunkSize = animCount / numThreads;
+                16, envSizeOrDefault("WOWEE_M2_ANIM_WORK_PER_THREAD", 64));
-            const size_t remainder = animCount % numThreads;
+            const size_t maxUsefulThreads = std::max<size_t>(
                1, (animCount + minAnimWorkPerThread - 1) / minAnimWorkPerThread);
            const size_t numThreads = std::min(static_cast<size_t>(numAnimThreads_), maxUsefulThreads);
            if (numThreads <= 1) {
                for (size_t i : boneWorkIndices_) {
                    if (i >= instances.size()) continue;
                    auto& inst = instances[i];
                    auto mdlIt = models.find(inst.modelId);
                    if (mdlIt == models.end()) continue;
                    computeBoneMatrices(mdlIt->second, inst);
                }
            } else {
                const size_t chunkSize = animCount / numThreads;
                const size_t remainder = animCount % numThreads;
-            // Reuse persistent futures vector to avoid allocation
+                // Reuse persistent futures vector to avoid allocation
-            animFutures_.clear();
+                animFutures_.clear();
-            if (animFutures_.capacity() < numThreads) {
+                if (animFutures_.capacity() < numThreads) {
-                animFutures_.reserve(numThreads);
+                    animFutures_.reserve(numThreads);
-            }
+                }
-            size_t start = 0;
+                size_t start = 0;
-            for (size_t t = 0; t < numThreads; ++t) {
+                for (size_t t = 0; t < numThreads; ++t) {
-                size_t end = start + chunkSize + (t < remainder ? 1 : 0);
+                    size_t end = start + chunkSize + (t < remainder ? 1 : 0);
-                animFutures_.push_back(std::async(std::launch::async,
+                    animFutures_.push_back(std::async(std::launch::async,
-                    [this, start, end]() {
+                        [this, start, end]() {
-                        for (size_t j = start; j < end; ++j) {
+                            for (size_t j = start; j < end; ++j) {
-                            size_t idx = boneWorkIndices_[j];
+                                size_t idx = boneWorkIndices_[j];
-                            if (idx >= instances.size()) continue;
+                                if (idx >= instances.size()) continue;
-                            auto& inst = instances[idx];
+                                auto& inst = instances[idx];
-                            auto mdlIt = models.find(inst.modelId);
+                                auto mdlIt = models.find(inst.modelId);
-                            if (mdlIt == models.end()) continue;
+                                if (mdlIt == models.end()) continue;
-                            computeBoneMatrices(mdlIt->second, inst);
+                                computeBoneMatrices(mdlIt->second, inst);
-                        }
+                            }
-                    }));
+                        }));
-                start = end;
+                    start = end;
-            }
+                }
-            for (auto& f : animFutures_) {
+                for (auto& f : animFutures_) {
-                f.get();
+                    f.get();
                }
            }
        }
    }
--- a/src/rendering/terrain_manager.cpp
+++ b/src/rendering/terrain_manager.cpp
@ -18,6 +18,7 @@
 #include <glm/gtx/euler_angles.hpp>
 #include <cmath>
 #include <cctype>
 #include <cstdlib>
 #include <functional>
 #include <unordered_set>
@ -26,6 +27,26 @@ namespace rendering {
 namespace {
 int computeTerrainWorkerCount() {
    const char* raw = std::getenv("WOWEE_TERRAIN_WORKERS");
    if (raw && *raw) {
        char* end = nullptr;
        unsigned long long forced = std::strtoull(raw, &end, 10);
        if (end != raw && forced > 0) {
            return static_cast<int>(forced);
        }
    }
    unsigned hc = std::thread::hardware_concurrency();
    if (hc > 0) {
        // Terrain streaming should leave CPU room for render/update threads.
        const unsigned availableCores = (hc > 1u) ? (hc - 1u) : 1u;
        const unsigned targetWorkers = std::max(2u, availableCores / 2u);
        return static_cast<int>(targetWorkers);
    }
    return 2;  // Fallback
 }
 bool decodeLayerAlpha(const pipeline::MapChunk& chunk, size_t layerIdx, std::vector<uint8_t>& outAlpha) {
    if (layerIdx >= chunk.layers.size()) return false;
    const auto& layer = chunk.layers[layerIdx];
@ -128,15 +149,9 @@ bool TerrainManager::initialize(pipeline::AssetManager* assets, TerrainRenderer*
    LOG_INFO("Terrain tile cache budget: ", tileCacheBudgetBytes_ / (1024 * 1024), " MB (dynamic)");
    // Start background worker pool (dynamic: scales with available cores)
-    // Use 75% of logical cores for decompression, leaving headroom for render/OS
+    // Keep defaults moderate; env override can increase if streaming is bottlenecked.
    workerRunning.store(true);
-    unsigned hc = std::thread::hardware_concurrency();
+    workerCount = computeTerrainWorkerCount();
    if (hc > 0) {
        unsigned targetWorkers = std::max(6u, (hc * 3) / 4);  // 75% of cores, minimum 6
        workerCount = static_cast<int>(targetWorkers);
    } else {
        workerCount = 6;  // Fallback
    }
    workerThreads.reserve(workerCount);
    for (int i = 0; i < workerCount; i++) {
        workerThreads.emplace_back(&TerrainManager::workerLoop, this);
@ -926,12 +941,10 @@ void TerrainManager::processReadyTiles() {
        if (pending) {
            TileCoord coord = pending->coord;
            auto tileStart = std::chrono::high_resolution_clock::now();
            finalizeTile(pending);
-            auto tileEnd = std::chrono::high_resolution_clock::now();
+            auto now = std::chrono::high_resolution_clock::now();
            float tileTimeMs = std::chrono::duration<float, std::milli>(tileEnd - tileStart).count();
            {
                std::lock_guard<std::mutex> lock(queueMutex);
@ -940,7 +953,7 @@ void TerrainManager::processReadyTiles() {
            processed++;
            // Check if we've exceeded time budget
-            float elapsedMs = std::chrono::duration<float, std::milli>(tileEnd - startTime).count();
+            float elapsedMs = std::chrono::duration<float, std::milli>(now - startTime).count();
            if (elapsedMs >= timeBudgetMs) {
                if (processed > 1) {
                    LOG_DEBUG("Processed ", processed, " tiles in ", elapsedMs, "ms (budget: ", timeBudgetMs, "ms)");
@ -1183,13 +1196,7 @@ void TerrainManager::unloadAll() {
    // Restart worker threads so streaming can resume (dynamic: scales with available cores)
    // Use 75% of logical cores for decompression, leaving headroom for render/OS
    workerRunning.store(true);
-    unsigned hc = std::thread::hardware_concurrency();
+    workerCount = computeTerrainWorkerCount();
    if (hc > 0) {
        unsigned targetWorkers = std::max(6u, (hc * 3) / 4);  // 75% of cores, minimum 6
        workerCount = static_cast<int>(targetWorkers);
    } else {
        workerCount = 6;  // Fallback
    }
    workerThreads.reserve(workerCount);
    for (int i = 0; i < workerCount; i++) {
        workerThreads.emplace_back(&TerrainManager::workerLoop, this);
--- a/src/rendering/wmo_renderer.cpp
+++ b/src/rendering/wmo_renderer.cpp
@ -37,6 +37,15 @@ size_t envSizeMBOrDefault(const char* name, size_t defMb) {
    if (end == raw || mb == 0) return defMb;
    return static_cast<size_t>(mb);
 }
 size_t envSizeOrDefault(const char* name, size_t defValue) {
    const char* raw = std::getenv(name);
    if (!raw || !*raw) return defValue;
    char* end = nullptr;
    unsigned long long v = std::strtoull(raw, &end, 10);
    if (end == raw || v == 0) return defValue;
    return static_cast<size_t>(v);
 }
 } // namespace
 static void transformAABB(const glm::mat4& modelMatrix,
@ -65,7 +74,13 @@ bool WMORenderer::initialize(VkContext* ctx, VkDescriptorSetLayout perFrameLayou
        return false;
    }
-    numCullThreads_ = std::min(4u, std::max(1u, std::thread::hardware_concurrency() - 1));
+    const unsigned hc = std::thread::hardware_concurrency();
    const size_t availableCores = (hc > 1u) ? static_cast<size_t>(hc - 1u) : 1ull;
    // WMO culling is lighter than animation; keep defaults conservative to reduce spikes.
    const size_t defaultCullThreads = std::max<size_t>(1, availableCores / 4);
    numCullThreads_ = static_cast<uint32_t>(std::max<size_t>(
        1, envSizeOrDefault("WOWEE_WMO_CULL_THREADS", defaultCullThreads)));
    core::Logger::getInstance().info("WMO cull threads: ", numCullThreads_);
    VkDevice device = vkCtx_->getDevice();
@ -1208,35 +1223,44 @@ void WMORenderer::render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet, const
    std::vector<InstanceDrawList> drawLists;
    drawLists.reserve(visibleInstances.size());
-    if (visibleInstances.size() >= 4 && numCullThreads_ > 1) {
+    static const size_t minParallelCullInstances = std::max<size_t>(
-        const size_t numThreads = std::min(static_cast<size_t>(numCullThreads_),
+        4, envSizeOrDefault("WOWEE_WMO_CULL_MT_MIN", 128));
-                                           visibleInstances.size());
+    if (visibleInstances.size() >= minParallelCullInstances && numCullThreads_ > 1) {
-        const size_t chunkSize = visibleInstances.size() / numThreads;
+        static const size_t minCullWorkPerThread = std::max<size_t>(
-        const size_t remainder = visibleInstances.size() % numThreads;
+            16, envSizeOrDefault("WOWEE_WMO_CULL_WORK_PER_THREAD", 64));
        const size_t maxUsefulThreads = std::max<size_t>(
            1, (visibleInstances.size() + minCullWorkPerThread - 1) / minCullWorkPerThread);
        const size_t numThreads = std::min(static_cast<size_t>(numCullThreads_), maxUsefulThreads);
        if (numThreads <= 1) {
            for (size_t idx : visibleInstances) {
                drawLists.push_back(cullInstance(idx));
            }
        } else {
            const size_t chunkSize = visibleInstances.size() / numThreads;
            const size_t remainder = visibleInstances.size() % numThreads;
-        cullFutures_.clear();
+            drawLists.resize(visibleInstances.size());
        if (cullFutures_.capacity() < numThreads) {
            cullFutures_.reserve(numThreads);
        }
-        size_t start = 0;
+            cullFutures_.clear();
-        for (size_t t = 0; t < numThreads; ++t) {
+            if (cullFutures_.capacity() < numThreads) {
-            size_t end = start + chunkSize + (t < remainder ? 1 : 0);
+                cullFutures_.reserve(numThreads);
-            cullFutures_.push_back(std::async(std::launch::async,
+            }
                [&, start, end]() {
                    std::vector<InstanceDrawList> chunk;
                    chunk.reserve(end - start);
                    for (size_t j = start; j < end; ++j)
                        chunk.push_back(cullInstance(visibleInstances[j]));
                    return chunk;
                }));
            start = end;
        }
-        for (auto& f : cullFutures_) {
+            size_t start = 0;
-            auto chunk = f.get();
+            for (size_t t = 0; t < numThreads; ++t) {
-            for (auto& dl : chunk)
+                const size_t end = start + chunkSize + (t < remainder ? 1 : 0);
-                drawLists.push_back(std::move(dl));
+                cullFutures_.push_back(std::async(std::launch::async,
                    [&, start, end]() {
                        for (size_t j = start; j < end; ++j) {
                            drawLists[j] = cullInstance(visibleInstances[j]);
                        }
                    }));
                start = end;
            }
            for (auto& f : cullFutures_) {
                f.get();
            }
        }
    } else {
        for (size_t idx : visibleInstances)
@ -1901,16 +1925,7 @@ VkTexture* WMORenderer::loadTexture(const std::string& path) {
        }
    }
-    std::vector<std::string> attemptedCandidates;
+    const auto& attemptedCandidates = uniqueCandidates;
    attemptedCandidates.reserve(uniqueCandidates.size());
    for (const auto& c : uniqueCandidates) {
        if (!failedTextureCache_.count(c)) {
            attemptedCandidates.push_back(c);
        }
    }
    if (attemptedCandidates.empty()) {
        return whiteTexture_.get();
    }
    // Try loading all candidates until one succeeds
    pipeline::BLPImage blp;
@ -1923,12 +1938,6 @@ VkTexture* WMORenderer::loadTexture(const std::string& path) {
        }
    }
    if (!blp.isValid()) {
        static constexpr size_t kMaxFailedTextureCache = 200000;
        for (const auto& c : attemptedCandidates) {
            if (failedTextureCache_.size() < kMaxFailedTextureCache) {
                failedTextureCache_.insert(c);
            }
        }
        if (loggedTextureLoadFails_.insert(key).second) {
            core::Logger::getInstance().warning("WMO: Failed to load texture: ", path);
        }
@ -1943,16 +1952,6 @@ VkTexture* WMORenderer::loadTexture(const std::string& path) {
    size_t base = static_cast<size_t>(blp.width) * static_cast<size_t>(blp.height) * 4ull;
    size_t approxBytes = base + (base / 3);
    if (textureCacheBytes_ + approxBytes > textureCacheBudgetBytes_) {
        static constexpr size_t kMaxFailedTextureCache = 200000;
        if (failedTextureCache_.size() < kMaxFailedTextureCache) {
            // Cache budget-rejected keys too; once saturated, repeated attempts
            // cause pointless decode churn and transient allocations.
            if (!resolvedKey.empty()) {
                failedTextureCache_.insert(resolvedKey);
            } else {
                failedTextureCache_.insert(key);
            }
        }
        if (textureBudgetRejectWarnings_ < 8 || (textureBudgetRejectWarnings_ % 120) == 0) {
            core::Logger::getInstance().warning(
                "WMO texture cache full (", textureCacheBytes_ / (1024 * 1024),