From 9c8cd44803ed002c8cb57d09a9631bd12c8ca6a8 Mon Sep 17 00:00:00 2001
From: Kelsi <kelsihates2fa@gmail.com>
Date: Sun, 22 Feb 2026 08:12:08 -0800
Subject: [PATCH] Optimize threading and texture fallback stability

---
 include/rendering/character_renderer.hpp |   1 +
 include/rendering/wmo_renderer.hpp       |   2 +-
 src/network/world_socket.cpp             |  32 +++++--
 src/pipeline/asset_manager.cpp           |  19 +++-
 src/rendering/character_renderer.cpp     | 102 +++++++++++++++-------
 src/rendering/m2_renderer.cpp            |  86 +++++++++++++------
 src/rendering/terrain_manager.cpp        |  45 ++++++----
 src/rendering/wmo_renderer.cpp           | 105 +++++++++++------------
 8 files changed, 251 insertions(+), 141 deletions(-)
diff --git a/include/rendering/character_renderer.hpp b/include/rendering/character_renderer.hpp
index 24e0696f..6505ac76 100644
--- a/include/rendering/character_renderer.hpp
+++ b/include/rendering/character_renderer.hpp
@@ -260,6 +260,7 @@ private:
     size_t textureCacheBytes_ = 0;
     uint64_t textureCacheCounter_ = 0;
     size_t textureCacheBudgetBytes_ = 1024ull * 1024 * 1024;
+    uint32_t textureBudgetRejectWarnings_ = 0;
     std::unique_ptr<VkTexture> whiteTexture_;
     std::unique_ptr<VkTexture> transparentTexture_;
 
diff --git a/include/rendering/wmo_renderer.hpp b/include/rendering/wmo_renderer.hpp
index 21cbda9f..9deefaa2 100644
--- a/include/rendering/wmo_renderer.hpp
+++ b/include/rendering/wmo_renderer.hpp
@@ -639,7 +639,7 @@ private:
         uint32_t portalCulled = 0;
         uint32_t distanceCulled = 0;
     };
-    std::vector<std::future<std::vector<InstanceDrawList>>> cullFutures_;
+    std::vector<std::future<void>> cullFutures_;
 
     // Collision query profiling (per frame).
     mutable double queryTimeMs = 0.0;
diff --git a/src/network/world_socket.cpp b/src/network/world_socket.cpp
index 10d7f950..ce5939ec 100644
--- a/src/network/world_socket.cpp
+++ b/src/network/world_socket.cpp
@@ -69,7 +69,7 @@ WorldSocket::WorldSocket() {
     net::ensureInit();
     // Always reserve baseline receive capacity (safe, behavior-preserving).
     receiveBuffer.reserve(64 * 1024);
-    useFastRecvAppend_ = envFlagEnabled("WOWEE_NET_FAST_RECV_APPEND", false);
+    useFastRecvAppend_ = envFlagEnabled("WOWEE_NET_FAST_RECV_APPEND", true);
     useParseScratchQueue_ = envFlagEnabled("WOWEE_NET_PARSE_SCRATCH", false);
     if (useParseScratchQueue_) {
         LOG_WARNING("WOWEE_NET_PARSE_SCRATCH is temporarily disabled (known unstable); forcing off");
@@ -304,8 +304,21 @@ void WorldSocket::update() {
                     disconnect();
                     return;
                 }
-                receiveBuffer.resize(oldSize + receivedSize);
-                std::memcpy(receiveBuffer.data() + oldSize, buffer, receivedSize);
+                const size_t needed = oldSize + receivedSize;
+                if (receiveBuffer.capacity() < needed) {
+                    size_t newCap = receiveBuffer.capacity() ? receiveBuffer.capacity() : 64 * 1024;
+                    while (newCap < needed && newCap < kMaxReceiveBufferBytes) {
+                        newCap = std::min(kMaxReceiveBufferBytes, newCap * 2);
+                    }
+                    if (newCap < needed) {
+                        LOG_ERROR("World socket receive buffer capacity growth failed (needed=", needed,
+                                  " max=", kMaxReceiveBufferBytes, "). Disconnecting to recover framing.");
+                        disconnect();
+                        return;
+                    }
+                    receiveBuffer.reserve(newCap);
+                }
+                receiveBuffer.insert(receiveBuffer.end(), buffer, buffer + receivedSize);
             } else {
                 receiveBuffer.insert(receiveBuffer.end(), buffer, buffer + received);
             }
@@ -334,10 +347,13 @@ void WorldSocket::update() {
     }
 
     if (receivedAny) {
-        LOG_DEBUG("World socket read ", bytesReadThisTick, " bytes in ", readOps,
-                 " recv call(s), buffered=", receiveBuffer.size());
-        // Hex dump received bytes for auth debugging
-        if (bytesReadThisTick <= 128) {
+        const bool debugLog = core::Logger::getInstance().shouldLog(core::LogLevel::DEBUG);
+        if (debugLog) {
+            LOG_DEBUG("World socket read ", bytesReadThisTick, " bytes in ", readOps,
+                      " recv call(s), buffered=", receiveBuffer.size());
+        }
+        // Hex dump received bytes for auth debugging (debug-only to avoid per-frame string work)
+        if (debugLog && bytesReadThisTick <= 128) {
             std::string hex;
             for (size_t i = 0; i < receiveBuffer.size(); ++i) {
                 char buf[4]; snprintf(buf, sizeof(buf), "%02x ", receiveBuffer[i]); hex += buf;
@@ -345,7 +361,7 @@ void WorldSocket::update() {
             LOG_DEBUG("World socket raw bytes: ", hex);
         }
         tryParsePackets();
-        if (connected && !receiveBuffer.empty()) {
+        if (debugLog && connected && !receiveBuffer.empty()) {
             LOG_DEBUG("World socket parse left ", receiveBuffer.size(),
                      " bytes buffered (awaiting complete packet)");
         }
diff --git a/src/pipeline/asset_manager.cpp b/src/pipeline/asset_manager.cpp
index f3ebacfc..bacb3aa5 100644
--- a/src/pipeline/asset_manager.cpp
+++ b/src/pipeline/asset_manager.cpp
@@ -29,6 +29,19 @@ size_t parseEnvSizeMB(const char* name) {
     }
     return static_cast<size_t>(mb);
 }
+
+size_t parseEnvCount(const char* name, size_t defValue) {
+    const char* v = std::getenv(name);
+    if (!v || !*v) {
+        return defValue;
+    }
+    char* end = nullptr;
+    unsigned long long n = std::strtoull(v, &end, 10);
+    if (end == v || n == 0) {
+        return defValue;
+    }
+    return static_cast<size_t>(n);
+}
 } // namespace
 
 AssetManager::AssetManager() = default;
@@ -148,7 +161,8 @@ BLPImage AssetManager::loadTexture(const std::string& path) {
     if (blpData.empty()) {
         static std::unordered_set<std::string> loggedMissingTextures;
         static bool missingTextureLogSuppressed = false;
-        static constexpr size_t kMaxMissingTextureLogKeys = 20000;
+        static const size_t kMaxMissingTextureLogKeys =
+            parseEnvCount("WOWEE_TEXTURE_MISS_LOG_KEYS", 400);
         if (loggedMissingTextures.size() < kMaxMissingTextureLogKeys &&
             loggedMissingTextures.insert(normalizedPath).second) {
             LOG_WARNING("Texture not found: ", normalizedPath);
@@ -164,7 +178,8 @@ BLPImage AssetManager::loadTexture(const std::string& path) {
     if (!image.isValid()) {
         static std::unordered_set<std::string> loggedDecodeFails;
         static bool decodeFailLogSuppressed = false;
-        static constexpr size_t kMaxDecodeFailLogKeys = 8000;
+        static const size_t kMaxDecodeFailLogKeys =
+            parseEnvCount("WOWEE_TEXTURE_DECODE_LOG_KEYS", 200);
         if (loggedDecodeFails.size() < kMaxDecodeFailLogKeys &&
             loggedDecodeFails.insert(normalizedPath).second) {
             LOG_ERROR("Failed to load texture: ", normalizedPath);
diff --git a/src/rendering/character_renderer.cpp b/src/rendering/character_renderer.cpp
index 99d85814..56265fca 100644
--- a/src/rendering/character_renderer.cpp
+++ b/src/rendering/character_renderer.cpp
@@ -56,6 +56,15 @@ size_t envSizeMBOrDefault(const char* name, size_t defMb) {
     return static_cast<size_t>(mb);
 }
 
+size_t envSizeOrDefault(const char* name, size_t defValue) {
+    const char* v = std::getenv(name);
+    if (!v || !*v) return defValue;
+    char* end = nullptr;
+    unsigned long long n = std::strtoull(v, &end, 10);
+    if (end == v || n == 0) return defValue;
+    return static_cast<size_t>(n);
+}
+
 size_t approxTextureBytesWithMips(int w, int h) {
     if (w <= 0 || h <= 0) return 0;
     size_t base = static_cast<size_t>(w) * static_cast<size_t>(h) * 4ull;
@@ -95,7 +104,13 @@ bool CharacterRenderer::initialize(VkContext* ctx, VkDescriptorSetLayout perFram
     assetManager = am;
     perFrameLayout_ = perFrameLayout;
     renderPassOverride_ = renderPassOverride;
-    numAnimThreads_ = std::max(1u, std::min(8u, std::thread::hardware_concurrency()));
+    const unsigned hc = std::thread::hardware_concurrency();
+    const size_t availableCores = (hc > 1u) ? static_cast<size_t>(hc - 1u) : 1ull;
+    // Character updates run alongside M2/WMO work; default to a smaller share.
+    const size_t defaultAnimThreads = std::max<size_t>(1, availableCores / 4);
+    numAnimThreads_ = static_cast<uint32_t>(std::max<size_t>(
+        1, envSizeOrDefault("WOWEE_CHAR_ANIM_THREADS", defaultAnimThreads)));
+    core::Logger::getInstance().info("Character anim threads: ", numAnimThreads_);
 
     VkDevice device = vkCtx_->getDevice();
 
@@ -250,7 +265,8 @@ bool CharacterRenderer::initialize(VkContext* ctx, VkDescriptorSetLayout perFram
     }
 
     // Diagnostics-only: cache lifetime is currently tied to renderer lifetime.
-    textureCacheBudgetBytes_ = envSizeMBOrDefault("WOWEE_CHARACTER_TEX_CACHE_MB", 512) * 1024ull * 1024ull;
+    textureCacheBudgetBytes_ = envSizeMBOrDefault("WOWEE_CHARACTER_TEX_CACHE_MB", 1024) * 1024ull * 1024ull;
+    LOG_INFO("Character texture cache budget: ", textureCacheBudgetBytes_ / (1024 * 1024), " MB");
 
     core::Logger::getInstance().info("Character renderer initialized (Vulkan)");
     return true;
@@ -403,8 +419,29 @@ VkTexture* CharacterRenderer::loadTexture(const std::string& path) {
 
     auto blpImage = assetManager->loadTexture(key);
     if (!blpImage.isValid()) {
+        static constexpr size_t kMaxFailedTextureCache = 200000;
         core::Logger::getInstance().warning("Failed to load texture: ", path);
-        failedTextureCache_.insert(key);
+        if (failedTextureCache_.size() < kMaxFailedTextureCache) {
+            failedTextureCache_.insert(key);
+        }
+        return whiteTexture_.get();
+    }
+
+    size_t approxBytes = approxTextureBytesWithMips(blpImage.width, blpImage.height);
+    if (textureCacheBytes_ + approxBytes > textureCacheBudgetBytes_) {
+        static constexpr size_t kMaxFailedTextureCache = 200000;
+        if (failedTextureCache_.size() < kMaxFailedTextureCache) {
+            // Budget is saturated; avoid repeatedly decoding/uploading this texture.
+            failedTextureCache_.insert(key);
+        }
+        if (textureBudgetRejectWarnings_ < 8 || (textureBudgetRejectWarnings_ % 120) == 0) {
+            core::Logger::getInstance().warning(
+                "Character texture cache full (",
+                textureCacheBytes_ / (1024 * 1024), " MB / ",
+                textureCacheBudgetBytes_ / (1024 * 1024), " MB), rejecting texture: ",
+                path);
+        }
+        ++textureBudgetRejectWarnings_;
         return whiteTexture_.get();
     }
 
@@ -426,7 +463,7 @@ VkTexture* CharacterRenderer::loadTexture(const std::string& path) {
 
     TextureCacheEntry e;
     e.texture = std::move(tex);
-    e.approxBytes = approxTextureBytesWithMips(blpImage.width, blpImage.height);
+    e.approxBytes = approxBytes;
     e.lastUse = ++textureCacheCounter_;
     e.hasAlpha = hasAlpha;
     e.colorKeyBlack = colorKeyBlackHint;
@@ -435,12 +472,6 @@ VkTexture* CharacterRenderer::loadTexture(const std::string& path) {
     textureColorKeyBlackByPtr_[texPtr] = colorKeyBlackHint;
     textureCache[key] = std::move(e);
 
-    if (textureCacheBytes_ > textureCacheBudgetBytes_) {
-        core::Logger::getInstance().warning(
-            "Character texture cache over budget: ",
-            textureCacheBytes_ / (1024 * 1024), " MB > ",
-            textureCacheBudgetBytes_ / (1024 * 1024), " MB (textures=", textureCache.size(), ")");
-    }
     core::Logger::getInstance().debug("Loaded character texture: ", path, " (", blpImage.width, "x", blpImage.height, ")");
     return texPtr;
 }
@@ -1144,29 +1175,40 @@ void CharacterRenderer::update(float deltaTime, const glm::vec3& cameraPos) {
 
     // Thread animation updates in chunks to avoid spawning one task per instance.
     if (updatedCount >= 8 && numAnimThreads_ > 1) {
-        const size_t numThreads = std::min(static_cast<size_t>(numAnimThreads_), updatedCount);
-        const size_t chunkSize = updatedCount / numThreads;
-        const size_t remainder = updatedCount % numThreads;
+        static const size_t minAnimWorkPerThread = std::max<size_t>(
+            16, envSizeOrDefault("WOWEE_CHAR_ANIM_WORK_PER_THREAD", 64));
+        const size_t maxUsefulThreads = std::max<size_t>(
+            1, (updatedCount + minAnimWorkPerThread - 1) / minAnimWorkPerThread);
+        const size_t numThreads = std::min(static_cast<size_t>(numAnimThreads_), maxUsefulThreads);
 
-        animFutures_.clear();
-        if (animFutures_.capacity() < numThreads) {
-            animFutures_.reserve(numThreads);
-        }
+        if (numThreads <= 1) {
+            for (auto& instRef : toUpdate) {
+                updateAnimation(instRef.get(), deltaTime);
+            }
+        } else {
+            const size_t chunkSize = updatedCount / numThreads;
+            const size_t remainder = updatedCount % numThreads;
 
-        size_t start = 0;
-        for (size_t t = 0; t < numThreads; t++) {
-            size_t end = start + chunkSize + (t < remainder ? 1 : 0);
-            animFutures_.push_back(std::async(std::launch::async,
-                [this, &toUpdate, start, end, deltaTime]() {
-                    for (size_t i = start; i < end; i++) {
-                        updateAnimation(toUpdate[i].get(), deltaTime);
-                    }
-                }));
-            start = end;
-        }
+            animFutures_.clear();
+            if (animFutures_.capacity() < numThreads) {
+                animFutures_.reserve(numThreads);
+            }
 
-        for (auto& f : animFutures_) {
-            f.get();
+            size_t start = 0;
+            for (size_t t = 0; t < numThreads; t++) {
+                size_t end = start + chunkSize + (t < remainder ? 1 : 0);
+                animFutures_.push_back(std::async(std::launch::async,
+                    [this, &toUpdate, start, end, deltaTime]() {
+                        for (size_t i = start; i < end; i++) {
+                            updateAnimation(toUpdate[i].get(), deltaTime);
+                        }
+                    }));
+                start = end;
+            }
+
+            for (auto& f : animFutures_) {
+                f.get();
+            }
         }
     } else {
         // Sequential for small counts (avoid thread overhead)
diff --git a/src/rendering/m2_renderer.cpp b/src/rendering/m2_renderer.cpp
index 5f9aafda..3f670fdc 100644
--- a/src/rendering/m2_renderer.cpp
+++ b/src/rendering/m2_renderer.cpp
@@ -49,6 +49,15 @@ size_t envSizeMBOrDefault(const char* name, size_t defMb) {
     return static_cast<size_t>(mb);
 }
 
+size_t envSizeOrDefault(const char* name, size_t defValue) {
+    const char* raw = std::getenv(name);
+    if (!raw || !*raw) return defValue;
+    char* end = nullptr;
+    unsigned long long v = std::strtoull(raw, &end, 10);
+    if (end == raw || v == 0) return defValue;
+    return static_cast<size_t>(v);
+}
+
 static constexpr uint32_t kParticleFlagRandomized = 0x40;
 static constexpr uint32_t kParticleFlagTiled = 0x80;
 
@@ -299,7 +308,12 @@ bool M2Renderer::initialize(VkContext* ctx, VkDescriptorSetLayout perFrameLayout
     vkCtx_ = ctx;
     assetManager = assets;
 
-    numAnimThreads_ = std::min(4u, std::max(1u, std::thread::hardware_concurrency() - 1));
+    const unsigned hc = std::thread::hardware_concurrency();
+    const size_t availableCores = (hc > 1u) ? static_cast<size_t>(hc - 1u) : 1ull;
+    // Keep headroom for other frame tasks: M2 gets about half of non-main cores by default.
+    const size_t defaultAnimThreads = std::max<size_t>(1, availableCores / 2);
+    numAnimThreads_ = static_cast<uint32_t>(std::max<size_t>(
+        1, envSizeOrDefault("WOWEE_M2_ANIM_THREADS", defaultAnimThreads)));
     LOG_INFO("Initializing M2 renderer (Vulkan, ", numAnimThreads_, " anim threads)...");
 
     VkDevice device = vkCtx_->getDevice();
@@ -1915,7 +1929,9 @@ void M2Renderer::update(float deltaTime, const glm::vec3& cameraPos, const glm::
     // Phase 2: Compute bone matrices (expensive, parallel if enough work)
     const size_t animCount = boneWorkIndices_.size();
     if (animCount > 0) {
-        if (animCount < 6 || numAnimThreads_ <= 1) {
+        static const size_t minParallelAnimInstances = std::max<size_t>(
+            8, envSizeOrDefault("WOWEE_M2_ANIM_MT_MIN", 96));
+        if (animCount < minParallelAnimInstances || numAnimThreads_ <= 1) {
             // Sequential — not enough work to justify thread overhead
             for (size_t i : boneWorkIndices_) {
                 if (i >= instances.size()) continue;
@@ -1926,35 +1942,49 @@ void M2Renderer::update(float deltaTime, const glm::vec3& cameraPos, const glm::
             }
         } else {
             // Parallel — dispatch across worker threads
-            const size_t numThreads = std::min(static_cast<size_t>(numAnimThreads_), animCount);
-            const size_t chunkSize = animCount / numThreads;
-            const size_t remainder = animCount % numThreads;
+            static const size_t minAnimWorkPerThread = std::max<size_t>(
+                16, envSizeOrDefault("WOWEE_M2_ANIM_WORK_PER_THREAD", 64));
+            const size_t maxUsefulThreads = std::max<size_t>(
+                1, (animCount + minAnimWorkPerThread - 1) / minAnimWorkPerThread);
+            const size_t numThreads = std::min(static_cast<size_t>(numAnimThreads_), maxUsefulThreads);
+            if (numThreads <= 1) {
+                for (size_t i : boneWorkIndices_) {
+                    if (i >= instances.size()) continue;
+                    auto& inst = instances[i];
+                    auto mdlIt = models.find(inst.modelId);
+                    if (mdlIt == models.end()) continue;
+                    computeBoneMatrices(mdlIt->second, inst);
+                }
+            } else {
+                const size_t chunkSize = animCount / numThreads;
+                const size_t remainder = animCount % numThreads;
 
-            // Reuse persistent futures vector to avoid allocation
-            animFutures_.clear();
-            if (animFutures_.capacity() < numThreads) {
-                animFutures_.reserve(numThreads);
-            }
+                // Reuse persistent futures vector to avoid allocation
+                animFutures_.clear();
+                if (animFutures_.capacity() < numThreads) {
+                    animFutures_.reserve(numThreads);
+                }
 
-            size_t start = 0;
-            for (size_t t = 0; t < numThreads; ++t) {
-                size_t end = start + chunkSize + (t < remainder ? 1 : 0);
-                animFutures_.push_back(std::async(std::launch::async,
-                    [this, start, end]() {
-                        for (size_t j = start; j < end; ++j) {
-                            size_t idx = boneWorkIndices_[j];
-                            if (idx >= instances.size()) continue;
-                            auto& inst = instances[idx];
-                            auto mdlIt = models.find(inst.modelId);
-                            if (mdlIt == models.end()) continue;
-                            computeBoneMatrices(mdlIt->second, inst);
-                        }
-                    }));
-                start = end;
-            }
+                size_t start = 0;
+                for (size_t t = 0; t < numThreads; ++t) {
+                    size_t end = start + chunkSize + (t < remainder ? 1 : 0);
+                    animFutures_.push_back(std::async(std::launch::async,
+                        [this, start, end]() {
+                            for (size_t j = start; j < end; ++j) {
+                                size_t idx = boneWorkIndices_[j];
+                                if (idx >= instances.size()) continue;
+                                auto& inst = instances[idx];
+                                auto mdlIt = models.find(inst.modelId);
+                                if (mdlIt == models.end()) continue;
+                                computeBoneMatrices(mdlIt->second, inst);
+                            }
+                        }));
+                    start = end;
+                }
 
-            for (auto& f : animFutures_) {
-                f.get();
+                for (auto& f : animFutures_) {
+                    f.get();
+                }
             }
         }
     }
diff --git a/src/rendering/terrain_manager.cpp b/src/rendering/terrain_manager.cpp
index d50e2c1e..4b5c0b7c 100644
--- a/src/rendering/terrain_manager.cpp
+++ b/src/rendering/terrain_manager.cpp
@@ -18,6 +18,7 @@
 #include <glm/gtx/euler_angles.hpp>
 #include <cmath>
 #include <cctype>
+#include <cstdlib>
 #include <functional>
 #include <unordered_set>
 
@@ -26,6 +27,26 @@ namespace rendering {
 
 namespace {
 
+int computeTerrainWorkerCount() {
+    const char* raw = std::getenv("WOWEE_TERRAIN_WORKERS");
+    if (raw && *raw) {
+        char* end = nullptr;
+        unsigned long long forced = std::strtoull(raw, &end, 10);
+        if (end != raw && forced > 0) {
+            return static_cast<int>(forced);
+        }
+    }
+
+    unsigned hc = std::thread::hardware_concurrency();
+    if (hc > 0) {
+        // Terrain streaming should leave CPU room for render/update threads.
+        const unsigned availableCores = (hc > 1u) ? (hc - 1u) : 1u;
+        const unsigned targetWorkers = std::max(2u, availableCores / 2u);
+        return static_cast<int>(targetWorkers);
+    }
+    return 2;  // Fallback
+}
+
 bool decodeLayerAlpha(const pipeline::MapChunk& chunk, size_t layerIdx, std::vector<uint8_t>& outAlpha) {
     if (layerIdx >= chunk.layers.size()) return false;
     const auto& layer = chunk.layers[layerIdx];
@@ -128,15 +149,9 @@ bool TerrainManager::initialize(pipeline::AssetManager* assets, TerrainRenderer*
     LOG_INFO("Terrain tile cache budget: ", tileCacheBudgetBytes_ / (1024 * 1024), " MB (dynamic)");
 
     // Start background worker pool (dynamic: scales with available cores)
-    // Use 75% of logical cores for decompression, leaving headroom for render/OS
+    // Keep defaults moderate; env override can increase if streaming is bottlenecked.
     workerRunning.store(true);
-    unsigned hc = std::thread::hardware_concurrency();
-    if (hc > 0) {
-        unsigned targetWorkers = std::max(6u, (hc * 3) / 4);  // 75% of cores, minimum 6
-        workerCount = static_cast<int>(targetWorkers);
-    } else {
-        workerCount = 6;  // Fallback
-    }
+    workerCount = computeTerrainWorkerCount();
     workerThreads.reserve(workerCount);
     for (int i = 0; i < workerCount; i++) {
         workerThreads.emplace_back(&TerrainManager::workerLoop, this);
@@ -926,12 +941,10 @@ void TerrainManager::processReadyTiles() {
 
         if (pending) {
             TileCoord coord = pending->coord;
-            auto tileStart = std::chrono::high_resolution_clock::now();
 
             finalizeTile(pending);
 
-            auto tileEnd = std::chrono::high_resolution_clock::now();
-            float tileTimeMs = std::chrono::duration<float, std::milli>(tileEnd - tileStart).count();
+            auto now = std::chrono::high_resolution_clock::now();
 
             {
                 std::lock_guard<std::mutex> lock(queueMutex);
@@ -940,7 +953,7 @@ void TerrainManager::processReadyTiles() {
             processed++;
 
             // Check if we've exceeded time budget
-            float elapsedMs = std::chrono::duration<float, std::milli>(tileEnd - startTime).count();
+            float elapsedMs = std::chrono::duration<float, std::milli>(now - startTime).count();
             if (elapsedMs >= timeBudgetMs) {
                 if (processed > 1) {
                     LOG_DEBUG("Processed ", processed, " tiles in ", elapsedMs, "ms (budget: ", timeBudgetMs, "ms)");
@@ -1183,13 +1196,7 @@ void TerrainManager::unloadAll() {
     // Restart worker threads so streaming can resume (dynamic: scales with available cores)
     // Use 75% of logical cores for decompression, leaving headroom for render/OS
     workerRunning.store(true);
-    unsigned hc = std::thread::hardware_concurrency();
-    if (hc > 0) {
-        unsigned targetWorkers = std::max(6u, (hc * 3) / 4);  // 75% of cores, minimum 6
-        workerCount = static_cast<int>(targetWorkers);
-    } else {
-        workerCount = 6;  // Fallback
-    }
+    workerCount = computeTerrainWorkerCount();
     workerThreads.reserve(workerCount);
     for (int i = 0; i < workerCount; i++) {
         workerThreads.emplace_back(&TerrainManager::workerLoop, this);
diff --git a/src/rendering/wmo_renderer.cpp b/src/rendering/wmo_renderer.cpp
index a2f97a24..15705a05 100644
--- a/src/rendering/wmo_renderer.cpp
+++ b/src/rendering/wmo_renderer.cpp
@@ -37,6 +37,15 @@ size_t envSizeMBOrDefault(const char* name, size_t defMb) {
     if (end == raw || mb == 0) return defMb;
     return static_cast<size_t>(mb);
 }
+
+size_t envSizeOrDefault(const char* name, size_t defValue) {
+    const char* raw = std::getenv(name);
+    if (!raw || !*raw) return defValue;
+    char* end = nullptr;
+    unsigned long long v = std::strtoull(raw, &end, 10);
+    if (end == raw || v == 0) return defValue;
+    return static_cast<size_t>(v);
+}
 } // namespace
 
 static void transformAABB(const glm::mat4& modelMatrix,
@@ -65,7 +74,13 @@ bool WMORenderer::initialize(VkContext* ctx, VkDescriptorSetLayout perFrameLayou
         return false;
     }
 
-    numCullThreads_ = std::min(4u, std::max(1u, std::thread::hardware_concurrency() - 1));
+    const unsigned hc = std::thread::hardware_concurrency();
+    const size_t availableCores = (hc > 1u) ? static_cast<size_t>(hc - 1u) : 1ull;
+    // WMO culling is lighter than animation; keep defaults conservative to reduce spikes.
+    const size_t defaultCullThreads = std::max<size_t>(1, availableCores / 4);
+    numCullThreads_ = static_cast<uint32_t>(std::max<size_t>(
+        1, envSizeOrDefault("WOWEE_WMO_CULL_THREADS", defaultCullThreads)));
+    core::Logger::getInstance().info("WMO cull threads: ", numCullThreads_);
 
     VkDevice device = vkCtx_->getDevice();
 
@@ -1208,35 +1223,44 @@ void WMORenderer::render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet, const
     std::vector<InstanceDrawList> drawLists;
     drawLists.reserve(visibleInstances.size());
 
-    if (visibleInstances.size() >= 4 && numCullThreads_ > 1) {
-        const size_t numThreads = std::min(static_cast<size_t>(numCullThreads_),
-                                           visibleInstances.size());
-        const size_t chunkSize = visibleInstances.size() / numThreads;
-        const size_t remainder = visibleInstances.size() % numThreads;
+    static const size_t minParallelCullInstances = std::max<size_t>(
+        4, envSizeOrDefault("WOWEE_WMO_CULL_MT_MIN", 128));
+    if (visibleInstances.size() >= minParallelCullInstances && numCullThreads_ > 1) {
+        static const size_t minCullWorkPerThread = std::max<size_t>(
+            16, envSizeOrDefault("WOWEE_WMO_CULL_WORK_PER_THREAD", 64));
+        const size_t maxUsefulThreads = std::max<size_t>(
+            1, (visibleInstances.size() + minCullWorkPerThread - 1) / minCullWorkPerThread);
+        const size_t numThreads = std::min(static_cast<size_t>(numCullThreads_), maxUsefulThreads);
+        if (numThreads <= 1) {
+            for (size_t idx : visibleInstances) {
+                drawLists.push_back(cullInstance(idx));
+            }
+        } else {
+            const size_t chunkSize = visibleInstances.size() / numThreads;
+            const size_t remainder = visibleInstances.size() % numThreads;
 
-        cullFutures_.clear();
-        if (cullFutures_.capacity() < numThreads) {
-            cullFutures_.reserve(numThreads);
-        }
+            drawLists.resize(visibleInstances.size());
 
-        size_t start = 0;
-        for (size_t t = 0; t < numThreads; ++t) {
-            size_t end = start + chunkSize + (t < remainder ? 1 : 0);
-            cullFutures_.push_back(std::async(std::launch::async,
-                [&, start, end]() {
-                    std::vector<InstanceDrawList> chunk;
-                    chunk.reserve(end - start);
-                    for (size_t j = start; j < end; ++j)
-                        chunk.push_back(cullInstance(visibleInstances[j]));
-                    return chunk;
-                }));
-            start = end;
-        }
+            cullFutures_.clear();
+            if (cullFutures_.capacity() < numThreads) {
+                cullFutures_.reserve(numThreads);
+            }
 
-        for (auto& f : cullFutures_) {
-            auto chunk = f.get();
-            for (auto& dl : chunk)
-                drawLists.push_back(std::move(dl));
+            size_t start = 0;
+            for (size_t t = 0; t < numThreads; ++t) {
+                const size_t end = start + chunkSize + (t < remainder ? 1 : 0);
+                cullFutures_.push_back(std::async(std::launch::async,
+                    [&, start, end]() {
+                        for (size_t j = start; j < end; ++j) {
+                            drawLists[j] = cullInstance(visibleInstances[j]);
+                        }
+                    }));
+                start = end;
+            }
+
+            for (auto& f : cullFutures_) {
+                f.get();
+            }
         }
     } else {
         for (size_t idx : visibleInstances)
@@ -1901,16 +1925,7 @@ VkTexture* WMORenderer::loadTexture(const std::string& path) {
         }
     }
 
-    std::vector<std::string> attemptedCandidates;
-    attemptedCandidates.reserve(uniqueCandidates.size());
-    for (const auto& c : uniqueCandidates) {
-        if (!failedTextureCache_.count(c)) {
-            attemptedCandidates.push_back(c);
-        }
-    }
-    if (attemptedCandidates.empty()) {
-        return whiteTexture_.get();
-    }
+    const auto& attemptedCandidates = uniqueCandidates;
 
     // Try loading all candidates until one succeeds
     pipeline::BLPImage blp;
@@ -1923,12 +1938,6 @@ VkTexture* WMORenderer::loadTexture(const std::string& path) {
         }
     }
     if (!blp.isValid()) {
-        static constexpr size_t kMaxFailedTextureCache = 200000;
-        for (const auto& c : attemptedCandidates) {
-            if (failedTextureCache_.size() < kMaxFailedTextureCache) {
-                failedTextureCache_.insert(c);
-            }
-        }
         if (loggedTextureLoadFails_.insert(key).second) {
             core::Logger::getInstance().warning("WMO: Failed to load texture: ", path);
         }
@@ -1943,16 +1952,6 @@ VkTexture* WMORenderer::loadTexture(const std::string& path) {
     size_t base = static_cast<size_t>(blp.width) * static_cast<size_t>(blp.height) * 4ull;
     size_t approxBytes = base + (base / 3);
     if (textureCacheBytes_ + approxBytes > textureCacheBudgetBytes_) {
-        static constexpr size_t kMaxFailedTextureCache = 200000;
-        if (failedTextureCache_.size() < kMaxFailedTextureCache) {
-            // Cache budget-rejected keys too; once saturated, repeated attempts
-            // cause pointless decode churn and transient allocations.
-            if (!resolvedKey.empty()) {
-                failedTextureCache_.insert(resolvedKey);
-            } else {
-                failedTextureCache_.insert(key);
-            }
-        }
         if (textureBudgetRejectWarnings_ < 8 || (textureBudgetRejectWarnings_ % 120) == 0) {
             core::Logger::getInstance().warning(
                 "WMO texture cache full (", textureCacheBytes_ / (1024 * 1024),