From 9c8cd44803ed002c8cb57d09a9631bd12c8ca6a8 Mon Sep 17 00:00:00 2001 From: Kelsi Date: Sun, 22 Feb 2026 08:12:08 -0800 Subject: [PATCH] Optimize threading and texture fallback stability --- include/rendering/character_renderer.hpp | 1 + include/rendering/wmo_renderer.hpp | 2 +- src/network/world_socket.cpp | 32 +++++-- src/pipeline/asset_manager.cpp | 19 +++- src/rendering/character_renderer.cpp | 102 +++++++++++++++------- src/rendering/m2_renderer.cpp | 86 +++++++++++++------ src/rendering/terrain_manager.cpp | 45 ++++++---- src/rendering/wmo_renderer.cpp | 105 +++++++++++------------ 8 files changed, 251 insertions(+), 141 deletions(-) diff --git a/include/rendering/character_renderer.hpp b/include/rendering/character_renderer.hpp index 24e0696f..6505ac76 100644 --- a/include/rendering/character_renderer.hpp +++ b/include/rendering/character_renderer.hpp @@ -260,6 +260,7 @@ private: size_t textureCacheBytes_ = 0; uint64_t textureCacheCounter_ = 0; size_t textureCacheBudgetBytes_ = 1024ull * 1024 * 1024; + uint32_t textureBudgetRejectWarnings_ = 0; std::unique_ptr whiteTexture_; std::unique_ptr transparentTexture_; diff --git a/include/rendering/wmo_renderer.hpp b/include/rendering/wmo_renderer.hpp index 21cbda9f..9deefaa2 100644 --- a/include/rendering/wmo_renderer.hpp +++ b/include/rendering/wmo_renderer.hpp @@ -639,7 +639,7 @@ private: uint32_t portalCulled = 0; uint32_t distanceCulled = 0; }; - std::vector>> cullFutures_; + std::vector> cullFutures_; // Collision query profiling (per frame). mutable double queryTimeMs = 0.0; diff --git a/src/network/world_socket.cpp b/src/network/world_socket.cpp index 10d7f950..ce5939ec 100644 --- a/src/network/world_socket.cpp +++ b/src/network/world_socket.cpp @@ -69,7 +69,7 @@ WorldSocket::WorldSocket() { net::ensureInit(); // Always reserve baseline receive capacity (safe, behavior-preserving). receiveBuffer.reserve(64 * 1024); - useFastRecvAppend_ = envFlagEnabled("WOWEE_NET_FAST_RECV_APPEND", false); + useFastRecvAppend_ = envFlagEnabled("WOWEE_NET_FAST_RECV_APPEND", true); useParseScratchQueue_ = envFlagEnabled("WOWEE_NET_PARSE_SCRATCH", false); if (useParseScratchQueue_) { LOG_WARNING("WOWEE_NET_PARSE_SCRATCH is temporarily disabled (known unstable); forcing off"); @@ -304,8 +304,21 @@ void WorldSocket::update() { disconnect(); return; } - receiveBuffer.resize(oldSize + receivedSize); - std::memcpy(receiveBuffer.data() + oldSize, buffer, receivedSize); + const size_t needed = oldSize + receivedSize; + if (receiveBuffer.capacity() < needed) { + size_t newCap = receiveBuffer.capacity() ? receiveBuffer.capacity() : 64 * 1024; + while (newCap < needed && newCap < kMaxReceiveBufferBytes) { + newCap = std::min(kMaxReceiveBufferBytes, newCap * 2); + } + if (newCap < needed) { + LOG_ERROR("World socket receive buffer capacity growth failed (needed=", needed, + " max=", kMaxReceiveBufferBytes, "). Disconnecting to recover framing."); + disconnect(); + return; + } + receiveBuffer.reserve(newCap); + } + receiveBuffer.insert(receiveBuffer.end(), buffer, buffer + receivedSize); } else { receiveBuffer.insert(receiveBuffer.end(), buffer, buffer + received); } @@ -334,10 +347,13 @@ void WorldSocket::update() { } if (receivedAny) { - LOG_DEBUG("World socket read ", bytesReadThisTick, " bytes in ", readOps, - " recv call(s), buffered=", receiveBuffer.size()); - // Hex dump received bytes for auth debugging - if (bytesReadThisTick <= 128) { + const bool debugLog = core::Logger::getInstance().shouldLog(core::LogLevel::DEBUG); + if (debugLog) { + LOG_DEBUG("World socket read ", bytesReadThisTick, " bytes in ", readOps, + " recv call(s), buffered=", receiveBuffer.size()); + } + // Hex dump received bytes for auth debugging (debug-only to avoid per-frame string work) + if (debugLog && bytesReadThisTick <= 128) { std::string hex; for (size_t i = 0; i < receiveBuffer.size(); ++i) { char buf[4]; snprintf(buf, sizeof(buf), "%02x ", receiveBuffer[i]); hex += buf; @@ -345,7 +361,7 @@ void WorldSocket::update() { LOG_DEBUG("World socket raw bytes: ", hex); } tryParsePackets(); - if (connected && !receiveBuffer.empty()) { + if (debugLog && connected && !receiveBuffer.empty()) { LOG_DEBUG("World socket parse left ", receiveBuffer.size(), " bytes buffered (awaiting complete packet)"); } diff --git a/src/pipeline/asset_manager.cpp b/src/pipeline/asset_manager.cpp index f3ebacfc..bacb3aa5 100644 --- a/src/pipeline/asset_manager.cpp +++ b/src/pipeline/asset_manager.cpp @@ -29,6 +29,19 @@ size_t parseEnvSizeMB(const char* name) { } return static_cast(mb); } + +size_t parseEnvCount(const char* name, size_t defValue) { + const char* v = std::getenv(name); + if (!v || !*v) { + return defValue; + } + char* end = nullptr; + unsigned long long n = std::strtoull(v, &end, 10); + if (end == v || n == 0) { + return defValue; + } + return static_cast(n); +} } // namespace AssetManager::AssetManager() = default; @@ -148,7 +161,8 @@ BLPImage AssetManager::loadTexture(const std::string& path) { if (blpData.empty()) { static std::unordered_set loggedMissingTextures; static bool missingTextureLogSuppressed = false; - static constexpr size_t kMaxMissingTextureLogKeys = 20000; + static const size_t kMaxMissingTextureLogKeys = + parseEnvCount("WOWEE_TEXTURE_MISS_LOG_KEYS", 400); if (loggedMissingTextures.size() < kMaxMissingTextureLogKeys && loggedMissingTextures.insert(normalizedPath).second) { LOG_WARNING("Texture not found: ", normalizedPath); @@ -164,7 +178,8 @@ BLPImage AssetManager::loadTexture(const std::string& path) { if (!image.isValid()) { static std::unordered_set loggedDecodeFails; static bool decodeFailLogSuppressed = false; - static constexpr size_t kMaxDecodeFailLogKeys = 8000; + static const size_t kMaxDecodeFailLogKeys = + parseEnvCount("WOWEE_TEXTURE_DECODE_LOG_KEYS", 200); if (loggedDecodeFails.size() < kMaxDecodeFailLogKeys && loggedDecodeFails.insert(normalizedPath).second) { LOG_ERROR("Failed to load texture: ", normalizedPath); diff --git a/src/rendering/character_renderer.cpp b/src/rendering/character_renderer.cpp index 99d85814..56265fca 100644 --- a/src/rendering/character_renderer.cpp +++ b/src/rendering/character_renderer.cpp @@ -56,6 +56,15 @@ size_t envSizeMBOrDefault(const char* name, size_t defMb) { return static_cast(mb); } +size_t envSizeOrDefault(const char* name, size_t defValue) { + const char* v = std::getenv(name); + if (!v || !*v) return defValue; + char* end = nullptr; + unsigned long long n = std::strtoull(v, &end, 10); + if (end == v || n == 0) return defValue; + return static_cast(n); +} + size_t approxTextureBytesWithMips(int w, int h) { if (w <= 0 || h <= 0) return 0; size_t base = static_cast(w) * static_cast(h) * 4ull; @@ -95,7 +104,13 @@ bool CharacterRenderer::initialize(VkContext* ctx, VkDescriptorSetLayout perFram assetManager = am; perFrameLayout_ = perFrameLayout; renderPassOverride_ = renderPassOverride; - numAnimThreads_ = std::max(1u, std::min(8u, std::thread::hardware_concurrency())); + const unsigned hc = std::thread::hardware_concurrency(); + const size_t availableCores = (hc > 1u) ? static_cast(hc - 1u) : 1ull; + // Character updates run alongside M2/WMO work; default to a smaller share. + const size_t defaultAnimThreads = std::max(1, availableCores / 4); + numAnimThreads_ = static_cast(std::max( + 1, envSizeOrDefault("WOWEE_CHAR_ANIM_THREADS", defaultAnimThreads))); + core::Logger::getInstance().info("Character anim threads: ", numAnimThreads_); VkDevice device = vkCtx_->getDevice(); @@ -250,7 +265,8 @@ bool CharacterRenderer::initialize(VkContext* ctx, VkDescriptorSetLayout perFram } // Diagnostics-only: cache lifetime is currently tied to renderer lifetime. - textureCacheBudgetBytes_ = envSizeMBOrDefault("WOWEE_CHARACTER_TEX_CACHE_MB", 512) * 1024ull * 1024ull; + textureCacheBudgetBytes_ = envSizeMBOrDefault("WOWEE_CHARACTER_TEX_CACHE_MB", 1024) * 1024ull * 1024ull; + LOG_INFO("Character texture cache budget: ", textureCacheBudgetBytes_ / (1024 * 1024), " MB"); core::Logger::getInstance().info("Character renderer initialized (Vulkan)"); return true; @@ -403,8 +419,29 @@ VkTexture* CharacterRenderer::loadTexture(const std::string& path) { auto blpImage = assetManager->loadTexture(key); if (!blpImage.isValid()) { + static constexpr size_t kMaxFailedTextureCache = 200000; core::Logger::getInstance().warning("Failed to load texture: ", path); - failedTextureCache_.insert(key); + if (failedTextureCache_.size() < kMaxFailedTextureCache) { + failedTextureCache_.insert(key); + } + return whiteTexture_.get(); + } + + size_t approxBytes = approxTextureBytesWithMips(blpImage.width, blpImage.height); + if (textureCacheBytes_ + approxBytes > textureCacheBudgetBytes_) { + static constexpr size_t kMaxFailedTextureCache = 200000; + if (failedTextureCache_.size() < kMaxFailedTextureCache) { + // Budget is saturated; avoid repeatedly decoding/uploading this texture. + failedTextureCache_.insert(key); + } + if (textureBudgetRejectWarnings_ < 8 || (textureBudgetRejectWarnings_ % 120) == 0) { + core::Logger::getInstance().warning( + "Character texture cache full (", + textureCacheBytes_ / (1024 * 1024), " MB / ", + textureCacheBudgetBytes_ / (1024 * 1024), " MB), rejecting texture: ", + path); + } + ++textureBudgetRejectWarnings_; return whiteTexture_.get(); } @@ -426,7 +463,7 @@ VkTexture* CharacterRenderer::loadTexture(const std::string& path) { TextureCacheEntry e; e.texture = std::move(tex); - e.approxBytes = approxTextureBytesWithMips(blpImage.width, blpImage.height); + e.approxBytes = approxBytes; e.lastUse = ++textureCacheCounter_; e.hasAlpha = hasAlpha; e.colorKeyBlack = colorKeyBlackHint; @@ -435,12 +472,6 @@ VkTexture* CharacterRenderer::loadTexture(const std::string& path) { textureColorKeyBlackByPtr_[texPtr] = colorKeyBlackHint; textureCache[key] = std::move(e); - if (textureCacheBytes_ > textureCacheBudgetBytes_) { - core::Logger::getInstance().warning( - "Character texture cache over budget: ", - textureCacheBytes_ / (1024 * 1024), " MB > ", - textureCacheBudgetBytes_ / (1024 * 1024), " MB (textures=", textureCache.size(), ")"); - } core::Logger::getInstance().debug("Loaded character texture: ", path, " (", blpImage.width, "x", blpImage.height, ")"); return texPtr; } @@ -1144,29 +1175,40 @@ void CharacterRenderer::update(float deltaTime, const glm::vec3& cameraPos) { // Thread animation updates in chunks to avoid spawning one task per instance. if (updatedCount >= 8 && numAnimThreads_ > 1) { - const size_t numThreads = std::min(static_cast(numAnimThreads_), updatedCount); - const size_t chunkSize = updatedCount / numThreads; - const size_t remainder = updatedCount % numThreads; + static const size_t minAnimWorkPerThread = std::max( + 16, envSizeOrDefault("WOWEE_CHAR_ANIM_WORK_PER_THREAD", 64)); + const size_t maxUsefulThreads = std::max( + 1, (updatedCount + minAnimWorkPerThread - 1) / minAnimWorkPerThread); + const size_t numThreads = std::min(static_cast(numAnimThreads_), maxUsefulThreads); - animFutures_.clear(); - if (animFutures_.capacity() < numThreads) { - animFutures_.reserve(numThreads); - } + if (numThreads <= 1) { + for (auto& instRef : toUpdate) { + updateAnimation(instRef.get(), deltaTime); + } + } else { + const size_t chunkSize = updatedCount / numThreads; + const size_t remainder = updatedCount % numThreads; - size_t start = 0; - for (size_t t = 0; t < numThreads; t++) { - size_t end = start + chunkSize + (t < remainder ? 1 : 0); - animFutures_.push_back(std::async(std::launch::async, - [this, &toUpdate, start, end, deltaTime]() { - for (size_t i = start; i < end; i++) { - updateAnimation(toUpdate[i].get(), deltaTime); - } - })); - start = end; - } + animFutures_.clear(); + if (animFutures_.capacity() < numThreads) { + animFutures_.reserve(numThreads); + } - for (auto& f : animFutures_) { - f.get(); + size_t start = 0; + for (size_t t = 0; t < numThreads; t++) { + size_t end = start + chunkSize + (t < remainder ? 1 : 0); + animFutures_.push_back(std::async(std::launch::async, + [this, &toUpdate, start, end, deltaTime]() { + for (size_t i = start; i < end; i++) { + updateAnimation(toUpdate[i].get(), deltaTime); + } + })); + start = end; + } + + for (auto& f : animFutures_) { + f.get(); + } } } else { // Sequential for small counts (avoid thread overhead) diff --git a/src/rendering/m2_renderer.cpp b/src/rendering/m2_renderer.cpp index 5f9aafda..3f670fdc 100644 --- a/src/rendering/m2_renderer.cpp +++ b/src/rendering/m2_renderer.cpp @@ -49,6 +49,15 @@ size_t envSizeMBOrDefault(const char* name, size_t defMb) { return static_cast(mb); } +size_t envSizeOrDefault(const char* name, size_t defValue) { + const char* raw = std::getenv(name); + if (!raw || !*raw) return defValue; + char* end = nullptr; + unsigned long long v = std::strtoull(raw, &end, 10); + if (end == raw || v == 0) return defValue; + return static_cast(v); +} + static constexpr uint32_t kParticleFlagRandomized = 0x40; static constexpr uint32_t kParticleFlagTiled = 0x80; @@ -299,7 +308,12 @@ bool M2Renderer::initialize(VkContext* ctx, VkDescriptorSetLayout perFrameLayout vkCtx_ = ctx; assetManager = assets; - numAnimThreads_ = std::min(4u, std::max(1u, std::thread::hardware_concurrency() - 1)); + const unsigned hc = std::thread::hardware_concurrency(); + const size_t availableCores = (hc > 1u) ? static_cast(hc - 1u) : 1ull; + // Keep headroom for other frame tasks: M2 gets about half of non-main cores by default. + const size_t defaultAnimThreads = std::max(1, availableCores / 2); + numAnimThreads_ = static_cast(std::max( + 1, envSizeOrDefault("WOWEE_M2_ANIM_THREADS", defaultAnimThreads))); LOG_INFO("Initializing M2 renderer (Vulkan, ", numAnimThreads_, " anim threads)..."); VkDevice device = vkCtx_->getDevice(); @@ -1915,7 +1929,9 @@ void M2Renderer::update(float deltaTime, const glm::vec3& cameraPos, const glm:: // Phase 2: Compute bone matrices (expensive, parallel if enough work) const size_t animCount = boneWorkIndices_.size(); if (animCount > 0) { - if (animCount < 6 || numAnimThreads_ <= 1) { + static const size_t minParallelAnimInstances = std::max( + 8, envSizeOrDefault("WOWEE_M2_ANIM_MT_MIN", 96)); + if (animCount < minParallelAnimInstances || numAnimThreads_ <= 1) { // Sequential — not enough work to justify thread overhead for (size_t i : boneWorkIndices_) { if (i >= instances.size()) continue; @@ -1926,35 +1942,49 @@ void M2Renderer::update(float deltaTime, const glm::vec3& cameraPos, const glm:: } } else { // Parallel — dispatch across worker threads - const size_t numThreads = std::min(static_cast(numAnimThreads_), animCount); - const size_t chunkSize = animCount / numThreads; - const size_t remainder = animCount % numThreads; + static const size_t minAnimWorkPerThread = std::max( + 16, envSizeOrDefault("WOWEE_M2_ANIM_WORK_PER_THREAD", 64)); + const size_t maxUsefulThreads = std::max( + 1, (animCount + minAnimWorkPerThread - 1) / minAnimWorkPerThread); + const size_t numThreads = std::min(static_cast(numAnimThreads_), maxUsefulThreads); + if (numThreads <= 1) { + for (size_t i : boneWorkIndices_) { + if (i >= instances.size()) continue; + auto& inst = instances[i]; + auto mdlIt = models.find(inst.modelId); + if (mdlIt == models.end()) continue; + computeBoneMatrices(mdlIt->second, inst); + } + } else { + const size_t chunkSize = animCount / numThreads; + const size_t remainder = animCount % numThreads; - // Reuse persistent futures vector to avoid allocation - animFutures_.clear(); - if (animFutures_.capacity() < numThreads) { - animFutures_.reserve(numThreads); - } + // Reuse persistent futures vector to avoid allocation + animFutures_.clear(); + if (animFutures_.capacity() < numThreads) { + animFutures_.reserve(numThreads); + } - size_t start = 0; - for (size_t t = 0; t < numThreads; ++t) { - size_t end = start + chunkSize + (t < remainder ? 1 : 0); - animFutures_.push_back(std::async(std::launch::async, - [this, start, end]() { - for (size_t j = start; j < end; ++j) { - size_t idx = boneWorkIndices_[j]; - if (idx >= instances.size()) continue; - auto& inst = instances[idx]; - auto mdlIt = models.find(inst.modelId); - if (mdlIt == models.end()) continue; - computeBoneMatrices(mdlIt->second, inst); - } - })); - start = end; - } + size_t start = 0; + for (size_t t = 0; t < numThreads; ++t) { + size_t end = start + chunkSize + (t < remainder ? 1 : 0); + animFutures_.push_back(std::async(std::launch::async, + [this, start, end]() { + for (size_t j = start; j < end; ++j) { + size_t idx = boneWorkIndices_[j]; + if (idx >= instances.size()) continue; + auto& inst = instances[idx]; + auto mdlIt = models.find(inst.modelId); + if (mdlIt == models.end()) continue; + computeBoneMatrices(mdlIt->second, inst); + } + })); + start = end; + } - for (auto& f : animFutures_) { - f.get(); + for (auto& f : animFutures_) { + f.get(); + } } } } diff --git a/src/rendering/terrain_manager.cpp b/src/rendering/terrain_manager.cpp index d50e2c1e..4b5c0b7c 100644 --- a/src/rendering/terrain_manager.cpp +++ b/src/rendering/terrain_manager.cpp @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -26,6 +27,26 @@ namespace rendering { namespace { +int computeTerrainWorkerCount() { + const char* raw = std::getenv("WOWEE_TERRAIN_WORKERS"); + if (raw && *raw) { + char* end = nullptr; + unsigned long long forced = std::strtoull(raw, &end, 10); + if (end != raw && forced > 0) { + return static_cast(forced); + } + } + + unsigned hc = std::thread::hardware_concurrency(); + if (hc > 0) { + // Terrain streaming should leave CPU room for render/update threads. + const unsigned availableCores = (hc > 1u) ? (hc - 1u) : 1u; + const unsigned targetWorkers = std::max(2u, availableCores / 2u); + return static_cast(targetWorkers); + } + return 2; // Fallback +} + bool decodeLayerAlpha(const pipeline::MapChunk& chunk, size_t layerIdx, std::vector& outAlpha) { if (layerIdx >= chunk.layers.size()) return false; const auto& layer = chunk.layers[layerIdx]; @@ -128,15 +149,9 @@ bool TerrainManager::initialize(pipeline::AssetManager* assets, TerrainRenderer* LOG_INFO("Terrain tile cache budget: ", tileCacheBudgetBytes_ / (1024 * 1024), " MB (dynamic)"); // Start background worker pool (dynamic: scales with available cores) - // Use 75% of logical cores for decompression, leaving headroom for render/OS + // Keep defaults moderate; env override can increase if streaming is bottlenecked. workerRunning.store(true); - unsigned hc = std::thread::hardware_concurrency(); - if (hc > 0) { - unsigned targetWorkers = std::max(6u, (hc * 3) / 4); // 75% of cores, minimum 6 - workerCount = static_cast(targetWorkers); - } else { - workerCount = 6; // Fallback - } + workerCount = computeTerrainWorkerCount(); workerThreads.reserve(workerCount); for (int i = 0; i < workerCount; i++) { workerThreads.emplace_back(&TerrainManager::workerLoop, this); @@ -926,12 +941,10 @@ void TerrainManager::processReadyTiles() { if (pending) { TileCoord coord = pending->coord; - auto tileStart = std::chrono::high_resolution_clock::now(); finalizeTile(pending); - auto tileEnd = std::chrono::high_resolution_clock::now(); - float tileTimeMs = std::chrono::duration(tileEnd - tileStart).count(); + auto now = std::chrono::high_resolution_clock::now(); { std::lock_guard lock(queueMutex); @@ -940,7 +953,7 @@ void TerrainManager::processReadyTiles() { processed++; // Check if we've exceeded time budget - float elapsedMs = std::chrono::duration(tileEnd - startTime).count(); + float elapsedMs = std::chrono::duration(now - startTime).count(); if (elapsedMs >= timeBudgetMs) { if (processed > 1) { LOG_DEBUG("Processed ", processed, " tiles in ", elapsedMs, "ms (budget: ", timeBudgetMs, "ms)"); @@ -1183,13 +1196,7 @@ void TerrainManager::unloadAll() { // Restart worker threads so streaming can resume (dynamic: scales with available cores) // Use 75% of logical cores for decompression, leaving headroom for render/OS workerRunning.store(true); - unsigned hc = std::thread::hardware_concurrency(); - if (hc > 0) { - unsigned targetWorkers = std::max(6u, (hc * 3) / 4); // 75% of cores, minimum 6 - workerCount = static_cast(targetWorkers); - } else { - workerCount = 6; // Fallback - } + workerCount = computeTerrainWorkerCount(); workerThreads.reserve(workerCount); for (int i = 0; i < workerCount; i++) { workerThreads.emplace_back(&TerrainManager::workerLoop, this); diff --git a/src/rendering/wmo_renderer.cpp b/src/rendering/wmo_renderer.cpp index a2f97a24..15705a05 100644 --- a/src/rendering/wmo_renderer.cpp +++ b/src/rendering/wmo_renderer.cpp @@ -37,6 +37,15 @@ size_t envSizeMBOrDefault(const char* name, size_t defMb) { if (end == raw || mb == 0) return defMb; return static_cast(mb); } + +size_t envSizeOrDefault(const char* name, size_t defValue) { + const char* raw = std::getenv(name); + if (!raw || !*raw) return defValue; + char* end = nullptr; + unsigned long long v = std::strtoull(raw, &end, 10); + if (end == raw || v == 0) return defValue; + return static_cast(v); +} } // namespace static void transformAABB(const glm::mat4& modelMatrix, @@ -65,7 +74,13 @@ bool WMORenderer::initialize(VkContext* ctx, VkDescriptorSetLayout perFrameLayou return false; } - numCullThreads_ = std::min(4u, std::max(1u, std::thread::hardware_concurrency() - 1)); + const unsigned hc = std::thread::hardware_concurrency(); + const size_t availableCores = (hc > 1u) ? static_cast(hc - 1u) : 1ull; + // WMO culling is lighter than animation; keep defaults conservative to reduce spikes. + const size_t defaultCullThreads = std::max(1, availableCores / 4); + numCullThreads_ = static_cast(std::max( + 1, envSizeOrDefault("WOWEE_WMO_CULL_THREADS", defaultCullThreads))); + core::Logger::getInstance().info("WMO cull threads: ", numCullThreads_); VkDevice device = vkCtx_->getDevice(); @@ -1208,35 +1223,44 @@ void WMORenderer::render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet, const std::vector drawLists; drawLists.reserve(visibleInstances.size()); - if (visibleInstances.size() >= 4 && numCullThreads_ > 1) { - const size_t numThreads = std::min(static_cast(numCullThreads_), - visibleInstances.size()); - const size_t chunkSize = visibleInstances.size() / numThreads; - const size_t remainder = visibleInstances.size() % numThreads; + static const size_t minParallelCullInstances = std::max( + 4, envSizeOrDefault("WOWEE_WMO_CULL_MT_MIN", 128)); + if (visibleInstances.size() >= minParallelCullInstances && numCullThreads_ > 1) { + static const size_t minCullWorkPerThread = std::max( + 16, envSizeOrDefault("WOWEE_WMO_CULL_WORK_PER_THREAD", 64)); + const size_t maxUsefulThreads = std::max( + 1, (visibleInstances.size() + minCullWorkPerThread - 1) / minCullWorkPerThread); + const size_t numThreads = std::min(static_cast(numCullThreads_), maxUsefulThreads); + if (numThreads <= 1) { + for (size_t idx : visibleInstances) { + drawLists.push_back(cullInstance(idx)); + } + } else { + const size_t chunkSize = visibleInstances.size() / numThreads; + const size_t remainder = visibleInstances.size() % numThreads; - cullFutures_.clear(); - if (cullFutures_.capacity() < numThreads) { - cullFutures_.reserve(numThreads); - } + drawLists.resize(visibleInstances.size()); - size_t start = 0; - for (size_t t = 0; t < numThreads; ++t) { - size_t end = start + chunkSize + (t < remainder ? 1 : 0); - cullFutures_.push_back(std::async(std::launch::async, - [&, start, end]() { - std::vector chunk; - chunk.reserve(end - start); - for (size_t j = start; j < end; ++j) - chunk.push_back(cullInstance(visibleInstances[j])); - return chunk; - })); - start = end; - } + cullFutures_.clear(); + if (cullFutures_.capacity() < numThreads) { + cullFutures_.reserve(numThreads); + } - for (auto& f : cullFutures_) { - auto chunk = f.get(); - for (auto& dl : chunk) - drawLists.push_back(std::move(dl)); + size_t start = 0; + for (size_t t = 0; t < numThreads; ++t) { + const size_t end = start + chunkSize + (t < remainder ? 1 : 0); + cullFutures_.push_back(std::async(std::launch::async, + [&, start, end]() { + for (size_t j = start; j < end; ++j) { + drawLists[j] = cullInstance(visibleInstances[j]); + } + })); + start = end; + } + + for (auto& f : cullFutures_) { + f.get(); + } } } else { for (size_t idx : visibleInstances) @@ -1901,16 +1925,7 @@ VkTexture* WMORenderer::loadTexture(const std::string& path) { } } - std::vector attemptedCandidates; - attemptedCandidates.reserve(uniqueCandidates.size()); - for (const auto& c : uniqueCandidates) { - if (!failedTextureCache_.count(c)) { - attemptedCandidates.push_back(c); - } - } - if (attemptedCandidates.empty()) { - return whiteTexture_.get(); - } + const auto& attemptedCandidates = uniqueCandidates; // Try loading all candidates until one succeeds pipeline::BLPImage blp; @@ -1923,12 +1938,6 @@ VkTexture* WMORenderer::loadTexture(const std::string& path) { } } if (!blp.isValid()) { - static constexpr size_t kMaxFailedTextureCache = 200000; - for (const auto& c : attemptedCandidates) { - if (failedTextureCache_.size() < kMaxFailedTextureCache) { - failedTextureCache_.insert(c); - } - } if (loggedTextureLoadFails_.insert(key).second) { core::Logger::getInstance().warning("WMO: Failed to load texture: ", path); } @@ -1943,16 +1952,6 @@ VkTexture* WMORenderer::loadTexture(const std::string& path) { size_t base = static_cast(blp.width) * static_cast(blp.height) * 4ull; size_t approxBytes = base + (base / 3); if (textureCacheBytes_ + approxBytes > textureCacheBudgetBytes_) { - static constexpr size_t kMaxFailedTextureCache = 200000; - if (failedTextureCache_.size() < kMaxFailedTextureCache) { - // Cache budget-rejected keys too; once saturated, repeated attempts - // cause pointless decode churn and transient allocations. - if (!resolvedKey.empty()) { - failedTextureCache_.insert(resolvedKey); - } else { - failedTextureCache_.insert(key); - } - } if (textureBudgetRejectWarnings_ < 8 || (textureBudgetRejectWarnings_ % 120) == 0) { core::Logger::getInstance().warning( "WMO texture cache full (", textureCacheBytes_ / (1024 * 1024),