mirror of
https://github.com/Kelsidavis/WoWee.git
synced 2026-04-17 09:33:51 +00:00
Optimize threading and texture fallback stability
This commit is contained in:
parent
f4d947fab1
commit
9c8cd44803
8 changed files with 251 additions and 141 deletions
|
|
@ -260,6 +260,7 @@ private:
|
||||||
size_t textureCacheBytes_ = 0;
|
size_t textureCacheBytes_ = 0;
|
||||||
uint64_t textureCacheCounter_ = 0;
|
uint64_t textureCacheCounter_ = 0;
|
||||||
size_t textureCacheBudgetBytes_ = 1024ull * 1024 * 1024;
|
size_t textureCacheBudgetBytes_ = 1024ull * 1024 * 1024;
|
||||||
|
uint32_t textureBudgetRejectWarnings_ = 0;
|
||||||
std::unique_ptr<VkTexture> whiteTexture_;
|
std::unique_ptr<VkTexture> whiteTexture_;
|
||||||
std::unique_ptr<VkTexture> transparentTexture_;
|
std::unique_ptr<VkTexture> transparentTexture_;
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -639,7 +639,7 @@ private:
|
||||||
uint32_t portalCulled = 0;
|
uint32_t portalCulled = 0;
|
||||||
uint32_t distanceCulled = 0;
|
uint32_t distanceCulled = 0;
|
||||||
};
|
};
|
||||||
std::vector<std::future<std::vector<InstanceDrawList>>> cullFutures_;
|
std::vector<std::future<void>> cullFutures_;
|
||||||
|
|
||||||
// Collision query profiling (per frame).
|
// Collision query profiling (per frame).
|
||||||
mutable double queryTimeMs = 0.0;
|
mutable double queryTimeMs = 0.0;
|
||||||
|
|
|
||||||
|
|
@ -69,7 +69,7 @@ WorldSocket::WorldSocket() {
|
||||||
net::ensureInit();
|
net::ensureInit();
|
||||||
// Always reserve baseline receive capacity (safe, behavior-preserving).
|
// Always reserve baseline receive capacity (safe, behavior-preserving).
|
||||||
receiveBuffer.reserve(64 * 1024);
|
receiveBuffer.reserve(64 * 1024);
|
||||||
useFastRecvAppend_ = envFlagEnabled("WOWEE_NET_FAST_RECV_APPEND", false);
|
useFastRecvAppend_ = envFlagEnabled("WOWEE_NET_FAST_RECV_APPEND", true);
|
||||||
useParseScratchQueue_ = envFlagEnabled("WOWEE_NET_PARSE_SCRATCH", false);
|
useParseScratchQueue_ = envFlagEnabled("WOWEE_NET_PARSE_SCRATCH", false);
|
||||||
if (useParseScratchQueue_) {
|
if (useParseScratchQueue_) {
|
||||||
LOG_WARNING("WOWEE_NET_PARSE_SCRATCH is temporarily disabled (known unstable); forcing off");
|
LOG_WARNING("WOWEE_NET_PARSE_SCRATCH is temporarily disabled (known unstable); forcing off");
|
||||||
|
|
@ -304,8 +304,21 @@ void WorldSocket::update() {
|
||||||
disconnect();
|
disconnect();
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
receiveBuffer.resize(oldSize + receivedSize);
|
const size_t needed = oldSize + receivedSize;
|
||||||
std::memcpy(receiveBuffer.data() + oldSize, buffer, receivedSize);
|
if (receiveBuffer.capacity() < needed) {
|
||||||
|
size_t newCap = receiveBuffer.capacity() ? receiveBuffer.capacity() : 64 * 1024;
|
||||||
|
while (newCap < needed && newCap < kMaxReceiveBufferBytes) {
|
||||||
|
newCap = std::min(kMaxReceiveBufferBytes, newCap * 2);
|
||||||
|
}
|
||||||
|
if (newCap < needed) {
|
||||||
|
LOG_ERROR("World socket receive buffer capacity growth failed (needed=", needed,
|
||||||
|
" max=", kMaxReceiveBufferBytes, "). Disconnecting to recover framing.");
|
||||||
|
disconnect();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
receiveBuffer.reserve(newCap);
|
||||||
|
}
|
||||||
|
receiveBuffer.insert(receiveBuffer.end(), buffer, buffer + receivedSize);
|
||||||
} else {
|
} else {
|
||||||
receiveBuffer.insert(receiveBuffer.end(), buffer, buffer + received);
|
receiveBuffer.insert(receiveBuffer.end(), buffer, buffer + received);
|
||||||
}
|
}
|
||||||
|
|
@ -334,10 +347,13 @@ void WorldSocket::update() {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (receivedAny) {
|
if (receivedAny) {
|
||||||
LOG_DEBUG("World socket read ", bytesReadThisTick, " bytes in ", readOps,
|
const bool debugLog = core::Logger::getInstance().shouldLog(core::LogLevel::DEBUG);
|
||||||
" recv call(s), buffered=", receiveBuffer.size());
|
if (debugLog) {
|
||||||
// Hex dump received bytes for auth debugging
|
LOG_DEBUG("World socket read ", bytesReadThisTick, " bytes in ", readOps,
|
||||||
if (bytesReadThisTick <= 128) {
|
" recv call(s), buffered=", receiveBuffer.size());
|
||||||
|
}
|
||||||
|
// Hex dump received bytes for auth debugging (debug-only to avoid per-frame string work)
|
||||||
|
if (debugLog && bytesReadThisTick <= 128) {
|
||||||
std::string hex;
|
std::string hex;
|
||||||
for (size_t i = 0; i < receiveBuffer.size(); ++i) {
|
for (size_t i = 0; i < receiveBuffer.size(); ++i) {
|
||||||
char buf[4]; snprintf(buf, sizeof(buf), "%02x ", receiveBuffer[i]); hex += buf;
|
char buf[4]; snprintf(buf, sizeof(buf), "%02x ", receiveBuffer[i]); hex += buf;
|
||||||
|
|
@ -345,7 +361,7 @@ void WorldSocket::update() {
|
||||||
LOG_DEBUG("World socket raw bytes: ", hex);
|
LOG_DEBUG("World socket raw bytes: ", hex);
|
||||||
}
|
}
|
||||||
tryParsePackets();
|
tryParsePackets();
|
||||||
if (connected && !receiveBuffer.empty()) {
|
if (debugLog && connected && !receiveBuffer.empty()) {
|
||||||
LOG_DEBUG("World socket parse left ", receiveBuffer.size(),
|
LOG_DEBUG("World socket parse left ", receiveBuffer.size(),
|
||||||
" bytes buffered (awaiting complete packet)");
|
" bytes buffered (awaiting complete packet)");
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -29,6 +29,19 @@ size_t parseEnvSizeMB(const char* name) {
|
||||||
}
|
}
|
||||||
return static_cast<size_t>(mb);
|
return static_cast<size_t>(mb);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
size_t parseEnvCount(const char* name, size_t defValue) {
|
||||||
|
const char* v = std::getenv(name);
|
||||||
|
if (!v || !*v) {
|
||||||
|
return defValue;
|
||||||
|
}
|
||||||
|
char* end = nullptr;
|
||||||
|
unsigned long long n = std::strtoull(v, &end, 10);
|
||||||
|
if (end == v || n == 0) {
|
||||||
|
return defValue;
|
||||||
|
}
|
||||||
|
return static_cast<size_t>(n);
|
||||||
|
}
|
||||||
} // namespace
|
} // namespace
|
||||||
|
|
||||||
AssetManager::AssetManager() = default;
|
AssetManager::AssetManager() = default;
|
||||||
|
|
@ -148,7 +161,8 @@ BLPImage AssetManager::loadTexture(const std::string& path) {
|
||||||
if (blpData.empty()) {
|
if (blpData.empty()) {
|
||||||
static std::unordered_set<std::string> loggedMissingTextures;
|
static std::unordered_set<std::string> loggedMissingTextures;
|
||||||
static bool missingTextureLogSuppressed = false;
|
static bool missingTextureLogSuppressed = false;
|
||||||
static constexpr size_t kMaxMissingTextureLogKeys = 20000;
|
static const size_t kMaxMissingTextureLogKeys =
|
||||||
|
parseEnvCount("WOWEE_TEXTURE_MISS_LOG_KEYS", 400);
|
||||||
if (loggedMissingTextures.size() < kMaxMissingTextureLogKeys &&
|
if (loggedMissingTextures.size() < kMaxMissingTextureLogKeys &&
|
||||||
loggedMissingTextures.insert(normalizedPath).second) {
|
loggedMissingTextures.insert(normalizedPath).second) {
|
||||||
LOG_WARNING("Texture not found: ", normalizedPath);
|
LOG_WARNING("Texture not found: ", normalizedPath);
|
||||||
|
|
@ -164,7 +178,8 @@ BLPImage AssetManager::loadTexture(const std::string& path) {
|
||||||
if (!image.isValid()) {
|
if (!image.isValid()) {
|
||||||
static std::unordered_set<std::string> loggedDecodeFails;
|
static std::unordered_set<std::string> loggedDecodeFails;
|
||||||
static bool decodeFailLogSuppressed = false;
|
static bool decodeFailLogSuppressed = false;
|
||||||
static constexpr size_t kMaxDecodeFailLogKeys = 8000;
|
static const size_t kMaxDecodeFailLogKeys =
|
||||||
|
parseEnvCount("WOWEE_TEXTURE_DECODE_LOG_KEYS", 200);
|
||||||
if (loggedDecodeFails.size() < kMaxDecodeFailLogKeys &&
|
if (loggedDecodeFails.size() < kMaxDecodeFailLogKeys &&
|
||||||
loggedDecodeFails.insert(normalizedPath).second) {
|
loggedDecodeFails.insert(normalizedPath).second) {
|
||||||
LOG_ERROR("Failed to load texture: ", normalizedPath);
|
LOG_ERROR("Failed to load texture: ", normalizedPath);
|
||||||
|
|
|
||||||
|
|
@ -56,6 +56,15 @@ size_t envSizeMBOrDefault(const char* name, size_t defMb) {
|
||||||
return static_cast<size_t>(mb);
|
return static_cast<size_t>(mb);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
size_t envSizeOrDefault(const char* name, size_t defValue) {
|
||||||
|
const char* v = std::getenv(name);
|
||||||
|
if (!v || !*v) return defValue;
|
||||||
|
char* end = nullptr;
|
||||||
|
unsigned long long n = std::strtoull(v, &end, 10);
|
||||||
|
if (end == v || n == 0) return defValue;
|
||||||
|
return static_cast<size_t>(n);
|
||||||
|
}
|
||||||
|
|
||||||
size_t approxTextureBytesWithMips(int w, int h) {
|
size_t approxTextureBytesWithMips(int w, int h) {
|
||||||
if (w <= 0 || h <= 0) return 0;
|
if (w <= 0 || h <= 0) return 0;
|
||||||
size_t base = static_cast<size_t>(w) * static_cast<size_t>(h) * 4ull;
|
size_t base = static_cast<size_t>(w) * static_cast<size_t>(h) * 4ull;
|
||||||
|
|
@ -95,7 +104,13 @@ bool CharacterRenderer::initialize(VkContext* ctx, VkDescriptorSetLayout perFram
|
||||||
assetManager = am;
|
assetManager = am;
|
||||||
perFrameLayout_ = perFrameLayout;
|
perFrameLayout_ = perFrameLayout;
|
||||||
renderPassOverride_ = renderPassOverride;
|
renderPassOverride_ = renderPassOverride;
|
||||||
numAnimThreads_ = std::max(1u, std::min(8u, std::thread::hardware_concurrency()));
|
const unsigned hc = std::thread::hardware_concurrency();
|
||||||
|
const size_t availableCores = (hc > 1u) ? static_cast<size_t>(hc - 1u) : 1ull;
|
||||||
|
// Character updates run alongside M2/WMO work; default to a smaller share.
|
||||||
|
const size_t defaultAnimThreads = std::max<size_t>(1, availableCores / 4);
|
||||||
|
numAnimThreads_ = static_cast<uint32_t>(std::max<size_t>(
|
||||||
|
1, envSizeOrDefault("WOWEE_CHAR_ANIM_THREADS", defaultAnimThreads)));
|
||||||
|
core::Logger::getInstance().info("Character anim threads: ", numAnimThreads_);
|
||||||
|
|
||||||
VkDevice device = vkCtx_->getDevice();
|
VkDevice device = vkCtx_->getDevice();
|
||||||
|
|
||||||
|
|
@ -250,7 +265,8 @@ bool CharacterRenderer::initialize(VkContext* ctx, VkDescriptorSetLayout perFram
|
||||||
}
|
}
|
||||||
|
|
||||||
// Diagnostics-only: cache lifetime is currently tied to renderer lifetime.
|
// Diagnostics-only: cache lifetime is currently tied to renderer lifetime.
|
||||||
textureCacheBudgetBytes_ = envSizeMBOrDefault("WOWEE_CHARACTER_TEX_CACHE_MB", 512) * 1024ull * 1024ull;
|
textureCacheBudgetBytes_ = envSizeMBOrDefault("WOWEE_CHARACTER_TEX_CACHE_MB", 1024) * 1024ull * 1024ull;
|
||||||
|
LOG_INFO("Character texture cache budget: ", textureCacheBudgetBytes_ / (1024 * 1024), " MB");
|
||||||
|
|
||||||
core::Logger::getInstance().info("Character renderer initialized (Vulkan)");
|
core::Logger::getInstance().info("Character renderer initialized (Vulkan)");
|
||||||
return true;
|
return true;
|
||||||
|
|
@ -403,8 +419,29 @@ VkTexture* CharacterRenderer::loadTexture(const std::string& path) {
|
||||||
|
|
||||||
auto blpImage = assetManager->loadTexture(key);
|
auto blpImage = assetManager->loadTexture(key);
|
||||||
if (!blpImage.isValid()) {
|
if (!blpImage.isValid()) {
|
||||||
|
static constexpr size_t kMaxFailedTextureCache = 200000;
|
||||||
core::Logger::getInstance().warning("Failed to load texture: ", path);
|
core::Logger::getInstance().warning("Failed to load texture: ", path);
|
||||||
failedTextureCache_.insert(key);
|
if (failedTextureCache_.size() < kMaxFailedTextureCache) {
|
||||||
|
failedTextureCache_.insert(key);
|
||||||
|
}
|
||||||
|
return whiteTexture_.get();
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t approxBytes = approxTextureBytesWithMips(blpImage.width, blpImage.height);
|
||||||
|
if (textureCacheBytes_ + approxBytes > textureCacheBudgetBytes_) {
|
||||||
|
static constexpr size_t kMaxFailedTextureCache = 200000;
|
||||||
|
if (failedTextureCache_.size() < kMaxFailedTextureCache) {
|
||||||
|
// Budget is saturated; avoid repeatedly decoding/uploading this texture.
|
||||||
|
failedTextureCache_.insert(key);
|
||||||
|
}
|
||||||
|
if (textureBudgetRejectWarnings_ < 8 || (textureBudgetRejectWarnings_ % 120) == 0) {
|
||||||
|
core::Logger::getInstance().warning(
|
||||||
|
"Character texture cache full (",
|
||||||
|
textureCacheBytes_ / (1024 * 1024), " MB / ",
|
||||||
|
textureCacheBudgetBytes_ / (1024 * 1024), " MB), rejecting texture: ",
|
||||||
|
path);
|
||||||
|
}
|
||||||
|
++textureBudgetRejectWarnings_;
|
||||||
return whiteTexture_.get();
|
return whiteTexture_.get();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -426,7 +463,7 @@ VkTexture* CharacterRenderer::loadTexture(const std::string& path) {
|
||||||
|
|
||||||
TextureCacheEntry e;
|
TextureCacheEntry e;
|
||||||
e.texture = std::move(tex);
|
e.texture = std::move(tex);
|
||||||
e.approxBytes = approxTextureBytesWithMips(blpImage.width, blpImage.height);
|
e.approxBytes = approxBytes;
|
||||||
e.lastUse = ++textureCacheCounter_;
|
e.lastUse = ++textureCacheCounter_;
|
||||||
e.hasAlpha = hasAlpha;
|
e.hasAlpha = hasAlpha;
|
||||||
e.colorKeyBlack = colorKeyBlackHint;
|
e.colorKeyBlack = colorKeyBlackHint;
|
||||||
|
|
@ -435,12 +472,6 @@ VkTexture* CharacterRenderer::loadTexture(const std::string& path) {
|
||||||
textureColorKeyBlackByPtr_[texPtr] = colorKeyBlackHint;
|
textureColorKeyBlackByPtr_[texPtr] = colorKeyBlackHint;
|
||||||
textureCache[key] = std::move(e);
|
textureCache[key] = std::move(e);
|
||||||
|
|
||||||
if (textureCacheBytes_ > textureCacheBudgetBytes_) {
|
|
||||||
core::Logger::getInstance().warning(
|
|
||||||
"Character texture cache over budget: ",
|
|
||||||
textureCacheBytes_ / (1024 * 1024), " MB > ",
|
|
||||||
textureCacheBudgetBytes_ / (1024 * 1024), " MB (textures=", textureCache.size(), ")");
|
|
||||||
}
|
|
||||||
core::Logger::getInstance().debug("Loaded character texture: ", path, " (", blpImage.width, "x", blpImage.height, ")");
|
core::Logger::getInstance().debug("Loaded character texture: ", path, " (", blpImage.width, "x", blpImage.height, ")");
|
||||||
return texPtr;
|
return texPtr;
|
||||||
}
|
}
|
||||||
|
|
@ -1144,29 +1175,40 @@ void CharacterRenderer::update(float deltaTime, const glm::vec3& cameraPos) {
|
||||||
|
|
||||||
// Thread animation updates in chunks to avoid spawning one task per instance.
|
// Thread animation updates in chunks to avoid spawning one task per instance.
|
||||||
if (updatedCount >= 8 && numAnimThreads_ > 1) {
|
if (updatedCount >= 8 && numAnimThreads_ > 1) {
|
||||||
const size_t numThreads = std::min(static_cast<size_t>(numAnimThreads_), updatedCount);
|
static const size_t minAnimWorkPerThread = std::max<size_t>(
|
||||||
const size_t chunkSize = updatedCount / numThreads;
|
16, envSizeOrDefault("WOWEE_CHAR_ANIM_WORK_PER_THREAD", 64));
|
||||||
const size_t remainder = updatedCount % numThreads;
|
const size_t maxUsefulThreads = std::max<size_t>(
|
||||||
|
1, (updatedCount + minAnimWorkPerThread - 1) / minAnimWorkPerThread);
|
||||||
|
const size_t numThreads = std::min(static_cast<size_t>(numAnimThreads_), maxUsefulThreads);
|
||||||
|
|
||||||
animFutures_.clear();
|
if (numThreads <= 1) {
|
||||||
if (animFutures_.capacity() < numThreads) {
|
for (auto& instRef : toUpdate) {
|
||||||
animFutures_.reserve(numThreads);
|
updateAnimation(instRef.get(), deltaTime);
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
const size_t chunkSize = updatedCount / numThreads;
|
||||||
|
const size_t remainder = updatedCount % numThreads;
|
||||||
|
|
||||||
size_t start = 0;
|
animFutures_.clear();
|
||||||
for (size_t t = 0; t < numThreads; t++) {
|
if (animFutures_.capacity() < numThreads) {
|
||||||
size_t end = start + chunkSize + (t < remainder ? 1 : 0);
|
animFutures_.reserve(numThreads);
|
||||||
animFutures_.push_back(std::async(std::launch::async,
|
}
|
||||||
[this, &toUpdate, start, end, deltaTime]() {
|
|
||||||
for (size_t i = start; i < end; i++) {
|
|
||||||
updateAnimation(toUpdate[i].get(), deltaTime);
|
|
||||||
}
|
|
||||||
}));
|
|
||||||
start = end;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (auto& f : animFutures_) {
|
size_t start = 0;
|
||||||
f.get();
|
for (size_t t = 0; t < numThreads; t++) {
|
||||||
|
size_t end = start + chunkSize + (t < remainder ? 1 : 0);
|
||||||
|
animFutures_.push_back(std::async(std::launch::async,
|
||||||
|
[this, &toUpdate, start, end, deltaTime]() {
|
||||||
|
for (size_t i = start; i < end; i++) {
|
||||||
|
updateAnimation(toUpdate[i].get(), deltaTime);
|
||||||
|
}
|
||||||
|
}));
|
||||||
|
start = end;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (auto& f : animFutures_) {
|
||||||
|
f.get();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// Sequential for small counts (avoid thread overhead)
|
// Sequential for small counts (avoid thread overhead)
|
||||||
|
|
|
||||||
|
|
@ -49,6 +49,15 @@ size_t envSizeMBOrDefault(const char* name, size_t defMb) {
|
||||||
return static_cast<size_t>(mb);
|
return static_cast<size_t>(mb);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
size_t envSizeOrDefault(const char* name, size_t defValue) {
|
||||||
|
const char* raw = std::getenv(name);
|
||||||
|
if (!raw || !*raw) return defValue;
|
||||||
|
char* end = nullptr;
|
||||||
|
unsigned long long v = std::strtoull(raw, &end, 10);
|
||||||
|
if (end == raw || v == 0) return defValue;
|
||||||
|
return static_cast<size_t>(v);
|
||||||
|
}
|
||||||
|
|
||||||
static constexpr uint32_t kParticleFlagRandomized = 0x40;
|
static constexpr uint32_t kParticleFlagRandomized = 0x40;
|
||||||
static constexpr uint32_t kParticleFlagTiled = 0x80;
|
static constexpr uint32_t kParticleFlagTiled = 0x80;
|
||||||
|
|
||||||
|
|
@ -299,7 +308,12 @@ bool M2Renderer::initialize(VkContext* ctx, VkDescriptorSetLayout perFrameLayout
|
||||||
vkCtx_ = ctx;
|
vkCtx_ = ctx;
|
||||||
assetManager = assets;
|
assetManager = assets;
|
||||||
|
|
||||||
numAnimThreads_ = std::min(4u, std::max(1u, std::thread::hardware_concurrency() - 1));
|
const unsigned hc = std::thread::hardware_concurrency();
|
||||||
|
const size_t availableCores = (hc > 1u) ? static_cast<size_t>(hc - 1u) : 1ull;
|
||||||
|
// Keep headroom for other frame tasks: M2 gets about half of non-main cores by default.
|
||||||
|
const size_t defaultAnimThreads = std::max<size_t>(1, availableCores / 2);
|
||||||
|
numAnimThreads_ = static_cast<uint32_t>(std::max<size_t>(
|
||||||
|
1, envSizeOrDefault("WOWEE_M2_ANIM_THREADS", defaultAnimThreads)));
|
||||||
LOG_INFO("Initializing M2 renderer (Vulkan, ", numAnimThreads_, " anim threads)...");
|
LOG_INFO("Initializing M2 renderer (Vulkan, ", numAnimThreads_, " anim threads)...");
|
||||||
|
|
||||||
VkDevice device = vkCtx_->getDevice();
|
VkDevice device = vkCtx_->getDevice();
|
||||||
|
|
@ -1915,7 +1929,9 @@ void M2Renderer::update(float deltaTime, const glm::vec3& cameraPos, const glm::
|
||||||
// Phase 2: Compute bone matrices (expensive, parallel if enough work)
|
// Phase 2: Compute bone matrices (expensive, parallel if enough work)
|
||||||
const size_t animCount = boneWorkIndices_.size();
|
const size_t animCount = boneWorkIndices_.size();
|
||||||
if (animCount > 0) {
|
if (animCount > 0) {
|
||||||
if (animCount < 6 || numAnimThreads_ <= 1) {
|
static const size_t minParallelAnimInstances = std::max<size_t>(
|
||||||
|
8, envSizeOrDefault("WOWEE_M2_ANIM_MT_MIN", 96));
|
||||||
|
if (animCount < minParallelAnimInstances || numAnimThreads_ <= 1) {
|
||||||
// Sequential — not enough work to justify thread overhead
|
// Sequential — not enough work to justify thread overhead
|
||||||
for (size_t i : boneWorkIndices_) {
|
for (size_t i : boneWorkIndices_) {
|
||||||
if (i >= instances.size()) continue;
|
if (i >= instances.size()) continue;
|
||||||
|
|
@ -1926,35 +1942,49 @@ void M2Renderer::update(float deltaTime, const glm::vec3& cameraPos, const glm::
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// Parallel — dispatch across worker threads
|
// Parallel — dispatch across worker threads
|
||||||
const size_t numThreads = std::min(static_cast<size_t>(numAnimThreads_), animCount);
|
static const size_t minAnimWorkPerThread = std::max<size_t>(
|
||||||
const size_t chunkSize = animCount / numThreads;
|
16, envSizeOrDefault("WOWEE_M2_ANIM_WORK_PER_THREAD", 64));
|
||||||
const size_t remainder = animCount % numThreads;
|
const size_t maxUsefulThreads = std::max<size_t>(
|
||||||
|
1, (animCount + minAnimWorkPerThread - 1) / minAnimWorkPerThread);
|
||||||
|
const size_t numThreads = std::min(static_cast<size_t>(numAnimThreads_), maxUsefulThreads);
|
||||||
|
if (numThreads <= 1) {
|
||||||
|
for (size_t i : boneWorkIndices_) {
|
||||||
|
if (i >= instances.size()) continue;
|
||||||
|
auto& inst = instances[i];
|
||||||
|
auto mdlIt = models.find(inst.modelId);
|
||||||
|
if (mdlIt == models.end()) continue;
|
||||||
|
computeBoneMatrices(mdlIt->second, inst);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
const size_t chunkSize = animCount / numThreads;
|
||||||
|
const size_t remainder = animCount % numThreads;
|
||||||
|
|
||||||
// Reuse persistent futures vector to avoid allocation
|
// Reuse persistent futures vector to avoid allocation
|
||||||
animFutures_.clear();
|
animFutures_.clear();
|
||||||
if (animFutures_.capacity() < numThreads) {
|
if (animFutures_.capacity() < numThreads) {
|
||||||
animFutures_.reserve(numThreads);
|
animFutures_.reserve(numThreads);
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t start = 0;
|
size_t start = 0;
|
||||||
for (size_t t = 0; t < numThreads; ++t) {
|
for (size_t t = 0; t < numThreads; ++t) {
|
||||||
size_t end = start + chunkSize + (t < remainder ? 1 : 0);
|
size_t end = start + chunkSize + (t < remainder ? 1 : 0);
|
||||||
animFutures_.push_back(std::async(std::launch::async,
|
animFutures_.push_back(std::async(std::launch::async,
|
||||||
[this, start, end]() {
|
[this, start, end]() {
|
||||||
for (size_t j = start; j < end; ++j) {
|
for (size_t j = start; j < end; ++j) {
|
||||||
size_t idx = boneWorkIndices_[j];
|
size_t idx = boneWorkIndices_[j];
|
||||||
if (idx >= instances.size()) continue;
|
if (idx >= instances.size()) continue;
|
||||||
auto& inst = instances[idx];
|
auto& inst = instances[idx];
|
||||||
auto mdlIt = models.find(inst.modelId);
|
auto mdlIt = models.find(inst.modelId);
|
||||||
if (mdlIt == models.end()) continue;
|
if (mdlIt == models.end()) continue;
|
||||||
computeBoneMatrices(mdlIt->second, inst);
|
computeBoneMatrices(mdlIt->second, inst);
|
||||||
}
|
}
|
||||||
}));
|
}));
|
||||||
start = end;
|
start = end;
|
||||||
}
|
}
|
||||||
|
|
||||||
for (auto& f : animFutures_) {
|
for (auto& f : animFutures_) {
|
||||||
f.get();
|
f.get();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -18,6 +18,7 @@
|
||||||
#include <glm/gtx/euler_angles.hpp>
|
#include <glm/gtx/euler_angles.hpp>
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
#include <cctype>
|
#include <cctype>
|
||||||
|
#include <cstdlib>
|
||||||
#include <functional>
|
#include <functional>
|
||||||
#include <unordered_set>
|
#include <unordered_set>
|
||||||
|
|
||||||
|
|
@ -26,6 +27,26 @@ namespace rendering {
|
||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
|
|
||||||
|
int computeTerrainWorkerCount() {
|
||||||
|
const char* raw = std::getenv("WOWEE_TERRAIN_WORKERS");
|
||||||
|
if (raw && *raw) {
|
||||||
|
char* end = nullptr;
|
||||||
|
unsigned long long forced = std::strtoull(raw, &end, 10);
|
||||||
|
if (end != raw && forced > 0) {
|
||||||
|
return static_cast<int>(forced);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
unsigned hc = std::thread::hardware_concurrency();
|
||||||
|
if (hc > 0) {
|
||||||
|
// Terrain streaming should leave CPU room for render/update threads.
|
||||||
|
const unsigned availableCores = (hc > 1u) ? (hc - 1u) : 1u;
|
||||||
|
const unsigned targetWorkers = std::max(2u, availableCores / 2u);
|
||||||
|
return static_cast<int>(targetWorkers);
|
||||||
|
}
|
||||||
|
return 2; // Fallback
|
||||||
|
}
|
||||||
|
|
||||||
bool decodeLayerAlpha(const pipeline::MapChunk& chunk, size_t layerIdx, std::vector<uint8_t>& outAlpha) {
|
bool decodeLayerAlpha(const pipeline::MapChunk& chunk, size_t layerIdx, std::vector<uint8_t>& outAlpha) {
|
||||||
if (layerIdx >= chunk.layers.size()) return false;
|
if (layerIdx >= chunk.layers.size()) return false;
|
||||||
const auto& layer = chunk.layers[layerIdx];
|
const auto& layer = chunk.layers[layerIdx];
|
||||||
|
|
@ -128,15 +149,9 @@ bool TerrainManager::initialize(pipeline::AssetManager* assets, TerrainRenderer*
|
||||||
LOG_INFO("Terrain tile cache budget: ", tileCacheBudgetBytes_ / (1024 * 1024), " MB (dynamic)");
|
LOG_INFO("Terrain tile cache budget: ", tileCacheBudgetBytes_ / (1024 * 1024), " MB (dynamic)");
|
||||||
|
|
||||||
// Start background worker pool (dynamic: scales with available cores)
|
// Start background worker pool (dynamic: scales with available cores)
|
||||||
// Use 75% of logical cores for decompression, leaving headroom for render/OS
|
// Keep defaults moderate; env override can increase if streaming is bottlenecked.
|
||||||
workerRunning.store(true);
|
workerRunning.store(true);
|
||||||
unsigned hc = std::thread::hardware_concurrency();
|
workerCount = computeTerrainWorkerCount();
|
||||||
if (hc > 0) {
|
|
||||||
unsigned targetWorkers = std::max(6u, (hc * 3) / 4); // 75% of cores, minimum 6
|
|
||||||
workerCount = static_cast<int>(targetWorkers);
|
|
||||||
} else {
|
|
||||||
workerCount = 6; // Fallback
|
|
||||||
}
|
|
||||||
workerThreads.reserve(workerCount);
|
workerThreads.reserve(workerCount);
|
||||||
for (int i = 0; i < workerCount; i++) {
|
for (int i = 0; i < workerCount; i++) {
|
||||||
workerThreads.emplace_back(&TerrainManager::workerLoop, this);
|
workerThreads.emplace_back(&TerrainManager::workerLoop, this);
|
||||||
|
|
@ -926,12 +941,10 @@ void TerrainManager::processReadyTiles() {
|
||||||
|
|
||||||
if (pending) {
|
if (pending) {
|
||||||
TileCoord coord = pending->coord;
|
TileCoord coord = pending->coord;
|
||||||
auto tileStart = std::chrono::high_resolution_clock::now();
|
|
||||||
|
|
||||||
finalizeTile(pending);
|
finalizeTile(pending);
|
||||||
|
|
||||||
auto tileEnd = std::chrono::high_resolution_clock::now();
|
auto now = std::chrono::high_resolution_clock::now();
|
||||||
float tileTimeMs = std::chrono::duration<float, std::milli>(tileEnd - tileStart).count();
|
|
||||||
|
|
||||||
{
|
{
|
||||||
std::lock_guard<std::mutex> lock(queueMutex);
|
std::lock_guard<std::mutex> lock(queueMutex);
|
||||||
|
|
@ -940,7 +953,7 @@ void TerrainManager::processReadyTiles() {
|
||||||
processed++;
|
processed++;
|
||||||
|
|
||||||
// Check if we've exceeded time budget
|
// Check if we've exceeded time budget
|
||||||
float elapsedMs = std::chrono::duration<float, std::milli>(tileEnd - startTime).count();
|
float elapsedMs = std::chrono::duration<float, std::milli>(now - startTime).count();
|
||||||
if (elapsedMs >= timeBudgetMs) {
|
if (elapsedMs >= timeBudgetMs) {
|
||||||
if (processed > 1) {
|
if (processed > 1) {
|
||||||
LOG_DEBUG("Processed ", processed, " tiles in ", elapsedMs, "ms (budget: ", timeBudgetMs, "ms)");
|
LOG_DEBUG("Processed ", processed, " tiles in ", elapsedMs, "ms (budget: ", timeBudgetMs, "ms)");
|
||||||
|
|
@ -1183,13 +1196,7 @@ void TerrainManager::unloadAll() {
|
||||||
// Restart worker threads so streaming can resume (dynamic: scales with available cores)
|
// Restart worker threads so streaming can resume (dynamic: scales with available cores)
|
||||||
// Use 75% of logical cores for decompression, leaving headroom for render/OS
|
// Use 75% of logical cores for decompression, leaving headroom for render/OS
|
||||||
workerRunning.store(true);
|
workerRunning.store(true);
|
||||||
unsigned hc = std::thread::hardware_concurrency();
|
workerCount = computeTerrainWorkerCount();
|
||||||
if (hc > 0) {
|
|
||||||
unsigned targetWorkers = std::max(6u, (hc * 3) / 4); // 75% of cores, minimum 6
|
|
||||||
workerCount = static_cast<int>(targetWorkers);
|
|
||||||
} else {
|
|
||||||
workerCount = 6; // Fallback
|
|
||||||
}
|
|
||||||
workerThreads.reserve(workerCount);
|
workerThreads.reserve(workerCount);
|
||||||
for (int i = 0; i < workerCount; i++) {
|
for (int i = 0; i < workerCount; i++) {
|
||||||
workerThreads.emplace_back(&TerrainManager::workerLoop, this);
|
workerThreads.emplace_back(&TerrainManager::workerLoop, this);
|
||||||
|
|
|
||||||
|
|
@ -37,6 +37,15 @@ size_t envSizeMBOrDefault(const char* name, size_t defMb) {
|
||||||
if (end == raw || mb == 0) return defMb;
|
if (end == raw || mb == 0) return defMb;
|
||||||
return static_cast<size_t>(mb);
|
return static_cast<size_t>(mb);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
size_t envSizeOrDefault(const char* name, size_t defValue) {
|
||||||
|
const char* raw = std::getenv(name);
|
||||||
|
if (!raw || !*raw) return defValue;
|
||||||
|
char* end = nullptr;
|
||||||
|
unsigned long long v = std::strtoull(raw, &end, 10);
|
||||||
|
if (end == raw || v == 0) return defValue;
|
||||||
|
return static_cast<size_t>(v);
|
||||||
|
}
|
||||||
} // namespace
|
} // namespace
|
||||||
|
|
||||||
static void transformAABB(const glm::mat4& modelMatrix,
|
static void transformAABB(const glm::mat4& modelMatrix,
|
||||||
|
|
@ -65,7 +74,13 @@ bool WMORenderer::initialize(VkContext* ctx, VkDescriptorSetLayout perFrameLayou
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
numCullThreads_ = std::min(4u, std::max(1u, std::thread::hardware_concurrency() - 1));
|
const unsigned hc = std::thread::hardware_concurrency();
|
||||||
|
const size_t availableCores = (hc > 1u) ? static_cast<size_t>(hc - 1u) : 1ull;
|
||||||
|
// WMO culling is lighter than animation; keep defaults conservative to reduce spikes.
|
||||||
|
const size_t defaultCullThreads = std::max<size_t>(1, availableCores / 4);
|
||||||
|
numCullThreads_ = static_cast<uint32_t>(std::max<size_t>(
|
||||||
|
1, envSizeOrDefault("WOWEE_WMO_CULL_THREADS", defaultCullThreads)));
|
||||||
|
core::Logger::getInstance().info("WMO cull threads: ", numCullThreads_);
|
||||||
|
|
||||||
VkDevice device = vkCtx_->getDevice();
|
VkDevice device = vkCtx_->getDevice();
|
||||||
|
|
||||||
|
|
@ -1208,35 +1223,44 @@ void WMORenderer::render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet, const
|
||||||
std::vector<InstanceDrawList> drawLists;
|
std::vector<InstanceDrawList> drawLists;
|
||||||
drawLists.reserve(visibleInstances.size());
|
drawLists.reserve(visibleInstances.size());
|
||||||
|
|
||||||
if (visibleInstances.size() >= 4 && numCullThreads_ > 1) {
|
static const size_t minParallelCullInstances = std::max<size_t>(
|
||||||
const size_t numThreads = std::min(static_cast<size_t>(numCullThreads_),
|
4, envSizeOrDefault("WOWEE_WMO_CULL_MT_MIN", 128));
|
||||||
visibleInstances.size());
|
if (visibleInstances.size() >= minParallelCullInstances && numCullThreads_ > 1) {
|
||||||
const size_t chunkSize = visibleInstances.size() / numThreads;
|
static const size_t minCullWorkPerThread = std::max<size_t>(
|
||||||
const size_t remainder = visibleInstances.size() % numThreads;
|
16, envSizeOrDefault("WOWEE_WMO_CULL_WORK_PER_THREAD", 64));
|
||||||
|
const size_t maxUsefulThreads = std::max<size_t>(
|
||||||
|
1, (visibleInstances.size() + minCullWorkPerThread - 1) / minCullWorkPerThread);
|
||||||
|
const size_t numThreads = std::min(static_cast<size_t>(numCullThreads_), maxUsefulThreads);
|
||||||
|
if (numThreads <= 1) {
|
||||||
|
for (size_t idx : visibleInstances) {
|
||||||
|
drawLists.push_back(cullInstance(idx));
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
const size_t chunkSize = visibleInstances.size() / numThreads;
|
||||||
|
const size_t remainder = visibleInstances.size() % numThreads;
|
||||||
|
|
||||||
cullFutures_.clear();
|
drawLists.resize(visibleInstances.size());
|
||||||
if (cullFutures_.capacity() < numThreads) {
|
|
||||||
cullFutures_.reserve(numThreads);
|
|
||||||
}
|
|
||||||
|
|
||||||
size_t start = 0;
|
cullFutures_.clear();
|
||||||
for (size_t t = 0; t < numThreads; ++t) {
|
if (cullFutures_.capacity() < numThreads) {
|
||||||
size_t end = start + chunkSize + (t < remainder ? 1 : 0);
|
cullFutures_.reserve(numThreads);
|
||||||
cullFutures_.push_back(std::async(std::launch::async,
|
}
|
||||||
[&, start, end]() {
|
|
||||||
std::vector<InstanceDrawList> chunk;
|
|
||||||
chunk.reserve(end - start);
|
|
||||||
for (size_t j = start; j < end; ++j)
|
|
||||||
chunk.push_back(cullInstance(visibleInstances[j]));
|
|
||||||
return chunk;
|
|
||||||
}));
|
|
||||||
start = end;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (auto& f : cullFutures_) {
|
size_t start = 0;
|
||||||
auto chunk = f.get();
|
for (size_t t = 0; t < numThreads; ++t) {
|
||||||
for (auto& dl : chunk)
|
const size_t end = start + chunkSize + (t < remainder ? 1 : 0);
|
||||||
drawLists.push_back(std::move(dl));
|
cullFutures_.push_back(std::async(std::launch::async,
|
||||||
|
[&, start, end]() {
|
||||||
|
for (size_t j = start; j < end; ++j) {
|
||||||
|
drawLists[j] = cullInstance(visibleInstances[j]);
|
||||||
|
}
|
||||||
|
}));
|
||||||
|
start = end;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (auto& f : cullFutures_) {
|
||||||
|
f.get();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
for (size_t idx : visibleInstances)
|
for (size_t idx : visibleInstances)
|
||||||
|
|
@ -1901,16 +1925,7 @@ VkTexture* WMORenderer::loadTexture(const std::string& path) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<std::string> attemptedCandidates;
|
const auto& attemptedCandidates = uniqueCandidates;
|
||||||
attemptedCandidates.reserve(uniqueCandidates.size());
|
|
||||||
for (const auto& c : uniqueCandidates) {
|
|
||||||
if (!failedTextureCache_.count(c)) {
|
|
||||||
attemptedCandidates.push_back(c);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (attemptedCandidates.empty()) {
|
|
||||||
return whiteTexture_.get();
|
|
||||||
}
|
|
||||||
|
|
||||||
// Try loading all candidates until one succeeds
|
// Try loading all candidates until one succeeds
|
||||||
pipeline::BLPImage blp;
|
pipeline::BLPImage blp;
|
||||||
|
|
@ -1923,12 +1938,6 @@ VkTexture* WMORenderer::loadTexture(const std::string& path) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (!blp.isValid()) {
|
if (!blp.isValid()) {
|
||||||
static constexpr size_t kMaxFailedTextureCache = 200000;
|
|
||||||
for (const auto& c : attemptedCandidates) {
|
|
||||||
if (failedTextureCache_.size() < kMaxFailedTextureCache) {
|
|
||||||
failedTextureCache_.insert(c);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (loggedTextureLoadFails_.insert(key).second) {
|
if (loggedTextureLoadFails_.insert(key).second) {
|
||||||
core::Logger::getInstance().warning("WMO: Failed to load texture: ", path);
|
core::Logger::getInstance().warning("WMO: Failed to load texture: ", path);
|
||||||
}
|
}
|
||||||
|
|
@ -1943,16 +1952,6 @@ VkTexture* WMORenderer::loadTexture(const std::string& path) {
|
||||||
size_t base = static_cast<size_t>(blp.width) * static_cast<size_t>(blp.height) * 4ull;
|
size_t base = static_cast<size_t>(blp.width) * static_cast<size_t>(blp.height) * 4ull;
|
||||||
size_t approxBytes = base + (base / 3);
|
size_t approxBytes = base + (base / 3);
|
||||||
if (textureCacheBytes_ + approxBytes > textureCacheBudgetBytes_) {
|
if (textureCacheBytes_ + approxBytes > textureCacheBudgetBytes_) {
|
||||||
static constexpr size_t kMaxFailedTextureCache = 200000;
|
|
||||||
if (failedTextureCache_.size() < kMaxFailedTextureCache) {
|
|
||||||
// Cache budget-rejected keys too; once saturated, repeated attempts
|
|
||||||
// cause pointless decode churn and transient allocations.
|
|
||||||
if (!resolvedKey.empty()) {
|
|
||||||
failedTextureCache_.insert(resolvedKey);
|
|
||||||
} else {
|
|
||||||
failedTextureCache_.insert(key);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (textureBudgetRejectWarnings_ < 8 || (textureBudgetRejectWarnings_ % 120) == 0) {
|
if (textureBudgetRejectWarnings_ < 8 || (textureBudgetRejectWarnings_ % 120) == 0) {
|
||||||
core::Logger::getInstance().warning(
|
core::Logger::getInstance().warning(
|
||||||
"WMO texture cache full (", textureCacheBytes_ / (1024 * 1024),
|
"WMO texture cache full (", textureCacheBytes_ / (1024 * 1024),
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue