From 30fa9836d9582105f71f66b3a5659f9a43bdd404 Mon Sep 17 00:00:00 2001 From: Kelsi Date: Wed, 4 Mar 2026 08:17:32 -0800 Subject: [PATCH 1/5] Fix glow sprite flashing, move fadeAlpha to push constants, throttle character bones - Glow sprites now use dedicated vertex buffer (glowVB_) separate from M2 particle buffer to prevent data race when renderM2Particles() overwrites glow data mid-flight - Move fadeAlpha from shared material UBO to per-draw push constants, eliminating cross-instance alpha race on non-double-buffered UBOs - Smooth adaptive render distance transitions to prevent pop-in/out at instance count thresholds (1000/2000) - Distance-tiered character bone throttling: near (<30u) every frame, mid (30-60u) every 3rd, far (60-120u) every 6th frame - Skip weapon instance animation updates (transforms set by parent bones) --- assets/shaders/m2.frag.glsl | 7 +-- assets/shaders/m2.frag.spv | Bin 16180 -> 16176 bytes assets/shaders/m2.vert.glsl | 3 ++ assets/shaders/m2.vert.spv | Bin 8936 -> 9120 bytes include/rendering/character_renderer.hpp | 3 ++ include/rendering/m2_renderer.hpp | 6 +++ src/rendering/character_renderer.cpp | 52 ++++++++++++++++++----- src/rendering/m2_renderer.cpp | 47 ++++++++++++-------- 8 files changed, 87 insertions(+), 31 deletions(-) diff --git a/assets/shaders/m2.frag.glsl b/assets/shaders/m2.frag.glsl index 36b0fc2c..448bd425 100644 --- a/assets/shaders/m2.frag.glsl +++ b/assets/shaders/m2.frag.glsl @@ -34,6 +34,7 @@ layout(location = 1) in vec3 Normal; layout(location = 2) in vec2 TexCoord; layout(location = 3) flat in vec3 InstanceOrigin; layout(location = 4) in float ModelHeight; +layout(location = 5) in float vFadeAlpha; layout(location = 0) out vec4 outColor; @@ -175,16 +176,16 @@ void main() { float fogFactor = clamp((fogParams.y - dist) / (fogParams.y - fogParams.x), 0.0, 1.0); result = mix(fogColor.rgb, result, fogFactor); - float outAlpha = texColor.a * fadeAlpha; + float outAlpha = texColor.a * vFadeAlpha; // Cutout materials should not remain partially transparent after discard, // otherwise foliage cards look view-dependent. if (alphaTest != 0 || colorKeyBlack != 0) { - outAlpha = fadeAlpha; + outAlpha = vFadeAlpha; } // Foliage cutout should stay opaque after alpha discard to avoid // view-angle translucency artifacts. if (alphaTest == 2 || alphaTest == 3) { - outAlpha = 1.0 * fadeAlpha; + outAlpha = 1.0 * vFadeAlpha; } outColor = vec4(result, outAlpha); } diff --git a/assets/shaders/m2.frag.spv b/assets/shaders/m2.frag.spv index 7deda700828b59b8c879ff3a210036ad2e021a26..f473a468e38ad0c0ee16da1694dcefbab1603d3d 100644 GIT binary patch delta 599 zcmX|;yGjFL5QS$qW`oLtBDN|hgj7}mB`5?zv5R11VG+ELLcv1gEha`?FNsaCw6aqx zWqkpk#b*#NjeaNQ_YcFIIWu!+GW$|}s)h-FG-c*luZ4Eq3bS6=n#m;BeY5C!GZzdB zm1m2qYx#|>Qqe5QFNz<6AHQFzAroabXI-MZZ-Yz@nD)$+#@Pt*88`u+$COtb#mjwf z>3DHs%*#%}1>|<%)i+J`3SRM-`L>P!?D#wb3)GfxUMB~ZcFB~nN7!N5J+QbE_#Rbp z`^4fuP(8%!eUlS^fQgg3uv4!{y>PqHMx%0 z(`3~gW8ySf*xld+tQ$m@thJAFN>HRx33nKATEGd&OQ46kbMQKscY+thI_3f=(vdI0 os?l@YRfnVPpKJKze}$?o@bC2XngkM1Lj#kzmWuGrxSop>zl=sxn*aa+ delta 602 zcmYL`yGjE=6o$`ibc2_GXl11!C_aD?2nv$QMq+6xLJ+J3O}r)X7FUfCf>>GEiIrh} z0V`{t!DkRJjeft`i35lK%r|HLGkf;2`c%!Nf{C=5vjNN4RVy>&23z@Q7qvHXp^M_@ z@FzfC`)$6soqs7wgA2*~p+l=v?M{RD?F;N-j?Q0!3U^0{{R3 diff --git a/assets/shaders/m2.vert.glsl b/assets/shaders/m2.vert.glsl index 7b0f9451..6f4545c8 100644 --- a/assets/shaders/m2.vert.glsl +++ b/assets/shaders/m2.vert.glsl @@ -19,6 +19,7 @@ layout(push_constant) uniform Push { int texCoordSet; int useBones; int isFoliage; + float fadeAlpha; } push; layout(set = 2, binding = 0) readonly buffer BoneSSBO { @@ -37,6 +38,7 @@ layout(location = 1) out vec3 Normal; layout(location = 2) out vec2 TexCoord; layout(location = 3) flat out vec3 InstanceOrigin; layout(location = 4) out float ModelHeight; +layout(location = 5) out float vFadeAlpha; void main() { vec4 pos = vec4(aPos, 1.0); @@ -86,6 +88,7 @@ void main() { InstanceOrigin = push.model[3].xyz; ModelHeight = pos.z; + vFadeAlpha = push.fadeAlpha; gl_Position = projection * view * worldPos; } diff --git a/assets/shaders/m2.vert.spv b/assets/shaders/m2.vert.spv index 9d5411d3265d74b22a7c8c7a018068f48d62b12b..8397440f03249e030dbcb99c3f5adbb3e3403381 100644 GIT binary patch delta 636 zcmXw0%}N4M82zr}j1D4EA|j>;5-r-a35qf>f2+x?>}MBJR>Gi#)>e9guz7(ttwZJ! zdWz@;+O+5ag3g&{1`hX}@0|O6_ul!czSQGds1OlR5|g-`P2wwRZc;AlH6_xVJ}#=H zA%^5cL+L$pJfl zeG_eXKJbKqwd;02jO)L4H;AMQC)a_nuS!4PlK22TJJ dgs6|f5%lV8j+4tNz}v7n4@k}2FMi7^@CWGMMw0*l delta 442 zcmXw#K}y3=5QgU^O+s6ONET9JMJVFZl^|$^m};8ZXsfsuQISAs7p-eoE(ACAT|iGD zk_))-65hZ|2>yRe0zWhJ&Hv`j%WL#==$b~y5mC|4n-DQ`f=`5Y6(}~#Ppqp5}igarEZF)?&;0f=ute4K_SCi{ou|Q+}X1(V( z_OlW60jhD$|EW?j-a^J$F{5N0T`(QgJzRNF|5lFPYP9lUa;((8sfRVqz s!}&;lOTBjogvbM_eV4B|4Jy^l3fyM^%=zhwvd1A31&Z2$lO diff --git a/include/rendering/character_renderer.hpp b/include/rendering/character_renderer.hpp index 3a2d24ec..c6f63451 100644 --- a/include/rendering/character_renderer.hpp +++ b/include/rendering/character_renderer.hpp @@ -178,6 +178,9 @@ private: bool hasOverrideModelMatrix = false; glm::mat4 overrideModelMatrix{1.0f}; + // Bone update throttling (skip frames for distant characters) + uint32_t boneUpdateCounter = 0; + // Per-instance bone SSBO (double-buffered per frame) VkBuffer boneBuffer[2] = {}; VmaAllocation boneAlloc[2] = {}; diff --git a/include/rendering/m2_renderer.hpp b/include/rendering/m2_renderer.hpp index 54c2c771..e91e08d3 100644 --- a/include/rendering/m2_renderer.hpp +++ b/include/rendering/m2_renderer.hpp @@ -371,6 +371,11 @@ private: ::VkBuffer m2ParticleVB_ = VK_NULL_HANDLE; VmaAllocation m2ParticleVBAlloc_ = VK_NULL_HANDLE; void* m2ParticleVBMapped_ = nullptr; + // Dedicated glow sprite vertex buffer (separate from particle VB to avoid data race) + static constexpr size_t MAX_GLOW_SPRITES = 2000; + ::VkBuffer glowVB_ = VK_NULL_HANDLE; + VmaAllocation glowVBAlloc_ = VK_NULL_HANDLE; + void* glowVBMapped_ = nullptr; std::unordered_map models; std::vector instances; @@ -477,6 +482,7 @@ private: // Cached camera state from update() for frustum-culling bones glm::vec3 cachedCamPos_ = glm::vec3(0.0f); float cachedMaxRenderDistSq_ = 0.0f; + float smoothedRenderDist_ = 1000.0f; // Smoothed render distance to prevent flickering // Thread count for parallel bone animation uint32_t numAnimThreads_ = 1; diff --git a/src/rendering/character_renderer.cpp b/src/rendering/character_renderer.cpp index ab0bbe78..a0cf967b 100644 --- a/src/rendering/character_renderer.cpp +++ b/src/rendering/character_renderer.cpp @@ -1423,20 +1423,53 @@ void CharacterRenderer::update(float deltaTime, const glm::vec3& cameraPos) { } // Only update animations for nearby characters (performance optimization) - // Collect instances that need updates + // Collect instances that need bone recomputation, with distance-based throttling std::vector> toUpdate; toUpdate.reserve(instances.size()); for (auto& pair : instances) { - float distSq = glm::distance2(pair.second.position, cameraPos); - if (distSq < animUpdateRadiusSq) { - toUpdate.push_back(std::ref(pair.second)); + auto& inst = pair.second; + + // Skip weapon instances — their transforms are set by parent bones + if (inst.hasOverrideModelMatrix) continue; + + float distSq = glm::distance2(inst.position, cameraPos); + if (distSq >= animUpdateRadiusSq) continue; + + // Always advance animation time (cheap) + auto modelIt = models.find(inst.modelId); + if (modelIt != models.end() && !modelIt->second.data.sequences.empty()) { + if (inst.currentSequenceIndex < 0) { + inst.currentSequenceIndex = 0; + inst.currentAnimationId = modelIt->second.data.sequences[0].id; + } + const auto& seq = modelIt->second.data.sequences[inst.currentSequenceIndex]; + inst.animationTime += deltaTime * 1000.0f; + if (seq.duration > 0 && inst.animationTime >= static_cast(seq.duration)) { + if (inst.animationLoop) { + inst.animationTime = std::fmod(inst.animationTime, static_cast(seq.duration)); + } else { + inst.animationTime = static_cast(seq.duration); + } + } + } + + // Distance-tiered bone throttling: near=every frame, mid=every 3rd, far=every 6th + uint32_t boneInterval = 1; + if (distSq > 60.0f * 60.0f) boneInterval = 6; + else if (distSq > 30.0f * 30.0f) boneInterval = 3; + + inst.boneUpdateCounter++; + bool needsBones = (inst.boneUpdateCounter >= boneInterval) || inst.boneMatrices.empty(); + if (needsBones) { + inst.boneUpdateCounter = 0; + toUpdate.push_back(std::ref(inst)); } } const size_t updatedCount = toUpdate.size(); - // Thread animation updates in chunks to avoid spawning one task per instance. + // Thread bone matrix computation in chunks if (updatedCount >= 8 && numAnimThreads_ > 1) { static const size_t minAnimWorkPerThread = std::max( 16, envSizeOrDefault("WOWEE_CHAR_ANIM_WORK_PER_THREAD", 64)); @@ -1446,7 +1479,7 @@ void CharacterRenderer::update(float deltaTime, const glm::vec3& cameraPos) { if (numThreads <= 1) { for (auto& instRef : toUpdate) { - updateAnimation(instRef.get(), deltaTime); + calculateBoneMatrices(instRef.get()); } } else { const size_t chunkSize = updatedCount / numThreads; @@ -1461,9 +1494,9 @@ void CharacterRenderer::update(float deltaTime, const glm::vec3& cameraPos) { for (size_t t = 0; t < numThreads; t++) { size_t end = start + chunkSize + (t < remainder ? 1 : 0); animFutures_.push_back(std::async(std::launch::async, - [this, &toUpdate, start, end, deltaTime]() { + [this, &toUpdate, start, end]() { for (size_t i = start; i < end; i++) { - updateAnimation(toUpdate[i].get(), deltaTime); + calculateBoneMatrices(toUpdate[i].get()); } })); start = end; @@ -1474,9 +1507,8 @@ void CharacterRenderer::update(float deltaTime, const glm::vec3& cameraPos) { } } } else { - // Sequential for small counts (avoid thread overhead) for (auto& instRef : toUpdate) { - updateAnimation(instRef.get(), deltaTime); + calculateBoneMatrices(instRef.get()); } } diff --git a/src/rendering/m2_renderer.cpp b/src/rendering/m2_renderer.cpp index 5ec1f70f..e7fbd6ef 100644 --- a/src/rendering/m2_renderer.cpp +++ b/src/rendering/m2_renderer.cpp @@ -401,7 +401,7 @@ bool M2Renderer::initialize(VkContext* ctx, VkDescriptorSetLayout perFrameLayout VkPushConstantRange pushRange{}; pushRange.stageFlags = VK_SHADER_STAGE_VERTEX_BIT; pushRange.offset = 0; - pushRange.size = 84; // mat4(64) + vec2(8) + int(4) + int(4) + int(4) + pushRange.size = 88; // mat4(64) + vec2(8) + int(4) + int(4) + int(4) + float(4) VkPipelineLayoutCreateInfo ci{VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO}; ci.setLayoutCount = 3; @@ -591,6 +591,11 @@ bool M2Renderer::initialize(VkContext* ctx, VkDescriptorSetLayout perFrameLayout bci.size = MAX_M2_PARTICLES * 9 * sizeof(float); vmaCreateBuffer(vkCtx_->getAllocator(), &bci, &aci, &m2ParticleVB_, &m2ParticleVBAlloc_, &allocInfo); m2ParticleVBMapped_ = allocInfo.pMappedData; + + // Dedicated glow sprite buffer (separate from particle VB to avoid data race) + bci.size = MAX_GLOW_SPRITES * 9 * sizeof(float); + vmaCreateBuffer(vkCtx_->getAllocator(), &bci, &aci, &glowVB_, &glowVBAlloc_, &allocInfo); + glowVBMapped_ = allocInfo.pMappedData; } // --- Create white fallback texture --- @@ -689,6 +694,7 @@ void M2Renderer::shutdown() { // Clean up particle buffers if (smokeVB_) { vmaDestroyBuffer(alloc, smokeVB_, smokeVBAlloc_); smokeVB_ = VK_NULL_HANDLE; } if (m2ParticleVB_) { vmaDestroyBuffer(alloc, m2ParticleVB_, m2ParticleVBAlloc_); m2ParticleVB_ = VK_NULL_HANDLE; } + if (glowVB_) { vmaDestroyBuffer(alloc, glowVB_, glowVBAlloc_); glowVB_ = VK_NULL_HANDLE; } smokeParticles.clear(); // Destroy pipelines @@ -2104,10 +2110,16 @@ void M2Renderer::render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet, const lastDrawCallCount = 0; - // Adaptive render distance: tiered by instance density to cap draw calls - const float maxRenderDistance = (instances.size() > 2000) ? 300.0f - : (instances.size() > 1000) ? 500.0f - : 1000.0f; + // Adaptive render distance: smoothed to prevent pop-in/pop-out flickering + const float targetRenderDist = (instances.size() > 2000) ? 300.0f + : (instances.size() > 1000) ? 500.0f + : 1000.0f; + // Smooth transitions: shrink slowly (avoid popping out nearby objects) + const float shrinkRate = 0.005f; // very slow decrease + const float growRate = 0.05f; // faster increase + float blendRate = (targetRenderDist < smoothedRenderDist_) ? shrinkRate : growRate; + smoothedRenderDist_ = glm::mix(smoothedRenderDist_, targetRenderDist, blendRate); + const float maxRenderDistance = smoothedRenderDist_; const float maxRenderDistanceSq = maxRenderDistance * maxRenderDistance; const float fadeStartFraction = 0.75f; const glm::vec3 camPos = camera.getPosition(); @@ -2127,15 +2139,14 @@ void M2Renderer::render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet, const for (uint32_t i = 0; i < static_cast(instances.size()); ++i) { const auto& instance = instances[i]; - // Fast early rejection: skip instances that are definitely too far glm::vec3 toCam = instance.position - camPos; float distSq = glm::dot(toCam, toCam); - if (distSq > maxPossibleDistSq) continue; // Early out before model lookup auto it = models.find(instance.modelId); if (it == models.end()) continue; const M2ModelGPU& model = it->second; if (!model.isValid() || model.isSmoke || model.isInvisibleTrap) continue; + float worldRadius = model.boundRadius * instance.scale; float cullRadius = worldRadius; if (model.disableAnimation) { @@ -2146,15 +2157,13 @@ void M2Renderer::render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet, const effectiveMaxDistSq *= 2.6f; } if (model.isGroundDetail) { - // Keep clutter local so distant grass doesn't overdraw the scene. effectiveMaxDistSq *= 0.75f; } - // Removed aggressive small-object distance caps to prevent city pop-out - // Small props (barrels, lanterns, etc.) now use same distance as larger objects + + if (distSq > maxPossibleDistSq) continue; if (distSq > effectiveMaxDistSq) continue; - // Frustum cull with moderate padding to prevent edge pop-out during camera rotation - // Reduced from 2.5x to 1.5x for better performance + // Frustum cull with padding float paddedRadius = std::max(cullRadius * 1.5f, cullRadius + 3.0f); if (cullRadius > 0.0f && !frustum.intersectsSphere(instance.position, paddedRadius)) continue; @@ -2179,6 +2188,7 @@ void M2Renderer::render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet, const int texCoordSet; int useBones; int isFoliage; + float fadeAlpha; }; // Bind per-frame descriptor set (set 0) — shared across all draws @@ -2390,10 +2400,10 @@ void M2Renderer::render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet, const currentPipeline = desiredPipeline; } - // Update material UBO with per-draw dynamic values (fadeAlpha, interiorDarken) + // Update material UBO with per-draw dynamic values (interiorDarken, forceCutout overrides) + // Note: fadeAlpha is in push constants (per-draw) to avoid shared-UBO race if (batch.materialUBOMapped) { auto* mat = static_cast(batch.materialUBOMapped); - mat->fadeAlpha = instanceFadeAlpha; mat->interiorDarken = insideInterior ? 1.0f : 0.0f; if (batch.colorKeyBlack) { mat->colorKeyThreshold = (effectiveBlendMode == 4 || effectiveBlendMode == 5) ? 0.7f : 0.08f; @@ -2419,6 +2429,7 @@ void M2Renderer::render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet, const pc.texCoordSet = static_cast(batch.textureUnit); pc.useBones = useBones ? 1 : 0; pc.isFoliage = model.shadowWindFoliage ? 1 : 0; + pc.fadeAlpha = instanceFadeAlpha; vkCmdPushConstants(cmd, pipelineLayout_, VK_SHADER_STAGE_VERTEX_BIT, 0, sizeof(pc), &pc); vkCmdDrawIndexed(cmd, batch.indexCount, 1, batch.indexStart, 0, 0); @@ -2427,7 +2438,7 @@ void M2Renderer::render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet, const } // Render glow sprites as billboarded additive point lights - if (!glowSprites_.empty() && particleAdditivePipeline_ && m2ParticleVB_ && glowTexDescSet_) { + if (!glowSprites_.empty() && particleAdditivePipeline_ && glowVB_ && glowTexDescSet_) { vkCmdBindPipeline(cmd, VK_PIPELINE_BIND_POINT_GRAPHICS, particleAdditivePipeline_); vkCmdBindDescriptorSets(cmd, VK_PIPELINE_BIND_POINT_GRAPHICS, particlePipelineLayout_, 0, 1, &perFrameSet, 0, nullptr); @@ -2454,11 +2465,11 @@ void M2Renderer::render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet, const glowData.push_back(0.0f); } - size_t uploadCount = std::min(glowSprites_.size(), MAX_M2_PARTICLES); - memcpy(m2ParticleVBMapped_, glowData.data(), uploadCount * 9 * sizeof(float)); + size_t uploadCount = std::min(glowSprites_.size(), MAX_GLOW_SPRITES); + memcpy(glowVBMapped_, glowData.data(), uploadCount * 9 * sizeof(float)); VkDeviceSize offset = 0; - vkCmdBindVertexBuffers(cmd, 0, 1, &m2ParticleVB_, &offset); + vkCmdBindVertexBuffers(cmd, 0, 1, &glowVB_, &offset); vkCmdDraw(cmd, static_cast(uploadCount), 1, 0, 0); } From 2e432fc1233277a70e4c7426dfdfe9bafa37a3bd Mon Sep 17 00:00:00 2001 From: Kelsi Date: Wed, 4 Mar 2026 08:28:21 -0800 Subject: [PATCH 2/5] Eliminate per-instance hash lookups in M2 render/shadow culling loops Use cached model flags (isValid, isSmoke, isInvisibleTrap, isGroundDetail, disableAnimation, boundRadius) on M2Instance instead of models.find() in the hot culling paths. Also complete cached flag initialization in createInstanceWithMatrix(). --- include/rendering/m2_renderer.hpp | 3 ++ src/rendering/m2_renderer.cpp | 67 ++++++++++++++++--------------- 2 files changed, 38 insertions(+), 32 deletions(-) diff --git a/include/rendering/m2_renderer.hpp b/include/rendering/m2_renderer.hpp index e91e08d3..83762a5c 100644 --- a/include/rendering/m2_renderer.hpp +++ b/include/rendering/m2_renderer.hpp @@ -179,6 +179,9 @@ struct M2Instance { bool cachedDisableAnimation = false; bool cachedIsSmoke = false; bool cachedHasParticleEmitters = false; + bool cachedIsGroundDetail = false; + bool cachedIsInvisibleTrap = false; + bool cachedIsValid = false; float cachedBoundRadius = 0.0f; // Frame-skip optimization (update distant animations less frequently) diff --git a/src/rendering/m2_renderer.cpp b/src/rendering/m2_renderer.cpp index e7fbd6ef..130b8b1a 100644 --- a/src/rendering/m2_renderer.cpp +++ b/src/rendering/m2_renderer.cpp @@ -1617,6 +1617,9 @@ uint32_t M2Renderer::createInstance(uint32_t modelId, const glm::vec3& position, instance.cachedIsSmoke = mdlRef.isSmoke; instance.cachedHasParticleEmitters = !mdlRef.particleEmitters.empty(); instance.cachedBoundRadius = mdlRef.boundRadius; + instance.cachedIsGroundDetail = mdlRef.isGroundDetail; + instance.cachedIsInvisibleTrap = mdlRef.isInvisibleTrap; + instance.cachedIsValid = mdlRef.isValid(); // Initialize animation: play first sequence (usually Stand/Idle) const auto& mdl = mdlRef; @@ -1691,6 +1694,9 @@ uint32_t M2Renderer::createInstanceWithMatrix(uint32_t modelId, const glm::mat4& instance.cachedIsSmoke = mdl2.isSmoke; instance.cachedHasParticleEmitters = !mdl2.particleEmitters.empty(); instance.cachedBoundRadius = mdl2.boundRadius; + instance.cachedIsGroundDetail = mdl2.isGroundDetail; + instance.cachedIsInvisibleTrap = mdl2.isInvisibleTrap; + instance.cachedIsValid = mdl2.isValid(); // Initialize animation if (mdl2.hasAnimation && !mdl2.disableAnimation && !mdl2.sequences.empty()) { @@ -2139,28 +2145,26 @@ void M2Renderer::render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet, const for (uint32_t i = 0; i < static_cast(instances.size()); ++i) { const auto& instance = instances[i]; + // Use cached model flags — no hash lookup needed + if (!instance.cachedIsValid || instance.cachedIsSmoke || instance.cachedIsInvisibleTrap) continue; + glm::vec3 toCam = instance.position - camPos; float distSq = glm::dot(toCam, toCam); + if (distSq > maxPossibleDistSq) continue; - auto it = models.find(instance.modelId); - if (it == models.end()) continue; - const M2ModelGPU& model = it->second; - if (!model.isValid() || model.isSmoke || model.isInvisibleTrap) continue; - - float worldRadius = model.boundRadius * instance.scale; + float worldRadius = instance.cachedBoundRadius * instance.scale; float cullRadius = worldRadius; - if (model.disableAnimation) { + if (instance.cachedDisableAnimation) { cullRadius = std::max(cullRadius, 3.0f); } float effectiveMaxDistSq = maxRenderDistanceSq * std::max(1.0f, cullRadius / 12.0f); - if (model.disableAnimation) { + if (instance.cachedDisableAnimation) { effectiveMaxDistSq *= 2.6f; } - if (model.isGroundDetail) { + if (instance.cachedIsGroundDetail) { effectiveMaxDistSq *= 0.75f; } - if (distSq > maxPossibleDistSq) continue; if (distSq > effectiveMaxDistSq) continue; // Frustum cull with padding @@ -2278,12 +2282,11 @@ void M2Renderer::render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet, const } } - // LOD selection based on distance - float dist = std::sqrt(entry.distSq); + // LOD selection based on squared distance (avoid sqrt) uint16_t desiredLOD = 0; - if (dist > 150.0f) desiredLOD = 3; - else if (dist > 80.0f) desiredLOD = 2; - else if (dist > 40.0f) desiredLOD = 1; + if (entry.distSq > 150.0f * 150.0f) desiredLOD = 3; + else if (entry.distSq > 80.0f * 80.0f) desiredLOD = 2; + else if (entry.distSq > 40.0f * 40.0f) desiredLOD = 1; uint16_t targetLOD = desiredLOD; if (desiredLOD > 0) { @@ -2450,23 +2453,21 @@ void M2Renderer::render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet, const vkCmdPushConstants(cmd, particlePipelineLayout_, VK_SHADER_STAGE_FRAGMENT_BIT, 0, sizeof(particlePush), &particlePush); - // Build and upload vertex data - std::vector glowData; - glowData.reserve(glowSprites_.size() * 9); - for (const auto& gs : glowSprites_) { - glowData.push_back(gs.worldPos.x); - glowData.push_back(gs.worldPos.y); - glowData.push_back(gs.worldPos.z); - glowData.push_back(gs.color.r); - glowData.push_back(gs.color.g); - glowData.push_back(gs.color.b); - glowData.push_back(gs.color.a); - glowData.push_back(gs.size); - glowData.push_back(0.0f); - } - + // Write glow vertex data directly to mapped buffer (no temp vector) size_t uploadCount = std::min(glowSprites_.size(), MAX_GLOW_SPRITES); - memcpy(glowVBMapped_, glowData.data(), uploadCount * 9 * sizeof(float)); + float* dst = static_cast(glowVBMapped_); + for (size_t gi = 0; gi < uploadCount; gi++) { + const auto& gs = glowSprites_[gi]; + *dst++ = gs.worldPos.x; + *dst++ = gs.worldPos.y; + *dst++ = gs.worldPos.z; + *dst++ = gs.color.r; + *dst++ = gs.color.g; + *dst++ = gs.color.b; + *dst++ = gs.color.a; + *dst++ = gs.size; + *dst++ = 0.0f; + } VkDeviceSize offset = 0; vkCmdBindVertexBuffers(cmd, 0, 1, &glowVB_, &offset); @@ -2748,6 +2749,9 @@ void M2Renderer::renderShadow(VkCommandBuffer cmd, const glm::mat4& lightSpaceMa const M2ModelGPU* currentModel = nullptr; for (const auto& instance : instances) { + // Use cached flags to skip early without hash lookup + if (!instance.cachedIsValid || instance.cachedIsSmoke || instance.cachedIsInvisibleTrap) continue; + // Distance cull against shadow frustum glm::vec3 diff = instance.position - shadowCenter; if (glm::dot(diff, diff) > shadowRadiusSq) continue; @@ -2755,7 +2759,6 @@ void M2Renderer::renderShadow(VkCommandBuffer cmd, const glm::mat4& lightSpaceMa auto modelIt = models.find(instance.modelId); if (modelIt == models.end()) continue; const M2ModelGPU& model = modelIt->second; - if (!model.isValid() || model.isSmoke || model.isInvisibleTrap) continue; // Filter: only draw foliage models in foliage pass, non-foliage in non-foliage pass if (model.shadowWindFoliage != foliagePass) continue; From e6acb4ac9ad18073ddd0b7c7f515ad2a2cda986a Mon Sep 17 00:00:00 2001 From: Kelsi Date: Wed, 4 Mar 2026 08:33:56 -0800 Subject: [PATCH 3/5] Optimize animation hotpaths: binary keyframe search, eliminate sqrt calls MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Replace O(n) linear keyframe search with O(log n) binary search in both M2 and Character renderers (runs thousands of times per frame) - Smoke particle removal: swap-and-pop instead of O(n²) vector erase - Character render backface cull: eliminate sqrt via squared comparison - Quaternion validation: use length² instead of sqrt-based length check --- src/rendering/character_renderer.cpp | 28 +++++++++++-------- src/rendering/m2_renderer.cpp | 42 +++++++++++++++------------- 2 files changed, 38 insertions(+), 32 deletions(-) diff --git a/src/rendering/character_renderer.cpp b/src/rendering/character_renderer.cpp index a0cf967b..0c52e43b 100644 --- a/src/rendering/character_renderer.cpp +++ b/src/rendering/character_renderer.cpp @@ -1580,13 +1580,12 @@ int CharacterRenderer::findKeyframeIndex(const std::vector& timestamps if (timestamps.empty()) return -1; if (timestamps.size() == 1) return 0; - // Binary search for the keyframe bracket - for (size_t i = 0; i < timestamps.size() - 1; i++) { - if (time < static_cast(timestamps[i + 1])) { - return static_cast(i); - } - } - return static_cast(timestamps.size() - 2); + // Binary search: find first element > t, then back up one + uint32_t t = static_cast(time); + auto it = std::upper_bound(timestamps.begin(), timestamps.end(), t); + if (it == timestamps.begin()) return 0; + size_t idx = static_cast(it - timestamps.begin()) - 1; + return static_cast(std::min(idx, timestamps.size() - 2)); } glm::vec3 CharacterRenderer::interpolateVec3(const pipeline::M2AnimationTrack& track, @@ -1630,8 +1629,8 @@ glm::quat CharacterRenderer::interpolateQuat(const pipeline::M2AnimationTrack& t if (keys.timestamps.empty() || keys.quatValues.empty()) return identity; auto safeQuat = [&](const glm::quat& q) -> glm::quat { - float len = glm::length(q); - if (len < 0.001f || std::isnan(len)) return identity; + float lenSq = q.x*q.x + q.y*q.y + q.z*q.z + q.w*q.w; + if (lenSq < 0.000001f || std::isnan(lenSq)) return identity; return q; }; @@ -1773,9 +1772,14 @@ void CharacterRenderer::render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet, float distSq = glm::dot(toInst, toInst); if (distSq > renderRadiusSq) continue; if (distSq > nearNoConeCullSq) { - float invDist = 1.0f / std::sqrt(distSq); - float facingDot = glm::dot(toInst, camForward) * invDist; - if (facingDot < backfaceDotCull) continue; + // Backface cull without sqrt: dot(toInst, camFwd) / |toInst| < threshold + // ⟺ dot < 0 || dot² < threshold² * distSq (when threshold < 0, dot must be negative) + float rawDot = glm::dot(toInst, camForward); + if (backfaceDotCull >= 0.0f) { + if (rawDot < 0.0f || rawDot * rawDot < backfaceDotCull * backfaceDotCull * distSq) continue; + } else { + if (rawDot < 0.0f && rawDot * rawDot > backfaceDotCull * backfaceDotCull * distSq) continue; + } } } diff --git a/src/rendering/m2_renderer.cpp b/src/rendering/m2_renderer.cpp index 130b8b1a..edf02243 100644 --- a/src/rendering/m2_renderer.cpp +++ b/src/rendering/m2_renderer.cpp @@ -1741,12 +1741,12 @@ uint32_t M2Renderer::createInstanceWithMatrix(uint32_t modelId, const glm::mat4& static int findKeyframeIndex(const std::vector& timestamps, float time) { if (timestamps.empty()) return -1; if (timestamps.size() == 1) return 0; - for (size_t i = 0; i < timestamps.size() - 1; i++) { - if (time < static_cast(timestamps[i + 1])) { - return static_cast(i); - } - } - return static_cast(timestamps.size() - 2); + uint32_t t = static_cast(time); + // Binary search: find first element > t, then back up one + auto it = std::upper_bound(timestamps.begin(), timestamps.end(), t); + if (it == timestamps.begin()) return 0; + size_t idx = static_cast(it - timestamps.begin()) - 1; + return static_cast(std::min(idx, timestamps.size() - 2)); } // Resolve sequence index and time for a track, handling global sequences. @@ -1803,8 +1803,8 @@ static glm::quat interpQuat(const pipeline::M2AnimationTrack& track, const auto& keys = track.sequences[si]; if (keys.timestamps.empty() || keys.quatValues.empty()) return identity; auto safe = [&](const glm::quat& q) -> glm::quat { - float len = glm::length(q); - if (len < 0.001f || std::isnan(len)) return identity; + float lenSq = q.x*q.x + q.y*q.y + q.z*q.z + q.w*q.w; + if (lenSq < 0.000001f || std::isnan(lenSq)) return identity; return q; }; if (keys.quatValues.size() == 1) return safe(keys.quatValues[0]); @@ -1907,21 +1907,23 @@ void M2Renderer::update(float deltaTime, const glm::vec3& cameraPos, const glm:: smokeEmitAccum = 0.0f; } - // --- Update existing smoke particles --- - for (auto it = smokeParticles.begin(); it != smokeParticles.end(); ) { - it->life += deltaTime; - if (it->life >= it->maxLife) { - it = smokeParticles.erase(it); + // --- Update existing smoke particles (swap-and-pop for O(1) removal) --- + for (size_t i = 0; i < smokeParticles.size(); ) { + auto& p = smokeParticles[i]; + p.life += deltaTime; + if (p.life >= p.maxLife) { + smokeParticles[i] = smokeParticles.back(); + smokeParticles.pop_back(); continue; } - it->position += it->velocity * deltaTime; - it->velocity.z *= 0.98f; // Slight deceleration - it->velocity.x += distDrift(smokeRng) * deltaTime; - it->velocity.y += distDrift(smokeRng) * deltaTime; + p.position += p.velocity * deltaTime; + p.velocity.z *= 0.98f; // Slight deceleration + p.velocity.x += distDrift(smokeRng) * deltaTime; + p.velocity.y += distDrift(smokeRng) * deltaTime; // Grow from 1.0 to 3.5 over lifetime - float t = it->life / it->maxLife; - it->size = 1.0f + t * 2.5f; - ++it; + float t = p.life / p.maxLife; + p.size = 1.0f + t * 2.5f; + ++i; } // --- Normal M2 animation update --- From 84b04446c1e55ad0438fbb212a3858dcdb479295 Mon Sep 17 00:00:00 2001 From: Kelsi Date: Wed, 4 Mar 2026 09:19:02 -0800 Subject: [PATCH 4/5] Per-instance NPC hair/skin textures, fix binary search float comparison - NPC hair/skin textures now use per-instance overrides instead of shared model-level textures, so each NPC shows its own hair color/style - Hair/skin DBC lookup runs for every NPC instance (including cached models) rather than only on first load - Fix keyframe binary search to use float comparison matching original linear scan semantics --- src/core/application.cpp | 71 ++++++++++++++++++++++++++++ src/rendering/character_renderer.cpp | 6 +-- src/rendering/m2_renderer.cpp | 6 +-- 3 files changed, 77 insertions(+), 6 deletions(-) diff --git a/src/core/application.cpp b/src/core/application.cpp index 2fa4b360..da49887d 100644 --- a/src/core/application.cpp +++ b/src/core/application.cpp @@ -4790,6 +4790,77 @@ void Application::spawnOnlineCreature(uint64_t guid, uint32_t displayId, float x return; } + // Per-instance hair/skin texture overrides — runs for ALL NPCs (including cached models) + // so that each NPC gets its own hair/skin color regardless of model sharing. + { + auto itDD = displayDataMap_.find(displayId); + if (itDD != displayDataMap_.end() && itDD->second.extraDisplayId != 0) { + auto itExtra2 = humanoidExtraMap_.find(itDD->second.extraDisplayId); + if (itExtra2 != humanoidExtraMap_.end()) { + const auto& extra = itExtra2->second; + const auto* md = charRenderer->getModelData(modelId); + if (md) { + auto charSectionsDbc2 = assetManager->loadDBC("CharSections.dbc"); + if (charSectionsDbc2) { + const auto* csL = pipeline::getActiveDBCLayout() + ? pipeline::getActiveDBCLayout()->getLayout("CharSections") : nullptr; + uint32_t tgtRace = static_cast(extra.raceId); + uint32_t tgtSex = static_cast(extra.sexId); + + // Look up hair texture (section 3) + for (uint32_t r = 0; r < charSectionsDbc2->getRecordCount(); r++) { + uint32_t rId = charSectionsDbc2->getUInt32(r, csL ? (*csL)["RaceID"] : 1); + uint32_t sId = charSectionsDbc2->getUInt32(r, csL ? (*csL)["SexID"] : 2); + if (rId != tgtRace || sId != tgtSex) continue; + uint32_t sec = charSectionsDbc2->getUInt32(r, csL ? (*csL)["BaseSection"] : 3); + if (sec != 3) continue; + uint32_t var = charSectionsDbc2->getUInt32(r, csL ? (*csL)["VariationIndex"] : 4); + uint32_t col = charSectionsDbc2->getUInt32(r, csL ? (*csL)["ColorIndex"] : 5); + if (var != static_cast(extra.hairStyleId)) continue; + if (col != static_cast(extra.hairColorId)) continue; + std::string hairPath = charSectionsDbc2->getString(r, csL ? (*csL)["Texture1"] : 6); + if (!hairPath.empty()) { + rendering::VkTexture* hairTex = charRenderer->loadTexture(hairPath); + if (hairTex) { + for (size_t ti = 0; ti < md->textures.size(); ti++) { + if (md->textures[ti].type == 6) { + charRenderer->setTextureSlotOverride(instanceId, static_cast(ti), hairTex); + } + } + } + } + break; + } + + // Look up skin texture (section 0) for per-instance skin color + for (uint32_t r = 0; r < charSectionsDbc2->getRecordCount(); r++) { + uint32_t rId = charSectionsDbc2->getUInt32(r, csL ? (*csL)["RaceID"] : 1); + uint32_t sId = charSectionsDbc2->getUInt32(r, csL ? (*csL)["SexID"] : 2); + if (rId != tgtRace || sId != tgtSex) continue; + uint32_t sec = charSectionsDbc2->getUInt32(r, csL ? (*csL)["BaseSection"] : 3); + if (sec != 0) continue; + uint32_t col = charSectionsDbc2->getUInt32(r, csL ? (*csL)["ColorIndex"] : 5); + if (col != static_cast(extra.skinId)) continue; + std::string skinPath = charSectionsDbc2->getString(r, csL ? (*csL)["Texture1"] : 6); + if (!skinPath.empty()) { + rendering::VkTexture* skinTex = charRenderer->loadTexture(skinPath); + if (skinTex) { + for (size_t ti = 0; ti < md->textures.size(); ti++) { + uint32_t tt = md->textures[ti].type; + if (tt == 1 || tt == 11) { + charRenderer->setTextureSlotOverride(instanceId, static_cast(ti), skinTex); + } + } + } + } + break; + } + } + } + } + } + } + // Optional humanoid NPC geoset mask. Disabled by default because forcing geosets // causes long-standing visual artifacts on some models (missing waist, phantom // bracers, flickering apron overlays). Prefer model defaults. diff --git a/src/rendering/character_renderer.cpp b/src/rendering/character_renderer.cpp index 0c52e43b..11fa2ae5 100644 --- a/src/rendering/character_renderer.cpp +++ b/src/rendering/character_renderer.cpp @@ -1580,9 +1580,9 @@ int CharacterRenderer::findKeyframeIndex(const std::vector& timestamps if (timestamps.empty()) return -1; if (timestamps.size() == 1) return 0; - // Binary search: find first element > t, then back up one - uint32_t t = static_cast(time); - auto it = std::upper_bound(timestamps.begin(), timestamps.end(), t); + // Binary search using float comparison to match original semantics exactly + auto it = std::upper_bound(timestamps.begin(), timestamps.end(), time, + [](float t, uint32_t ts) { return t < static_cast(ts); }); if (it == timestamps.begin()) return 0; size_t idx = static_cast(it - timestamps.begin()) - 1; return static_cast(std::min(idx, timestamps.size() - 2)); diff --git a/src/rendering/m2_renderer.cpp b/src/rendering/m2_renderer.cpp index edf02243..2ea99420 100644 --- a/src/rendering/m2_renderer.cpp +++ b/src/rendering/m2_renderer.cpp @@ -1741,9 +1741,9 @@ uint32_t M2Renderer::createInstanceWithMatrix(uint32_t modelId, const glm::mat4& static int findKeyframeIndex(const std::vector& timestamps, float time) { if (timestamps.empty()) return -1; if (timestamps.size() == 1) return 0; - uint32_t t = static_cast(time); - // Binary search: find first element > t, then back up one - auto it = std::upper_bound(timestamps.begin(), timestamps.end(), t); + // Binary search using float comparison to match original semantics exactly + auto it = std::upper_bound(timestamps.begin(), timestamps.end(), time, + [](float t, uint32_t ts) { return t < static_cast(ts); }); if (it == timestamps.begin()) return 0; size_t idx = static_cast(it - timestamps.begin()) - 1; return static_cast(std::min(idx, timestamps.size() - 2)); From bec7a678aa75cf272f0a511dc02a87c30b200a67 Mon Sep 17 00:00:00 2001 From: Kelsi Date: Wed, 4 Mar 2026 09:25:00 -0800 Subject: [PATCH 5/5] Fix missing floors in dungeon instances by restricting LOD detection Low-vertex groups (<100 verts) were incorrectly marked as distance-only LOD shells in small WMOs like Stockades. Now only applies this heuristic to large WMOs (50+ groups) where it's needed for city exterior shells. --- src/rendering/wmo_renderer.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/rendering/wmo_renderer.cpp b/src/rendering/wmo_renderer.cpp index 18230194..b71f0287 100644 --- a/src/rendering/wmo_renderer.cpp +++ b/src/rendering/wmo_renderer.cpp @@ -536,7 +536,7 @@ bool WMORenderer::loadModel(const pipeline::WMOModel& model, uint32_t id) { // Flag 0x80 on INDOOR groups in large WMOs = interior cathedral shell bool hasFlag80 = (wmoGroup.flags & 0x80) != 0; bool isIndoor = (wmoGroup.flags & 0x2000) != 0; - if (nVerts < 100 || (alwaysDraw && nVerts < 5000) || (isFacade && isLargeWmo) || (isCityShell && isLargeWmo) || (hasFlag80 && isIndoor && isLargeWmo)) { + if ((nVerts < 100 && isLargeWmo) || (alwaysDraw && nVerts < 5000) || (isFacade && isLargeWmo) || (isCityShell && isLargeWmo) || (hasFlag80 && isIndoor && isLargeWmo)) { resources.isLOD = true; } modelData.groups.push_back(resources);