Kelsidavis-WoWee/src/rendering/m2_renderer.cpp

1727 lines
83 KiB
C++
Raw Normal View History

#include "rendering/m2_renderer.hpp"
#include "rendering/m2_renderer_internal.h"
#include "rendering/m2_model_classifier.hpp"
#include "rendering/vk_context.hpp"
#include "rendering/vk_buffer.hpp"
#include "rendering/vk_texture.hpp"
#include "rendering/vk_pipeline.hpp"
#include "rendering/vk_shader.hpp"
#include "rendering/vk_utils.hpp"
#include "rendering/vk_frame_data.hpp"
#include "rendering/camera.hpp"
#include "rendering/frustum.hpp"
#include "pipeline/asset_manager.hpp"
#include "pipeline/blp_loader.hpp"
#include "core/logger.hpp"
#include "core/profiler.hpp"
#include <chrono>
#include <cctype>
#include <glm/gtc/matrix_transform.hpp>
#include <glm/gtc/type_ptr.hpp>
#include <glm/gtx/quaternion.hpp>
#include <unordered_set>
2026-02-07 19:20:37 -08:00
#include <functional>
#include <algorithm>
#include <cmath>
#include <cstdlib>
#include <random>
#include <limits>
#include <future>
#include <thread>
namespace wowee {
namespace rendering {
namespace {
bool envFlagEnabled(const char* key, bool defaultValue) {
const char* raw = std::getenv(key);
if (!raw || !*raw) return defaultValue;
std::string v(raw);
std::transform(v.begin(), v.end(), v.begin(), [](unsigned char c) {
return static_cast<char>(std::tolower(c));
});
return !(v == "0" || v == "false" || v == "off" || v == "no");
}
} // namespace
void M2Instance::updateModelMatrix() {
modelMatrix = glm::mat4(1.0f);
modelMatrix = glm::translate(modelMatrix, position);
// Rotation in radians
modelMatrix = glm::rotate(modelMatrix, rotation.x, glm::vec3(1.0f, 0.0f, 0.0f));
modelMatrix = glm::rotate(modelMatrix, rotation.y, glm::vec3(0.0f, 1.0f, 0.0f));
modelMatrix = glm::rotate(modelMatrix, rotation.z, glm::vec3(0.0f, 0.0f, 1.0f));
modelMatrix = glm::scale(modelMatrix, glm::vec3(scale));
invModelMatrix = glm::inverse(modelMatrix);
}
M2Renderer::M2Renderer() {
}
M2Renderer::~M2Renderer() {
shutdown();
}
bool M2Renderer::initialize(VkContext* ctx, VkDescriptorSetLayout perFrameLayout,
pipeline::AssetManager* assets) {
if (initialized_) { assetManager = assets; return true; }
vkCtx_ = ctx;
assetManager = assets;
const unsigned hc = std::thread::hardware_concurrency();
const size_t availableCores = (hc > 1u) ? static_cast<size_t>(hc - 1u) : 1ull;
// Keep headroom for other frame tasks: M2 gets about half of non-main cores by default.
const size_t defaultAnimThreads = std::max<size_t>(1, availableCores / 2);
numAnimThreads_ = static_cast<uint32_t>(std::max<size_t>(
1, envSizeOrDefault("WOWEE_M2_ANIM_THREADS", defaultAnimThreads)));
LOG_INFO("Initializing M2 renderer (Vulkan, ", numAnimThreads_, " anim threads)...");
VkDevice device = vkCtx_->getDevice();
// --- Descriptor set layouts ---
// Material set layout (set 1): binding 0 = sampler2D, binding 2 = M2Material UBO
// (M2Params moved to push constants alongside model matrix)
{
VkDescriptorSetLayoutBinding bindings[2] = {};
bindings[0].binding = 0;
bindings[0].descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER;
bindings[0].descriptorCount = 1;
bindings[0].stageFlags = VK_SHADER_STAGE_FRAGMENT_BIT;
bindings[1].binding = 2;
bindings[1].descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER;
bindings[1].descriptorCount = 1;
bindings[1].stageFlags = VK_SHADER_STAGE_FRAGMENT_BIT;
VkDescriptorSetLayoutCreateInfo ci{VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO};
ci.bindingCount = 2;
ci.pBindings = bindings;
vkCreateDescriptorSetLayout(device, &ci, nullptr, &materialSetLayout_);
}
// Bone set layout (set 2): binding 0 = STORAGE_BUFFER (bone matrices)
{
VkDescriptorSetLayoutBinding binding{};
binding.binding = 0;
binding.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
binding.descriptorCount = 1;
binding.stageFlags = VK_SHADER_STAGE_VERTEX_BIT;
VkDescriptorSetLayoutCreateInfo ci{VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO};
ci.bindingCount = 1;
ci.pBindings = &binding;
vkCreateDescriptorSetLayout(device, &ci, nullptr, &boneSetLayout_);
}
feat(animation): decompose AnimationController into FSM-based architecture Replace the 2,200-line monolithic AnimationController (goto-driven, single class, untestable) with a composed FSM architecture per refactor.md. New subsystem (src/rendering/animation/ — 16 headers, 10 sources): - CharacterAnimator: FSM composer implementing ICharacterAnimator - LocomotionFSM: idle/walk/run/sprint/jump/swim/strafe - CombatFSM: melee/ranged/spell cast/stun/hit reaction/charge - ActivityFSM: emote/loot/sit-down/sitting/sit-up - MountFSM: idle/run/flight/taxi/fidget/rear-up (per-instance RNG) - AnimCapabilitySet + AnimCapabilityProbe: probe once at model load, eliminate per-frame hasAnimation() linear search - AnimationManager: registry of CharacterAnimator by GUID - EmoteRegistry: DBC-backed emote command → animId singleton - FootstepDriver, SfxStateDriver: extracted from AnimationController animation_ids.hpp/.cpp moved to animation/ subdirectory (452 named constants); all include paths updated. AnimationController retained as thin adapter (~400 LOC): collects FrameInput, delegates to CharacterAnimator, applies AnimOutput. Priority order: Mount > Stun > HitReaction > Spell > Charge > Melee/Ranged > CombatIdle > Emote > Loot > Sit > Locomotion. STAY_IN_STATE policy when all FSMs return valid=false. Bugs fixed: - Remove static mt19937 in mount fidget (shared state across all mounted units) — replaced with per-instance seeded RNG - Remove goto from mounted animation branch (skipped init) - Remove per-frame hasAnimation() calls (now one probe at load) - Fix VK_INDEX_TYPE_UINT16 → UINT32 in shadow pass Tests (4 new suites, all ASAN+UBSan clean): - test_locomotion_fsm: 167 assertions - test_combat_fsm: 125 cases - test_activity_fsm: 112 cases - test_anim_capability: 56 cases docs/ANIMATION_SYSTEM.md added (architecture reference).
2026-04-05 12:27:35 +03:00
// Instance data set layout (set 3): binding 0 = STORAGE_BUFFER (per-instance data)
feat(rendering): GPU architecture + visual quality fixes M2 GPU instancing - M2InstanceGPU SSBO (96 B/entry, double-buffered, 16384 max) - Group opaque instances by (modelId, LOD); single vkCmdDrawIndexed per group - boneBase field indexes into mega bone SSBO via gl_InstanceIndex Indirect terrain drawing - 24 MB mega index buffer (6M uint32) + 64 MB mega vertex buffer - CPU builds VkDrawIndexedIndirectCommand per visible chunk - Single VB/IB bind per frame; shadow pass reuses mega buffers - Replaced vkCmdDrawIndexedIndirect with direct vkCmdDrawIndexed to fix host-mapped buffer race condition that caused terrain flickering GPU frustum culling (compute shader) - m2_cull.comp.glsl: 64-thread workgroups, sphere-vs-6-planes + distance cull - CullInstanceGPU SSBO input, uint visibility[] output, double-buffered - dispatchCullCompute() runs before main pass via render graph node Consolidated bone matrix SSBOs - 16 MB double-buffered mega bone SSBO (2048 instances × 128 bones) - Eliminated per-instance descriptor sets; one megaBoneSet_ per frame - prepareRender() packs bone matrices consecutively into current frame slot Render graph / frame graph - RenderGraph: RGResource handles, RGPass nodes, Kahn topological sort - Automatic VkImageMemoryBarrier/VkBufferMemoryBarrier between passes - Passes: minimap_composite, worldmap_composite, preview_composite, shadow_pass, reflection_pass, compute_cull - beginFrame() uses buildFrameGraph() + renderGraph_->execute(cmd) Pipeline derivatives - PipelineBuilder::setFlags/setBasePipeline for VK_PIPELINE_CREATE_DERIVATIVE_BIT - M2 opaque = base; alphaTest/alpha/additive are derivatives - Applied to terrain (wireframe) and WMO (alpha-test) renderers Rendering bug fixes: - fix(shadow): compute lightSpaceMatrix before updatePerFrameUBO to eliminate one-frame lag that caused shadow trails and flicker on moving objects - fix(shadow): scale depth bias with shadowDistance_ instead of hardcoded 0.8f to prevent acne at close range and gaps at far range - fix(visibility): WMO group distance threshold 500u → 1200u to match terrain view distance; buildings were disappearing on the horizon - fix(precision): camera near plane 0.05 → 0.5 (ratio 600K:1 → 60K:1), eliminating Z-fighting and improving frustum plane extraction stability - fix(streaming): terrain load radius 4 → 6 tiles (~2133u → ~3200u) to exceed M2 render distance (2800u) and eliminate pop-in when camera turns; unload radius 7 → 9; spawn radius 3 → 4 - fix(visibility): ground-detail M2 distance multiplier 0.75 → 0.9 to reduce early pop of grass and debris
2026-04-04 13:43:16 +03:00
{
VkDescriptorSetLayoutBinding binding{};
binding.binding = 0;
binding.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
binding.descriptorCount = 1;
binding.stageFlags = VK_SHADER_STAGE_VERTEX_BIT;
VkDescriptorSetLayoutCreateInfo ci{VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO};
ci.bindingCount = 1;
ci.pBindings = &binding;
vkCreateDescriptorSetLayout(device, &ci, nullptr, &instanceSetLayout_);
}
// Particle texture set layout (set 1 for particles): binding 0 = sampler2D
{
VkDescriptorSetLayoutBinding binding{};
binding.binding = 0;
binding.descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER;
binding.descriptorCount = 1;
binding.stageFlags = VK_SHADER_STAGE_FRAGMENT_BIT;
VkDescriptorSetLayoutCreateInfo ci{VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO};
ci.bindingCount = 1;
ci.pBindings = &binding;
vkCreateDescriptorSetLayout(device, &ci, nullptr, &particleTexLayout_);
}
// --- Descriptor pools ---
{
VkDescriptorPoolSize sizes[] = {
{VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, MAX_MATERIAL_SETS + 256},
{VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, MAX_MATERIAL_SETS + 256},
};
VkDescriptorPoolCreateInfo ci{VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO};
ci.maxSets = MAX_MATERIAL_SETS + 256;
ci.poolSizeCount = 2;
ci.pPoolSizes = sizes;
ci.flags = VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT;
vkCreateDescriptorPool(device, &ci, nullptr, &materialDescPool_);
}
{
VkDescriptorPoolSize sizes[] = {
{VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, MAX_BONE_SETS},
};
VkDescriptorPoolCreateInfo ci{VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO};
ci.maxSets = MAX_BONE_SETS;
ci.poolSizeCount = 1;
ci.pPoolSizes = sizes;
ci.flags = VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT;
vkCreateDescriptorPool(device, &ci, nullptr, &boneDescPool_);
}
2026-03-22 21:47:12 +03:00
// Create a small identity-bone SSBO + descriptor set so that non-animated
// draws always have a valid set 2 bound. The Intel ANV driver segfaults
// on vkCmdDrawIndexed when a declared descriptor set slot is unbound.
{
// Single identity matrix (bone 0 = identity)
glm::mat4 identity(1.0f);
VkBufferCreateInfo bci{VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO};
bci.size = sizeof(glm::mat4);
bci.usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT;
VmaAllocationCreateInfo aci{};
aci.usage = VMA_MEMORY_USAGE_CPU_TO_GPU;
aci.flags = VMA_ALLOCATION_CREATE_MAPPED_BIT;
VmaAllocationInfo allocInfo{};
vmaCreateBuffer(ctx->getAllocator(), &bci, &aci,
&dummyBoneBuffer_, &dummyBoneAlloc_, &allocInfo);
if (allocInfo.pMappedData) {
memcpy(allocInfo.pMappedData, &identity, sizeof(identity));
}
dummyBoneSet_ = allocateBoneSet();
if (dummyBoneSet_) {
VkDescriptorBufferInfo bufInfo{};
bufInfo.buffer = dummyBoneBuffer_;
bufInfo.offset = 0;
bufInfo.range = sizeof(glm::mat4);
VkWriteDescriptorSet write{VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET};
write.dstSet = dummyBoneSet_;
write.dstBinding = 0;
write.descriptorCount = 1;
write.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
write.pBufferInfo = &bufInfo;
vkUpdateDescriptorSets(device, 1, &write, 0, nullptr);
}
}
feat(rendering): GPU architecture + visual quality fixes M2 GPU instancing - M2InstanceGPU SSBO (96 B/entry, double-buffered, 16384 max) - Group opaque instances by (modelId, LOD); single vkCmdDrawIndexed per group - boneBase field indexes into mega bone SSBO via gl_InstanceIndex Indirect terrain drawing - 24 MB mega index buffer (6M uint32) + 64 MB mega vertex buffer - CPU builds VkDrawIndexedIndirectCommand per visible chunk - Single VB/IB bind per frame; shadow pass reuses mega buffers - Replaced vkCmdDrawIndexedIndirect with direct vkCmdDrawIndexed to fix host-mapped buffer race condition that caused terrain flickering GPU frustum culling (compute shader) - m2_cull.comp.glsl: 64-thread workgroups, sphere-vs-6-planes + distance cull - CullInstanceGPU SSBO input, uint visibility[] output, double-buffered - dispatchCullCompute() runs before main pass via render graph node Consolidated bone matrix SSBOs - 16 MB double-buffered mega bone SSBO (2048 instances × 128 bones) - Eliminated per-instance descriptor sets; one megaBoneSet_ per frame - prepareRender() packs bone matrices consecutively into current frame slot Render graph / frame graph - RenderGraph: RGResource handles, RGPass nodes, Kahn topological sort - Automatic VkImageMemoryBarrier/VkBufferMemoryBarrier between passes - Passes: minimap_composite, worldmap_composite, preview_composite, shadow_pass, reflection_pass, compute_cull - beginFrame() uses buildFrameGraph() + renderGraph_->execute(cmd) Pipeline derivatives - PipelineBuilder::setFlags/setBasePipeline for VK_PIPELINE_CREATE_DERIVATIVE_BIT - M2 opaque = base; alphaTest/alpha/additive are derivatives - Applied to terrain (wireframe) and WMO (alpha-test) renderers Rendering bug fixes: - fix(shadow): compute lightSpaceMatrix before updatePerFrameUBO to eliminate one-frame lag that caused shadow trails and flicker on moving objects - fix(shadow): scale depth bias with shadowDistance_ instead of hardcoded 0.8f to prevent acne at close range and gaps at far range - fix(visibility): WMO group distance threshold 500u → 1200u to match terrain view distance; buildings were disappearing on the horizon - fix(precision): camera near plane 0.05 → 0.5 (ratio 600K:1 → 60K:1), eliminating Z-fighting and improving frustum plane extraction stability - fix(streaming): terrain load radius 4 → 6 tiles (~2133u → ~3200u) to exceed M2 render distance (2800u) and eliminate pop-in when camera turns; unload radius 7 → 9; spawn radius 3 → 4 - fix(visibility): ground-detail M2 distance multiplier 0.75 → 0.9 to reduce early pop of grass and debris
2026-04-04 13:43:16 +03:00
// Mega bone SSBO — consolidates all animated instance bones into one buffer per frame.
// Slot 0 = identity matrix (for non-animated instances), slots 1..N = animated instances.
{
const VkDeviceSize megaSize = MEGA_BONE_MAX_INSTANCES * MAX_BONES_PER_INSTANCE * sizeof(glm::mat4);
glm::mat4 identity(1.0f);
for (int i = 0; i < 2; i++) {
VkBufferCreateInfo bci{VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO};
bci.size = megaSize;
bci.usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT;
VmaAllocationCreateInfo aci{};
aci.usage = VMA_MEMORY_USAGE_CPU_TO_GPU;
aci.flags = VMA_ALLOCATION_CREATE_MAPPED_BIT;
VmaAllocationInfo allocInfo{};
vmaCreateBuffer(ctx->getAllocator(), &bci, &aci,
&megaBoneBuffer_[i], &megaBoneAlloc_[i], &allocInfo);
megaBoneMapped_[i] = allocInfo.pMappedData;
// Slot 0: identity matrix (for non-animated instances)
if (megaBoneMapped_[i]) {
memcpy(megaBoneMapped_[i], &identity, sizeof(identity));
}
megaBoneSet_[i] = allocateBoneSet();
if (megaBoneSet_[i]) {
VkDescriptorBufferInfo bufInfo{};
bufInfo.buffer = megaBoneBuffer_[i];
bufInfo.offset = 0;
bufInfo.range = megaSize;
VkWriteDescriptorSet write{VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET};
write.dstSet = megaBoneSet_[i];
write.dstBinding = 0;
write.descriptorCount = 1;
write.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
write.pBufferInfo = &bufInfo;
vkUpdateDescriptorSets(device, 1, &write, 0, nullptr);
}
}
}
feat(animation): decompose AnimationController into FSM-based architecture Replace the 2,200-line monolithic AnimationController (goto-driven, single class, untestable) with a composed FSM architecture per refactor.md. New subsystem (src/rendering/animation/ — 16 headers, 10 sources): - CharacterAnimator: FSM composer implementing ICharacterAnimator - LocomotionFSM: idle/walk/run/sprint/jump/swim/strafe - CombatFSM: melee/ranged/spell cast/stun/hit reaction/charge - ActivityFSM: emote/loot/sit-down/sitting/sit-up - MountFSM: idle/run/flight/taxi/fidget/rear-up (per-instance RNG) - AnimCapabilitySet + AnimCapabilityProbe: probe once at model load, eliminate per-frame hasAnimation() linear search - AnimationManager: registry of CharacterAnimator by GUID - EmoteRegistry: DBC-backed emote command → animId singleton - FootstepDriver, SfxStateDriver: extracted from AnimationController animation_ids.hpp/.cpp moved to animation/ subdirectory (452 named constants); all include paths updated. AnimationController retained as thin adapter (~400 LOC): collects FrameInput, delegates to CharacterAnimator, applies AnimOutput. Priority order: Mount > Stun > HitReaction > Spell > Charge > Melee/Ranged > CombatIdle > Emote > Loot > Sit > Locomotion. STAY_IN_STATE policy when all FSMs return valid=false. Bugs fixed: - Remove static mt19937 in mount fidget (shared state across all mounted units) — replaced with per-instance seeded RNG - Remove goto from mounted animation branch (skipped init) - Remove per-frame hasAnimation() calls (now one probe at load) - Fix VK_INDEX_TYPE_UINT16 → UINT32 in shadow pass Tests (4 new suites, all ASAN+UBSan clean): - test_locomotion_fsm: 167 assertions - test_combat_fsm: 125 cases - test_activity_fsm: 112 cases - test_anim_capability: 56 cases docs/ANIMATION_SYSTEM.md added (architecture reference).
2026-04-05 12:27:35 +03:00
// Instance data SSBO — per-frame buffer holding per-instance transforms, fade, bones.
feat(rendering): GPU architecture + visual quality fixes M2 GPU instancing - M2InstanceGPU SSBO (96 B/entry, double-buffered, 16384 max) - Group opaque instances by (modelId, LOD); single vkCmdDrawIndexed per group - boneBase field indexes into mega bone SSBO via gl_InstanceIndex Indirect terrain drawing - 24 MB mega index buffer (6M uint32) + 64 MB mega vertex buffer - CPU builds VkDrawIndexedIndirectCommand per visible chunk - Single VB/IB bind per frame; shadow pass reuses mega buffers - Replaced vkCmdDrawIndexedIndirect with direct vkCmdDrawIndexed to fix host-mapped buffer race condition that caused terrain flickering GPU frustum culling (compute shader) - m2_cull.comp.glsl: 64-thread workgroups, sphere-vs-6-planes + distance cull - CullInstanceGPU SSBO input, uint visibility[] output, double-buffered - dispatchCullCompute() runs before main pass via render graph node Consolidated bone matrix SSBOs - 16 MB double-buffered mega bone SSBO (2048 instances × 128 bones) - Eliminated per-instance descriptor sets; one megaBoneSet_ per frame - prepareRender() packs bone matrices consecutively into current frame slot Render graph / frame graph - RenderGraph: RGResource handles, RGPass nodes, Kahn topological sort - Automatic VkImageMemoryBarrier/VkBufferMemoryBarrier between passes - Passes: minimap_composite, worldmap_composite, preview_composite, shadow_pass, reflection_pass, compute_cull - beginFrame() uses buildFrameGraph() + renderGraph_->execute(cmd) Pipeline derivatives - PipelineBuilder::setFlags/setBasePipeline for VK_PIPELINE_CREATE_DERIVATIVE_BIT - M2 opaque = base; alphaTest/alpha/additive are derivatives - Applied to terrain (wireframe) and WMO (alpha-test) renderers Rendering bug fixes: - fix(shadow): compute lightSpaceMatrix before updatePerFrameUBO to eliminate one-frame lag that caused shadow trails and flicker on moving objects - fix(shadow): scale depth bias with shadowDistance_ instead of hardcoded 0.8f to prevent acne at close range and gaps at far range - fix(visibility): WMO group distance threshold 500u → 1200u to match terrain view distance; buildings were disappearing on the horizon - fix(precision): camera near plane 0.05 → 0.5 (ratio 600K:1 → 60K:1), eliminating Z-fighting and improving frustum plane extraction stability - fix(streaming): terrain load radius 4 → 6 tiles (~2133u → ~3200u) to exceed M2 render distance (2800u) and eliminate pop-in when camera turns; unload radius 7 → 9; spawn radius 3 → 4 - fix(visibility): ground-detail M2 distance multiplier 0.75 → 0.9 to reduce early pop of grass and debris
2026-04-04 13:43:16 +03:00
// Shader reads instanceData[push.instanceDataOffset + gl_InstanceIndex].
{
static_assert(sizeof(M2InstanceGPU) == 96, "M2InstanceGPU must be 96 bytes (std430)");
const VkDeviceSize instBufSize = MAX_INSTANCE_DATA * sizeof(M2InstanceGPU);
// Descriptor pool for 2 sets (double-buffered)
VkDescriptorPoolSize poolSize{VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 2};
VkDescriptorPoolCreateInfo poolCi{VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO};
poolCi.maxSets = 2;
poolCi.poolSizeCount = 1;
poolCi.pPoolSizes = &poolSize;
vkCreateDescriptorPool(device, &poolCi, nullptr, &instanceDescPool_);
for (int i = 0; i < 2; i++) {
VkBufferCreateInfo bci{VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO};
bci.size = instBufSize;
bci.usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT;
VmaAllocationCreateInfo aci{};
aci.usage = VMA_MEMORY_USAGE_CPU_TO_GPU;
aci.flags = VMA_ALLOCATION_CREATE_MAPPED_BIT;
VmaAllocationInfo allocInfo{};
vmaCreateBuffer(ctx->getAllocator(), &bci, &aci,
&instanceBuffer_[i], &instanceAlloc_[i], &allocInfo);
instanceMapped_[i] = allocInfo.pMappedData;
VkDescriptorSetAllocateInfo setAi{VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO};
setAi.descriptorPool = instanceDescPool_;
setAi.descriptorSetCount = 1;
setAi.pSetLayouts = &instanceSetLayout_;
vkAllocateDescriptorSets(device, &setAi, &instanceSet_[i]);
VkDescriptorBufferInfo bufInfo{};
bufInfo.buffer = instanceBuffer_[i];
bufInfo.offset = 0;
bufInfo.range = instBufSize;
VkWriteDescriptorSet write{VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET};
write.dstSet = instanceSet_[i];
write.dstBinding = 0;
write.descriptorCount = 1;
write.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
write.pBufferInfo = &bufInfo;
vkUpdateDescriptorSets(device, 1, &write, 0, nullptr);
}
}
feat(animation): decompose AnimationController into FSM-based architecture Replace the 2,200-line monolithic AnimationController (goto-driven, single class, untestable) with a composed FSM architecture per refactor.md. New subsystem (src/rendering/animation/ — 16 headers, 10 sources): - CharacterAnimator: FSM composer implementing ICharacterAnimator - LocomotionFSM: idle/walk/run/sprint/jump/swim/strafe - CombatFSM: melee/ranged/spell cast/stun/hit reaction/charge - ActivityFSM: emote/loot/sit-down/sitting/sit-up - MountFSM: idle/run/flight/taxi/fidget/rear-up (per-instance RNG) - AnimCapabilitySet + AnimCapabilityProbe: probe once at model load, eliminate per-frame hasAnimation() linear search - AnimationManager: registry of CharacterAnimator by GUID - EmoteRegistry: DBC-backed emote command → animId singleton - FootstepDriver, SfxStateDriver: extracted from AnimationController animation_ids.hpp/.cpp moved to animation/ subdirectory (452 named constants); all include paths updated. AnimationController retained as thin adapter (~400 LOC): collects FrameInput, delegates to CharacterAnimator, applies AnimOutput. Priority order: Mount > Stun > HitReaction > Spell > Charge > Melee/Ranged > CombatIdle > Emote > Loot > Sit > Locomotion. STAY_IN_STATE policy when all FSMs return valid=false. Bugs fixed: - Remove static mt19937 in mount fidget (shared state across all mounted units) — replaced with per-instance seeded RNG - Remove goto from mounted animation branch (skipped init) - Remove per-frame hasAnimation() calls (now one probe at load) - Fix VK_INDEX_TYPE_UINT16 → UINT32 in shadow pass Tests (4 new suites, all ASAN+UBSan clean): - test_locomotion_fsm: 167 assertions - test_combat_fsm: 125 cases - test_activity_fsm: 112 cases - test_anim_capability: 56 cases docs/ANIMATION_SYSTEM.md added (architecture reference).
2026-04-05 12:27:35 +03:00
// GPU frustum culling — compute pipeline, buffers, descriptors.
feat(rendering): GPU architecture + visual quality fixes M2 GPU instancing - M2InstanceGPU SSBO (96 B/entry, double-buffered, 16384 max) - Group opaque instances by (modelId, LOD); single vkCmdDrawIndexed per group - boneBase field indexes into mega bone SSBO via gl_InstanceIndex Indirect terrain drawing - 24 MB mega index buffer (6M uint32) + 64 MB mega vertex buffer - CPU builds VkDrawIndexedIndirectCommand per visible chunk - Single VB/IB bind per frame; shadow pass reuses mega buffers - Replaced vkCmdDrawIndexedIndirect with direct vkCmdDrawIndexed to fix host-mapped buffer race condition that caused terrain flickering GPU frustum culling (compute shader) - m2_cull.comp.glsl: 64-thread workgroups, sphere-vs-6-planes + distance cull - CullInstanceGPU SSBO input, uint visibility[] output, double-buffered - dispatchCullCompute() runs before main pass via render graph node Consolidated bone matrix SSBOs - 16 MB double-buffered mega bone SSBO (2048 instances × 128 bones) - Eliminated per-instance descriptor sets; one megaBoneSet_ per frame - prepareRender() packs bone matrices consecutively into current frame slot Render graph / frame graph - RenderGraph: RGResource handles, RGPass nodes, Kahn topological sort - Automatic VkImageMemoryBarrier/VkBufferMemoryBarrier between passes - Passes: minimap_composite, worldmap_composite, preview_composite, shadow_pass, reflection_pass, compute_cull - beginFrame() uses buildFrameGraph() + renderGraph_->execute(cmd) Pipeline derivatives - PipelineBuilder::setFlags/setBasePipeline for VK_PIPELINE_CREATE_DERIVATIVE_BIT - M2 opaque = base; alphaTest/alpha/additive are derivatives - Applied to terrain (wireframe) and WMO (alpha-test) renderers Rendering bug fixes: - fix(shadow): compute lightSpaceMatrix before updatePerFrameUBO to eliminate one-frame lag that caused shadow trails and flicker on moving objects - fix(shadow): scale depth bias with shadowDistance_ instead of hardcoded 0.8f to prevent acne at close range and gaps at far range - fix(visibility): WMO group distance threshold 500u → 1200u to match terrain view distance; buildings were disappearing on the horizon - fix(precision): camera near plane 0.05 → 0.5 (ratio 600K:1 → 60K:1), eliminating Z-fighting and improving frustum plane extraction stability - fix(streaming): terrain load radius 4 → 6 tiles (~2133u → ~3200u) to exceed M2 render distance (2800u) and eliminate pop-in when camera turns; unload radius 7 → 9; spawn radius 3 → 4 - fix(visibility): ground-detail M2 distance multiplier 0.75 → 0.9 to reduce early pop of grass and debris
2026-04-04 13:43:16 +03:00
// Compute shader tests each instance bounding sphere against 6 frustum planes + distance.
// Output: uint visibility[] read back by CPU to skip culled instances in sortedVisible_ build.
{
static_assert(sizeof(CullInstanceGPU) == 32, "CullInstanceGPU must be 32 bytes (std430)");
feat(rendering): add HiZ occlusion culling & fix WMO interior shadows Implement GPU-driven Hierarchical-Z occlusion culling for M2 doodads using a depth pyramid built from the previous frame's depth buffer. The cull shader projects bounding spheres via prevViewProj (temporal reprojection) and samples the HiZ pyramid to reject hidden objects before the main render pass. Key implementation details: - Separate early compute submission (beginSingleTimeCommands + fence wait) eliminates 2-frame visibility staleness - Conservative safeguards prevent false culls: screen-edge guard, full VP row-vector AABB projection (Cauchy-Schwarz), 50% sphere inflation, depth bias, mip+1, min screen size threshold, camera motion dampening (auto-disable on fast rotations), and per-instance previouslyVisible flag tracking - Graceful fallback to frustum-only culling if HiZ init fails Fix dark WMO interiors by gating shadow map sampling on isInterior==0 in the WMO fragment shader. Interior groups (flag 0x2000) now rely solely on pre-baked MOCV vertex-color lighting + MOHD ambient color. Disable interiorDarken globally (was incorrectly darkening outdoor M2s when camera was inside a WMO). Use isInsideInteriorWMO() instead of isInsideWMO() for correct indoor detection. New files: - hiz_system.hpp/cpp: pyramid image management, compute pipeline, descriptors, mip-chain build dispatch, resize handling - hiz_build.comp.glsl: MAX-depth 2x2 reduction compute shader - m2_cull_hiz.comp.glsl: frustum + HiZ occlusion cull compute shader - test_indoor_shadows.cpp: 14 unit tests for shadow/interior contracts Modified: - CullUniformsGPU expanded 128->272 bytes (HiZ params, viewProj, prevViewProj) - Depth buffer images gain VK_IMAGE_USAGE_SAMPLED_BIT for HiZ reads - wmo.frag.glsl: interior branch before unlit, shadow skip for 0x2000 - Render graph: hiz_build + compute_cull disabled (run in early compute) - .gitignore: ignore compiled .spv binaries - MEGA_BONE_MAX_INSTANCES: 2048 -> 4096 Signed-off-by: Pavel Okhlopkov <pavel.okhlopkov@flant.com>
2026-04-06 16:40:59 +03:00
static_assert(sizeof(CullUniformsGPU) == 272, "CullUniformsGPU must be 272 bytes (std140)");
feat(rendering): GPU architecture + visual quality fixes M2 GPU instancing - M2InstanceGPU SSBO (96 B/entry, double-buffered, 16384 max) - Group opaque instances by (modelId, LOD); single vkCmdDrawIndexed per group - boneBase field indexes into mega bone SSBO via gl_InstanceIndex Indirect terrain drawing - 24 MB mega index buffer (6M uint32) + 64 MB mega vertex buffer - CPU builds VkDrawIndexedIndirectCommand per visible chunk - Single VB/IB bind per frame; shadow pass reuses mega buffers - Replaced vkCmdDrawIndexedIndirect with direct vkCmdDrawIndexed to fix host-mapped buffer race condition that caused terrain flickering GPU frustum culling (compute shader) - m2_cull.comp.glsl: 64-thread workgroups, sphere-vs-6-planes + distance cull - CullInstanceGPU SSBO input, uint visibility[] output, double-buffered - dispatchCullCompute() runs before main pass via render graph node Consolidated bone matrix SSBOs - 16 MB double-buffered mega bone SSBO (2048 instances × 128 bones) - Eliminated per-instance descriptor sets; one megaBoneSet_ per frame - prepareRender() packs bone matrices consecutively into current frame slot Render graph / frame graph - RenderGraph: RGResource handles, RGPass nodes, Kahn topological sort - Automatic VkImageMemoryBarrier/VkBufferMemoryBarrier between passes - Passes: minimap_composite, worldmap_composite, preview_composite, shadow_pass, reflection_pass, compute_cull - beginFrame() uses buildFrameGraph() + renderGraph_->execute(cmd) Pipeline derivatives - PipelineBuilder::setFlags/setBasePipeline for VK_PIPELINE_CREATE_DERIVATIVE_BIT - M2 opaque = base; alphaTest/alpha/additive are derivatives - Applied to terrain (wireframe) and WMO (alpha-test) renderers Rendering bug fixes: - fix(shadow): compute lightSpaceMatrix before updatePerFrameUBO to eliminate one-frame lag that caused shadow trails and flicker on moving objects - fix(shadow): scale depth bias with shadowDistance_ instead of hardcoded 0.8f to prevent acne at close range and gaps at far range - fix(visibility): WMO group distance threshold 500u → 1200u to match terrain view distance; buildings were disappearing on the horizon - fix(precision): camera near plane 0.05 → 0.5 (ratio 600K:1 → 60K:1), eliminating Z-fighting and improving frustum plane extraction stability - fix(streaming): terrain load radius 4 → 6 tiles (~2133u → ~3200u) to exceed M2 render distance (2800u) and eliminate pop-in when camera turns; unload radius 7 → 9; spawn radius 3 → 4 - fix(visibility): ground-detail M2 distance multiplier 0.75 → 0.9 to reduce early pop of grass and debris
2026-04-04 13:43:16 +03:00
// Descriptor set layout: binding 0 = UBO (frustum+camera), 1 = SSBO (input), 2 = SSBO (output)
VkDescriptorSetLayoutBinding bindings[3] = {};
bindings[0].binding = 0;
bindings[0].descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER;
bindings[0].descriptorCount = 1;
bindings[0].stageFlags = VK_SHADER_STAGE_COMPUTE_BIT;
bindings[1].binding = 1;
bindings[1].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
bindings[1].descriptorCount = 1;
bindings[1].stageFlags = VK_SHADER_STAGE_COMPUTE_BIT;
bindings[2].binding = 2;
bindings[2].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
bindings[2].descriptorCount = 1;
bindings[2].stageFlags = VK_SHADER_STAGE_COMPUTE_BIT;
VkDescriptorSetLayoutCreateInfo layoutCi{VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO};
layoutCi.bindingCount = 3;
layoutCi.pBindings = bindings;
vkCreateDescriptorSetLayout(device, &layoutCi, nullptr, &cullSetLayout_);
// Pipeline layout (no push constants — everything via UBO)
VkPipelineLayoutCreateInfo plCi{VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO};
plCi.setLayoutCount = 1;
plCi.pSetLayouts = &cullSetLayout_;
vkCreatePipelineLayout(device, &plCi, nullptr, &cullPipelineLayout_);
// Load compute shader
rendering::VkShaderModule cullComp;
if (!cullComp.loadFromFile(device, "assets/shaders/m2_cull.comp.spv")) {
LOG_ERROR("M2Renderer: failed to load m2_cull.comp.spv — GPU culling disabled");
} else {
VkComputePipelineCreateInfo cpCi{VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO};
cpCi.stage = cullComp.stageInfo(VK_SHADER_STAGE_COMPUTE_BIT);
cpCi.layout = cullPipelineLayout_;
if (vkCreateComputePipelines(device, VK_NULL_HANDLE, 1, &cpCi, nullptr, &cullPipeline_) != VK_SUCCESS) {
LOG_ERROR("M2Renderer: failed to create cull compute pipeline");
cullPipeline_ = VK_NULL_HANDLE;
}
cullComp.destroy();
}
feat(rendering): add HiZ occlusion culling & fix WMO interior shadows Implement GPU-driven Hierarchical-Z occlusion culling for M2 doodads using a depth pyramid built from the previous frame's depth buffer. The cull shader projects bounding spheres via prevViewProj (temporal reprojection) and samples the HiZ pyramid to reject hidden objects before the main render pass. Key implementation details: - Separate early compute submission (beginSingleTimeCommands + fence wait) eliminates 2-frame visibility staleness - Conservative safeguards prevent false culls: screen-edge guard, full VP row-vector AABB projection (Cauchy-Schwarz), 50% sphere inflation, depth bias, mip+1, min screen size threshold, camera motion dampening (auto-disable on fast rotations), and per-instance previouslyVisible flag tracking - Graceful fallback to frustum-only culling if HiZ init fails Fix dark WMO interiors by gating shadow map sampling on isInterior==0 in the WMO fragment shader. Interior groups (flag 0x2000) now rely solely on pre-baked MOCV vertex-color lighting + MOHD ambient color. Disable interiorDarken globally (was incorrectly darkening outdoor M2s when camera was inside a WMO). Use isInsideInteriorWMO() instead of isInsideWMO() for correct indoor detection. New files: - hiz_system.hpp/cpp: pyramid image management, compute pipeline, descriptors, mip-chain build dispatch, resize handling - hiz_build.comp.glsl: MAX-depth 2x2 reduction compute shader - m2_cull_hiz.comp.glsl: frustum + HiZ occlusion cull compute shader - test_indoor_shadows.cpp: 14 unit tests for shadow/interior contracts Modified: - CullUniformsGPU expanded 128->272 bytes (HiZ params, viewProj, prevViewProj) - Depth buffer images gain VK_IMAGE_USAGE_SAMPLED_BIT for HiZ reads - wmo.frag.glsl: interior branch before unlit, shadow skip for 0x2000 - Render graph: hiz_build + compute_cull disabled (run in early compute) - .gitignore: ignore compiled .spv binaries - MEGA_BONE_MAX_INSTANCES: 2048 -> 4096 Signed-off-by: Pavel Okhlopkov <pavel.okhlopkov@flant.com>
2026-04-06 16:40:59 +03:00
// HiZ-aware cull pipeline (Phase 6.3 Option B)
// Uses set 0 (same as frustum-only) + set 1 (HiZ pyramid sampler from HiZSystem).
// The HiZ descriptor set layout is created lazily when hizSystem_ is set, but the
// pipeline layout and shader are created now if the shader is available.
rendering::VkShaderModule cullHiZComp;
if (cullHiZComp.loadFromFile(device, "assets/shaders/m2_cull_hiz.comp.spv")) {
// HiZ cull set 1 layout: single combined image sampler (the HiZ pyramid)
VkDescriptorSetLayoutBinding hizBinding{};
hizBinding.binding = 0;
hizBinding.descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER;
hizBinding.descriptorCount = 1;
hizBinding.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT;
VkDescriptorSetLayout hizSamplerLayout = VK_NULL_HANDLE;
VkDescriptorSetLayoutCreateInfo hizLayoutCi{VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO};
hizLayoutCi.bindingCount = 1;
hizLayoutCi.pBindings = &hizBinding;
vkCreateDescriptorSetLayout(device, &hizLayoutCi, nullptr, &hizSamplerLayout);
VkDescriptorSetLayout hizSetLayouts[2] = {cullSetLayout_, hizSamplerLayout};
VkPipelineLayoutCreateInfo hizPlCi{VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO};
hizPlCi.setLayoutCount = 2;
hizPlCi.pSetLayouts = hizSetLayouts;
vkCreatePipelineLayout(device, &hizPlCi, nullptr, &cullHiZPipelineLayout_);
VkComputePipelineCreateInfo hizCpCi{VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO};
hizCpCi.stage = cullHiZComp.stageInfo(VK_SHADER_STAGE_COMPUTE_BIT);
hizCpCi.layout = cullHiZPipelineLayout_;
if (vkCreateComputePipelines(device, VK_NULL_HANDLE, 1, &hizCpCi, nullptr, &cullHiZPipeline_) != VK_SUCCESS) {
LOG_WARNING("M2Renderer: failed to create HiZ cull compute pipeline — HiZ disabled");
cullHiZPipeline_ = VK_NULL_HANDLE;
vkDestroyPipelineLayout(device, cullHiZPipelineLayout_, nullptr);
cullHiZPipelineLayout_ = VK_NULL_HANDLE;
} else {
LOG_INFO("M2Renderer: HiZ occlusion cull pipeline created");
}
// The hizSamplerLayout is now owned by the pipeline layout; we don't track it
// separately because the pipeline layout keeps a ref. But actually Vulkan
// requires us to keep it alive. Store it where HiZSystem will provide it.
// For now, we can destroy it since the pipeline layout was already created.
vkDestroyDescriptorSetLayout(device, hizSamplerLayout, nullptr);
cullHiZComp.destroy();
} else {
LOG_INFO("M2Renderer: m2_cull_hiz.comp.spv not found — HiZ occlusion culling not available");
}
feat(rendering): GPU architecture + visual quality fixes M2 GPU instancing - M2InstanceGPU SSBO (96 B/entry, double-buffered, 16384 max) - Group opaque instances by (modelId, LOD); single vkCmdDrawIndexed per group - boneBase field indexes into mega bone SSBO via gl_InstanceIndex Indirect terrain drawing - 24 MB mega index buffer (6M uint32) + 64 MB mega vertex buffer - CPU builds VkDrawIndexedIndirectCommand per visible chunk - Single VB/IB bind per frame; shadow pass reuses mega buffers - Replaced vkCmdDrawIndexedIndirect with direct vkCmdDrawIndexed to fix host-mapped buffer race condition that caused terrain flickering GPU frustum culling (compute shader) - m2_cull.comp.glsl: 64-thread workgroups, sphere-vs-6-planes + distance cull - CullInstanceGPU SSBO input, uint visibility[] output, double-buffered - dispatchCullCompute() runs before main pass via render graph node Consolidated bone matrix SSBOs - 16 MB double-buffered mega bone SSBO (2048 instances × 128 bones) - Eliminated per-instance descriptor sets; one megaBoneSet_ per frame - prepareRender() packs bone matrices consecutively into current frame slot Render graph / frame graph - RenderGraph: RGResource handles, RGPass nodes, Kahn topological sort - Automatic VkImageMemoryBarrier/VkBufferMemoryBarrier between passes - Passes: minimap_composite, worldmap_composite, preview_composite, shadow_pass, reflection_pass, compute_cull - beginFrame() uses buildFrameGraph() + renderGraph_->execute(cmd) Pipeline derivatives - PipelineBuilder::setFlags/setBasePipeline for VK_PIPELINE_CREATE_DERIVATIVE_BIT - M2 opaque = base; alphaTest/alpha/additive are derivatives - Applied to terrain (wireframe) and WMO (alpha-test) renderers Rendering bug fixes: - fix(shadow): compute lightSpaceMatrix before updatePerFrameUBO to eliminate one-frame lag that caused shadow trails and flicker on moving objects - fix(shadow): scale depth bias with shadowDistance_ instead of hardcoded 0.8f to prevent acne at close range and gaps at far range - fix(visibility): WMO group distance threshold 500u → 1200u to match terrain view distance; buildings were disappearing on the horizon - fix(precision): camera near plane 0.05 → 0.5 (ratio 600K:1 → 60K:1), eliminating Z-fighting and improving frustum plane extraction stability - fix(streaming): terrain load radius 4 → 6 tiles (~2133u → ~3200u) to exceed M2 render distance (2800u) and eliminate pop-in when camera turns; unload radius 7 → 9; spawn radius 3 → 4 - fix(visibility): ground-detail M2 distance multiplier 0.75 → 0.9 to reduce early pop of grass and debris
2026-04-04 13:43:16 +03:00
// Descriptor pool: 2 sets × 3 descriptors each (1 UBO + 2 SSBO)
VkDescriptorPoolSize poolSizes[2] = {};
poolSizes[0] = {VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, 2};
poolSizes[1] = {VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 4}; // 2 input + 2 output
VkDescriptorPoolCreateInfo poolCi{VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO};
poolCi.maxSets = 2;
poolCi.poolSizeCount = 2;
poolCi.pPoolSizes = poolSizes;
vkCreateDescriptorPool(device, &poolCi, nullptr, &cullDescPool_);
const VkDeviceSize uniformSize = sizeof(CullUniformsGPU);
const VkDeviceSize inputSize = MAX_CULL_INSTANCES * sizeof(CullInstanceGPU);
const VkDeviceSize outputSize = MAX_CULL_INSTANCES * sizeof(uint32_t);
for (int i = 0; i < 2; i++) {
// Uniform buffer (frustum planes + camera)
{
VkBufferCreateInfo bci{VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO};
bci.size = uniformSize;
bci.usage = VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT;
VmaAllocationCreateInfo aci{};
aci.usage = VMA_MEMORY_USAGE_CPU_TO_GPU;
aci.flags = VMA_ALLOCATION_CREATE_MAPPED_BIT;
VmaAllocationInfo ai{};
vmaCreateBuffer(ctx->getAllocator(), &bci, &aci,
&cullUniformBuffer_[i], &cullUniformAlloc_[i], &ai);
cullUniformMapped_[i] = ai.pMappedData;
}
// Input SSBO (per-instance cull data)
{
VkBufferCreateInfo bci{VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO};
bci.size = inputSize;
bci.usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT;
VmaAllocationCreateInfo aci{};
aci.usage = VMA_MEMORY_USAGE_CPU_TO_GPU;
aci.flags = VMA_ALLOCATION_CREATE_MAPPED_BIT;
VmaAllocationInfo ai{};
vmaCreateBuffer(ctx->getAllocator(), &bci, &aci,
&cullInputBuffer_[i], &cullInputAlloc_[i], &ai);
cullInputMapped_[i] = ai.pMappedData;
}
// Output SSBO (visibility flags — GPU writes, CPU reads)
{
VkBufferCreateInfo bci{VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO};
bci.size = outputSize;
bci.usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT;
VmaAllocationCreateInfo aci{};
aci.usage = VMA_MEMORY_USAGE_GPU_TO_CPU;
aci.flags = VMA_ALLOCATION_CREATE_MAPPED_BIT;
VmaAllocationInfo ai{};
vmaCreateBuffer(ctx->getAllocator(), &bci, &aci,
&cullOutputBuffer_[i], &cullOutputAlloc_[i], &ai);
cullOutputMapped_[i] = ai.pMappedData;
}
// Allocate and write descriptor set
VkDescriptorSetAllocateInfo setAi{VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO};
setAi.descriptorPool = cullDescPool_;
setAi.descriptorSetCount = 1;
setAi.pSetLayouts = &cullSetLayout_;
vkAllocateDescriptorSets(device, &setAi, &cullSet_[i]);
VkDescriptorBufferInfo uboInfo{cullUniformBuffer_[i], 0, uniformSize};
VkDescriptorBufferInfo inputInfo{cullInputBuffer_[i], 0, inputSize};
VkDescriptorBufferInfo outputInfo{cullOutputBuffer_[i], 0, outputSize};
VkWriteDescriptorSet writes[3] = {};
writes[0] = {VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET};
writes[0].dstSet = cullSet_[i];
writes[0].dstBinding = 0;
writes[0].descriptorCount = 1;
writes[0].descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER;
writes[0].pBufferInfo = &uboInfo;
writes[1] = {VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET};
writes[1].dstSet = cullSet_[i];
writes[1].dstBinding = 1;
writes[1].descriptorCount = 1;
writes[1].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
writes[1].pBufferInfo = &inputInfo;
writes[2] = {VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET};
writes[2].dstSet = cullSet_[i];
writes[2].dstBinding = 2;
writes[2].descriptorCount = 1;
writes[2].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
writes[2].pBufferInfo = &outputInfo;
vkUpdateDescriptorSets(device, 3, writes, 0, nullptr);
}
}
// --- Pipeline layouts ---
feat(rendering): GPU architecture + visual quality fixes M2 GPU instancing - M2InstanceGPU SSBO (96 B/entry, double-buffered, 16384 max) - Group opaque instances by (modelId, LOD); single vkCmdDrawIndexed per group - boneBase field indexes into mega bone SSBO via gl_InstanceIndex Indirect terrain drawing - 24 MB mega index buffer (6M uint32) + 64 MB mega vertex buffer - CPU builds VkDrawIndexedIndirectCommand per visible chunk - Single VB/IB bind per frame; shadow pass reuses mega buffers - Replaced vkCmdDrawIndexedIndirect with direct vkCmdDrawIndexed to fix host-mapped buffer race condition that caused terrain flickering GPU frustum culling (compute shader) - m2_cull.comp.glsl: 64-thread workgroups, sphere-vs-6-planes + distance cull - CullInstanceGPU SSBO input, uint visibility[] output, double-buffered - dispatchCullCompute() runs before main pass via render graph node Consolidated bone matrix SSBOs - 16 MB double-buffered mega bone SSBO (2048 instances × 128 bones) - Eliminated per-instance descriptor sets; one megaBoneSet_ per frame - prepareRender() packs bone matrices consecutively into current frame slot Render graph / frame graph - RenderGraph: RGResource handles, RGPass nodes, Kahn topological sort - Automatic VkImageMemoryBarrier/VkBufferMemoryBarrier between passes - Passes: minimap_composite, worldmap_composite, preview_composite, shadow_pass, reflection_pass, compute_cull - beginFrame() uses buildFrameGraph() + renderGraph_->execute(cmd) Pipeline derivatives - PipelineBuilder::setFlags/setBasePipeline for VK_PIPELINE_CREATE_DERIVATIVE_BIT - M2 opaque = base; alphaTest/alpha/additive are derivatives - Applied to terrain (wireframe) and WMO (alpha-test) renderers Rendering bug fixes: - fix(shadow): compute lightSpaceMatrix before updatePerFrameUBO to eliminate one-frame lag that caused shadow trails and flicker on moving objects - fix(shadow): scale depth bias with shadowDistance_ instead of hardcoded 0.8f to prevent acne at close range and gaps at far range - fix(visibility): WMO group distance threshold 500u → 1200u to match terrain view distance; buildings were disappearing on the horizon - fix(precision): camera near plane 0.05 → 0.5 (ratio 600K:1 → 60K:1), eliminating Z-fighting and improving frustum plane extraction stability - fix(streaming): terrain load radius 4 → 6 tiles (~2133u → ~3200u) to exceed M2 render distance (2800u) and eliminate pop-in when camera turns; unload radius 7 → 9; spawn radius 3 → 4 - fix(visibility): ground-detail M2 distance multiplier 0.75 → 0.9 to reduce early pop of grass and debris
2026-04-04 13:43:16 +03:00
// Main M2 pipeline layout: set 0 = perFrame, set 1 = material, set 2 = bones, set 3 = instances
// Push constant: int texCoordSet + int isFoliage + int instanceDataOffset (12 bytes)
{
feat(rendering): GPU architecture + visual quality fixes M2 GPU instancing - M2InstanceGPU SSBO (96 B/entry, double-buffered, 16384 max) - Group opaque instances by (modelId, LOD); single vkCmdDrawIndexed per group - boneBase field indexes into mega bone SSBO via gl_InstanceIndex Indirect terrain drawing - 24 MB mega index buffer (6M uint32) + 64 MB mega vertex buffer - CPU builds VkDrawIndexedIndirectCommand per visible chunk - Single VB/IB bind per frame; shadow pass reuses mega buffers - Replaced vkCmdDrawIndexedIndirect with direct vkCmdDrawIndexed to fix host-mapped buffer race condition that caused terrain flickering GPU frustum culling (compute shader) - m2_cull.comp.glsl: 64-thread workgroups, sphere-vs-6-planes + distance cull - CullInstanceGPU SSBO input, uint visibility[] output, double-buffered - dispatchCullCompute() runs before main pass via render graph node Consolidated bone matrix SSBOs - 16 MB double-buffered mega bone SSBO (2048 instances × 128 bones) - Eliminated per-instance descriptor sets; one megaBoneSet_ per frame - prepareRender() packs bone matrices consecutively into current frame slot Render graph / frame graph - RenderGraph: RGResource handles, RGPass nodes, Kahn topological sort - Automatic VkImageMemoryBarrier/VkBufferMemoryBarrier between passes - Passes: minimap_composite, worldmap_composite, preview_composite, shadow_pass, reflection_pass, compute_cull - beginFrame() uses buildFrameGraph() + renderGraph_->execute(cmd) Pipeline derivatives - PipelineBuilder::setFlags/setBasePipeline for VK_PIPELINE_CREATE_DERIVATIVE_BIT - M2 opaque = base; alphaTest/alpha/additive are derivatives - Applied to terrain (wireframe) and WMO (alpha-test) renderers Rendering bug fixes: - fix(shadow): compute lightSpaceMatrix before updatePerFrameUBO to eliminate one-frame lag that caused shadow trails and flicker on moving objects - fix(shadow): scale depth bias with shadowDistance_ instead of hardcoded 0.8f to prevent acne at close range and gaps at far range - fix(visibility): WMO group distance threshold 500u → 1200u to match terrain view distance; buildings were disappearing on the horizon - fix(precision): camera near plane 0.05 → 0.5 (ratio 600K:1 → 60K:1), eliminating Z-fighting and improving frustum plane extraction stability - fix(streaming): terrain load radius 4 → 6 tiles (~2133u → ~3200u) to exceed M2 render distance (2800u) and eliminate pop-in when camera turns; unload radius 7 → 9; spawn radius 3 → 4 - fix(visibility): ground-detail M2 distance multiplier 0.75 → 0.9 to reduce early pop of grass and debris
2026-04-04 13:43:16 +03:00
VkDescriptorSetLayout setLayouts[] = {perFrameLayout, materialSetLayout_, boneSetLayout_, instanceSetLayout_};
VkPushConstantRange pushRange{};
pushRange.stageFlags = VK_SHADER_STAGE_VERTEX_BIT;
pushRange.offset = 0;
feat(rendering): GPU architecture + visual quality fixes M2 GPU instancing - M2InstanceGPU SSBO (96 B/entry, double-buffered, 16384 max) - Group opaque instances by (modelId, LOD); single vkCmdDrawIndexed per group - boneBase field indexes into mega bone SSBO via gl_InstanceIndex Indirect terrain drawing - 24 MB mega index buffer (6M uint32) + 64 MB mega vertex buffer - CPU builds VkDrawIndexedIndirectCommand per visible chunk - Single VB/IB bind per frame; shadow pass reuses mega buffers - Replaced vkCmdDrawIndexedIndirect with direct vkCmdDrawIndexed to fix host-mapped buffer race condition that caused terrain flickering GPU frustum culling (compute shader) - m2_cull.comp.glsl: 64-thread workgroups, sphere-vs-6-planes + distance cull - CullInstanceGPU SSBO input, uint visibility[] output, double-buffered - dispatchCullCompute() runs before main pass via render graph node Consolidated bone matrix SSBOs - 16 MB double-buffered mega bone SSBO (2048 instances × 128 bones) - Eliminated per-instance descriptor sets; one megaBoneSet_ per frame - prepareRender() packs bone matrices consecutively into current frame slot Render graph / frame graph - RenderGraph: RGResource handles, RGPass nodes, Kahn topological sort - Automatic VkImageMemoryBarrier/VkBufferMemoryBarrier between passes - Passes: minimap_composite, worldmap_composite, preview_composite, shadow_pass, reflection_pass, compute_cull - beginFrame() uses buildFrameGraph() + renderGraph_->execute(cmd) Pipeline derivatives - PipelineBuilder::setFlags/setBasePipeline for VK_PIPELINE_CREATE_DERIVATIVE_BIT - M2 opaque = base; alphaTest/alpha/additive are derivatives - Applied to terrain (wireframe) and WMO (alpha-test) renderers Rendering bug fixes: - fix(shadow): compute lightSpaceMatrix before updatePerFrameUBO to eliminate one-frame lag that caused shadow trails and flicker on moving objects - fix(shadow): scale depth bias with shadowDistance_ instead of hardcoded 0.8f to prevent acne at close range and gaps at far range - fix(visibility): WMO group distance threshold 500u → 1200u to match terrain view distance; buildings were disappearing on the horizon - fix(precision): camera near plane 0.05 → 0.5 (ratio 600K:1 → 60K:1), eliminating Z-fighting and improving frustum plane extraction stability - fix(streaming): terrain load radius 4 → 6 tiles (~2133u → ~3200u) to exceed M2 render distance (2800u) and eliminate pop-in when camera turns; unload radius 7 → 9; spawn radius 3 → 4 - fix(visibility): ground-detail M2 distance multiplier 0.75 → 0.9 to reduce early pop of grass and debris
2026-04-04 13:43:16 +03:00
pushRange.size = 12; // int texCoordSet + int isFoliage + int instanceDataOffset
VkPipelineLayoutCreateInfo ci{VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO};
feat(rendering): GPU architecture + visual quality fixes M2 GPU instancing - M2InstanceGPU SSBO (96 B/entry, double-buffered, 16384 max) - Group opaque instances by (modelId, LOD); single vkCmdDrawIndexed per group - boneBase field indexes into mega bone SSBO via gl_InstanceIndex Indirect terrain drawing - 24 MB mega index buffer (6M uint32) + 64 MB mega vertex buffer - CPU builds VkDrawIndexedIndirectCommand per visible chunk - Single VB/IB bind per frame; shadow pass reuses mega buffers - Replaced vkCmdDrawIndexedIndirect with direct vkCmdDrawIndexed to fix host-mapped buffer race condition that caused terrain flickering GPU frustum culling (compute shader) - m2_cull.comp.glsl: 64-thread workgroups, sphere-vs-6-planes + distance cull - CullInstanceGPU SSBO input, uint visibility[] output, double-buffered - dispatchCullCompute() runs before main pass via render graph node Consolidated bone matrix SSBOs - 16 MB double-buffered mega bone SSBO (2048 instances × 128 bones) - Eliminated per-instance descriptor sets; one megaBoneSet_ per frame - prepareRender() packs bone matrices consecutively into current frame slot Render graph / frame graph - RenderGraph: RGResource handles, RGPass nodes, Kahn topological sort - Automatic VkImageMemoryBarrier/VkBufferMemoryBarrier between passes - Passes: minimap_composite, worldmap_composite, preview_composite, shadow_pass, reflection_pass, compute_cull - beginFrame() uses buildFrameGraph() + renderGraph_->execute(cmd) Pipeline derivatives - PipelineBuilder::setFlags/setBasePipeline for VK_PIPELINE_CREATE_DERIVATIVE_BIT - M2 opaque = base; alphaTest/alpha/additive are derivatives - Applied to terrain (wireframe) and WMO (alpha-test) renderers Rendering bug fixes: - fix(shadow): compute lightSpaceMatrix before updatePerFrameUBO to eliminate one-frame lag that caused shadow trails and flicker on moving objects - fix(shadow): scale depth bias with shadowDistance_ instead of hardcoded 0.8f to prevent acne at close range and gaps at far range - fix(visibility): WMO group distance threshold 500u → 1200u to match terrain view distance; buildings were disappearing on the horizon - fix(precision): camera near plane 0.05 → 0.5 (ratio 600K:1 → 60K:1), eliminating Z-fighting and improving frustum plane extraction stability - fix(streaming): terrain load radius 4 → 6 tiles (~2133u → ~3200u) to exceed M2 render distance (2800u) and eliminate pop-in when camera turns; unload radius 7 → 9; spawn radius 3 → 4 - fix(visibility): ground-detail M2 distance multiplier 0.75 → 0.9 to reduce early pop of grass and debris
2026-04-04 13:43:16 +03:00
ci.setLayoutCount = 4;
ci.pSetLayouts = setLayouts;
ci.pushConstantRangeCount = 1;
ci.pPushConstantRanges = &pushRange;
vkCreatePipelineLayout(device, &ci, nullptr, &pipelineLayout_);
}
// Particle pipeline layout: set 0 = perFrame, set 1 = particleTex
// Push constant: vec2 tileCount + int alphaKey (12 bytes)
{
VkDescriptorSetLayout setLayouts[] = {perFrameLayout, particleTexLayout_};
VkPushConstantRange pushRange{};
pushRange.stageFlags = VK_SHADER_STAGE_FRAGMENT_BIT;
pushRange.offset = 0;
pushRange.size = 12; // vec2 + int
VkPipelineLayoutCreateInfo ci{VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO};
ci.setLayoutCount = 2;
ci.pSetLayouts = setLayouts;
ci.pushConstantRangeCount = 1;
ci.pPushConstantRanges = &pushRange;
vkCreatePipelineLayout(device, &ci, nullptr, &particlePipelineLayout_);
}
// Smoke pipeline layout: set 0 = perFrame
// Push constant: float screenHeight (4 bytes)
{
VkDescriptorSetLayout setLayouts[] = {perFrameLayout};
VkPushConstantRange pushRange{};
pushRange.stageFlags = VK_SHADER_STAGE_VERTEX_BIT;
pushRange.offset = 0;
pushRange.size = 4;
VkPipelineLayoutCreateInfo ci{VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO};
ci.setLayoutCount = 1;
ci.pSetLayouts = setLayouts;
ci.pushConstantRangeCount = 1;
ci.pPushConstantRanges = &pushRange;
vkCreatePipelineLayout(device, &ci, nullptr, &smokePipelineLayout_);
}
// --- Load shaders ---
rendering::VkShaderModule m2Vert, m2Frag;
rendering::VkShaderModule particleVert, particleFrag;
rendering::VkShaderModule smokeVert, smokeFrag;
(void)m2Vert.loadFromFile(device, "assets/shaders/m2.vert.spv");
(void)m2Frag.loadFromFile(device, "assets/shaders/m2.frag.spv");
(void)particleVert.loadFromFile(device, "assets/shaders/m2_particle.vert.spv");
(void)particleFrag.loadFromFile(device, "assets/shaders/m2_particle.frag.spv");
(void)smokeVert.loadFromFile(device, "assets/shaders/m2_smoke.vert.spv");
(void)smokeFrag.loadFromFile(device, "assets/shaders/m2_smoke.frag.spv");
if (!m2Vert.isValid() || !m2Frag.isValid()) {
LOG_ERROR("M2: Missing required shaders, cannot initialize");
return false;
}
VkRenderPass mainPass = vkCtx_->getImGuiRenderPass();
// --- Build M2 model pipelines ---
// Vertex input: 18 floats = 72 bytes stride
// loc 0: vec3 pos (0), loc 1: vec3 normal (12), loc 2: vec2 uv0 (24),
// loc 5: vec2 uv1 (32), loc 3: vec4 boneWeights (40), loc 4: vec4 boneIndices (56)
VkVertexInputBindingDescription m2Binding{};
m2Binding.binding = 0;
m2Binding.stride = 18 * sizeof(float);
m2Binding.inputRate = VK_VERTEX_INPUT_RATE_VERTEX;
std::vector<VkVertexInputAttributeDescription> m2Attrs = {
{0, 0, VK_FORMAT_R32G32B32_SFLOAT, 0}, // position
{1, 0, VK_FORMAT_R32G32B32_SFLOAT, 3 * sizeof(float)}, // normal
{2, 0, VK_FORMAT_R32G32_SFLOAT, 6 * sizeof(float)}, // texCoord0
{5, 0, VK_FORMAT_R32G32_SFLOAT, 8 * sizeof(float)}, // texCoord1
{3, 0, VK_FORMAT_R32G32B32A32_SFLOAT, 10 * sizeof(float)}, // boneWeights
{4, 0, VK_FORMAT_R32G32B32A32_SFLOAT, 14 * sizeof(float)}, // boneIndices (float)
};
feat(rendering): GPU architecture + visual quality fixes M2 GPU instancing - M2InstanceGPU SSBO (96 B/entry, double-buffered, 16384 max) - Group opaque instances by (modelId, LOD); single vkCmdDrawIndexed per group - boneBase field indexes into mega bone SSBO via gl_InstanceIndex Indirect terrain drawing - 24 MB mega index buffer (6M uint32) + 64 MB mega vertex buffer - CPU builds VkDrawIndexedIndirectCommand per visible chunk - Single VB/IB bind per frame; shadow pass reuses mega buffers - Replaced vkCmdDrawIndexedIndirect with direct vkCmdDrawIndexed to fix host-mapped buffer race condition that caused terrain flickering GPU frustum culling (compute shader) - m2_cull.comp.glsl: 64-thread workgroups, sphere-vs-6-planes + distance cull - CullInstanceGPU SSBO input, uint visibility[] output, double-buffered - dispatchCullCompute() runs before main pass via render graph node Consolidated bone matrix SSBOs - 16 MB double-buffered mega bone SSBO (2048 instances × 128 bones) - Eliminated per-instance descriptor sets; one megaBoneSet_ per frame - prepareRender() packs bone matrices consecutively into current frame slot Render graph / frame graph - RenderGraph: RGResource handles, RGPass nodes, Kahn topological sort - Automatic VkImageMemoryBarrier/VkBufferMemoryBarrier between passes - Passes: minimap_composite, worldmap_composite, preview_composite, shadow_pass, reflection_pass, compute_cull - beginFrame() uses buildFrameGraph() + renderGraph_->execute(cmd) Pipeline derivatives - PipelineBuilder::setFlags/setBasePipeline for VK_PIPELINE_CREATE_DERIVATIVE_BIT - M2 opaque = base; alphaTest/alpha/additive are derivatives - Applied to terrain (wireframe) and WMO (alpha-test) renderers Rendering bug fixes: - fix(shadow): compute lightSpaceMatrix before updatePerFrameUBO to eliminate one-frame lag that caused shadow trails and flicker on moving objects - fix(shadow): scale depth bias with shadowDistance_ instead of hardcoded 0.8f to prevent acne at close range and gaps at far range - fix(visibility): WMO group distance threshold 500u → 1200u to match terrain view distance; buildings were disappearing on the horizon - fix(precision): camera near plane 0.05 → 0.5 (ratio 600K:1 → 60K:1), eliminating Z-fighting and improving frustum plane extraction stability - fix(streaming): terrain load radius 4 → 6 tiles (~2133u → ~3200u) to exceed M2 render distance (2800u) and eliminate pop-in when camera turns; unload radius 7 → 9; spawn radius 3 → 4 - fix(visibility): ground-detail M2 distance multiplier 0.75 → 0.9 to reduce early pop of grass and debris
2026-04-04 13:43:16 +03:00
// Pipeline derivatives — opaque is the base, others derive from it for shared state optimization
auto buildM2Pipeline = [&](VkPipelineColorBlendAttachmentState blendState, bool depthWrite,
VkPipelineCreateFlags flags = 0, VkPipeline basePipeline = VK_NULL_HANDLE) -> VkPipeline {
return PipelineBuilder()
.setShaders(m2Vert.stageInfo(VK_SHADER_STAGE_VERTEX_BIT),
m2Frag.stageInfo(VK_SHADER_STAGE_FRAGMENT_BIT))
.setVertexInput({m2Binding}, m2Attrs)
.setTopology(VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST)
.setRasterization(VK_POLYGON_MODE_FILL, VK_CULL_MODE_NONE)
.setDepthTest(true, depthWrite, VK_COMPARE_OP_LESS_OR_EQUAL)
.setColorBlendAttachment(blendState)
.setMultisample(vkCtx_->getMsaaSamples())
.setLayout(pipelineLayout_)
.setRenderPass(mainPass)
.setDynamicStates({VK_DYNAMIC_STATE_VIEWPORT, VK_DYNAMIC_STATE_SCISSOR})
feat(rendering): GPU architecture + visual quality fixes M2 GPU instancing - M2InstanceGPU SSBO (96 B/entry, double-buffered, 16384 max) - Group opaque instances by (modelId, LOD); single vkCmdDrawIndexed per group - boneBase field indexes into mega bone SSBO via gl_InstanceIndex Indirect terrain drawing - 24 MB mega index buffer (6M uint32) + 64 MB mega vertex buffer - CPU builds VkDrawIndexedIndirectCommand per visible chunk - Single VB/IB bind per frame; shadow pass reuses mega buffers - Replaced vkCmdDrawIndexedIndirect with direct vkCmdDrawIndexed to fix host-mapped buffer race condition that caused terrain flickering GPU frustum culling (compute shader) - m2_cull.comp.glsl: 64-thread workgroups, sphere-vs-6-planes + distance cull - CullInstanceGPU SSBO input, uint visibility[] output, double-buffered - dispatchCullCompute() runs before main pass via render graph node Consolidated bone matrix SSBOs - 16 MB double-buffered mega bone SSBO (2048 instances × 128 bones) - Eliminated per-instance descriptor sets; one megaBoneSet_ per frame - prepareRender() packs bone matrices consecutively into current frame slot Render graph / frame graph - RenderGraph: RGResource handles, RGPass nodes, Kahn topological sort - Automatic VkImageMemoryBarrier/VkBufferMemoryBarrier between passes - Passes: minimap_composite, worldmap_composite, preview_composite, shadow_pass, reflection_pass, compute_cull - beginFrame() uses buildFrameGraph() + renderGraph_->execute(cmd) Pipeline derivatives - PipelineBuilder::setFlags/setBasePipeline for VK_PIPELINE_CREATE_DERIVATIVE_BIT - M2 opaque = base; alphaTest/alpha/additive are derivatives - Applied to terrain (wireframe) and WMO (alpha-test) renderers Rendering bug fixes: - fix(shadow): compute lightSpaceMatrix before updatePerFrameUBO to eliminate one-frame lag that caused shadow trails and flicker on moving objects - fix(shadow): scale depth bias with shadowDistance_ instead of hardcoded 0.8f to prevent acne at close range and gaps at far range - fix(visibility): WMO group distance threshold 500u → 1200u to match terrain view distance; buildings were disappearing on the horizon - fix(precision): camera near plane 0.05 → 0.5 (ratio 600K:1 → 60K:1), eliminating Z-fighting and improving frustum plane extraction stability - fix(streaming): terrain load radius 4 → 6 tiles (~2133u → ~3200u) to exceed M2 render distance (2800u) and eliminate pop-in when camera turns; unload radius 7 → 9; spawn radius 3 → 4 - fix(visibility): ground-detail M2 distance multiplier 0.75 → 0.9 to reduce early pop of grass and debris
2026-04-04 13:43:16 +03:00
.setFlags(flags)
.setBasePipeline(basePipeline)
.build(device, vkCtx_->getPipelineCache());
};
opaquePipeline_ = buildM2Pipeline(PipelineBuilder::blendDisabled(), true,
feat(rendering): GPU architecture + visual quality fixes M2 GPU instancing - M2InstanceGPU SSBO (96 B/entry, double-buffered, 16384 max) - Group opaque instances by (modelId, LOD); single vkCmdDrawIndexed per group - boneBase field indexes into mega bone SSBO via gl_InstanceIndex Indirect terrain drawing - 24 MB mega index buffer (6M uint32) + 64 MB mega vertex buffer - CPU builds VkDrawIndexedIndirectCommand per visible chunk - Single VB/IB bind per frame; shadow pass reuses mega buffers - Replaced vkCmdDrawIndexedIndirect with direct vkCmdDrawIndexed to fix host-mapped buffer race condition that caused terrain flickering GPU frustum culling (compute shader) - m2_cull.comp.glsl: 64-thread workgroups, sphere-vs-6-planes + distance cull - CullInstanceGPU SSBO input, uint visibility[] output, double-buffered - dispatchCullCompute() runs before main pass via render graph node Consolidated bone matrix SSBOs - 16 MB double-buffered mega bone SSBO (2048 instances × 128 bones) - Eliminated per-instance descriptor sets; one megaBoneSet_ per frame - prepareRender() packs bone matrices consecutively into current frame slot Render graph / frame graph - RenderGraph: RGResource handles, RGPass nodes, Kahn topological sort - Automatic VkImageMemoryBarrier/VkBufferMemoryBarrier between passes - Passes: minimap_composite, worldmap_composite, preview_composite, shadow_pass, reflection_pass, compute_cull - beginFrame() uses buildFrameGraph() + renderGraph_->execute(cmd) Pipeline derivatives - PipelineBuilder::setFlags/setBasePipeline for VK_PIPELINE_CREATE_DERIVATIVE_BIT - M2 opaque = base; alphaTest/alpha/additive are derivatives - Applied to terrain (wireframe) and WMO (alpha-test) renderers Rendering bug fixes: - fix(shadow): compute lightSpaceMatrix before updatePerFrameUBO to eliminate one-frame lag that caused shadow trails and flicker on moving objects - fix(shadow): scale depth bias with shadowDistance_ instead of hardcoded 0.8f to prevent acne at close range and gaps at far range - fix(visibility): WMO group distance threshold 500u → 1200u to match terrain view distance; buildings were disappearing on the horizon - fix(precision): camera near plane 0.05 → 0.5 (ratio 600K:1 → 60K:1), eliminating Z-fighting and improving frustum plane extraction stability - fix(streaming): terrain load radius 4 → 6 tiles (~2133u → ~3200u) to exceed M2 render distance (2800u) and eliminate pop-in when camera turns; unload radius 7 → 9; spawn radius 3 → 4 - fix(visibility): ground-detail M2 distance multiplier 0.75 → 0.9 to reduce early pop of grass and debris
2026-04-04 13:43:16 +03:00
VK_PIPELINE_CREATE_ALLOW_DERIVATIVES_BIT);
alphaTestPipeline_ = buildM2Pipeline(PipelineBuilder::blendAlpha(), true,
feat(rendering): GPU architecture + visual quality fixes M2 GPU instancing - M2InstanceGPU SSBO (96 B/entry, double-buffered, 16384 max) - Group opaque instances by (modelId, LOD); single vkCmdDrawIndexed per group - boneBase field indexes into mega bone SSBO via gl_InstanceIndex Indirect terrain drawing - 24 MB mega index buffer (6M uint32) + 64 MB mega vertex buffer - CPU builds VkDrawIndexedIndirectCommand per visible chunk - Single VB/IB bind per frame; shadow pass reuses mega buffers - Replaced vkCmdDrawIndexedIndirect with direct vkCmdDrawIndexed to fix host-mapped buffer race condition that caused terrain flickering GPU frustum culling (compute shader) - m2_cull.comp.glsl: 64-thread workgroups, sphere-vs-6-planes + distance cull - CullInstanceGPU SSBO input, uint visibility[] output, double-buffered - dispatchCullCompute() runs before main pass via render graph node Consolidated bone matrix SSBOs - 16 MB double-buffered mega bone SSBO (2048 instances × 128 bones) - Eliminated per-instance descriptor sets; one megaBoneSet_ per frame - prepareRender() packs bone matrices consecutively into current frame slot Render graph / frame graph - RenderGraph: RGResource handles, RGPass nodes, Kahn topological sort - Automatic VkImageMemoryBarrier/VkBufferMemoryBarrier between passes - Passes: minimap_composite, worldmap_composite, preview_composite, shadow_pass, reflection_pass, compute_cull - beginFrame() uses buildFrameGraph() + renderGraph_->execute(cmd) Pipeline derivatives - PipelineBuilder::setFlags/setBasePipeline for VK_PIPELINE_CREATE_DERIVATIVE_BIT - M2 opaque = base; alphaTest/alpha/additive are derivatives - Applied to terrain (wireframe) and WMO (alpha-test) renderers Rendering bug fixes: - fix(shadow): compute lightSpaceMatrix before updatePerFrameUBO to eliminate one-frame lag that caused shadow trails and flicker on moving objects - fix(shadow): scale depth bias with shadowDistance_ instead of hardcoded 0.8f to prevent acne at close range and gaps at far range - fix(visibility): WMO group distance threshold 500u → 1200u to match terrain view distance; buildings were disappearing on the horizon - fix(precision): camera near plane 0.05 → 0.5 (ratio 600K:1 → 60K:1), eliminating Z-fighting and improving frustum plane extraction stability - fix(streaming): terrain load radius 4 → 6 tiles (~2133u → ~3200u) to exceed M2 render distance (2800u) and eliminate pop-in when camera turns; unload radius 7 → 9; spawn radius 3 → 4 - fix(visibility): ground-detail M2 distance multiplier 0.75 → 0.9 to reduce early pop of grass and debris
2026-04-04 13:43:16 +03:00
VK_PIPELINE_CREATE_DERIVATIVE_BIT, opaquePipeline_);
alphaPipeline_ = buildM2Pipeline(PipelineBuilder::blendAlpha(), false,
feat(rendering): GPU architecture + visual quality fixes M2 GPU instancing - M2InstanceGPU SSBO (96 B/entry, double-buffered, 16384 max) - Group opaque instances by (modelId, LOD); single vkCmdDrawIndexed per group - boneBase field indexes into mega bone SSBO via gl_InstanceIndex Indirect terrain drawing - 24 MB mega index buffer (6M uint32) + 64 MB mega vertex buffer - CPU builds VkDrawIndexedIndirectCommand per visible chunk - Single VB/IB bind per frame; shadow pass reuses mega buffers - Replaced vkCmdDrawIndexedIndirect with direct vkCmdDrawIndexed to fix host-mapped buffer race condition that caused terrain flickering GPU frustum culling (compute shader) - m2_cull.comp.glsl: 64-thread workgroups, sphere-vs-6-planes + distance cull - CullInstanceGPU SSBO input, uint visibility[] output, double-buffered - dispatchCullCompute() runs before main pass via render graph node Consolidated bone matrix SSBOs - 16 MB double-buffered mega bone SSBO (2048 instances × 128 bones) - Eliminated per-instance descriptor sets; one megaBoneSet_ per frame - prepareRender() packs bone matrices consecutively into current frame slot Render graph / frame graph - RenderGraph: RGResource handles, RGPass nodes, Kahn topological sort - Automatic VkImageMemoryBarrier/VkBufferMemoryBarrier between passes - Passes: minimap_composite, worldmap_composite, preview_composite, shadow_pass, reflection_pass, compute_cull - beginFrame() uses buildFrameGraph() + renderGraph_->execute(cmd) Pipeline derivatives - PipelineBuilder::setFlags/setBasePipeline for VK_PIPELINE_CREATE_DERIVATIVE_BIT - M2 opaque = base; alphaTest/alpha/additive are derivatives - Applied to terrain (wireframe) and WMO (alpha-test) renderers Rendering bug fixes: - fix(shadow): compute lightSpaceMatrix before updatePerFrameUBO to eliminate one-frame lag that caused shadow trails and flicker on moving objects - fix(shadow): scale depth bias with shadowDistance_ instead of hardcoded 0.8f to prevent acne at close range and gaps at far range - fix(visibility): WMO group distance threshold 500u → 1200u to match terrain view distance; buildings were disappearing on the horizon - fix(precision): camera near plane 0.05 → 0.5 (ratio 600K:1 → 60K:1), eliminating Z-fighting and improving frustum plane extraction stability - fix(streaming): terrain load radius 4 → 6 tiles (~2133u → ~3200u) to exceed M2 render distance (2800u) and eliminate pop-in when camera turns; unload radius 7 → 9; spawn radius 3 → 4 - fix(visibility): ground-detail M2 distance multiplier 0.75 → 0.9 to reduce early pop of grass and debris
2026-04-04 13:43:16 +03:00
VK_PIPELINE_CREATE_DERIVATIVE_BIT, opaquePipeline_);
additivePipeline_ = buildM2Pipeline(PipelineBuilder::blendAdditive(), false,
feat(rendering): GPU architecture + visual quality fixes M2 GPU instancing - M2InstanceGPU SSBO (96 B/entry, double-buffered, 16384 max) - Group opaque instances by (modelId, LOD); single vkCmdDrawIndexed per group - boneBase field indexes into mega bone SSBO via gl_InstanceIndex Indirect terrain drawing - 24 MB mega index buffer (6M uint32) + 64 MB mega vertex buffer - CPU builds VkDrawIndexedIndirectCommand per visible chunk - Single VB/IB bind per frame; shadow pass reuses mega buffers - Replaced vkCmdDrawIndexedIndirect with direct vkCmdDrawIndexed to fix host-mapped buffer race condition that caused terrain flickering GPU frustum culling (compute shader) - m2_cull.comp.glsl: 64-thread workgroups, sphere-vs-6-planes + distance cull - CullInstanceGPU SSBO input, uint visibility[] output, double-buffered - dispatchCullCompute() runs before main pass via render graph node Consolidated bone matrix SSBOs - 16 MB double-buffered mega bone SSBO (2048 instances × 128 bones) - Eliminated per-instance descriptor sets; one megaBoneSet_ per frame - prepareRender() packs bone matrices consecutively into current frame slot Render graph / frame graph - RenderGraph: RGResource handles, RGPass nodes, Kahn topological sort - Automatic VkImageMemoryBarrier/VkBufferMemoryBarrier between passes - Passes: minimap_composite, worldmap_composite, preview_composite, shadow_pass, reflection_pass, compute_cull - beginFrame() uses buildFrameGraph() + renderGraph_->execute(cmd) Pipeline derivatives - PipelineBuilder::setFlags/setBasePipeline for VK_PIPELINE_CREATE_DERIVATIVE_BIT - M2 opaque = base; alphaTest/alpha/additive are derivatives - Applied to terrain (wireframe) and WMO (alpha-test) renderers Rendering bug fixes: - fix(shadow): compute lightSpaceMatrix before updatePerFrameUBO to eliminate one-frame lag that caused shadow trails and flicker on moving objects - fix(shadow): scale depth bias with shadowDistance_ instead of hardcoded 0.8f to prevent acne at close range and gaps at far range - fix(visibility): WMO group distance threshold 500u → 1200u to match terrain view distance; buildings were disappearing on the horizon - fix(precision): camera near plane 0.05 → 0.5 (ratio 600K:1 → 60K:1), eliminating Z-fighting and improving frustum plane extraction stability - fix(streaming): terrain load radius 4 → 6 tiles (~2133u → ~3200u) to exceed M2 render distance (2800u) and eliminate pop-in when camera turns; unload radius 7 → 9; spawn radius 3 → 4 - fix(visibility): ground-detail M2 distance multiplier 0.75 → 0.9 to reduce early pop of grass and debris
2026-04-04 13:43:16 +03:00
VK_PIPELINE_CREATE_DERIVATIVE_BIT, opaquePipeline_);
// --- Build particle pipelines ---
if (particleVert.isValid() && particleFrag.isValid()) {
VkVertexInputBindingDescription pBind{};
pBind.binding = 0;
pBind.stride = 9 * sizeof(float); // pos3 + color4 + size1 + tile1
pBind.inputRate = VK_VERTEX_INPUT_RATE_VERTEX;
std::vector<VkVertexInputAttributeDescription> pAttrs = {
{0, 0, VK_FORMAT_R32G32B32_SFLOAT, 0}, // position
{1, 0, VK_FORMAT_R32G32B32A32_SFLOAT, 3 * sizeof(float)}, // color
{2, 0, VK_FORMAT_R32_SFLOAT, 7 * sizeof(float)}, // size
{3, 0, VK_FORMAT_R32_SFLOAT, 8 * sizeof(float)}, // tile
};
auto buildParticlePipeline = [&](VkPipelineColorBlendAttachmentState blend) -> VkPipeline {
return PipelineBuilder()
.setShaders(particleVert.stageInfo(VK_SHADER_STAGE_VERTEX_BIT),
particleFrag.stageInfo(VK_SHADER_STAGE_FRAGMENT_BIT))
.setVertexInput({pBind}, pAttrs)
.setTopology(VK_PRIMITIVE_TOPOLOGY_POINT_LIST)
.setRasterization(VK_POLYGON_MODE_FILL, VK_CULL_MODE_NONE)
.setDepthTest(true, false, VK_COMPARE_OP_LESS_OR_EQUAL)
.setColorBlendAttachment(blend)
.setMultisample(vkCtx_->getMsaaSamples())
.setLayout(particlePipelineLayout_)
.setRenderPass(mainPass)
.setDynamicStates({VK_DYNAMIC_STATE_VIEWPORT, VK_DYNAMIC_STATE_SCISSOR})
.build(device, vkCtx_->getPipelineCache());
};
particlePipeline_ = buildParticlePipeline(PipelineBuilder::blendAlpha());
particleAdditivePipeline_ = buildParticlePipeline(PipelineBuilder::blendAdditive());
}
// --- Build smoke pipeline ---
if (smokeVert.isValid() && smokeFrag.isValid()) {
VkVertexInputBindingDescription sBind{};
sBind.binding = 0;
sBind.stride = 6 * sizeof(float); // pos3 + lifeRatio1 + size1 + isSpark1
sBind.inputRate = VK_VERTEX_INPUT_RATE_VERTEX;
std::vector<VkVertexInputAttributeDescription> sAttrs = {
{0, 0, VK_FORMAT_R32G32B32_SFLOAT, 0}, // position
{1, 0, VK_FORMAT_R32_SFLOAT, 3 * sizeof(float)}, // lifeRatio
{2, 0, VK_FORMAT_R32_SFLOAT, 4 * sizeof(float)}, // size
{3, 0, VK_FORMAT_R32_SFLOAT, 5 * sizeof(float)}, // isSpark
};
smokePipeline_ = PipelineBuilder()
.setShaders(smokeVert.stageInfo(VK_SHADER_STAGE_VERTEX_BIT),
smokeFrag.stageInfo(VK_SHADER_STAGE_FRAGMENT_BIT))
.setVertexInput({sBind}, sAttrs)
.setTopology(VK_PRIMITIVE_TOPOLOGY_POINT_LIST)
.setRasterization(VK_POLYGON_MODE_FILL, VK_CULL_MODE_NONE)
.setDepthTest(true, false, VK_COMPARE_OP_LESS_OR_EQUAL)
.setColorBlendAttachment(PipelineBuilder::blendAlpha())
.setMultisample(vkCtx_->getMsaaSamples())
.setLayout(smokePipelineLayout_)
.setRenderPass(mainPass)
.setDynamicStates({VK_DYNAMIC_STATE_VIEWPORT, VK_DYNAMIC_STATE_SCISSOR})
.build(device, vkCtx_->getPipelineCache());
}
// --- Build ribbon pipelines ---
// Vertex format: pos(3) + color(3) + alpha(1) + uv(2) = 9 floats = 36 bytes
{
rendering::VkShaderModule ribVert, ribFrag;
(void)ribVert.loadFromFile(device, "assets/shaders/m2_ribbon.vert.spv");
(void)ribFrag.loadFromFile(device, "assets/shaders/m2_ribbon.frag.spv");
if (ribVert.isValid() && ribFrag.isValid()) {
// Reuse particleTexLayout_ for set 1 (single texture sampler)
VkDescriptorSetLayout ribLayouts[] = {perFrameLayout, particleTexLayout_};
VkPipelineLayoutCreateInfo lci{VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO};
lci.setLayoutCount = 2;
lci.pSetLayouts = ribLayouts;
vkCreatePipelineLayout(device, &lci, nullptr, &ribbonPipelineLayout_);
VkVertexInputBindingDescription rBind{};
rBind.binding = 0;
rBind.stride = 9 * sizeof(float);
rBind.inputRate = VK_VERTEX_INPUT_RATE_VERTEX;
std::vector<VkVertexInputAttributeDescription> rAttrs = {
{0, 0, VK_FORMAT_R32G32B32_SFLOAT, 0}, // pos
{1, 0, VK_FORMAT_R32G32B32_SFLOAT, 3 * sizeof(float)}, // color
{2, 0, VK_FORMAT_R32_SFLOAT, 6 * sizeof(float)}, // alpha
{3, 0, VK_FORMAT_R32G32_SFLOAT, 7 * sizeof(float)}, // uv
};
auto buildRibbonPipeline = [&](VkPipelineColorBlendAttachmentState blend) -> VkPipeline {
return PipelineBuilder()
.setShaders(ribVert.stageInfo(VK_SHADER_STAGE_VERTEX_BIT),
ribFrag.stageInfo(VK_SHADER_STAGE_FRAGMENT_BIT))
.setVertexInput({rBind}, rAttrs)
.setTopology(VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP)
.setRasterization(VK_POLYGON_MODE_FILL, VK_CULL_MODE_NONE)
.setDepthTest(true, false, VK_COMPARE_OP_LESS_OR_EQUAL)
.setColorBlendAttachment(blend)
.setMultisample(vkCtx_->getMsaaSamples())
.setLayout(ribbonPipelineLayout_)
.setRenderPass(mainPass)
.setDynamicStates({VK_DYNAMIC_STATE_VIEWPORT, VK_DYNAMIC_STATE_SCISSOR})
.build(device, vkCtx_->getPipelineCache());
};
ribbonPipeline_ = buildRibbonPipeline(PipelineBuilder::blendAlpha());
ribbonAdditivePipeline_ = buildRibbonPipeline(PipelineBuilder::blendAdditive());
}
ribVert.destroy(); ribFrag.destroy();
}
// Clean up shader modules
m2Vert.destroy(); m2Frag.destroy();
particleVert.destroy(); particleFrag.destroy();
smokeVert.destroy(); smokeFrag.destroy();
// --- Create dynamic particle buffers (mapped for CPU writes) ---
{
VkBufferCreateInfo bci{VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO};
bci.usage = VK_BUFFER_USAGE_VERTEX_BUFFER_BIT;
VmaAllocationCreateInfo aci{};
aci.usage = VMA_MEMORY_USAGE_CPU_TO_GPU;
aci.flags = VMA_ALLOCATION_CREATE_MAPPED_BIT;
VmaAllocationInfo allocInfo{};
// Smoke particle buffer
bci.size = MAX_SMOKE_PARTICLES * 6 * sizeof(float);
vmaCreateBuffer(vkCtx_->getAllocator(), &bci, &aci, &smokeVB_, &smokeVBAlloc_, &allocInfo);
smokeVBMapped_ = allocInfo.pMappedData;
// M2 particle buffer
bci.size = MAX_M2_PARTICLES * 9 * sizeof(float);
vmaCreateBuffer(vkCtx_->getAllocator(), &bci, &aci, &m2ParticleVB_, &m2ParticleVBAlloc_, &allocInfo);
m2ParticleVBMapped_ = allocInfo.pMappedData;
// Dedicated glow sprite buffer (separate from particle VB to avoid data race)
bci.size = MAX_GLOW_SPRITES * 9 * sizeof(float);
vmaCreateBuffer(vkCtx_->getAllocator(), &bci, &aci, &glowVB_, &glowVBAlloc_, &allocInfo);
glowVBMapped_ = allocInfo.pMappedData;
// Ribbon vertex buffer — triangle strip: pos(3)+color(3)+alpha(1)+uv(2)=9 floats/vert
bci.size = MAX_RIBBON_VERTS * 9 * sizeof(float);
vmaCreateBuffer(vkCtx_->getAllocator(), &bci, &aci, &ribbonVB_, &ribbonVBAlloc_, &allocInfo);
ribbonVBMapped_ = allocInfo.pMappedData;
}
// --- Create white fallback texture ---
{
uint8_t white[] = {255, 255, 255, 255};
whiteTexture_ = std::make_unique<VkTexture>();
whiteTexture_->upload(*vkCtx_, white, 1, 1, VK_FORMAT_R8G8B8A8_UNORM);
whiteTexture_->createSampler(device, VK_FILTER_LINEAR, VK_FILTER_LINEAR, VK_SAMPLER_ADDRESS_MODE_REPEAT);
}
// --- Generate soft radial gradient glow texture ---
{
static constexpr int SZ = 64;
std::vector<uint8_t> px(SZ * SZ * 4);
float half = SZ / 2.0f;
for (int y = 0; y < SZ; y++) {
for (int x = 0; x < SZ; x++) {
float dx = (x + 0.5f - half) / half;
float dy = (y + 0.5f - half) / half;
float r = std::sqrt(dx * dx + dy * dy);
float a = std::max(0.0f, 1.0f - r);
a = a * a; // Quadratic falloff
int idx = (y * SZ + x) * 4;
px[idx + 0] = 255;
px[idx + 1] = 255;
px[idx + 2] = 255;
px[idx + 3] = static_cast<uint8_t>(a * 255);
}
}
glowTexture_ = std::make_unique<VkTexture>();
glowTexture_->upload(*vkCtx_, px.data(), SZ, SZ, VK_FORMAT_R8G8B8A8_UNORM);
glowTexture_->createSampler(device, VK_FILTER_LINEAR, VK_FILTER_LINEAR, VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE);
// Pre-allocate glow texture descriptor set (reused every frame)
if (particleTexLayout_ && materialDescPool_) {
VkDescriptorSetAllocateInfo ai{VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO};
ai.descriptorPool = materialDescPool_;
ai.descriptorSetCount = 1;
ai.pSetLayouts = &particleTexLayout_;
if (vkAllocateDescriptorSets(device, &ai, &glowTexDescSet_) == VK_SUCCESS) {
VkDescriptorImageInfo imgInfo = glowTexture_->descriptorInfo();
VkWriteDescriptorSet write{VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET};
write.dstSet = glowTexDescSet_;
write.dstBinding = 0;
write.descriptorCount = 1;
write.descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER;
write.pImageInfo = &imgInfo;
vkUpdateDescriptorSets(device, 1, &write, 0, nullptr);
}
}
}
textureCacheBudgetBytes_ =
envSizeMBOrDefault("WOWEE_M2_TEX_CACHE_MB", 4096) * 1024ull * 1024ull;
modelCacheLimit_ = envSizeMBOrDefault("WOWEE_M2_MODEL_LIMIT", 6000);
LOG_INFO("M2 texture cache budget: ", textureCacheBudgetBytes_ / (1024 * 1024), " MB");
LOG_INFO("M2 model cache limit: ", modelCacheLimit_);
LOG_INFO("M2 renderer initialized (Vulkan)");
initialized_ = true;
return true;
}
feat(rendering): add HiZ occlusion culling & fix WMO interior shadows Implement GPU-driven Hierarchical-Z occlusion culling for M2 doodads using a depth pyramid built from the previous frame's depth buffer. The cull shader projects bounding spheres via prevViewProj (temporal reprojection) and samples the HiZ pyramid to reject hidden objects before the main render pass. Key implementation details: - Separate early compute submission (beginSingleTimeCommands + fence wait) eliminates 2-frame visibility staleness - Conservative safeguards prevent false culls: screen-edge guard, full VP row-vector AABB projection (Cauchy-Schwarz), 50% sphere inflation, depth bias, mip+1, min screen size threshold, camera motion dampening (auto-disable on fast rotations), and per-instance previouslyVisible flag tracking - Graceful fallback to frustum-only culling if HiZ init fails Fix dark WMO interiors by gating shadow map sampling on isInterior==0 in the WMO fragment shader. Interior groups (flag 0x2000) now rely solely on pre-baked MOCV vertex-color lighting + MOHD ambient color. Disable interiorDarken globally (was incorrectly darkening outdoor M2s when camera was inside a WMO). Use isInsideInteriorWMO() instead of isInsideWMO() for correct indoor detection. New files: - hiz_system.hpp/cpp: pyramid image management, compute pipeline, descriptors, mip-chain build dispatch, resize handling - hiz_build.comp.glsl: MAX-depth 2x2 reduction compute shader - m2_cull_hiz.comp.glsl: frustum + HiZ occlusion cull compute shader - test_indoor_shadows.cpp: 14 unit tests for shadow/interior contracts Modified: - CullUniformsGPU expanded 128->272 bytes (HiZ params, viewProj, prevViewProj) - Depth buffer images gain VK_IMAGE_USAGE_SAMPLED_BIT for HiZ reads - wmo.frag.glsl: interior branch before unlit, shadow skip for 0x2000 - Render graph: hiz_build + compute_cull disabled (run in early compute) - .gitignore: ignore compiled .spv binaries - MEGA_BONE_MAX_INSTANCES: 2048 -> 4096 Signed-off-by: Pavel Okhlopkov <pavel.okhlopkov@flant.com>
2026-04-06 16:40:59 +03:00
void M2Renderer::invalidateCullOutput(uint32_t frameIndex) {
// On non-HOST_COHERENT memory, VMA-mapped GPU→CPU buffers need explicit
// invalidation so the CPU cache sees the latest GPU writes.
if (frameIndex < 2 && cullOutputAlloc_[frameIndex]) {
vmaInvalidateAllocation(vkCtx_->getAllocator(), cullOutputAlloc_[frameIndex], 0, VK_WHOLE_SIZE);
}
}
void M2Renderer::shutdown() {
LOG_INFO("Shutting down M2 renderer...");
if (!vkCtx_) return;
vkDeviceWaitIdle(vkCtx_->getDevice());
VkDevice device = vkCtx_->getDevice();
VmaAllocator alloc = vkCtx_->getAllocator();
// Delete model GPU resources
for (auto& [id, model] : models) {
destroyModelGPU(model);
}
models.clear();
// Destroy instance bone buffers
for (auto& inst : instances) {
destroyInstanceBones(inst);
}
instances.clear();
spatialGrid.clear();
instanceIndexById.clear();
instanceDedupMap_.clear();
// Delete cached textures
textureCache.clear();
textureCacheBytes_ = 0;
textureCacheCounter_ = 0;
texturePropsByPtr_.clear();
failedTextureCache_.clear();
failedTextureRetryAt_.clear();
loggedTextureLoadFails_.clear();
textureLookupSerial_ = 0;
textureBudgetRejectWarnings_ = 0;
whiteTexture_.reset();
glowTexture_.reset();
// Clean up particle/ribbon buffers
if (smokeVB_) { vmaDestroyBuffer(alloc, smokeVB_, smokeVBAlloc_); smokeVB_ = VK_NULL_HANDLE; }
if (m2ParticleVB_) { vmaDestroyBuffer(alloc, m2ParticleVB_, m2ParticleVBAlloc_); m2ParticleVB_ = VK_NULL_HANDLE; }
if (glowVB_) { vmaDestroyBuffer(alloc, glowVB_, glowVBAlloc_); glowVB_ = VK_NULL_HANDLE; }
if (ribbonVB_) { vmaDestroyBuffer(alloc, ribbonVB_, ribbonVBAlloc_); ribbonVB_ = VK_NULL_HANDLE; }
smokeParticles.clear();
// Destroy pipelines
auto destroyPipeline = [&](VkPipeline& p) { if (p) { vkDestroyPipeline(device, p, nullptr); p = VK_NULL_HANDLE; } };
destroyPipeline(opaquePipeline_);
destroyPipeline(alphaTestPipeline_);
destroyPipeline(alphaPipeline_);
destroyPipeline(additivePipeline_);
destroyPipeline(particlePipeline_);
destroyPipeline(particleAdditivePipeline_);
destroyPipeline(smokePipeline_);
destroyPipeline(ribbonPipeline_);
destroyPipeline(ribbonAdditivePipeline_);
if (pipelineLayout_) { vkDestroyPipelineLayout(device, pipelineLayout_, nullptr); pipelineLayout_ = VK_NULL_HANDLE; }
if (particlePipelineLayout_) { vkDestroyPipelineLayout(device, particlePipelineLayout_, nullptr); particlePipelineLayout_ = VK_NULL_HANDLE; }
if (smokePipelineLayout_) { vkDestroyPipelineLayout(device, smokePipelineLayout_, nullptr); smokePipelineLayout_ = VK_NULL_HANDLE; }
if (ribbonPipelineLayout_) { vkDestroyPipelineLayout(device, ribbonPipelineLayout_, nullptr); ribbonPipelineLayout_ = VK_NULL_HANDLE; }
// Destroy descriptor pools and layouts
2026-03-22 21:47:12 +03:00
if (dummyBoneBuffer_) { vmaDestroyBuffer(alloc, dummyBoneBuffer_, dummyBoneAlloc_); dummyBoneBuffer_ = VK_NULL_HANDLE; }
// dummyBoneSet_ is freed implicitly when boneDescPool_ is destroyed
dummyBoneSet_ = VK_NULL_HANDLE;
feat(rendering): GPU architecture + visual quality fixes M2 GPU instancing - M2InstanceGPU SSBO (96 B/entry, double-buffered, 16384 max) - Group opaque instances by (modelId, LOD); single vkCmdDrawIndexed per group - boneBase field indexes into mega bone SSBO via gl_InstanceIndex Indirect terrain drawing - 24 MB mega index buffer (6M uint32) + 64 MB mega vertex buffer - CPU builds VkDrawIndexedIndirectCommand per visible chunk - Single VB/IB bind per frame; shadow pass reuses mega buffers - Replaced vkCmdDrawIndexedIndirect with direct vkCmdDrawIndexed to fix host-mapped buffer race condition that caused terrain flickering GPU frustum culling (compute shader) - m2_cull.comp.glsl: 64-thread workgroups, sphere-vs-6-planes + distance cull - CullInstanceGPU SSBO input, uint visibility[] output, double-buffered - dispatchCullCompute() runs before main pass via render graph node Consolidated bone matrix SSBOs - 16 MB double-buffered mega bone SSBO (2048 instances × 128 bones) - Eliminated per-instance descriptor sets; one megaBoneSet_ per frame - prepareRender() packs bone matrices consecutively into current frame slot Render graph / frame graph - RenderGraph: RGResource handles, RGPass nodes, Kahn topological sort - Automatic VkImageMemoryBarrier/VkBufferMemoryBarrier between passes - Passes: minimap_composite, worldmap_composite, preview_composite, shadow_pass, reflection_pass, compute_cull - beginFrame() uses buildFrameGraph() + renderGraph_->execute(cmd) Pipeline derivatives - PipelineBuilder::setFlags/setBasePipeline for VK_PIPELINE_CREATE_DERIVATIVE_BIT - M2 opaque = base; alphaTest/alpha/additive are derivatives - Applied to terrain (wireframe) and WMO (alpha-test) renderers Rendering bug fixes: - fix(shadow): compute lightSpaceMatrix before updatePerFrameUBO to eliminate one-frame lag that caused shadow trails and flicker on moving objects - fix(shadow): scale depth bias with shadowDistance_ instead of hardcoded 0.8f to prevent acne at close range and gaps at far range - fix(visibility): WMO group distance threshold 500u → 1200u to match terrain view distance; buildings were disappearing on the horizon - fix(precision): camera near plane 0.05 → 0.5 (ratio 600K:1 → 60K:1), eliminating Z-fighting and improving frustum plane extraction stability - fix(streaming): terrain load radius 4 → 6 tiles (~2133u → ~3200u) to exceed M2 render distance (2800u) and eliminate pop-in when camera turns; unload radius 7 → 9; spawn radius 3 → 4 - fix(visibility): ground-detail M2 distance multiplier 0.75 → 0.9 to reduce early pop of grass and debris
2026-04-04 13:43:16 +03:00
// Mega bone SSBO cleanup (sets freed implicitly with boneDescPool_)
for (int i = 0; i < 2; i++) {
if (megaBoneBuffer_[i]) { vmaDestroyBuffer(alloc, megaBoneBuffer_[i], megaBoneAlloc_[i]); megaBoneBuffer_[i] = VK_NULL_HANDLE; }
megaBoneMapped_[i] = nullptr;
megaBoneSet_[i] = VK_NULL_HANDLE;
}
if (materialDescPool_) { vkDestroyDescriptorPool(device, materialDescPool_, nullptr); materialDescPool_ = VK_NULL_HANDLE; }
if (boneDescPool_) { vkDestroyDescriptorPool(device, boneDescPool_, nullptr); boneDescPool_ = VK_NULL_HANDLE; }
feat(animation): decompose AnimationController into FSM-based architecture Replace the 2,200-line monolithic AnimationController (goto-driven, single class, untestable) with a composed FSM architecture per refactor.md. New subsystem (src/rendering/animation/ — 16 headers, 10 sources): - CharacterAnimator: FSM composer implementing ICharacterAnimator - LocomotionFSM: idle/walk/run/sprint/jump/swim/strafe - CombatFSM: melee/ranged/spell cast/stun/hit reaction/charge - ActivityFSM: emote/loot/sit-down/sitting/sit-up - MountFSM: idle/run/flight/taxi/fidget/rear-up (per-instance RNG) - AnimCapabilitySet + AnimCapabilityProbe: probe once at model load, eliminate per-frame hasAnimation() linear search - AnimationManager: registry of CharacterAnimator by GUID - EmoteRegistry: DBC-backed emote command → animId singleton - FootstepDriver, SfxStateDriver: extracted from AnimationController animation_ids.hpp/.cpp moved to animation/ subdirectory (452 named constants); all include paths updated. AnimationController retained as thin adapter (~400 LOC): collects FrameInput, delegates to CharacterAnimator, applies AnimOutput. Priority order: Mount > Stun > HitReaction > Spell > Charge > Melee/Ranged > CombatIdle > Emote > Loot > Sit > Locomotion. STAY_IN_STATE policy when all FSMs return valid=false. Bugs fixed: - Remove static mt19937 in mount fidget (shared state across all mounted units) — replaced with per-instance seeded RNG - Remove goto from mounted animation branch (skipped init) - Remove per-frame hasAnimation() calls (now one probe at load) - Fix VK_INDEX_TYPE_UINT16 → UINT32 in shadow pass Tests (4 new suites, all ASAN+UBSan clean): - test_locomotion_fsm: 167 assertions - test_combat_fsm: 125 cases - test_activity_fsm: 112 cases - test_anim_capability: 56 cases docs/ANIMATION_SYSTEM.md added (architecture reference).
2026-04-05 12:27:35 +03:00
// Instance data SSBO cleanup (sets freed with instanceDescPool_)
feat(rendering): GPU architecture + visual quality fixes M2 GPU instancing - M2InstanceGPU SSBO (96 B/entry, double-buffered, 16384 max) - Group opaque instances by (modelId, LOD); single vkCmdDrawIndexed per group - boneBase field indexes into mega bone SSBO via gl_InstanceIndex Indirect terrain drawing - 24 MB mega index buffer (6M uint32) + 64 MB mega vertex buffer - CPU builds VkDrawIndexedIndirectCommand per visible chunk - Single VB/IB bind per frame; shadow pass reuses mega buffers - Replaced vkCmdDrawIndexedIndirect with direct vkCmdDrawIndexed to fix host-mapped buffer race condition that caused terrain flickering GPU frustum culling (compute shader) - m2_cull.comp.glsl: 64-thread workgroups, sphere-vs-6-planes + distance cull - CullInstanceGPU SSBO input, uint visibility[] output, double-buffered - dispatchCullCompute() runs before main pass via render graph node Consolidated bone matrix SSBOs - 16 MB double-buffered mega bone SSBO (2048 instances × 128 bones) - Eliminated per-instance descriptor sets; one megaBoneSet_ per frame - prepareRender() packs bone matrices consecutively into current frame slot Render graph / frame graph - RenderGraph: RGResource handles, RGPass nodes, Kahn topological sort - Automatic VkImageMemoryBarrier/VkBufferMemoryBarrier between passes - Passes: minimap_composite, worldmap_composite, preview_composite, shadow_pass, reflection_pass, compute_cull - beginFrame() uses buildFrameGraph() + renderGraph_->execute(cmd) Pipeline derivatives - PipelineBuilder::setFlags/setBasePipeline for VK_PIPELINE_CREATE_DERIVATIVE_BIT - M2 opaque = base; alphaTest/alpha/additive are derivatives - Applied to terrain (wireframe) and WMO (alpha-test) renderers Rendering bug fixes: - fix(shadow): compute lightSpaceMatrix before updatePerFrameUBO to eliminate one-frame lag that caused shadow trails and flicker on moving objects - fix(shadow): scale depth bias with shadowDistance_ instead of hardcoded 0.8f to prevent acne at close range and gaps at far range - fix(visibility): WMO group distance threshold 500u → 1200u to match terrain view distance; buildings were disappearing on the horizon - fix(precision): camera near plane 0.05 → 0.5 (ratio 600K:1 → 60K:1), eliminating Z-fighting and improving frustum plane extraction stability - fix(streaming): terrain load radius 4 → 6 tiles (~2133u → ~3200u) to exceed M2 render distance (2800u) and eliminate pop-in when camera turns; unload radius 7 → 9; spawn radius 3 → 4 - fix(visibility): ground-detail M2 distance multiplier 0.75 → 0.9 to reduce early pop of grass and debris
2026-04-04 13:43:16 +03:00
for (int i = 0; i < 2; i++) {
if (instanceBuffer_[i]) { vmaDestroyBuffer(alloc, instanceBuffer_[i], instanceAlloc_[i]); instanceBuffer_[i] = VK_NULL_HANDLE; }
instanceMapped_[i] = nullptr;
instanceSet_[i] = VK_NULL_HANDLE;
}
if (instanceDescPool_) { vkDestroyDescriptorPool(device, instanceDescPool_, nullptr); instanceDescPool_ = VK_NULL_HANDLE; }
feat(animation): decompose AnimationController into FSM-based architecture Replace the 2,200-line monolithic AnimationController (goto-driven, single class, untestable) with a composed FSM architecture per refactor.md. New subsystem (src/rendering/animation/ — 16 headers, 10 sources): - CharacterAnimator: FSM composer implementing ICharacterAnimator - LocomotionFSM: idle/walk/run/sprint/jump/swim/strafe - CombatFSM: melee/ranged/spell cast/stun/hit reaction/charge - ActivityFSM: emote/loot/sit-down/sitting/sit-up - MountFSM: idle/run/flight/taxi/fidget/rear-up (per-instance RNG) - AnimCapabilitySet + AnimCapabilityProbe: probe once at model load, eliminate per-frame hasAnimation() linear search - AnimationManager: registry of CharacterAnimator by GUID - EmoteRegistry: DBC-backed emote command → animId singleton - FootstepDriver, SfxStateDriver: extracted from AnimationController animation_ids.hpp/.cpp moved to animation/ subdirectory (452 named constants); all include paths updated. AnimationController retained as thin adapter (~400 LOC): collects FrameInput, delegates to CharacterAnimator, applies AnimOutput. Priority order: Mount > Stun > HitReaction > Spell > Charge > Melee/Ranged > CombatIdle > Emote > Loot > Sit > Locomotion. STAY_IN_STATE policy when all FSMs return valid=false. Bugs fixed: - Remove static mt19937 in mount fidget (shared state across all mounted units) — replaced with per-instance seeded RNG - Remove goto from mounted animation branch (skipped init) - Remove per-frame hasAnimation() calls (now one probe at load) - Fix VK_INDEX_TYPE_UINT16 → UINT32 in shadow pass Tests (4 new suites, all ASAN+UBSan clean): - test_locomotion_fsm: 167 assertions - test_combat_fsm: 125 cases - test_activity_fsm: 112 cases - test_anim_capability: 56 cases docs/ANIMATION_SYSTEM.md added (architecture reference).
2026-04-05 12:27:35 +03:00
// GPU frustum culling compute pipeline + buffers cleanup
feat(rendering): add HiZ occlusion culling & fix WMO interior shadows Implement GPU-driven Hierarchical-Z occlusion culling for M2 doodads using a depth pyramid built from the previous frame's depth buffer. The cull shader projects bounding spheres via prevViewProj (temporal reprojection) and samples the HiZ pyramid to reject hidden objects before the main render pass. Key implementation details: - Separate early compute submission (beginSingleTimeCommands + fence wait) eliminates 2-frame visibility staleness - Conservative safeguards prevent false culls: screen-edge guard, full VP row-vector AABB projection (Cauchy-Schwarz), 50% sphere inflation, depth bias, mip+1, min screen size threshold, camera motion dampening (auto-disable on fast rotations), and per-instance previouslyVisible flag tracking - Graceful fallback to frustum-only culling if HiZ init fails Fix dark WMO interiors by gating shadow map sampling on isInterior==0 in the WMO fragment shader. Interior groups (flag 0x2000) now rely solely on pre-baked MOCV vertex-color lighting + MOHD ambient color. Disable interiorDarken globally (was incorrectly darkening outdoor M2s when camera was inside a WMO). Use isInsideInteriorWMO() instead of isInsideWMO() for correct indoor detection. New files: - hiz_system.hpp/cpp: pyramid image management, compute pipeline, descriptors, mip-chain build dispatch, resize handling - hiz_build.comp.glsl: MAX-depth 2x2 reduction compute shader - m2_cull_hiz.comp.glsl: frustum + HiZ occlusion cull compute shader - test_indoor_shadows.cpp: 14 unit tests for shadow/interior contracts Modified: - CullUniformsGPU expanded 128->272 bytes (HiZ params, viewProj, prevViewProj) - Depth buffer images gain VK_IMAGE_USAGE_SAMPLED_BIT for HiZ reads - wmo.frag.glsl: interior branch before unlit, shadow skip for 0x2000 - Render graph: hiz_build + compute_cull disabled (run in early compute) - .gitignore: ignore compiled .spv binaries - MEGA_BONE_MAX_INSTANCES: 2048 -> 4096 Signed-off-by: Pavel Okhlopkov <pavel.okhlopkov@flant.com>
2026-04-06 16:40:59 +03:00
if (cullHiZPipeline_) { vkDestroyPipeline(device, cullHiZPipeline_, nullptr); cullHiZPipeline_ = VK_NULL_HANDLE; }
if (cullHiZPipelineLayout_) { vkDestroyPipelineLayout(device, cullHiZPipelineLayout_, nullptr); cullHiZPipelineLayout_ = VK_NULL_HANDLE; }
feat(rendering): GPU architecture + visual quality fixes M2 GPU instancing - M2InstanceGPU SSBO (96 B/entry, double-buffered, 16384 max) - Group opaque instances by (modelId, LOD); single vkCmdDrawIndexed per group - boneBase field indexes into mega bone SSBO via gl_InstanceIndex Indirect terrain drawing - 24 MB mega index buffer (6M uint32) + 64 MB mega vertex buffer - CPU builds VkDrawIndexedIndirectCommand per visible chunk - Single VB/IB bind per frame; shadow pass reuses mega buffers - Replaced vkCmdDrawIndexedIndirect with direct vkCmdDrawIndexed to fix host-mapped buffer race condition that caused terrain flickering GPU frustum culling (compute shader) - m2_cull.comp.glsl: 64-thread workgroups, sphere-vs-6-planes + distance cull - CullInstanceGPU SSBO input, uint visibility[] output, double-buffered - dispatchCullCompute() runs before main pass via render graph node Consolidated bone matrix SSBOs - 16 MB double-buffered mega bone SSBO (2048 instances × 128 bones) - Eliminated per-instance descriptor sets; one megaBoneSet_ per frame - prepareRender() packs bone matrices consecutively into current frame slot Render graph / frame graph - RenderGraph: RGResource handles, RGPass nodes, Kahn topological sort - Automatic VkImageMemoryBarrier/VkBufferMemoryBarrier between passes - Passes: minimap_composite, worldmap_composite, preview_composite, shadow_pass, reflection_pass, compute_cull - beginFrame() uses buildFrameGraph() + renderGraph_->execute(cmd) Pipeline derivatives - PipelineBuilder::setFlags/setBasePipeline for VK_PIPELINE_CREATE_DERIVATIVE_BIT - M2 opaque = base; alphaTest/alpha/additive are derivatives - Applied to terrain (wireframe) and WMO (alpha-test) renderers Rendering bug fixes: - fix(shadow): compute lightSpaceMatrix before updatePerFrameUBO to eliminate one-frame lag that caused shadow trails and flicker on moving objects - fix(shadow): scale depth bias with shadowDistance_ instead of hardcoded 0.8f to prevent acne at close range and gaps at far range - fix(visibility): WMO group distance threshold 500u → 1200u to match terrain view distance; buildings were disappearing on the horizon - fix(precision): camera near plane 0.05 → 0.5 (ratio 600K:1 → 60K:1), eliminating Z-fighting and improving frustum plane extraction stability - fix(streaming): terrain load radius 4 → 6 tiles (~2133u → ~3200u) to exceed M2 render distance (2800u) and eliminate pop-in when camera turns; unload radius 7 → 9; spawn radius 3 → 4 - fix(visibility): ground-detail M2 distance multiplier 0.75 → 0.9 to reduce early pop of grass and debris
2026-04-04 13:43:16 +03:00
if (cullPipeline_) { vkDestroyPipeline(device, cullPipeline_, nullptr); cullPipeline_ = VK_NULL_HANDLE; }
if (cullPipelineLayout_) { vkDestroyPipelineLayout(device, cullPipelineLayout_, nullptr); cullPipelineLayout_ = VK_NULL_HANDLE; }
for (int i = 0; i < 2; i++) {
if (cullUniformBuffer_[i]) { vmaDestroyBuffer(alloc, cullUniformBuffer_[i], cullUniformAlloc_[i]); cullUniformBuffer_[i] = VK_NULL_HANDLE; }
if (cullInputBuffer_[i]) { vmaDestroyBuffer(alloc, cullInputBuffer_[i], cullInputAlloc_[i]); cullInputBuffer_[i] = VK_NULL_HANDLE; }
if (cullOutputBuffer_[i]) { vmaDestroyBuffer(alloc, cullOutputBuffer_[i], cullOutputAlloc_[i]); cullOutputBuffer_[i] = VK_NULL_HANDLE; }
cullUniformMapped_[i] = cullInputMapped_[i] = cullOutputMapped_[i] = nullptr;
cullSet_[i] = VK_NULL_HANDLE;
}
if (cullDescPool_) { vkDestroyDescriptorPool(device, cullDescPool_, nullptr); cullDescPool_ = VK_NULL_HANDLE; }
if (cullSetLayout_) { vkDestroyDescriptorSetLayout(device, cullSetLayout_, nullptr); cullSetLayout_ = VK_NULL_HANDLE; }
if (materialSetLayout_) { vkDestroyDescriptorSetLayout(device, materialSetLayout_, nullptr); materialSetLayout_ = VK_NULL_HANDLE; }
if (boneSetLayout_) { vkDestroyDescriptorSetLayout(device, boneSetLayout_, nullptr); boneSetLayout_ = VK_NULL_HANDLE; }
feat(rendering): GPU architecture + visual quality fixes M2 GPU instancing - M2InstanceGPU SSBO (96 B/entry, double-buffered, 16384 max) - Group opaque instances by (modelId, LOD); single vkCmdDrawIndexed per group - boneBase field indexes into mega bone SSBO via gl_InstanceIndex Indirect terrain drawing - 24 MB mega index buffer (6M uint32) + 64 MB mega vertex buffer - CPU builds VkDrawIndexedIndirectCommand per visible chunk - Single VB/IB bind per frame; shadow pass reuses mega buffers - Replaced vkCmdDrawIndexedIndirect with direct vkCmdDrawIndexed to fix host-mapped buffer race condition that caused terrain flickering GPU frustum culling (compute shader) - m2_cull.comp.glsl: 64-thread workgroups, sphere-vs-6-planes + distance cull - CullInstanceGPU SSBO input, uint visibility[] output, double-buffered - dispatchCullCompute() runs before main pass via render graph node Consolidated bone matrix SSBOs - 16 MB double-buffered mega bone SSBO (2048 instances × 128 bones) - Eliminated per-instance descriptor sets; one megaBoneSet_ per frame - prepareRender() packs bone matrices consecutively into current frame slot Render graph / frame graph - RenderGraph: RGResource handles, RGPass nodes, Kahn topological sort - Automatic VkImageMemoryBarrier/VkBufferMemoryBarrier between passes - Passes: minimap_composite, worldmap_composite, preview_composite, shadow_pass, reflection_pass, compute_cull - beginFrame() uses buildFrameGraph() + renderGraph_->execute(cmd) Pipeline derivatives - PipelineBuilder::setFlags/setBasePipeline for VK_PIPELINE_CREATE_DERIVATIVE_BIT - M2 opaque = base; alphaTest/alpha/additive are derivatives - Applied to terrain (wireframe) and WMO (alpha-test) renderers Rendering bug fixes: - fix(shadow): compute lightSpaceMatrix before updatePerFrameUBO to eliminate one-frame lag that caused shadow trails and flicker on moving objects - fix(shadow): scale depth bias with shadowDistance_ instead of hardcoded 0.8f to prevent acne at close range and gaps at far range - fix(visibility): WMO group distance threshold 500u → 1200u to match terrain view distance; buildings were disappearing on the horizon - fix(precision): camera near plane 0.05 → 0.5 (ratio 600K:1 → 60K:1), eliminating Z-fighting and improving frustum plane extraction stability - fix(streaming): terrain load radius 4 → 6 tiles (~2133u → ~3200u) to exceed M2 render distance (2800u) and eliminate pop-in when camera turns; unload radius 7 → 9; spawn radius 3 → 4 - fix(visibility): ground-detail M2 distance multiplier 0.75 → 0.9 to reduce early pop of grass and debris
2026-04-04 13:43:16 +03:00
if (instanceSetLayout_) { vkDestroyDescriptorSetLayout(device, instanceSetLayout_, nullptr); instanceSetLayout_ = VK_NULL_HANDLE; }
if (particleTexLayout_) { vkDestroyDescriptorSetLayout(device, particleTexLayout_, nullptr); particleTexLayout_ = VK_NULL_HANDLE; }
// Destroy shadow resources
destroyPipeline(shadowPipeline_);
if (shadowPipelineLayout_) { vkDestroyPipelineLayout(device, shadowPipelineLayout_, nullptr); shadowPipelineLayout_ = VK_NULL_HANDLE; }
for (auto& pool : shadowTexPool_) { if (pool) { vkDestroyDescriptorPool(device, pool, nullptr); pool = VK_NULL_HANDLE; } }
if (shadowParamsPool_) { vkDestroyDescriptorPool(device, shadowParamsPool_, nullptr); shadowParamsPool_ = VK_NULL_HANDLE; }
if (shadowParamsLayout_) { vkDestroyDescriptorSetLayout(device, shadowParamsLayout_, nullptr); shadowParamsLayout_ = VK_NULL_HANDLE; }
if (shadowParamsUBO_) { vmaDestroyBuffer(alloc, shadowParamsUBO_, shadowParamsAlloc_); shadowParamsUBO_ = VK_NULL_HANDLE; }
initialized_ = false;
}
void M2Renderer::destroyModelGPU(M2ModelGPU& model) {
if (!vkCtx_) return;
VmaAllocator alloc = vkCtx_->getAllocator();
if (model.vertexBuffer) { vmaDestroyBuffer(alloc, model.vertexBuffer, model.vertexAlloc); model.vertexBuffer = VK_NULL_HANDLE; }
if (model.indexBuffer) { vmaDestroyBuffer(alloc, model.indexBuffer, model.indexAlloc); model.indexBuffer = VK_NULL_HANDLE; }
VkDevice device = vkCtx_->getDevice();
for (auto& batch : model.batches) {
if (batch.materialSet) { vkFreeDescriptorSets(device, materialDescPool_, 1, &batch.materialSet); batch.materialSet = VK_NULL_HANDLE; }
if (batch.materialUBO) { vmaDestroyBuffer(alloc, batch.materialUBO, batch.materialUBOAlloc); batch.materialUBO = VK_NULL_HANDLE; }
}
// Free pre-allocated particle texture descriptor sets
for (auto& pSet : model.particleTexSets) {
if (pSet) { vkFreeDescriptorSets(device, materialDescPool_, 1, &pSet); pSet = VK_NULL_HANDLE; }
}
model.particleTexSets.clear();
// Free ribbon texture descriptor sets
for (auto& rSet : model.ribbonTexSets) {
if (rSet) { vkFreeDescriptorSets(device, materialDescPool_, 1, &rSet); rSet = VK_NULL_HANDLE; }
}
model.ribbonTexSets.clear();
}
void M2Renderer::destroyInstanceBones(M2Instance& inst, bool defer) {
if (!vkCtx_) return;
VkDevice device = vkCtx_->getDevice();
VmaAllocator alloc = vkCtx_->getAllocator();
for (int i = 0; i < 2; i++) {
// Snapshot handles before clearing the instance — needed for both
// immediate and deferred paths.
VkDescriptorSet boneSet = inst.boneSet[i];
::VkBuffer boneBuf = inst.boneBuffer[i];
VmaAllocation boneAlloc = inst.boneAlloc[i];
inst.boneSet[i] = VK_NULL_HANDLE;
inst.boneBuffer[i] = VK_NULL_HANDLE;
inst.boneMapped[i] = nullptr;
if (!defer) {
// Immediate destruction (safe after vkDeviceWaitIdle)
if (boneSet != VK_NULL_HANDLE) {
vkFreeDescriptorSets(device, boneDescPool_, 1, &boneSet);
}
if (boneBuf) {
vmaDestroyBuffer(alloc, boneBuf, boneAlloc);
}
} else if (boneSet != VK_NULL_HANDLE || boneBuf) {
// Deferred destruction — the loop destroys bone sets for ALL frame
// slots, so the other slot's command buffer may still be in flight.
// Must wait for all fences, not just the current frame's.
VkDescriptorPool pool = boneDescPool_;
vkCtx_->deferAfterAllFrameFences([device, alloc, pool, boneSet, boneBuf, boneAlloc]() {
if (boneSet != VK_NULL_HANDLE) {
VkDescriptorSet s = boneSet;
vkFreeDescriptorSets(device, pool, 1, &s);
}
if (boneBuf) {
vmaDestroyBuffer(alloc, boneBuf, boneAlloc);
}
});
}
}
}
VkDescriptorSet M2Renderer::allocateMaterialSet() {
VkDescriptorSetAllocateInfo ai{VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO};
ai.descriptorPool = materialDescPool_;
ai.descriptorSetCount = 1;
ai.pSetLayouts = &materialSetLayout_;
VkDescriptorSet set = VK_NULL_HANDLE;
2026-03-22 21:47:12 +03:00
VkResult result = vkAllocateDescriptorSets(vkCtx_->getDevice(), &ai, &set);
if (result != VK_SUCCESS) {
LOG_ERROR("M2Renderer: material descriptor set allocation failed (", result, ")");
return VK_NULL_HANDLE;
}
return set;
}
VkDescriptorSet M2Renderer::allocateBoneSet() {
VkDescriptorSetAllocateInfo ai{VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO};
ai.descriptorPool = boneDescPool_;
ai.descriptorSetCount = 1;
ai.pSetLayouts = &boneSetLayout_;
VkDescriptorSet set = VK_NULL_HANDLE;
2026-03-22 21:47:12 +03:00
VkResult result = vkAllocateDescriptorSets(vkCtx_->getDevice(), &ai, &set);
if (result != VK_SUCCESS) {
LOG_ERROR("M2Renderer: bone descriptor set allocation failed (", result, ")");
return VK_NULL_HANDLE;
}
return set;
}
// ---------------------------------------------------------------------------
// M2 collision mesh: build spatial grid + classify triangles
// ---------------------------------------------------------------------------
void M2ModelGPU::CollisionMesh::build() {
if (indices.size() < 3 || vertices.empty()) return;
triCount = static_cast<uint32_t>(indices.size() / 3);
// Bounding box for grid
glm::vec3 bmin(std::numeric_limits<float>::max());
glm::vec3 bmax(-std::numeric_limits<float>::max());
for (const auto& v : vertices) {
bmin = glm::min(bmin, v);
bmax = glm::max(bmax, v);
}
gridOrigin = glm::vec2(bmin.x, bmin.y);
gridCellsX = std::max(1, std::min(32, static_cast<int>(std::ceil((bmax.x - bmin.x) / CELL_SIZE))));
gridCellsY = std::max(1, std::min(32, static_cast<int>(std::ceil((bmax.y - bmin.y) / CELL_SIZE))));
cellFloorTris.resize(static_cast<size_t>(gridCellsX) * static_cast<size_t>(gridCellsY));
cellWallTris.resize(static_cast<size_t>(gridCellsX) * static_cast<size_t>(gridCellsY));
triBounds.resize(triCount);
for (uint32_t ti = 0; ti < triCount; ti++) {
uint16_t i0 = indices[ti * 3];
uint16_t i1 = indices[ti * 3 + 1];
uint16_t i2 = indices[ti * 3 + 2];
if (i0 >= vertices.size() || i1 >= vertices.size() || i2 >= vertices.size()) continue;
const auto& v0 = vertices[i0];
const auto& v1 = vertices[i1];
const auto& v2 = vertices[i2];
triBounds[ti].minZ = std::min({v0.z, v1.z, v2.z});
triBounds[ti].maxZ = std::max({v0.z, v1.z, v2.z});
glm::vec3 normal = glm::cross(v1 - v0, v2 - v0);
float normalLen = glm::length(normal);
float absNz = (normalLen > 0.001f) ? std::abs(normal.z / normalLen) : 0.0f;
bool isFloor = (absNz >= 0.35f); // ~70° max slope (relaxed for steep stairs)
bool isWall = (absNz < 0.65f);
float triMinX = std::min({v0.x, v1.x, v2.x});
float triMaxX = std::max({v0.x, v1.x, v2.x});
float triMinY = std::min({v0.y, v1.y, v2.y});
float triMaxY = std::max({v0.y, v1.y, v2.y});
int cxMin = std::clamp(static_cast<int>((triMinX - gridOrigin.x) / CELL_SIZE), 0, gridCellsX - 1);
int cxMax = std::clamp(static_cast<int>((triMaxX - gridOrigin.x) / CELL_SIZE), 0, gridCellsX - 1);
int cyMin = std::clamp(static_cast<int>((triMinY - gridOrigin.y) / CELL_SIZE), 0, gridCellsY - 1);
int cyMax = std::clamp(static_cast<int>((triMaxY - gridOrigin.y) / CELL_SIZE), 0, gridCellsY - 1);
for (int cy = cyMin; cy <= cyMax; cy++) {
for (int cx = cxMin; cx <= cxMax; cx++) {
int ci = cy * gridCellsX + cx;
if (isFloor) cellFloorTris[ci].push_back(ti);
if (isWall) cellWallTris[ci].push_back(ti);
}
}
}
}
void M2ModelGPU::CollisionMesh::getFloorTrisInRange(
float minX, float minY, float maxX, float maxY,
std::vector<uint32_t>& out) const {
out.clear();
if (gridCellsX == 0 || gridCellsY == 0) return;
int cxMin = std::clamp(static_cast<int>((minX - gridOrigin.x) / CELL_SIZE), 0, gridCellsX - 1);
int cxMax = std::clamp(static_cast<int>((maxX - gridOrigin.x) / CELL_SIZE), 0, gridCellsX - 1);
int cyMin = std::clamp(static_cast<int>((minY - gridOrigin.y) / CELL_SIZE), 0, gridCellsY - 1);
int cyMax = std::clamp(static_cast<int>((maxY - gridOrigin.y) / CELL_SIZE), 0, gridCellsY - 1);
const size_t cellCount = static_cast<size_t>(cxMax - cxMin + 1) *
static_cast<size_t>(cyMax - cyMin + 1);
out.reserve(cellCount * 8);
for (int cy = cyMin; cy <= cyMax; cy++) {
for (int cx = cxMin; cx <= cxMax; cx++) {
const auto& cell = cellFloorTris[cy * gridCellsX + cx];
out.insert(out.end(), cell.begin(), cell.end());
}
}
std::sort(out.begin(), out.end());
out.erase(std::unique(out.begin(), out.end()), out.end());
}
void M2ModelGPU::CollisionMesh::getWallTrisInRange(
float minX, float minY, float maxX, float maxY,
std::vector<uint32_t>& out) const {
out.clear();
if (gridCellsX == 0 || gridCellsY == 0) return;
int cxMin = std::clamp(static_cast<int>((minX - gridOrigin.x) / CELL_SIZE), 0, gridCellsX - 1);
int cxMax = std::clamp(static_cast<int>((maxX - gridOrigin.x) / CELL_SIZE), 0, gridCellsX - 1);
int cyMin = std::clamp(static_cast<int>((minY - gridOrigin.y) / CELL_SIZE), 0, gridCellsY - 1);
int cyMax = std::clamp(static_cast<int>((maxY - gridOrigin.y) / CELL_SIZE), 0, gridCellsY - 1);
const size_t cellCount = static_cast<size_t>(cxMax - cxMin + 1) *
static_cast<size_t>(cyMax - cyMin + 1);
out.reserve(cellCount * 8);
for (int cy = cyMin; cy <= cyMax; cy++) {
for (int cx = cxMin; cx <= cxMax; cx++) {
const auto& cell = cellWallTris[cy * gridCellsX + cx];
out.insert(out.end(), cell.begin(), cell.end());
}
}
std::sort(out.begin(), out.end());
out.erase(std::unique(out.begin(), out.end()), out.end());
}
bool M2Renderer::hasModel(uint32_t modelId) const {
return models.find(modelId) != models.end();
}
void M2Renderer::markModelAsSpellEffect(uint32_t modelId) {
auto it = models.find(modelId);
if (it != models.end()) {
it->second.isSpellEffect = true;
// Spell effects MUST have bone animation for ribbons/particles to work.
// The classifier may have set disableAnimation=true based on name tokens
// (e.g. "chest" in HolySmite_Low_Chest.m2) — override that for spell effects.
if (it->second.disableAnimation && it->second.hasAnimation) {
it->second.disableAnimation = false;
LOG_INFO("SpellEffect: re-enabled animation for '", it->second.name, "'");
}
}
}
bool M2Renderer::loadModel(const pipeline::M2Model& model, uint32_t modelId) {
if (models.find(modelId) != models.end()) {
// Already loaded
return true;
}
if (models.size() >= modelCacheLimit_) {
if (modelLimitRejectWarnings_ < 3) {
LOG_WARNING("M2 model cache full (", models.size(), "/", modelCacheLimit_,
"), skipping model load: id=", modelId, " name=", model.name);
}
++modelLimitRejectWarnings_;
return false;
}
bool hasGeometry = !model.vertices.empty() && !model.indices.empty();
bool hasParticles = !model.particleEmitters.empty();
bool hasRibbons = !model.ribbonEmitters.empty();
if (!hasGeometry && !hasParticles && !hasRibbons) {
LOG_WARNING("M2 model has no renderable content: ", model.name);
return false;
}
M2ModelGPU gpuModel;
gpuModel.name = model.name;
// Use tight bounds from actual vertices for collision/camera occlusion.
// Header bounds in some M2s are overly conservative.
glm::vec3 tightMin(0.0f);
glm::vec3 tightMax(0.0f);
if (hasGeometry) {
tightMin = glm::vec3(std::numeric_limits<float>::max());
tightMax = glm::vec3(-std::numeric_limits<float>::max());
for (const auto& v : model.vertices) {
tightMin = glm::min(tightMin, v.position);
tightMax = glm::max(tightMax, v.position);
}
}
// Classify model from name and geometry — pure function, no GPU dependencies.
auto cls = classifyM2Model(model.name, tightMin, tightMax,
model.vertices.size(),
model.particleEmitters.size());
const bool isInvisibleTrap = cls.isInvisibleTrap;
const bool groundDetailModel = cls.isGroundDetail;
if (isInvisibleTrap) {
LOG_INFO("Loading InvisibleTrap model: ", model.name, " (will be invisible, no collision)");
}
gpuModel.isInvisibleTrap = cls.isInvisibleTrap;
gpuModel.collisionSteppedFountain = cls.collisionSteppedFountain;
gpuModel.collisionSteppedLowPlatform = cls.collisionSteppedLowPlatform;
gpuModel.collisionBridge = cls.collisionBridge;
gpuModel.collisionPlanter = cls.collisionPlanter;
gpuModel.collisionStatue = cls.collisionStatue;
gpuModel.collisionTreeTrunk = cls.collisionTreeTrunk;
gpuModel.collisionNarrowVerticalProp = cls.collisionNarrowVerticalProp;
gpuModel.collisionSmallSolidProp = cls.collisionSmallSolidProp;
gpuModel.collisionNoBlock = cls.collisionNoBlock;
gpuModel.isGroundDetail = cls.isGroundDetail;
gpuModel.isFoliageLike = cls.isFoliageLike;
gpuModel.disableAnimation = cls.disableAnimation;
gpuModel.shadowWindFoliage = cls.shadowWindFoliage;
gpuModel.isFireflyEffect = cls.isFireflyEffect;
gpuModel.isSmallFoliage = cls.isSmallFoliage;
gpuModel.isSmoke = cls.isSmoke;
gpuModel.isSpellEffect = cls.isSpellEffect;
gpuModel.isLavaModel = cls.isLavaModel;
gpuModel.isInstancePortal = cls.isInstancePortal;
gpuModel.isWaterVegetation = cls.isWaterVegetation;
gpuModel.isElvenLike = cls.isElvenLike;
gpuModel.isLanternLike = cls.isLanternLike;
gpuModel.isKoboldFlame = cls.isKoboldFlame;
gpuModel.isWaterfall = cls.isWaterfall;
gpuModel.isBrazierOrFire = cls.isBrazierOrFire;
gpuModel.isTorch = cls.isTorch;
gpuModel.ambientEmitterType = cls.ambientEmitterType;
gpuModel.boundMin = tightMin;
gpuModel.boundMax = tightMax;
gpuModel.boundRadius = model.boundRadius;
// Fallback: compute bound radius from vertex extents when M2 header reports 0
if (gpuModel.boundRadius < 0.01f && !model.vertices.empty()) {
glm::vec3 extent = tightMax - tightMin;
gpuModel.boundRadius = glm::length(extent) * 0.5f;
}
gpuModel.indexCount = static_cast<uint32_t>(model.indices.size());
gpuModel.vertexCount = static_cast<uint32_t>(model.vertices.size());
// Store bone/sequence data for animation
gpuModel.bones = model.bones;
gpuModel.sequences = model.sequences;
gpuModel.globalSequenceDurations = model.globalSequenceDurations;
gpuModel.hasAnimation = false;
for (const auto& bone : model.bones) {
if (bone.translation.hasData() || bone.rotation.hasData() || bone.scale.hasData()) {
gpuModel.hasAnimation = true;
break;
}
}
// Build collision mesh + spatial grid from M2 bounding geometry
gpuModel.collision.vertices = model.collisionVertices;
gpuModel.collision.indices = model.collisionIndices;
gpuModel.collision.build();
if (gpuModel.collision.valid()) {
core::Logger::getInstance().debug(" M2 collision mesh: ", gpuModel.collision.triCount,
" tris, grid ", gpuModel.collision.gridCellsX, "x", gpuModel.collision.gridCellsY);
}
// Identify idle variation sequences (animation ID 0 = Stand)
for (int i = 0; i < static_cast<int>(model.sequences.size()); i++) {
if (model.sequences[i].id == 0 && model.sequences[i].duration > 0) {
gpuModel.idleVariationIndices.push_back(i);
}
}
// Batch all GPU uploads (VB, IB, textures) into a single command buffer
// submission with one fence wait, instead of one fence wait per upload.
vkCtx_->beginUploadBatch();
if (hasGeometry) {
// Create VBO with interleaved vertex data
// Format: position (3), normal (3), texcoord0 (2), texcoord1 (2), boneWeights (4), boneIndices (4 as float)
const size_t floatsPerVertex = 18;
std::vector<float> vertexData;
vertexData.reserve(model.vertices.size() * floatsPerVertex);
for (const auto& v : model.vertices) {
vertexData.push_back(v.position.x);
vertexData.push_back(v.position.y);
vertexData.push_back(v.position.z);
vertexData.push_back(v.normal.x);
vertexData.push_back(v.normal.y);
vertexData.push_back(v.normal.z);
vertexData.push_back(v.texCoords[0].x);
vertexData.push_back(v.texCoords[0].y);
vertexData.push_back(v.texCoords[1].x);
vertexData.push_back(v.texCoords[1].y);
float w0 = v.boneWeights[0] / 255.0f;
float w1 = v.boneWeights[1] / 255.0f;
float w2 = v.boneWeights[2] / 255.0f;
float w3 = v.boneWeights[3] / 255.0f;
vertexData.push_back(w0);
vertexData.push_back(w1);
vertexData.push_back(w2);
vertexData.push_back(w3);
vertexData.push_back(static_cast<float>(std::min(v.boneIndices[0], uint8_t(127))));
vertexData.push_back(static_cast<float>(std::min(v.boneIndices[1], uint8_t(127))));
vertexData.push_back(static_cast<float>(std::min(v.boneIndices[2], uint8_t(127))));
vertexData.push_back(static_cast<float>(std::min(v.boneIndices[3], uint8_t(127))));
}
// Upload vertex buffer to GPU
{
auto buf = uploadBuffer(*vkCtx_,
vertexData.data(), vertexData.size() * sizeof(float),
VK_BUFFER_USAGE_VERTEX_BUFFER_BIT);
gpuModel.vertexBuffer = buf.buffer;
gpuModel.vertexAlloc = buf.allocation;
}
// Upload index buffer to GPU
{
auto buf = uploadBuffer(*vkCtx_,
model.indices.data(), model.indices.size() * sizeof(uint16_t),
VK_BUFFER_USAGE_INDEX_BUFFER_BIT);
gpuModel.indexBuffer = buf.buffer;
gpuModel.indexAlloc = buf.allocation;
}
2026-03-22 21:47:12 +03:00
if (!gpuModel.vertexBuffer || !gpuModel.indexBuffer) {
LOG_ERROR("M2Renderer::loadModel: GPU buffer upload failed for model ", modelId);
}
}
// Load ALL textures from the model into a local vector.
// textureLoadFailed[i] is true if texture[i] had a named path that failed to load.
// Such batches are hidden (batchOpacity=0) rather than rendered white.
std::vector<VkTexture*> allTextures;
std::vector<bool> textureLoadFailed;
std::vector<std::string> textureKeysLower;
if (assetManager) {
for (size_t ti = 0; ti < model.textures.size(); ti++) {
const auto& tex = model.textures[ti];
std::string texPath = tex.filename;
// Some extracted M2 texture strings contain embedded NUL + garbage suffix.
// Truncate at first NUL so valid paths like "...foo.blp\0junk" still resolve.
size_t nul = texPath.find('\0');
if (nul != std::string::npos) {
texPath.resize(nul);
}
if (!texPath.empty()) {
std::string texKey = texPath;
std::replace(texKey.begin(), texKey.end(), '/', '\\');
std::transform(texKey.begin(), texKey.end(), texKey.begin(),
[](unsigned char c) { return static_cast<char>(std::tolower(c)); });
VkTexture* texPtr = loadTexture(texPath, tex.flags);
bool failed = (texPtr == whiteTexture_.get());
if (failed) {
static uint32_t loggedModelTextureFails = 0;
static bool loggedModelTextureFailSuppressed = false;
if (loggedModelTextureFails < 250) {
LOG_WARNING("M2 model ", model.name, " texture[", ti, "] failed to load: ", texPath);
++loggedModelTextureFails;
} else if (!loggedModelTextureFailSuppressed) {
LOG_WARNING("M2 model texture-failure warnings suppressed after ",
loggedModelTextureFails, " entries");
loggedModelTextureFailSuppressed = true;
}
}
if (isInvisibleTrap) {
LOG_INFO(" InvisibleTrap texture[", ti, "]: ", texPath, " -> ", (failed ? "WHITE" : "OK"));
}
allTextures.push_back(texPtr);
textureLoadFailed.push_back(failed);
textureKeysLower.push_back(std::move(texKey));
} else {
if (isInvisibleTrap) {
LOG_INFO(" InvisibleTrap texture[", ti, "]: EMPTY (using white fallback)");
}
allTextures.push_back(whiteTexture_.get());
textureLoadFailed.push_back(false); // Empty filename = intentional white (type!=0)
textureKeysLower.emplace_back();
}
}
}
static const bool kGlowDiag = envFlagEnabled("WOWEE_M2_GLOW_DIAG", false);
if (kGlowDiag) {
if (gpuModel.isLanternLike) {
for (size_t ti = 0; ti < model.textures.size(); ++ti) {
const std::string key = (ti < textureKeysLower.size()) ? textureKeysLower[ti] : std::string();
LOG_DEBUG("M2 GLOW TEX '", model.name, "' tex[", ti, "]='", key, "' flags=0x",
std::hex, model.textures[ti].flags, std::dec);
}
}
}
// Copy particle emitter data and resolve textures
gpuModel.particleEmitters = model.particleEmitters;
gpuModel.particleTextures.resize(model.particleEmitters.size(), whiteTexture_.get());
for (size_t ei = 0; ei < model.particleEmitters.size(); ei++) {
uint16_t texIdx = model.particleEmitters[ei].texture;
if (texIdx < allTextures.size() && allTextures[texIdx] != nullptr) {
gpuModel.particleTextures[ei] = allTextures[texIdx];
} else {
LOG_WARNING("M2 '", model.name, "' particle emitter[", ei,
"] texture index ", texIdx, " out of range (", allTextures.size(),
" textures) — using white fallback");
}
}
// Pre-allocate one stable descriptor set per particle emitter to avoid per-frame allocation.
// This prevents materialDescPool_ exhaustion when many emitters are active each frame.
if (particleTexLayout_ && materialDescPool_ && !model.particleEmitters.empty()) {
VkDevice device = vkCtx_->getDevice();
gpuModel.particleTexSets.resize(model.particleEmitters.size(), VK_NULL_HANDLE);
for (size_t ei = 0; ei < model.particleEmitters.size(); ei++) {
VkDescriptorSetAllocateInfo ai{VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO};
ai.descriptorPool = materialDescPool_;
ai.descriptorSetCount = 1;
ai.pSetLayouts = &particleTexLayout_;
if (vkAllocateDescriptorSets(device, &ai, &gpuModel.particleTexSets[ei]) == VK_SUCCESS) {
VkTexture* tex = gpuModel.particleTextures[ei];
VkDescriptorImageInfo imgInfo = tex->descriptorInfo();
VkWriteDescriptorSet write{VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET};
write.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
write.dstSet = gpuModel.particleTexSets[ei];
write.dstBinding = 0;
write.descriptorCount = 1;
write.descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER;
write.pImageInfo = &imgInfo;
vkUpdateDescriptorSets(device, 1, &write, 0, nullptr);
}
}
}
// Copy ribbon emitter data and resolve textures
gpuModel.ribbonEmitters = model.ribbonEmitters;
if (!model.ribbonEmitters.empty()) {
VkDevice device = vkCtx_->getDevice();
gpuModel.ribbonTextures.resize(model.ribbonEmitters.size(), whiteTexture_.get());
gpuModel.ribbonTexSets.resize(model.ribbonEmitters.size(), VK_NULL_HANDLE);
for (size_t ri = 0; ri < model.ribbonEmitters.size(); ri++) {
// Resolve texture: ribbon textureIndex is a direct index into the
// model's texture array (NOT through the textureLookup table).
uint16_t texDirect = model.ribbonEmitters[ri].textureIndex;
if (texDirect < allTextures.size() && allTextures[texDirect] != nullptr) {
gpuModel.ribbonTextures[ri] = allTextures[texDirect];
} else {
// Fallback: try through textureLookup table
uint32_t texIdx = (texDirect < model.textureLookup.size())
? model.textureLookup[texDirect] : UINT32_MAX;
if (texIdx < allTextures.size() && allTextures[texIdx] != nullptr) {
gpuModel.ribbonTextures[ri] = allTextures[texIdx];
} else {
LOG_WARNING("M2 '", model.name, "' ribbon emitter[", ri,
"] texIndex=", texDirect, " lookup failed"
" (direct=", (texDirect < allTextures.size() ? "yes" : "OOB"),
" lookup=", texIdx,
" textures=", allTextures.size(),
") — using white fallback");
}
}
// Allocate descriptor set (reuse particleTexLayout_ = single sampler)
if (particleTexLayout_ && materialDescPool_) {
VkDescriptorSetAllocateInfo ai{VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO};
ai.descriptorPool = materialDescPool_;
ai.descriptorSetCount = 1;
ai.pSetLayouts = &particleTexLayout_;
if (vkAllocateDescriptorSets(device, &ai, &gpuModel.ribbonTexSets[ri]) == VK_SUCCESS) {
VkTexture* tex = gpuModel.ribbonTextures[ri];
VkDescriptorImageInfo imgInfo = tex->descriptorInfo();
VkWriteDescriptorSet write{VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET};
write.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
write.dstSet = gpuModel.ribbonTexSets[ri];
write.dstBinding = 0;
write.descriptorCount = 1;
write.descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER;
write.pImageInfo = &imgInfo;
vkUpdateDescriptorSets(device, 1, &write, 0, nullptr);
}
}
}
LOG_DEBUG(" Ribbon emitters loaded: ", model.ribbonEmitters.size());
}
// Copy texture transform data for UV animation
gpuModel.textureTransforms = model.textureTransforms;
gpuModel.textureTransformLookup = model.textureTransformLookup;
gpuModel.hasTextureAnimation = false;
// Build per-batch GPU entries
if (!model.batches.empty()) {
for (const auto& batch : model.batches) {
M2ModelGPU::BatchGPU bgpu;
bgpu.indexStart = batch.indexStart;
bgpu.indexCount = batch.indexCount;
// Store texture animation index from batch
bgpu.textureAnimIndex = batch.textureAnimIndex;
if (bgpu.textureAnimIndex != 0xFFFF) {
gpuModel.hasTextureAnimation = true;
}
// Store blend mode and flags from material
if (batch.materialIndex < model.materials.size()) {
bgpu.blendMode = model.materials[batch.materialIndex].blendMode;
bgpu.materialFlags = model.materials[batch.materialIndex].flags;
if (bgpu.blendMode >= 2) gpuModel.hasTransparentBatches = true;
}
// Copy LOD level from batch
bgpu.submeshLevel = batch.submeshLevel;
// Resolve texture: batch.textureIndex → textureLookup → allTextures
VkTexture* tex = whiteTexture_.get();
bool texFailed = false;
std::string batchTexKeyLower;
if (batch.textureIndex < model.textureLookup.size()) {
uint16_t texIdx = model.textureLookup[batch.textureIndex];
if (texIdx < allTextures.size()) {
tex = allTextures[texIdx];
texFailed = (texIdx < textureLoadFailed.size()) && textureLoadFailed[texIdx];
if (texIdx < textureKeysLower.size()) {
batchTexKeyLower = textureKeysLower[texIdx];
}
}
if (texIdx < model.textures.size()) {
bgpu.texFlags = static_cast<uint8_t>(model.textures[texIdx].flags & 0x3);
}
} else if (!allTextures.empty()) {
LOG_WARNING("M2 '", model.name, "' batch textureIndex ", batch.textureIndex,
" out of range (textureLookup size=", model.textureLookup.size(),
") — falling back to texture[0]");
tex = allTextures[0];
texFailed = !textureLoadFailed.empty() && textureLoadFailed[0];
if (!textureKeysLower.empty()) {
batchTexKeyLower = textureKeysLower[0];
}
}
if (texFailed && groundDetailModel) {
static const std::string kDetailFallbackTexture = "World\\NoDXT\\Detail\\8des_detaildoodads01.blp";
VkTexture* fallbackTex = loadTexture(kDetailFallbackTexture, 0);
if (fallbackTex != nullptr && fallbackTex != whiteTexture_.get()) {
tex = fallbackTex;
texFailed = false;
}
}
bgpu.texture = tex;
const auto tcls = classifyBatchTexture(batchTexKeyLower);
const bool modelLanternFamily = gpuModel.isLanternLike;
bgpu.lanternGlowHint =
tcls.exactLanternGlowTex ||
((tcls.hasGlowToken || (modelLanternFamily && tcls.hasFlameToken)) &&
(tcls.lanternFamily || modelLanternFamily) &&
(!tcls.likelyFlame || modelLanternFamily));
bgpu.glowCardLike = bgpu.lanternGlowHint && tcls.hasGlowCardToken;
bgpu.glowTint = tcls.glowTint;
if (tex != nullptr && tex != whiteTexture_.get()) {
auto pit = texturePropsByPtr_.find(tex);
if (pit != texturePropsByPtr_.end()) {
bgpu.hasAlpha = pit->second.hasAlpha;
bgpu.colorKeyBlack = pit->second.colorKeyBlack;
}
}
// textureCoordIndex is an index into a texture coord combo table, not directly
// a UV set selector. Most batches have index=0 (UV set 0). We always use UV set 0
// since we don't have the full combo table — dual-UV effects are rare edge cases.
bgpu.textureUnit = 0;
// Start at full opacity; hide only if texture failed to load.
bgpu.batchOpacity = (texFailed && !groundDetailModel) ? 0.0f : 1.0f;
// Apply at-rest transparency and color alpha from the M2 animation tracks.
// These provide per-batch opacity for ghosts, ethereal effects, fading doodads, etc.
// Skip zero values: some animated tracks start at 0 and animate up, and baking
// that first keyframe would make the entire batch permanently invisible.
if (bgpu.batchOpacity > 0.0f) {
float animAlpha = 1.0f;
if (batch.colorIndex < model.colorAlphas.size()) {
float ca = model.colorAlphas[batch.colorIndex];
if (ca > 0.001f) animAlpha *= ca;
}
if (batch.transparencyIndex < model.textureWeights.size()) {
float tw = model.textureWeights[batch.transparencyIndex];
if (tw > 0.001f) animAlpha *= tw;
}
bgpu.batchOpacity *= animAlpha;
}
// Compute batch center and radius for glow sprite positioning
if ((bgpu.blendMode >= 3 || bgpu.colorKeyBlack) && batch.indexCount > 0) {
glm::vec3 sum(0.0f);
uint32_t counted = 0;
for (uint32_t j = batch.indexStart; j < batch.indexStart + batch.indexCount; j++) {
if (j < model.indices.size()) {
uint16_t vi = model.indices[j];
if (vi < model.vertices.size()) {
sum += model.vertices[vi].position;
counted++;
}
}
}
if (counted > 0) {
bgpu.center = sum / static_cast<float>(counted);
float maxDist = 0.0f;
for (uint32_t j = batch.indexStart; j < batch.indexStart + batch.indexCount; j++) {
if (j < model.indices.size()) {
uint16_t vi = model.indices[j];
if (vi < model.vertices.size()) {
float d = glm::length(model.vertices[vi].position - bgpu.center);
maxDist = std::max(maxDist, d);
}
}
}
bgpu.glowSize = std::max(maxDist, 0.5f);
}
}
// Optional diagnostics for glow/light batches (disabled by default).
if (kGlowDiag && gpuModel.isLanternLike) {
LOG_DEBUG("M2 GLOW DIAG '", model.name, "' batch ", gpuModel.batches.size(),
": blend=", bgpu.blendMode, " matFlags=0x",
std::hex, bgpu.materialFlags, std::dec,
" colorKey=", bgpu.colorKeyBlack ? "Y" : "N",
" hasAlpha=", bgpu.hasAlpha ? "Y" : "N",
" unlit=", (bgpu.materialFlags & 0x01) ? "Y" : "N",
" lanternHint=", bgpu.lanternGlowHint ? "Y" : "N",
" glowSize=", bgpu.glowSize,
" tex=", bgpu.texture,
" idxCount=", bgpu.indexCount);
}
gpuModel.batches.push_back(bgpu);
}
} else {
// Fallback: single batch covering all indices with first texture
M2ModelGPU::BatchGPU bgpu;
bgpu.indexStart = 0;
bgpu.indexCount = gpuModel.indexCount;
bgpu.texture = allTextures.empty() ? whiteTexture_.get() : allTextures[0];
if (bgpu.texture != nullptr && bgpu.texture != whiteTexture_.get()) {
auto pit = texturePropsByPtr_.find(bgpu.texture);
if (pit != texturePropsByPtr_.end()) {
bgpu.hasAlpha = pit->second.hasAlpha;
bgpu.colorKeyBlack = pit->second.colorKeyBlack;
}
}
gpuModel.batches.push_back(bgpu);
}
// Detect particle emitter volume models: box mesh (24 verts, 36 indices)
// with disproportionately large bounds. These are invisible bounding volumes
// that only exist to spawn particles — their mesh should never be rendered.
if (!isInvisibleTrap && !groundDetailModel &&
gpuModel.vertexCount <= 24 && gpuModel.indexCount <= 36
&& !model.particleEmitters.empty()) {
glm::vec3 size = gpuModel.boundMax - gpuModel.boundMin;
float maxDim = std::max({size.x, size.y, size.z});
if (maxDim > 5.0f) {
gpuModel.isInvisibleTrap = true;
LOG_DEBUG("M2 emitter volume hidden: '", model.name, "' size=(",
size.x, " x ", size.y, " x ", size.z, ")");
}
}
vkCtx_->endUploadBatch();
// Allocate Vulkan descriptor sets and UBOs for each batch
for (auto& bgpu : gpuModel.batches) {
// Create combined UBO for M2Params (binding 1) + M2Material (binding 2)
// We allocate them as separate buffers for clarity
VmaAllocationInfo matAllocInfo{};
{
VkBufferCreateInfo bci{VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO};
bci.size = sizeof(M2MaterialUBO);
bci.usage = VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT;
VmaAllocationCreateInfo aci{};
aci.usage = VMA_MEMORY_USAGE_CPU_TO_GPU;
aci.flags = VMA_ALLOCATION_CREATE_MAPPED_BIT;
vmaCreateBuffer(vkCtx_->getAllocator(), &bci, &aci, &bgpu.materialUBO, &bgpu.materialUBOAlloc, &matAllocInfo);
// Write initial material data (static per-batch — fadeAlpha/interiorDarken updated at draw time)
M2MaterialUBO mat{};
mat.hasTexture = (bgpu.texture != nullptr && bgpu.texture != whiteTexture_.get()) ? 1 : 0;
mat.alphaTest = (bgpu.blendMode == 1 || (bgpu.blendMode >= 2 && !bgpu.hasAlpha)) ? 1 : 0;
mat.colorKeyBlack = bgpu.colorKeyBlack ? 1 : 0;
mat.colorKeyThreshold = 0.08f;
mat.unlit = (bgpu.materialFlags & 0x01) ? 1 : 0;
mat.blendMode = bgpu.blendMode;
mat.fadeAlpha = 1.0f;
mat.interiorDarken = 0.0f;
mat.specularIntensity = 0.5f;
memcpy(matAllocInfo.pMappedData, &mat, sizeof(mat));
bgpu.materialUBOMapped = matAllocInfo.pMappedData;
}
// Allocate descriptor set and write all bindings
bgpu.materialSet = allocateMaterialSet();
if (bgpu.materialSet) {
VkTexture* batchTex = bgpu.texture ? bgpu.texture : whiteTexture_.get();
VkDescriptorImageInfo imgInfo = batchTex->descriptorInfo();
VkDescriptorBufferInfo matBufInfo{};
matBufInfo.buffer = bgpu.materialUBO;
matBufInfo.offset = 0;
matBufInfo.range = sizeof(M2MaterialUBO);
VkWriteDescriptorSet writes[2] = {};
// binding 0: texture
writes[0].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
writes[0].dstSet = bgpu.materialSet;
writes[0].dstBinding = 0;
writes[0].descriptorCount = 1;
writes[0].descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER;
writes[0].pImageInfo = &imgInfo;
// binding 2: M2Material UBO
writes[1].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
writes[1].dstSet = bgpu.materialSet;
writes[1].dstBinding = 2;
writes[1].descriptorCount = 1;
writes[1].descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER;
writes[1].pBufferInfo = &matBufInfo;
vkUpdateDescriptorSets(vkCtx_->getDevice(), 2, writes, 0, nullptr);
}
}
// Pre-compute available LOD levels to avoid per-instance batch iteration
gpuModel.availableLODs = 0;
for (const auto& b : gpuModel.batches) {
if (b.submeshLevel < 8) gpuModel.availableLODs |= (1u << b.submeshLevel);
}
models[modelId] = std::move(gpuModel);
2026-03-22 21:47:12 +03:00
spatialIndexDirty_ = true; // Map may have rehashed — refresh cachedModel pointers
LOG_DEBUG("Loaded M2 model: ", model.name, " (", models[modelId].vertexCount, " vertices, ",
models[modelId].indexCount / 3, " triangles, ", models[modelId].batches.size(), " batches)");
return true;
}
} // namespace rendering
} // namespace wowee