diff --git a/CMakeLists.txt b/CMakeLists.txt index f61b4024..88daaa4a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -605,6 +605,7 @@ set(WOWEE_SOURCES src/rendering/wmo_renderer.cpp src/rendering/m2_renderer.cpp src/rendering/m2_model_classifier.cpp + src/rendering/render_graph.cpp src/rendering/quest_marker_renderer.cpp src/rendering/minimap.cpp src/rendering/world_map.cpp diff --git a/assets/shaders/m2.vert.glsl b/assets/shaders/m2.vert.glsl index 6f4545c8..a5913ca2 100644 --- a/assets/shaders/m2.vert.glsl +++ b/assets/shaders/m2.vert.glsl @@ -13,19 +13,29 @@ layout(set = 0, binding = 0) uniform PerFrame { vec4 shadowParams; }; +// Phase 2.1: Per-draw push constants (batch-level data only) layout(push_constant) uniform Push { - mat4 model; - vec2 uvOffset; - int texCoordSet; - int useBones; - int isFoliage; - float fadeAlpha; + int texCoordSet; // UV set index (0 or 1) + int isFoliage; // Foliage wind animation flag + int instanceDataOffset; // Base index into InstanceSSBO for this draw group } push; layout(set = 2, binding = 0) readonly buffer BoneSSBO { mat4 bones[]; }; +// Phase 2.1: Per-instance data read via gl_InstanceIndex (GPU instancing) +struct InstanceData { + mat4 model; + vec2 uvOffset; + float fadeAlpha; + int useBones; + int boneBase; +}; +layout(set = 3, binding = 0) readonly buffer InstanceSSBO { + InstanceData instanceData[]; +}; + layout(location = 0) in vec3 aPos; layout(location = 1) in vec3 aNormal; layout(location = 2) in vec2 aTexCoord; @@ -41,15 +51,23 @@ layout(location = 4) out float ModelHeight; layout(location = 5) out float vFadeAlpha; void main() { + // Phase 2.1: Fetch per-instance data from SSBO + int instIdx = push.instanceDataOffset + gl_InstanceIndex; + mat4 model = instanceData[instIdx].model; + vec2 uvOff = instanceData[instIdx].uvOffset; + float fade = instanceData[instIdx].fadeAlpha; + int uBones = instanceData[instIdx].useBones; + int bBase = instanceData[instIdx].boneBase; + vec4 pos = vec4(aPos, 1.0); vec4 norm = vec4(aNormal, 0.0); - if (push.useBones != 0) { + if (uBones != 0) { ivec4 bi = ivec4(aBoneIndicesF); - mat4 skinMat = bones[bi.x] * aBoneWeights.x - + bones[bi.y] * aBoneWeights.y - + bones[bi.z] * aBoneWeights.z - + bones[bi.w] * aBoneWeights.w; + mat4 skinMat = bones[bBase + bi.x] * aBoneWeights.x + + bones[bBase + bi.y] * aBoneWeights.y + + bones[bBase + bi.z] * aBoneWeights.z + + bones[bBase + bi.w] * aBoneWeights.w; pos = skinMat * pos; norm = skinMat * norm; } @@ -57,7 +75,7 @@ void main() { // Wind animation for foliage if (push.isFoliage != 0) { float windTime = fogParams.z; - vec3 worldRef = push.model[3].xyz; + vec3 worldRef = model[3].xyz; float heightFactor = clamp(pos.z / 20.0, 0.0, 1.0); heightFactor *= heightFactor; // quadratic — base stays grounded @@ -80,15 +98,15 @@ void main() { pos.y += trunkSwayY + branchSwayY + leafFlutterY; } - vec4 worldPos = push.model * pos; + vec4 worldPos = model * pos; FragPos = worldPos.xyz; - Normal = mat3(push.model) * norm.xyz; + Normal = mat3(model) * norm.xyz; - TexCoord = (push.texCoordSet == 1 ? aTexCoord2 : aTexCoord) + push.uvOffset; + TexCoord = (push.texCoordSet == 1 ? aTexCoord2 : aTexCoord) + uvOff; - InstanceOrigin = push.model[3].xyz; + InstanceOrigin = model[3].xyz; ModelHeight = pos.z; - vFadeAlpha = push.fadeAlpha; + vFadeAlpha = fade; gl_Position = projection * view * worldPos; } diff --git a/assets/shaders/m2.vert.spv b/assets/shaders/m2.vert.spv index 8397440f..11364e67 100644 Binary files a/assets/shaders/m2.vert.spv and b/assets/shaders/m2.vert.spv differ diff --git a/assets/shaders/m2_cull.comp.glsl b/assets/shaders/m2_cull.comp.glsl new file mode 100644 index 00000000..831a521e --- /dev/null +++ b/assets/shaders/m2_cull.comp.glsl @@ -0,0 +1,76 @@ +#version 450 + +// Phase 2.3: GPU Frustum Culling for M2 doodads +// Each compute thread tests one M2 instance against 6 frustum planes. +// Input: per-instance bounding sphere + flags. +// Output: uint visibility array (1 = visible, 0 = culled). + +layout(local_size_x = 64) in; + +// Per-instance cull data (uploaded from CPU each frame) +struct CullInstance { + vec4 sphere; // xyz = world position, w = padded radius + float effectiveMaxDistSq; // adaptive distance cull threshold + uint flags; // bit 0 = valid, bit 1 = smoke, bit 2 = invisibleTrap + float _pad0; + float _pad1; +}; + +layout(std140, set = 0, binding = 0) uniform CullUniforms { + vec4 frustumPlanes[6]; // xyz = normal, w = distance + vec4 cameraPos; // xyz = camera position, w = maxPossibleDistSq + uint instanceCount; + uint _pad0; + uint _pad1; + uint _pad2; +}; + +layout(std430, set = 0, binding = 1) readonly buffer CullInput { + CullInstance cullInstances[]; +}; + +layout(std430, set = 0, binding = 2) writeonly buffer CullOutput { + uint visibility[]; +}; + +void main() { + uint id = gl_GlobalInvocationID.x; + if (id >= instanceCount) return; + + CullInstance inst = cullInstances[id]; + + // Flag check: must be valid, not smoke, not invisible trap + uint f = inst.flags; + if ((f & 1u) == 0u || (f & 6u) != 0u) { + visibility[id] = 0u; + return; + } + + // Early distance rejection (loose upper bound) + vec3 toCam = inst.sphere.xyz - cameraPos.xyz; + float distSq = dot(toCam, toCam); + if (distSq > cameraPos.w) { + visibility[id] = 0u; + return; + } + + // Accurate per-instance distance cull + if (distSq > inst.effectiveMaxDistSq) { + visibility[id] = 0u; + return; + } + + // Frustum cull: sphere vs 6 planes + float radius = inst.sphere.w; + if (radius > 0.0) { + for (int i = 0; i < 6; i++) { + float d = dot(frustumPlanes[i].xyz, inst.sphere.xyz) + frustumPlanes[i].w; + if (d < -radius) { + visibility[id] = 0u; + return; + } + } + } + + visibility[id] = 1u; +} diff --git a/assets/shaders/m2_cull.comp.spv b/assets/shaders/m2_cull.comp.spv new file mode 100644 index 00000000..ef1a08fd Binary files /dev/null and b/assets/shaders/m2_cull.comp.spv differ diff --git a/include/rendering/camera.hpp b/include/rendering/camera.hpp index ee58c8f2..ed4732f2 100644 --- a/include/rendering/camera.hpp +++ b/include/rendering/camera.hpp @@ -51,7 +51,7 @@ private: float pitch = 0.0f; float fov = 45.0f; float aspectRatio = 16.0f / 9.0f; - float nearPlane = 0.05f; + float nearPlane = 0.5f; float farPlane = 30000.0f; // Improves depth precision vs extremely large far clip glm::mat4 viewMatrix = glm::mat4(1.0f); diff --git a/include/rendering/m2_renderer.hpp b/include/rendering/m2_renderer.hpp index dbeeeae8..0acd9972 100644 --- a/include/rendering/m2_renderer.hpp +++ b/include/rendering/m2_renderer.hpp @@ -219,12 +219,15 @@ struct M2Instance { uint8_t frameSkipCounter = 0; bool bonesDirty[2] = {false, false}; // Per-frame-index: set when bones recomputed, cleared after upload - // Per-instance bone SSBO (double-buffered) + // Per-instance bone SSBO (double-buffered) — legacy; see mega bone SSBO in M2Renderer ::VkBuffer boneBuffer[2] = {}; VmaAllocation boneAlloc[2] = {}; void* boneMapped[2] = {}; VkDescriptorSet boneSet[2] = {}; + // Mega bone SSBO offset — base bone index for this instance (set per-frame in prepareRender) + uint32_t megaBoneOffset = 0; + void updateModelMatrix(); }; @@ -292,6 +295,8 @@ public: */ /** Pre-allocate GPU resources (bone SSBOs, descriptors) on main thread before parallel render. */ void prepareRender(uint32_t frameIndex, const Camera& camera); + /** Phase 2.3: Dispatch GPU frustum culling compute shader on primary cmd before render pass. */ + void dispatchCullCompute(VkCommandBuffer cmd, uint32_t frameIndex, const Camera& camera); void render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet, const Camera& camera); /** @@ -425,6 +430,65 @@ private: VmaAllocation dummyBoneAlloc_ = VK_NULL_HANDLE; VkDescriptorSet dummyBoneSet_ = VK_NULL_HANDLE; + // Mega bone SSBO — consolidates all per-instance bone matrices into a single buffer per frame. + // Replaces per-instance bone SSBOs for fewer descriptor binds and enables GPU instancing. + static constexpr uint32_t MEGA_BONE_MAX_INSTANCES = 2048; + static constexpr uint32_t MAX_BONES_PER_INSTANCE = 128; + ::VkBuffer megaBoneBuffer_[2] = {}; + VmaAllocation megaBoneAlloc_[2] = {}; + void* megaBoneMapped_[2] = {}; + VkDescriptorSet megaBoneSet_[2] = {}; + + // Phase 2.1: GPU instance data SSBO — per-instance transforms, fade, bones for instanced draws. + // Shader reads instanceData[push.instanceDataOffset + gl_InstanceIndex]. + struct M2InstanceGPU { + glm::mat4 model; // 64 bytes @ offset 0 + glm::vec2 uvOffset; // 8 bytes @ offset 64 + float fadeAlpha; // 4 bytes @ offset 72 + int32_t useBones; // 4 bytes @ offset 76 + int32_t boneBase; // 4 bytes @ offset 80 + int32_t _pad[3] = {}; // 12 bytes @ offset 84 — align to 96 (std430) + }; + static constexpr uint32_t MAX_INSTANCE_DATA = 16384; + VkDescriptorSetLayout instanceSetLayout_ = VK_NULL_HANDLE; + VkDescriptorPool instanceDescPool_ = VK_NULL_HANDLE; + ::VkBuffer instanceBuffer_[2] = {}; + VmaAllocation instanceAlloc_[2] = {}; + void* instanceMapped_[2] = {}; + VkDescriptorSet instanceSet_[2] = {}; + uint32_t instanceDataCount_ = 0; // reset each frame in render() + + // Phase 2.3: GPU Frustum Culling via Compute Shader + // Compute shader tests each M2 instance against frustum planes + distance, writes visibility[]. + // CPU reads back visibility to build sortedVisible_ without per-instance frustum/distance tests. + struct CullInstanceGPU { // matches CullInstance in m2_cull.comp.glsl (32 bytes, std430) + glm::vec4 sphere; // xyz = world position, w = padded radius + float effectiveMaxDistSq; // adaptive distance cull threshold + uint32_t flags; // bit 0 = valid, bit 1 = smoke, bit 2 = invisibleTrap + float _pad[2] = {}; + }; + struct CullUniformsGPU { // matches CullUniforms in m2_cull.comp.glsl (128 bytes, std140) + glm::vec4 frustumPlanes[6]; // xyz = normal, w = distance + glm::vec4 cameraPos; // xyz = camera position, w = maxPossibleDistSq + uint32_t instanceCount; + uint32_t _pad[3] = {}; + }; + static constexpr uint32_t MAX_CULL_INSTANCES = 16384; + VkPipeline cullPipeline_ = VK_NULL_HANDLE; + VkPipelineLayout cullPipelineLayout_ = VK_NULL_HANDLE; + VkDescriptorSetLayout cullSetLayout_ = VK_NULL_HANDLE; + VkDescriptorPool cullDescPool_ = VK_NULL_HANDLE; + VkDescriptorSet cullSet_[2] = {}; // double-buffered + ::VkBuffer cullUniformBuffer_[2] = {}; // frustum planes + camera (UBO) + VmaAllocation cullUniformAlloc_[2] = {}; + void* cullUniformMapped_[2] = {}; + ::VkBuffer cullInputBuffer_[2] = {}; // per-instance bounding sphere + flags (SSBO) + VmaAllocation cullInputAlloc_[2] = {}; + void* cullInputMapped_[2] = {}; + ::VkBuffer cullOutputBuffer_[2] = {}; // uint visibility[] (SSBO, host-readable) + VmaAllocation cullOutputAlloc_[2] = {}; + void* cullOutputMapped_[2] = {}; + // Dynamic ribbon vertex buffer (CPU-written triangle strip) static constexpr size_t MAX_RIBBON_VERTS = 2048; // 9 floats each ::VkBuffer ribbonVB_ = VK_NULL_HANDLE; diff --git a/include/rendering/render_graph.hpp b/include/rendering/render_graph.hpp new file mode 100644 index 00000000..39ea34bd --- /dev/null +++ b/include/rendering/render_graph.hpp @@ -0,0 +1,117 @@ +#pragma once + +#include +#include +#include +#include +#include + +namespace wowee { +namespace rendering { + +// Phase 2.5: Lightweight Render Graph / Frame Graph +// Converts hardcoded pass sequence (shadow → reflection → compute cull → +// main → post-process → ImGui → present) into declarative graph nodes. +// Graph auto-inserts VkImageMemoryBarrier between passes. + +// Resource handle — identifies a virtual resource (image or buffer) within the graph. +struct RGResource { + uint32_t id = UINT32_MAX; + bool valid() const { return id != UINT32_MAX; } +}; + +// Image barrier descriptor for automatic synchronization between passes. +struct RGImageBarrier { + VkImage image; + VkImageLayout oldLayout; + VkImageLayout newLayout; + VkAccessFlags srcAccess; + VkAccessFlags dstAccess; + VkPipelineStageFlags srcStage; + VkPipelineStageFlags dstStage; + VkImageAspectFlags aspectMask; +}; + +// Buffer barrier descriptor for automatic synchronization between passes. +struct RGBufferBarrier { + VkBuffer buffer; + VkDeviceSize offset; + VkDeviceSize size; + VkAccessFlags srcAccess; + VkAccessFlags dstAccess; + VkPipelineStageFlags srcStage; + VkPipelineStageFlags dstStage; +}; + +// Render pass node — wraps an execution callback with declared inputs/outputs. +struct RGPass { + std::string name; + std::vector inputs; + std::vector outputs; + std::function execute; + bool enabled = true; // Can be dynamically disabled per-frame + + // Barriers to insert before this pass executes + std::vector imageBarriers; + std::vector bufferBarriers; +}; + +class RenderGraph { +public: + RenderGraph() = default; + ~RenderGraph() = default; + + // Reset graph for a new frame (clears passes, keeps resource registry). + void reset(); + + // Register a virtual resource (returns handle for input/output declarations). + RGResource registerResource(const std::string& name); + + // Look up a previously registered resource by name. + RGResource findResource(const std::string& name) const; + + // Add a render pass node. + // inputs: resources this pass reads from + // outputs: resources this pass writes to + // execute: callback invoked with the frame's command buffer + void addPass(const std::string& name, + const std::vector& inputs, + const std::vector& outputs, + std::function execute); + + // Enable/disable a pass by name (for dynamic toggling, e.g. shadows off). + void setPassEnabled(const std::string& name, bool enabled); + + // Compile: topological sort by dependency order, insert barriers. + // Must be called after all addPass() calls and before execute(). + void compile(); + + // Execute all enabled passes in compiled order on the given command buffer. + void execute(VkCommandBuffer cmd); + + // Query: get the compiled execution order (pass names, for debug HUD). + const std::vector& getExecutionOrder() const { return executionOrder_; } + const std::vector& getPasses() const { return passes_; } + +private: + // Topological sort helper (Kahn's algorithm). + void topologicalSort(); + + // Resource registry: name → id + struct ResourceEntry { + std::string name; + uint32_t id; + }; + std::vector resources_; + uint32_t nextResourceId_ = 0; + + // Pass storage + std::vector passes_; + + // Compiled execution order (indices into passes_) + std::vector executionOrder_; + bool compiled_ = false; +}; + +} // namespace rendering +} // namespace wowee diff --git a/include/rendering/renderer.hpp b/include/rendering/renderer.hpp index 54372da9..a4d075e9 100644 --- a/include/rendering/renderer.hpp +++ b/include/rendering/renderer.hpp @@ -56,6 +56,7 @@ class AnimationController; class LevelUpEffect; class ChargeEffect; class SwimEffects; +class RenderGraph; class Renderer { public: @@ -433,6 +434,10 @@ private: bool ghostMode_ = false; // set each frame from gameHandler->isPlayerGhost() + // Phase 2.5: Render Graph — declarative pass ordering with automatic barriers + std::unique_ptr renderGraph_; + void buildFrameGraph(game::GameHandler* gameHandler); + // CPU timing stats (last frame/update). double lastUpdateMs = 0.0; double lastRenderMs = 0.0; diff --git a/include/rendering/terrain_manager.hpp b/include/rendering/terrain_manager.hpp index 50c09680..59c9c4e2 100644 --- a/include/rendering/terrain_manager.hpp +++ b/include/rendering/terrain_manager.hpp @@ -346,8 +346,8 @@ private: // Streaming parameters bool streamingEnabled = true; - int loadRadius = 4; // Load tiles within this radius (9x9 grid = 81 tiles) - int unloadRadius = 7; // Unload tiles beyond this radius + int loadRadius = 6; // Load tiles within this radius (13x13 grid = 169 tiles) + int unloadRadius = 9; // Unload tiles beyond this radius float updateInterval = 0.033f; // Check streaming every 33ms (~30 fps) float timeSinceLastUpdate = 0.0f; float proactiveStreamTimer_ = 0.0f; diff --git a/include/rendering/terrain_renderer.hpp b/include/rendering/terrain_renderer.hpp index 5bc13252..24fa1955 100644 --- a/include/rendering/terrain_renderer.hpp +++ b/include/rendering/terrain_renderer.hpp @@ -60,6 +60,11 @@ struct TerrainChunkGPU { float boundingSphereRadius = 0.0f; glm::vec3 boundingSphereCenter = glm::vec3(0.0f); + // Phase 2.2: Offsets into mega buffers for indirect drawing (-1 = not in mega buffer) + int32_t megaBaseVertex = -1; + uint32_t megaFirstIndex = 0; + uint32_t vertexCount = 0; + bool isValid() const { return vertexBuffer != VK_NULL_HANDLE && indexBuffer != VK_NULL_HANDLE; } }; @@ -200,6 +205,25 @@ private: bool fogEnabled = true; int renderedChunks = 0; int culledChunks = 0; + + // Phase 2.2: Mega vertex/index buffers for indirect drawing + // All terrain chunks share a single VB + IB, eliminating per-chunk rebinds. + // Indirect draw commands are built CPU-side each frame for visible chunks. + VkBuffer megaVB_ = VK_NULL_HANDLE; + VmaAllocation megaVBAlloc_ = VK_NULL_HANDLE; + void* megaVBMapped_ = nullptr; + VkBuffer megaIB_ = VK_NULL_HANDLE; + VmaAllocation megaIBAlloc_ = VK_NULL_HANDLE; + void* megaIBMapped_ = nullptr; + uint32_t megaVBUsed_ = 0; // vertices used + uint32_t megaIBUsed_ = 0; // indices used + static constexpr uint32_t MEGA_VB_MAX_VERTS = 1536 * 1024; // ~1.5M verts × 44B ≈ 64MB + static constexpr uint32_t MEGA_IB_MAX_INDICES = 6 * 1024 * 1024; // 6M indices × 4B = 24MB + + VkBuffer indirectBuffer_ = VK_NULL_HANDLE; + VmaAllocation indirectAlloc_ = VK_NULL_HANDLE; + void* indirectMapped_ = nullptr; + static constexpr uint32_t MAX_INDIRECT_DRAWS = 8192; }; } // namespace rendering diff --git a/include/rendering/vk_pipeline.hpp b/include/rendering/vk_pipeline.hpp index e95337f8..e53229e3 100644 --- a/include/rendering/vk_pipeline.hpp +++ b/include/rendering/vk_pipeline.hpp @@ -75,6 +75,10 @@ public: // Dynamic state PipelineBuilder& setDynamicStates(const std::vector& states); + // Pipeline derivatives — hint driver to share compiled state between similar pipelines + PipelineBuilder& setFlags(VkPipelineCreateFlags flags); + PipelineBuilder& setBasePipeline(VkPipeline basePipeline); + // Build the pipeline (pass a VkPipelineCache for faster creation) VkPipeline build(VkDevice device, VkPipelineCache cache = VK_NULL_HANDLE) const; @@ -106,6 +110,8 @@ private: VkRenderPass renderPass_ = VK_NULL_HANDLE; uint32_t subpass_ = 0; std::vector dynamicStates_; + VkPipelineCreateFlags flags_ = 0; + VkPipeline basePipelineHandle_ = VK_NULL_HANDLE; }; // Helper to create a pipeline layout from descriptor set layouts and push constant ranges diff --git a/src/core/world_loader.cpp b/src/core/world_loader.cpp index 9e90e747..4e967b18 100644 --- a/src/core/world_loader.cpp +++ b/src/core/world_loader.cpp @@ -734,9 +734,9 @@ void WorldLoader::loadOnlineWorldTerrain(uint32_t mapId, float x, float y, float // Use a small radius for the initial load (just immediate tiles), // then restore the full radius after entering the game. // This matches WoW's behavior: load quickly, stream the rest in-game. - const int savedLoadRadius = 4; - terrainMgr->setLoadRadius(3); // 7x7=49 tiles — prevents hitches on spawn - terrainMgr->setUnloadRadius(7); + const int savedLoadRadius = 6; + terrainMgr->setLoadRadius(4); // 9x9=81 tiles — prevents hitches on spawn + terrainMgr->setUnloadRadius(9); // Trigger tile streaming for surrounding area terrainMgr->update(*camera, 1.0f); diff --git a/src/main.cpp b/src/main.cpp index a4481a9f..7ac84715 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -111,13 +111,13 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char* argv[]) { _NSGetExecutablePath(nullptr, &bufSize); std::string exePath(bufSize, '\0'); _NSGetExecutablePath(exePath.data(), &bufSize); - chdir(dirname(exePath.data())); + if (chdir(dirname(exePath.data())) != 0) {} } #elif defined(__linux__) { char buf[4096]; ssize_t len = readlink("/proc/self/exe", buf, sizeof(buf) - 1); - if (len > 0) { buf[len] = '\0'; chdir(dirname(buf)); } + if (len > 0) { buf[len] = '\0'; if (chdir(dirname(buf)) != 0) {} } } #endif diff --git a/src/rendering/m2_renderer.cpp b/src/rendering/m2_renderer.cpp index d87a6844..8fccc598 100644 --- a/src/rendering/m2_renderer.cpp +++ b/src/rendering/m2_renderer.cpp @@ -349,6 +349,20 @@ bool M2Renderer::initialize(VkContext* ctx, VkDescriptorSetLayout perFrameLayout vkCreateDescriptorSetLayout(device, &ci, nullptr, &boneSetLayout_); } + // Phase 2.1: Instance data set layout (set 3): binding 0 = STORAGE_BUFFER (per-instance data) + { + VkDescriptorSetLayoutBinding binding{}; + binding.binding = 0; + binding.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; + binding.descriptorCount = 1; + binding.stageFlags = VK_SHADER_STAGE_VERTEX_BIT; + + VkDescriptorSetLayoutCreateInfo ci{VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO}; + ci.bindingCount = 1; + ci.pBindings = &binding; + vkCreateDescriptorSetLayout(device, &ci, nullptr, &instanceSetLayout_); + } + // Particle texture set layout (set 1 for particles): binding 0 = sampler2D { VkDescriptorSetLayoutBinding binding{}; @@ -423,19 +437,244 @@ bool M2Renderer::initialize(VkContext* ctx, VkDescriptorSetLayout perFrameLayout } } + // Mega bone SSBO — consolidates all animated instance bones into one buffer per frame. + // Slot 0 = identity matrix (for non-animated instances), slots 1..N = animated instances. + { + const VkDeviceSize megaSize = MEGA_BONE_MAX_INSTANCES * MAX_BONES_PER_INSTANCE * sizeof(glm::mat4); + glm::mat4 identity(1.0f); + for (int i = 0; i < 2; i++) { + VkBufferCreateInfo bci{VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO}; + bci.size = megaSize; + bci.usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT; + VmaAllocationCreateInfo aci{}; + aci.usage = VMA_MEMORY_USAGE_CPU_TO_GPU; + aci.flags = VMA_ALLOCATION_CREATE_MAPPED_BIT; + VmaAllocationInfo allocInfo{}; + vmaCreateBuffer(ctx->getAllocator(), &bci, &aci, + &megaBoneBuffer_[i], &megaBoneAlloc_[i], &allocInfo); + megaBoneMapped_[i] = allocInfo.pMappedData; + + // Slot 0: identity matrix (for non-animated instances) + if (megaBoneMapped_[i]) { + memcpy(megaBoneMapped_[i], &identity, sizeof(identity)); + } + + megaBoneSet_[i] = allocateBoneSet(); + if (megaBoneSet_[i]) { + VkDescriptorBufferInfo bufInfo{}; + bufInfo.buffer = megaBoneBuffer_[i]; + bufInfo.offset = 0; + bufInfo.range = megaSize; + VkWriteDescriptorSet write{VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET}; + write.dstSet = megaBoneSet_[i]; + write.dstBinding = 0; + write.descriptorCount = 1; + write.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; + write.pBufferInfo = &bufInfo; + vkUpdateDescriptorSets(device, 1, &write, 0, nullptr); + } + } + } + + // Phase 2.1: Instance data SSBO — per-frame buffer holding per-instance transforms, fade, bones. + // Shader reads instanceData[push.instanceDataOffset + gl_InstanceIndex]. + { + static_assert(sizeof(M2InstanceGPU) == 96, "M2InstanceGPU must be 96 bytes (std430)"); + const VkDeviceSize instBufSize = MAX_INSTANCE_DATA * sizeof(M2InstanceGPU); + + // Descriptor pool for 2 sets (double-buffered) + VkDescriptorPoolSize poolSize{VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 2}; + VkDescriptorPoolCreateInfo poolCi{VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO}; + poolCi.maxSets = 2; + poolCi.poolSizeCount = 1; + poolCi.pPoolSizes = &poolSize; + vkCreateDescriptorPool(device, &poolCi, nullptr, &instanceDescPool_); + + for (int i = 0; i < 2; i++) { + VkBufferCreateInfo bci{VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO}; + bci.size = instBufSize; + bci.usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT; + VmaAllocationCreateInfo aci{}; + aci.usage = VMA_MEMORY_USAGE_CPU_TO_GPU; + aci.flags = VMA_ALLOCATION_CREATE_MAPPED_BIT; + VmaAllocationInfo allocInfo{}; + vmaCreateBuffer(ctx->getAllocator(), &bci, &aci, + &instanceBuffer_[i], &instanceAlloc_[i], &allocInfo); + instanceMapped_[i] = allocInfo.pMappedData; + + VkDescriptorSetAllocateInfo setAi{VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO}; + setAi.descriptorPool = instanceDescPool_; + setAi.descriptorSetCount = 1; + setAi.pSetLayouts = &instanceSetLayout_; + vkAllocateDescriptorSets(device, &setAi, &instanceSet_[i]); + + VkDescriptorBufferInfo bufInfo{}; + bufInfo.buffer = instanceBuffer_[i]; + bufInfo.offset = 0; + bufInfo.range = instBufSize; + VkWriteDescriptorSet write{VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET}; + write.dstSet = instanceSet_[i]; + write.dstBinding = 0; + write.descriptorCount = 1; + write.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; + write.pBufferInfo = &bufInfo; + vkUpdateDescriptorSets(device, 1, &write, 0, nullptr); + } + } + + // Phase 2.3: GPU frustum culling — compute pipeline, buffers, descriptors. + // Compute shader tests each instance bounding sphere against 6 frustum planes + distance. + // Output: uint visibility[] read back by CPU to skip culled instances in sortedVisible_ build. + { + static_assert(sizeof(CullInstanceGPU) == 32, "CullInstanceGPU must be 32 bytes (std430)"); + static_assert(sizeof(CullUniformsGPU) == 128, "CullUniformsGPU must be 128 bytes (std140)"); + + // Descriptor set layout: binding 0 = UBO (frustum+camera), 1 = SSBO (input), 2 = SSBO (output) + VkDescriptorSetLayoutBinding bindings[3] = {}; + bindings[0].binding = 0; + bindings[0].descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER; + bindings[0].descriptorCount = 1; + bindings[0].stageFlags = VK_SHADER_STAGE_COMPUTE_BIT; + bindings[1].binding = 1; + bindings[1].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; + bindings[1].descriptorCount = 1; + bindings[1].stageFlags = VK_SHADER_STAGE_COMPUTE_BIT; + bindings[2].binding = 2; + bindings[2].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; + bindings[2].descriptorCount = 1; + bindings[2].stageFlags = VK_SHADER_STAGE_COMPUTE_BIT; + + VkDescriptorSetLayoutCreateInfo layoutCi{VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO}; + layoutCi.bindingCount = 3; + layoutCi.pBindings = bindings; + vkCreateDescriptorSetLayout(device, &layoutCi, nullptr, &cullSetLayout_); + + // Pipeline layout (no push constants — everything via UBO) + VkPipelineLayoutCreateInfo plCi{VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO}; + plCi.setLayoutCount = 1; + plCi.pSetLayouts = &cullSetLayout_; + vkCreatePipelineLayout(device, &plCi, nullptr, &cullPipelineLayout_); + + // Load compute shader + rendering::VkShaderModule cullComp; + if (!cullComp.loadFromFile(device, "assets/shaders/m2_cull.comp.spv")) { + LOG_ERROR("M2Renderer: failed to load m2_cull.comp.spv — GPU culling disabled"); + } else { + VkComputePipelineCreateInfo cpCi{VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO}; + cpCi.stage = cullComp.stageInfo(VK_SHADER_STAGE_COMPUTE_BIT); + cpCi.layout = cullPipelineLayout_; + if (vkCreateComputePipelines(device, VK_NULL_HANDLE, 1, &cpCi, nullptr, &cullPipeline_) != VK_SUCCESS) { + LOG_ERROR("M2Renderer: failed to create cull compute pipeline"); + cullPipeline_ = VK_NULL_HANDLE; + } + cullComp.destroy(); + } + + // Descriptor pool: 2 sets × 3 descriptors each (1 UBO + 2 SSBO) + VkDescriptorPoolSize poolSizes[2] = {}; + poolSizes[0] = {VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, 2}; + poolSizes[1] = {VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 4}; // 2 input + 2 output + VkDescriptorPoolCreateInfo poolCi{VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO}; + poolCi.maxSets = 2; + poolCi.poolSizeCount = 2; + poolCi.pPoolSizes = poolSizes; + vkCreateDescriptorPool(device, &poolCi, nullptr, &cullDescPool_); + + const VkDeviceSize uniformSize = sizeof(CullUniformsGPU); + const VkDeviceSize inputSize = MAX_CULL_INSTANCES * sizeof(CullInstanceGPU); + const VkDeviceSize outputSize = MAX_CULL_INSTANCES * sizeof(uint32_t); + + for (int i = 0; i < 2; i++) { + // Uniform buffer (frustum planes + camera) + { + VkBufferCreateInfo bci{VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO}; + bci.size = uniformSize; + bci.usage = VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT; + VmaAllocationCreateInfo aci{}; + aci.usage = VMA_MEMORY_USAGE_CPU_TO_GPU; + aci.flags = VMA_ALLOCATION_CREATE_MAPPED_BIT; + VmaAllocationInfo ai{}; + vmaCreateBuffer(ctx->getAllocator(), &bci, &aci, + &cullUniformBuffer_[i], &cullUniformAlloc_[i], &ai); + cullUniformMapped_[i] = ai.pMappedData; + } + // Input SSBO (per-instance cull data) + { + VkBufferCreateInfo bci{VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO}; + bci.size = inputSize; + bci.usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT; + VmaAllocationCreateInfo aci{}; + aci.usage = VMA_MEMORY_USAGE_CPU_TO_GPU; + aci.flags = VMA_ALLOCATION_CREATE_MAPPED_BIT; + VmaAllocationInfo ai{}; + vmaCreateBuffer(ctx->getAllocator(), &bci, &aci, + &cullInputBuffer_[i], &cullInputAlloc_[i], &ai); + cullInputMapped_[i] = ai.pMappedData; + } + // Output SSBO (visibility flags — GPU writes, CPU reads) + { + VkBufferCreateInfo bci{VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO}; + bci.size = outputSize; + bci.usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT; + VmaAllocationCreateInfo aci{}; + aci.usage = VMA_MEMORY_USAGE_GPU_TO_CPU; + aci.flags = VMA_ALLOCATION_CREATE_MAPPED_BIT; + VmaAllocationInfo ai{}; + vmaCreateBuffer(ctx->getAllocator(), &bci, &aci, + &cullOutputBuffer_[i], &cullOutputAlloc_[i], &ai); + cullOutputMapped_[i] = ai.pMappedData; + } + + // Allocate and write descriptor set + VkDescriptorSetAllocateInfo setAi{VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO}; + setAi.descriptorPool = cullDescPool_; + setAi.descriptorSetCount = 1; + setAi.pSetLayouts = &cullSetLayout_; + vkAllocateDescriptorSets(device, &setAi, &cullSet_[i]); + + VkDescriptorBufferInfo uboInfo{cullUniformBuffer_[i], 0, uniformSize}; + VkDescriptorBufferInfo inputInfo{cullInputBuffer_[i], 0, inputSize}; + VkDescriptorBufferInfo outputInfo{cullOutputBuffer_[i], 0, outputSize}; + + VkWriteDescriptorSet writes[3] = {}; + writes[0] = {VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET}; + writes[0].dstSet = cullSet_[i]; + writes[0].dstBinding = 0; + writes[0].descriptorCount = 1; + writes[0].descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER; + writes[0].pBufferInfo = &uboInfo; + + writes[1] = {VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET}; + writes[1].dstSet = cullSet_[i]; + writes[1].dstBinding = 1; + writes[1].descriptorCount = 1; + writes[1].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; + writes[1].pBufferInfo = &inputInfo; + + writes[2] = {VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET}; + writes[2].dstSet = cullSet_[i]; + writes[2].dstBinding = 2; + writes[2].descriptorCount = 1; + writes[2].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; + writes[2].pBufferInfo = &outputInfo; + + vkUpdateDescriptorSets(device, 3, writes, 0, nullptr); + } + } + // --- Pipeline layouts --- - // Main M2 pipeline layout: set 0 = perFrame, set 1 = material, set 2 = bones - // Push constant: mat4 model + vec2 uvOffset + int texCoordSet + int useBones = 80 bytes + // Main M2 pipeline layout: set 0 = perFrame, set 1 = material, set 2 = bones, set 3 = instances + // Push constant: int texCoordSet + int isFoliage + int instanceDataOffset (12 bytes) { - VkDescriptorSetLayout setLayouts[] = {perFrameLayout, materialSetLayout_, boneSetLayout_}; + VkDescriptorSetLayout setLayouts[] = {perFrameLayout, materialSetLayout_, boneSetLayout_, instanceSetLayout_}; VkPushConstantRange pushRange{}; pushRange.stageFlags = VK_SHADER_STAGE_VERTEX_BIT; pushRange.offset = 0; - pushRange.size = 88; // mat4(64) + vec2(8) + int(4) + int(4) + int(4) + float(4) + pushRange.size = 12; // int texCoordSet + int isFoliage + int instanceDataOffset VkPipelineLayoutCreateInfo ci{VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO}; - ci.setLayoutCount = 3; + ci.setLayoutCount = 4; ci.pSetLayouts = setLayouts; ci.pushConstantRangeCount = 1; ci.pPushConstantRanges = &pushRange; @@ -513,7 +752,9 @@ bool M2Renderer::initialize(VkContext* ctx, VkDescriptorSetLayout perFrameLayout {4, 0, VK_FORMAT_R32G32B32A32_SFLOAT, 14 * sizeof(float)}, // boneIndices (float) }; - auto buildM2Pipeline = [&](VkPipelineColorBlendAttachmentState blendState, bool depthWrite) -> VkPipeline { + // Pipeline derivatives — opaque is the base, others derive from it for shared state optimization + auto buildM2Pipeline = [&](VkPipelineColorBlendAttachmentState blendState, bool depthWrite, + VkPipelineCreateFlags flags = 0, VkPipeline basePipeline = VK_NULL_HANDLE) -> VkPipeline { return PipelineBuilder() .setShaders(m2Vert.stageInfo(VK_SHADER_STAGE_VERTEX_BIT), m2Frag.stageInfo(VK_SHADER_STAGE_FRAGMENT_BIT)) @@ -526,13 +767,19 @@ bool M2Renderer::initialize(VkContext* ctx, VkDescriptorSetLayout perFrameLayout .setLayout(pipelineLayout_) .setRenderPass(mainPass) .setDynamicStates({VK_DYNAMIC_STATE_VIEWPORT, VK_DYNAMIC_STATE_SCISSOR}) + .setFlags(flags) + .setBasePipeline(basePipeline) .build(device, vkCtx_->getPipelineCache()); }; - opaquePipeline_ = buildM2Pipeline(PipelineBuilder::blendDisabled(), true); - alphaTestPipeline_ = buildM2Pipeline(PipelineBuilder::blendAlpha(), true); - alphaPipeline_ = buildM2Pipeline(PipelineBuilder::blendAlpha(), false); - additivePipeline_ = buildM2Pipeline(PipelineBuilder::blendAdditive(), false); + opaquePipeline_ = buildM2Pipeline(PipelineBuilder::blendDisabled(), true, + VK_PIPELINE_CREATE_ALLOW_DERIVATIVES_BIT); + alphaTestPipeline_ = buildM2Pipeline(PipelineBuilder::blendAlpha(), true, + VK_PIPELINE_CREATE_DERIVATIVE_BIT, opaquePipeline_); + alphaPipeline_ = buildM2Pipeline(PipelineBuilder::blendAlpha(), false, + VK_PIPELINE_CREATE_DERIVATIVE_BIT, opaquePipeline_); + additivePipeline_ = buildM2Pipeline(PipelineBuilder::blendAdditive(), false, + VK_PIPELINE_CREATE_DERIVATIVE_BIT, opaquePipeline_); // --- Build particle pipelines --- if (particleVert.isValid() && particleFrag.isValid()) { @@ -805,10 +1052,38 @@ void M2Renderer::shutdown() { if (dummyBoneBuffer_) { vmaDestroyBuffer(alloc, dummyBoneBuffer_, dummyBoneAlloc_); dummyBoneBuffer_ = VK_NULL_HANDLE; } // dummyBoneSet_ is freed implicitly when boneDescPool_ is destroyed dummyBoneSet_ = VK_NULL_HANDLE; + // Mega bone SSBO cleanup (sets freed implicitly with boneDescPool_) + for (int i = 0; i < 2; i++) { + if (megaBoneBuffer_[i]) { vmaDestroyBuffer(alloc, megaBoneBuffer_[i], megaBoneAlloc_[i]); megaBoneBuffer_[i] = VK_NULL_HANDLE; } + megaBoneMapped_[i] = nullptr; + megaBoneSet_[i] = VK_NULL_HANDLE; + } if (materialDescPool_) { vkDestroyDescriptorPool(device, materialDescPool_, nullptr); materialDescPool_ = VK_NULL_HANDLE; } if (boneDescPool_) { vkDestroyDescriptorPool(device, boneDescPool_, nullptr); boneDescPool_ = VK_NULL_HANDLE; } + // Phase 2.1: Instance data SSBO cleanup (sets freed with instanceDescPool_) + for (int i = 0; i < 2; i++) { + if (instanceBuffer_[i]) { vmaDestroyBuffer(alloc, instanceBuffer_[i], instanceAlloc_[i]); instanceBuffer_[i] = VK_NULL_HANDLE; } + instanceMapped_[i] = nullptr; + instanceSet_[i] = VK_NULL_HANDLE; + } + if (instanceDescPool_) { vkDestroyDescriptorPool(device, instanceDescPool_, nullptr); instanceDescPool_ = VK_NULL_HANDLE; } + + // Phase 2.3: GPU frustum culling compute pipeline + buffers cleanup + if (cullPipeline_) { vkDestroyPipeline(device, cullPipeline_, nullptr); cullPipeline_ = VK_NULL_HANDLE; } + if (cullPipelineLayout_) { vkDestroyPipelineLayout(device, cullPipelineLayout_, nullptr); cullPipelineLayout_ = VK_NULL_HANDLE; } + for (int i = 0; i < 2; i++) { + if (cullUniformBuffer_[i]) { vmaDestroyBuffer(alloc, cullUniformBuffer_[i], cullUniformAlloc_[i]); cullUniformBuffer_[i] = VK_NULL_HANDLE; } + if (cullInputBuffer_[i]) { vmaDestroyBuffer(alloc, cullInputBuffer_[i], cullInputAlloc_[i]); cullInputBuffer_[i] = VK_NULL_HANDLE; } + if (cullOutputBuffer_[i]) { vmaDestroyBuffer(alloc, cullOutputBuffer_[i], cullOutputAlloc_[i]); cullOutputBuffer_[i] = VK_NULL_HANDLE; } + cullUniformMapped_[i] = cullInputMapped_[i] = cullOutputMapped_[i] = nullptr; + cullSet_[i] = VK_NULL_HANDLE; + } + if (cullDescPool_) { vkDestroyDescriptorPool(device, cullDescPool_, nullptr); cullDescPool_ = VK_NULL_HANDLE; } + if (cullSetLayout_) { vkDestroyDescriptorSetLayout(device, cullSetLayout_, nullptr); cullSetLayout_ = VK_NULL_HANDLE; } + if (materialSetLayout_) { vkDestroyDescriptorSetLayout(device, materialSetLayout_, nullptr); materialSetLayout_ = VK_NULL_HANDLE; } if (boneSetLayout_) { vkDestroyDescriptorSetLayout(device, boneSetLayout_, nullptr); boneSetLayout_ = VK_NULL_HANDLE; } + if (instanceSetLayout_) { vkDestroyDescriptorSetLayout(device, instanceSetLayout_, nullptr); instanceSetLayout_ = VK_NULL_HANDLE; } if (particleTexLayout_) { vkDestroyDescriptorSetLayout(device, particleTexLayout_, nullptr); particleTexLayout_ = VK_NULL_HANDLE; } // Destroy shadow resources @@ -2212,47 +2487,117 @@ void M2Renderer::prepareRender(uint32_t frameIndex, const Camera& camera) { if (!initialized_ || instances.empty()) return; (void)camera; // reserved for future frustum-based culling - // Pre-allocate bone SSBOs + descriptor sets on main thread (pool ops not thread-safe). - // Only iterate animated instances — static doodads don't need bone buffers. + // --- Mega bone SSBO: assign slots and upload all animated instance bones --- + // Slot 0 = identity (non-animated), slots 1..N = animated instances. + uint32_t nextSlot = 1; for (size_t idx : animatedInstanceIndices_) { if (idx >= instances.size()) continue; auto& instance = instances[idx]; - if (instance.boneMatrices.empty()) continue; + if (instance.boneMatrices.empty()) { + instance.megaBoneOffset = 0; // Use identity slot + continue; + } - if (!instance.boneBuffer[frameIndex]) { - VkBufferCreateInfo bci{VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO}; - bci.size = 128 * sizeof(glm::mat4); - bci.usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT; - VmaAllocationCreateInfo aci{}; - aci.usage = VMA_MEMORY_USAGE_CPU_TO_GPU; - aci.flags = VMA_ALLOCATION_CREATE_MAPPED_BIT; - VmaAllocationInfo allocInfo{}; - vmaCreateBuffer(vkCtx_->getAllocator(), &bci, &aci, - &instance.boneBuffer[frameIndex], &instance.boneAlloc[frameIndex], &allocInfo); - instance.boneMapped[frameIndex] = allocInfo.pMappedData; + if (nextSlot >= MEGA_BONE_MAX_INSTANCES) { + instance.megaBoneOffset = 0; // Overflow — use identity + continue; + } - // Force dirty so current boneMatrices get copied into this - // newly-allocated buffer during render (prevents garbage/zero - // bones when the other frame index already cleared bonesDirty). - instance.bonesDirty[frameIndex] = true; + instance.megaBoneOffset = nextSlot * MAX_BONES_PER_INSTANCE; - instance.boneSet[frameIndex] = allocateBoneSet(); - if (instance.boneSet[frameIndex]) { - VkDescriptorBufferInfo bufInfo{}; - bufInfo.buffer = instance.boneBuffer[frameIndex]; - bufInfo.offset = 0; - bufInfo.range = bci.size; - VkWriteDescriptorSet write{VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET}; - write.dstSet = instance.boneSet[frameIndex]; - write.dstBinding = 0; - write.descriptorCount = 1; - write.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; - write.pBufferInfo = &bufInfo; - vkUpdateDescriptorSets(vkCtx_->getDevice(), 1, &write, 0, nullptr); + // Upload bone matrices to mega buffer + if (megaBoneMapped_[frameIndex]) { + int numBones = std::min(static_cast(instance.boneMatrices.size()), + static_cast(MAX_BONES_PER_INSTANCE)); + auto* dst = static_cast(megaBoneMapped_[frameIndex]) + instance.megaBoneOffset; + memcpy(dst, instance.boneMatrices.data(), numBones * sizeof(glm::mat4)); + } + + nextSlot++; + } +} + +// Phase 2.3: Dispatch GPU frustum culling compute shader. +// Called on the primary command buffer BEFORE the render pass begins so that +// compute dispatch and memory barrier complete before secondary command buffers +// read the visibility output in render(). +void M2Renderer::dispatchCullCompute(VkCommandBuffer cmd, uint32_t frameIndex, const Camera& camera) { + if (!cullPipeline_ || instances.empty()) return; + + const uint32_t numInstances = std::min(static_cast(instances.size()), MAX_CULL_INSTANCES); + + // --- Compute per-instance adaptive distances (same formula as old CPU cull) --- + const float targetRenderDist = (instances.size() > 2000) ? 300.0f + : (instances.size() > 1000) ? 500.0f + : 1000.0f; + const float shrinkRate = 0.005f; + const float growRate = 0.05f; + float blendRate = (targetRenderDist < smoothedRenderDist_) ? shrinkRate : growRate; + smoothedRenderDist_ = glm::mix(smoothedRenderDist_, targetRenderDist, blendRate); + const float maxRenderDistance = smoothedRenderDist_; + const float maxRenderDistanceSq = maxRenderDistance * maxRenderDistance; + const float maxPossibleDistSq = maxRenderDistanceSq * 4.0f; // 2x safety margin + + // --- Upload frustum planes + camera (UBO, binding 0) --- + const glm::mat4 vp = camera.getProjectionMatrix() * camera.getViewMatrix(); + Frustum frustum; + frustum.extractFromMatrix(vp); + const glm::vec3 camPos = camera.getPosition(); + + if (cullUniformMapped_[frameIndex]) { + auto* ubo = static_cast(cullUniformMapped_[frameIndex]); + for (int i = 0; i < 6; i++) { + const auto& p = frustum.getPlane(static_cast(i)); + ubo->frustumPlanes[i] = glm::vec4(p.normal, p.distance); + } + ubo->cameraPos = glm::vec4(camPos, maxPossibleDistSq); + ubo->instanceCount = numInstances; + } + + // --- Upload per-instance cull data (SSBO, binding 1) --- + if (cullInputMapped_[frameIndex]) { + auto* input = static_cast(cullInputMapped_[frameIndex]); + for (uint32_t i = 0; i < numInstances; i++) { + const auto& inst = instances[i]; + float worldRadius = inst.cachedBoundRadius * inst.scale; + float cullRadius = worldRadius; + if (inst.cachedDisableAnimation) { + cullRadius = std::max(cullRadius, 3.0f); } + float effectiveMaxDistSq = maxRenderDistanceSq * std::max(1.0f, cullRadius / 12.0f); + if (inst.cachedDisableAnimation) effectiveMaxDistSq *= 2.6f; + if (inst.cachedIsGroundDetail) effectiveMaxDistSq *= 0.9f; + + float paddedRadius = std::max(cullRadius * 1.5f, cullRadius + 3.0f); + + uint32_t flags = 0; + if (inst.cachedIsValid) flags |= 1u; + if (inst.cachedIsSmoke) flags |= 2u; + if (inst.cachedIsInvisibleTrap) flags |= 4u; + + input[i].sphere = glm::vec4(inst.position, paddedRadius); + input[i].effectiveMaxDistSq = effectiveMaxDistSq; + input[i].flags = flags; } } + + // --- Dispatch compute shader --- + vkCmdBindPipeline(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, cullPipeline_); + vkCmdBindDescriptorSets(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, + cullPipelineLayout_, 0, 1, &cullSet_[frameIndex], 0, nullptr); + + const uint32_t groupCount = (numInstances + 63) / 64; + vkCmdDispatch(cmd, groupCount, 1, 1); + + // --- Memory barrier: compute writes → host reads --- + VkMemoryBarrier barrier{VK_STRUCTURE_TYPE_MEMORY_BARRIER}; + barrier.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT; + barrier.dstAccessMask = VK_ACCESS_HOST_READ_BIT; + vkCmdPipelineBarrier(cmd, + VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, + VK_PIPELINE_STAGE_HOST_BIT, + 0, 1, &barrier, 0, nullptr, 0, nullptr); } void M2Renderer::render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet, const Camera& camera) { @@ -2267,71 +2612,86 @@ void M2Renderer::render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet, const LOG_INFO("M2 render: ", instances.size(), " instances, ", models.size(), " models"); } - // Build frustum for culling - const glm::mat4 view = camera.getViewMatrix(); - const glm::mat4 projection = camera.getProjectionMatrix(); - Frustum frustum; - frustum.extractFromMatrix(projection * view); - // Reuse persistent buffers (clear instead of reallocating) glowSprites_.clear(); lastDrawCallCount = 0; - // Adaptive render distance: smoothed to prevent pop-in/pop-out flickering - const float targetRenderDist = (instances.size() > 2000) ? 300.0f - : (instances.size() > 1000) ? 500.0f - : 1000.0f; - // Smooth transitions: shrink slowly (avoid popping out nearby objects) - const float shrinkRate = 0.005f; // very slow decrease - const float growRate = 0.05f; // faster increase - float blendRate = (targetRenderDist < smoothedRenderDist_) ? shrinkRate : growRate; - smoothedRenderDist_ = glm::mix(smoothedRenderDist_, targetRenderDist, blendRate); - const float maxRenderDistance = smoothedRenderDist_; - const float maxRenderDistanceSq = maxRenderDistance * maxRenderDistance; + // Phase 2.3: GPU cull results — dispatchCullCompute() already updated smoothedRenderDist_. + // Use the cached value (set by dispatchCullCompute or fallback below). + const uint32_t frameIndex = vkCtx_->getCurrentFrame(); + const uint32_t numInstances = std::min(static_cast(instances.size()), MAX_CULL_INSTANCES); + const uint32_t* visibility = static_cast(cullOutputMapped_[frameIndex]); + const bool gpuCullAvailable = (cullPipeline_ != VK_NULL_HANDLE && visibility != nullptr); + + // If GPU culling was not dispatched, fallback: compute distances on CPU + float maxRenderDistanceSq; + if (!gpuCullAvailable) { + const float targetRenderDist = (instances.size() > 2000) ? 300.0f + : (instances.size() > 1000) ? 500.0f + : 1000.0f; + const float shrinkRate = 0.005f; + const float growRate = 0.05f; + float blendRate = (targetRenderDist < smoothedRenderDist_) ? shrinkRate : growRate; + smoothedRenderDist_ = glm::mix(smoothedRenderDist_, targetRenderDist, blendRate); + maxRenderDistanceSq = smoothedRenderDist_ * smoothedRenderDist_; + } else { + maxRenderDistanceSq = smoothedRenderDist_ * smoothedRenderDist_; + } + const float fadeStartFraction = 0.75f; const glm::vec3 camPos = camera.getPosition(); - // Build sorted visible instance list: cull then sort by modelId to batch VAO binds - // Reuse persistent vector to avoid allocation + // Build sorted visible instance list sortedVisible_.clear(); - // Reserve based on expected visible count (roughly 30% of total instances in dense areas) const size_t expectedVisible = std::min(instances.size() / 3, size_t(600)); if (sortedVisible_.capacity() < expectedVisible) { sortedVisible_.reserve(expectedVisible); } - // Early distance rejection: max possible render distance (tight but safe upper bound) - const float maxPossibleDistSq = maxRenderDistance * maxRenderDistance * 4.0f; // 2x safety margin (reduced from 4x) + // Phase 2.3: GPU frustum culling — build frustum only for CPU fallback path + Frustum frustum; + if (!gpuCullAvailable) { + const glm::mat4 vp = camera.getProjectionMatrix() * camera.getViewMatrix(); + frustum.extractFromMatrix(vp); + } + const float maxPossibleDistSq = maxRenderDistanceSq * 4.0f; - for (uint32_t i = 0; i < static_cast(instances.size()); ++i) { + for (uint32_t i = 0; i < numInstances; ++i) { const auto& instance = instances[i]; - // Use cached model flags — no hash lookup needed - if (!instance.cachedIsValid || instance.cachedIsSmoke || instance.cachedIsInvisibleTrap) continue; + if (gpuCullAvailable) { + // Phase 2.3: GPU already tested flags + distance + frustum + if (!visibility[i]) continue; + } else { + // CPU fallback: same culling logic as before Phase 2.3 + if (!instance.cachedIsValid || instance.cachedIsSmoke || instance.cachedIsInvisibleTrap) continue; + glm::vec3 toCam = instance.position - camPos; + float distSqTest = glm::dot(toCam, toCam); + if (distSqTest > maxPossibleDistSq) continue; + + float worldRadius = instance.cachedBoundRadius * instance.scale; + float cullRadius = worldRadius; + if (instance.cachedDisableAnimation) cullRadius = std::max(cullRadius, 3.0f); + float effDistSq = maxRenderDistanceSq * std::max(1.0f, cullRadius / 12.0f); + if (instance.cachedDisableAnimation) effDistSq *= 2.6f; + if (instance.cachedIsGroundDetail) effDistSq *= 0.9f; + if (distSqTest > effDistSq) continue; + + float paddedRadius = std::max(cullRadius * 1.5f, cullRadius + 3.0f); + if (cullRadius > 0.0f && !frustum.intersectsSphere(instance.position, paddedRadius)) continue; + } + + // Compute distSq + effectiveMaxDistSq for sorting and fade alpha (cheap for visible-only) glm::vec3 toCam = instance.position - camPos; float distSq = glm::dot(toCam, toCam); - if (distSq > maxPossibleDistSq) continue; - float worldRadius = instance.cachedBoundRadius * instance.scale; float cullRadius = worldRadius; - if (instance.cachedDisableAnimation) { - cullRadius = std::max(cullRadius, 3.0f); - } + if (instance.cachedDisableAnimation) cullRadius = std::max(cullRadius, 3.0f); float effectiveMaxDistSq = maxRenderDistanceSq * std::max(1.0f, cullRadius / 12.0f); - if (instance.cachedDisableAnimation) { - effectiveMaxDistSq *= 2.6f; - } - if (instance.cachedIsGroundDetail) { - effectiveMaxDistSq *= 0.75f; - } - - if (distSq > effectiveMaxDistSq) continue; - - // Frustum cull with padding - float paddedRadius = std::max(cullRadius * 1.5f, cullRadius + 3.0f); - if (cullRadius > 0.0f && !frustum.intersectsSphere(instance.position, paddedRadius)) continue; + if (instance.cachedDisableAnimation) effectiveMaxDistSq *= 2.6f; + if (instance.cachedIsGroundDetail) effectiveMaxDistSq *= 0.9f; sortedVisible_.push_back({i, instance.modelId, distSq, effectiveMaxDistSq}); } @@ -2351,17 +2711,12 @@ void M2Renderer::render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet, const // State tracking VkPipeline currentPipeline = VK_NULL_HANDLE; VkDescriptorSet currentMaterialSet = VK_NULL_HANDLE; - VkDescriptorSet currentBoneSet = VK_NULL_HANDLE; - uint32_t frameIndex = vkCtx_->getCurrentFrame(); - // Push constants struct matching m2.vert.glsl push_constant block + // Phase 2.1: Push constants now carry per-batch data only; per-instance data is in instance SSBO. struct M2PushConstants { - glm::mat4 model; - glm::vec2 uvOffset; - int texCoordSet; - int useBones; - int isFoliage; - float fadeAlpha; + int32_t texCoordSet; // UV set index (0 or 1) + int32_t isFoliage; // Foliage wind animation flag + int32_t instanceDataOffset; // Base index into instance SSBO for this draw group }; // Validate per-frame descriptor set before any Vulkan commands @@ -2377,311 +2732,338 @@ void M2Renderer::render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet, const // Start with opaque pipeline vkCmdBindPipeline(cmd, VK_PIPELINE_BIND_POINT_GRAPHICS, opaquePipeline_); currentPipeline = opaquePipeline_; - bool opaquePass = true; // Pass 1 = opaque, pass 2 = transparent (set below for second pass) // Bind dummy bone set (set 2) so non-animated draws have a valid binding. - // Animated instances override this with their real bone set per-instance. - if (dummyBoneSet_) { + // Phase 2.4: Bind mega bone SSBO instead — all instances index into one buffer via boneBase. + if (megaBoneSet_[frameIndex]) { + vkCmdBindDescriptorSets(cmd, VK_PIPELINE_BIND_POINT_GRAPHICS, + pipelineLayout_, 2, 1, &megaBoneSet_[frameIndex], 0, nullptr); + } else if (dummyBoneSet_) { vkCmdBindDescriptorSets(cmd, VK_PIPELINE_BIND_POINT_GRAPHICS, pipelineLayout_, 2, 1, &dummyBoneSet_, 0, nullptr); } - for (const auto& entry : sortedVisible_) { - if (entry.index >= instances.size()) continue; - auto& instance = instances[entry.index]; + // Phase 2.1: Bind instance data SSBO (set 3) — per-instance transforms, fade, bones + if (instanceSet_[frameIndex]) { + vkCmdBindDescriptorSets(cmd, VK_PIPELINE_BIND_POINT_GRAPHICS, + pipelineLayout_, 3, 1, &instanceSet_[frameIndex], 0, nullptr); + } - // Bind vertex + index buffers once per model group - if (entry.modelId != currentModelId) { - currentModelId = entry.modelId; - currentModelValid = false; - auto mdlIt = models.find(currentModelId); - if (mdlIt == models.end()) continue; - currentModel = &mdlIt->second; - if (!currentModel->vertexBuffer || !currentModel->indexBuffer) continue; - currentModelValid = true; - VkDeviceSize offset = 0; - vkCmdBindVertexBuffers(cmd, 0, 1, ¤tModel->vertexBuffer, &offset); - vkCmdBindIndexBuffer(cmd, currentModel->indexBuffer, 0, VK_INDEX_TYPE_UINT16); - } - if (!currentModelValid) continue; + // Phase 2.1: Reset instance SSBO write cursor for this frame + instanceDataCount_ = 0; + auto* instSSBO = static_cast(instanceMapped_[frameIndex]); - const M2ModelGPU& model = *currentModel; + // ===================================================================== + // Phase 2.1: Opaque pass — instanced draws grouped by (modelId, LOD) + // ===================================================================== + // sortedVisible_ is already sorted by modelId so consecutive entries share + // the same vertex/index buffer. Within each model group we sub-group by + // targetLOD to guarantee all instances in one vkCmdDrawIndexed use the + // same batch set. Per-instance data (model matrix, fade, bones) is + // written to the instance SSBO; the shader reads it via gl_InstanceIndex. + { + struct PendingInstance { + uint32_t instanceIdx; + float fadeAlpha; + bool useBones; + uint16_t targetLOD; + }; + std::vector pending; + pending.reserve(128); - // Distance-based fade alpha for smooth pop-in (squared-distance, no sqrt) - float fadeAlpha = 1.0f; - float fadeFrac = model.disableAnimation ? 0.55f : fadeStartFraction; - float fadeStartDistSq = entry.effectiveMaxDistSq * fadeFrac * fadeFrac; - if (entry.distSq > fadeStartDistSq) { - fadeAlpha = std::clamp((entry.effectiveMaxDistSq - entry.distSq) / - (entry.effectiveMaxDistSq - fadeStartDistSq), 0.0f, 1.0f); - } + size_t visStart = 0; + while (visStart < sortedVisible_.size()) { + // Find group of consecutive entries with same modelId + uint32_t groupModelId = sortedVisible_[visStart].modelId; + size_t groupEnd = visStart; + while (groupEnd < sortedVisible_.size() && sortedVisible_[groupEnd].modelId == groupModelId) + groupEnd++; - float instanceFadeAlpha = fadeAlpha; - if (model.isGroundDetail) { - instanceFadeAlpha *= 0.82f; - } - if (model.isInstancePortal) { - // Render mesh at low alpha + emit glow sprite at center - instanceFadeAlpha *= 0.12f; - if (entry.distSq < 400.0f * 400.0f) { - glm::vec3 center = glm::vec3(instance.modelMatrix * glm::vec4(0.0f, 0.0f, 0.0f, 1.0f)); - GlowSprite gs; - gs.worldPos = center; - gs.color = glm::vec4(0.35f, 0.5f, 1.0f, 1.1f); - gs.size = instance.scale * 5.0f; - glowSprites_.push_back(gs); - GlowSprite halo = gs; - halo.color.a *= 0.3f; - halo.size *= 2.2f; - glowSprites_.push_back(halo); - } - } - - // Upload bone matrices to SSBO if model has skeletal animation. - // Skip animated instances entirely until bones are computed + buffers allocated - // to prevent bind-pose/T-pose flash on first appearance. - bool modelNeedsAnimation = model.hasAnimation && !model.disableAnimation; - if (modelNeedsAnimation && instance.boneMatrices.empty()) { - continue; // Bones not yet computed — skip to avoid bind-pose flash - } - bool needsBones = modelNeedsAnimation && !instance.boneMatrices.empty(); - if (needsBones && (!instance.boneBuffer[frameIndex] || !instance.boneSet[frameIndex])) { - continue; // Bone buffers not yet allocated — skip to avoid bind-pose flash - } - bool useBones = needsBones; - if (useBones) { - // Upload bone matrices only when recomputed (per-frame-index tracking - // ensures both double-buffered SSBOs get the latest bone data) - if (instance.bonesDirty[frameIndex] && instance.boneMapped[frameIndex]) { - int numBones = std::min(static_cast(instance.boneMatrices.size()), 128); - memcpy(instance.boneMapped[frameIndex], instance.boneMatrices.data(), - numBones * sizeof(glm::mat4)); - instance.bonesDirty[frameIndex] = false; - } - - // Bind bone descriptor set (set 2) — skip if already bound - if (instance.boneSet[frameIndex] && instance.boneSet[frameIndex] != currentBoneSet) { - vkCmdBindDescriptorSets(cmd, VK_PIPELINE_BIND_POINT_GRAPHICS, - pipelineLayout_, 2, 1, &instance.boneSet[frameIndex], 0, nullptr); - currentBoneSet = instance.boneSet[frameIndex]; - } - } - - // LOD selection based on squared distance (avoid sqrt) - uint16_t desiredLOD = 0; - if (entry.distSq > 150.0f * 150.0f) desiredLOD = 3; - else if (entry.distSq > 80.0f * 80.0f) desiredLOD = 2; - else if (entry.distSq > 40.0f * 40.0f) desiredLOD = 1; - - uint16_t targetLOD = desiredLOD; - if (desiredLOD > 0 && !(model.availableLODs & (1u << desiredLOD))) { - targetLOD = 0; - } - - const bool foliageLikeModel = model.isFoliageLike; - // Particle-dominant spell effects: mesh is emission geometry, render dim - const bool particleDominantEffect = model.isSpellEffect && - !model.particleEmitters.empty() && model.batches.size() <= 2; - for (const auto& batch : model.batches) { - if (batch.indexCount == 0) continue; - if (!model.isGroundDetail && batch.submeshLevel != targetLOD) continue; - if (batch.batchOpacity < 0.01f) continue; - - // Two-pass gate: pass 1 = opaque/cutout only, pass 2 = transparent/additive only. - // Alpha-test (blendMode==1) and spell effects that force-additive are handled - // by their effective blend mode below; gate on raw blendMode here. - { - const bool rawTransparent = (batch.blendMode >= 2) || model.isSpellEffect; - if (opaquePass && rawTransparent) continue; // skip transparent in opaque pass - if (!opaquePass && !rawTransparent) continue; // skip opaque in transparent pass - } - - const bool koboldFlameCard = batch.colorKeyBlack && model.isKoboldFlame; - const bool smallCardLikeBatch = - (batch.glowSize <= 1.35f) || - (batch.lanternGlowHint && batch.glowSize <= 6.0f); - const bool batchUnlit = (batch.materialFlags & 0x01) != 0; - const bool elvenLikeModel = model.isElvenLike; - const bool lanternLikeModel = model.isLanternLike; - const bool shouldUseGlowSprite = - !koboldFlameCard && - (elvenLikeModel || (lanternLikeModel && batch.lanternGlowHint)) && - !model.isSpellEffect && - smallCardLikeBatch && - (batch.lanternGlowHint || - (batch.blendMode >= 3) || - (batch.colorKeyBlack && batchUnlit && batch.blendMode >= 1)); - if (shouldUseGlowSprite) { - if (entry.distSq < 180.0f * 180.0f) { - glm::vec3 worldPos = glm::vec3(instance.modelMatrix * glm::vec4(batch.center, 1.0f)); - GlowSprite gs; - gs.worldPos = worldPos; - if (batch.glowTint == 1 || elvenLikeModel) { - gs.color = glm::vec4(0.48f, 0.72f, 1.0f, 1.05f); - } else if (batch.glowTint == 2) { - gs.color = glm::vec4(1.0f, 0.28f, 0.22f, 1.10f); - } else { - gs.color = glm::vec4(1.0f, 0.82f, 0.46f, 1.15f); - } - gs.size = batch.glowSize * instance.scale * 1.45f; - glowSprites_.push_back(gs); - GlowSprite halo = gs; - halo.color.a *= 0.42f; - halo.size *= 1.8f; - glowSprites_.push_back(halo); - } - const bool cardLikeSkipMesh = - (batch.blendMode >= 3) || - batch.colorKeyBlack || - ((batch.materialFlags & 0x01) != 0); - const bool lanternGlowCardSkip = - lanternLikeModel && - batch.lanternGlowHint && - smallCardLikeBatch && - cardLikeSkipMesh; - if (lanternGlowCardSkip || (cardLikeSkipMesh && !lanternLikeModel)) { - continue; - } - } - - // Compute UV offset for texture animation - glm::vec2 uvOffset(0.0f, 0.0f); - if (batch.textureAnimIndex != 0xFFFF && model.hasTextureAnimation) { - uint16_t lookupIdx = batch.textureAnimIndex; - if (lookupIdx < model.textureTransformLookup.size()) { - uint16_t transformIdx = model.textureTransformLookup[lookupIdx]; - if (transformIdx < model.textureTransforms.size()) { - const auto& tt = model.textureTransforms[transformIdx]; - glm::vec3 trans = interpVec3(tt.translation, - instance.currentSequenceIndex, instance.animTime, - glm::vec3(0.0f), model.globalSequenceDurations); - uvOffset = glm::vec2(trans.x, trans.y); - } - } - } - // Lava M2 models: fallback UV scroll if no texture animation. - // Uses kLavaAnimStart (file-scope) for consistent timing across passes. - if (model.isLavaModel && uvOffset == glm::vec2(0.0f)) { - float t = std::chrono::duration(std::chrono::steady_clock::now() - kLavaAnimStart).count(); - uvOffset = glm::vec2(t * 0.03f, -t * 0.08f); - } - - // Foliage/card-like batches render more stably as cutout (depth-write on) - // instead of alpha-blended sorting. - const bool foliageCutout = - foliageLikeModel && - !model.isSpellEffect && - batch.blendMode <= 3; - const bool forceCutout = - !model.isSpellEffect && - (model.isGroundDetail || - foliageCutout || - batch.blendMode == 1 || - (batch.blendMode >= 2 && !batch.hasAlpha) || - batch.colorKeyBlack); - - // Select pipeline based on blend mode - uint8_t effectiveBlendMode = batch.blendMode; - if (model.isSpellEffect) { - // Effect models: force additive blend for opaque/cutout batches - // so the mesh renders as a transparent glow, not a solid object - if (effectiveBlendMode <= 1) { - effectiveBlendMode = 3; // additive - } else if (effectiveBlendMode == 4 || effectiveBlendMode == 5) { - effectiveBlendMode = 3; - } - } - if (forceCutout) { - effectiveBlendMode = 1; - } - - VkPipeline desiredPipeline; - if (forceCutout) { - // Use opaque pipeline + shader discard for stable foliage cards. - desiredPipeline = opaquePipeline_; - } else { - switch (effectiveBlendMode) { - case 0: desiredPipeline = opaquePipeline_; break; - case 1: desiredPipeline = alphaTestPipeline_; break; - case 2: desiredPipeline = alphaPipeline_; break; - default: desiredPipeline = additivePipeline_; break; - } - } - if (desiredPipeline != currentPipeline) { - vkCmdBindPipeline(cmd, VK_PIPELINE_BIND_POINT_GRAPHICS, desiredPipeline); - currentPipeline = desiredPipeline; - } - - // Update material UBO with per-draw dynamic values (interiorDarken, forceCutout overrides) - // Note: fadeAlpha is in push constants (per-draw) to avoid shared-UBO race - if (batch.materialUBOMapped) { - auto* mat = static_cast(batch.materialUBOMapped); - mat->interiorDarken = insideInterior ? 1.0f : 0.0f; - if (batch.colorKeyBlack) { - mat->colorKeyThreshold = (effectiveBlendMode == 4 || effectiveBlendMode == 5) ? 0.7f : 0.08f; - } - if (forceCutout) { - mat->alphaTest = model.isGroundDetail ? 3 : (foliageCutout ? 2 : 1); - if (model.isGroundDetail) { - mat->unlit = 0; - } - } - } - - // Bind material descriptor set (set 1) — skip batch if missing - // to avoid inheriting a stale descriptor set from a prior renderer - if (!batch.materialSet) continue; - if (batch.materialSet != currentMaterialSet) { - vkCmdBindDescriptorSets(cmd, VK_PIPELINE_BIND_POINT_GRAPHICS, - pipelineLayout_, 1, 1, &batch.materialSet, 0, nullptr); - currentMaterialSet = batch.materialSet; - } - - // Push constants - M2PushConstants pc; - pc.model = instance.modelMatrix; - pc.uvOffset = uvOffset; - pc.texCoordSet = static_cast(batch.textureUnit); - pc.useBones = useBones ? 1 : 0; - pc.isFoliage = model.shadowWindFoliage ? 1 : 0; - pc.fadeAlpha = instanceFadeAlpha; - // Particle-dominant effects: mesh is emission geometry, don't render - if (particleDominantEffect && batch.blendMode <= 1) { + auto mdlIt = models.find(groupModelId); + if (mdlIt == models.end() || !mdlIt->second.vertexBuffer || !mdlIt->second.indexBuffer) { + visStart = groupEnd; continue; } - vkCmdPushConstants(cmd, pipelineLayout_, VK_SHADER_STAGE_VERTEX_BIT, 0, sizeof(pc), &pc); - vkCmdDrawIndexed(cmd, batch.indexCount, 1, batch.indexStart, 0, 0); - lastDrawCallCount++; + const M2ModelGPU& model = mdlIt->second; + + bool modelNeedsAnimation = model.hasAnimation && !model.disableAnimation; + const bool foliageLikeModel = model.isFoliageLike; + const bool particleDominantEffect = model.isSpellEffect && + !model.particleEmitters.empty() && model.batches.size() <= 2; + + // Collect per-instance data for this model group + pending.clear(); + for (size_t vi = visStart; vi < groupEnd; vi++) { + const auto& entry = sortedVisible_[vi]; + if (entry.index >= instances.size()) continue; + auto& instance = instances[entry.index]; + + // Distance-based fade alpha + float fadeFrac = model.disableAnimation ? 0.55f : fadeStartFraction; + float fadeStartDistSq = entry.effectiveMaxDistSq * fadeFrac * fadeFrac; + float fadeAlpha = 1.0f; + if (entry.distSq > fadeStartDistSq) { + fadeAlpha = std::clamp((entry.effectiveMaxDistSq - entry.distSq) / + (entry.effectiveMaxDistSq - fadeStartDistSq), 0.0f, 1.0f); + } + float instanceFadeAlpha = fadeAlpha; + if (model.isGroundDetail) instanceFadeAlpha *= 0.82f; + if (model.isInstancePortal) { + instanceFadeAlpha *= 0.12f; + if (entry.distSq < 400.0f * 400.0f) { + glm::vec3 center = glm::vec3(instance.modelMatrix * glm::vec4(0.0f, 0.0f, 0.0f, 1.0f)); + GlowSprite gs; + gs.worldPos = center; + gs.color = glm::vec4(0.35f, 0.5f, 1.0f, 1.1f); + gs.size = instance.scale * 5.0f; + glowSprites_.push_back(gs); + GlowSprite halo = gs; + halo.color.a *= 0.3f; + halo.size *= 2.2f; + glowSprites_.push_back(halo); + } + } + + // Bone readiness check + if (modelNeedsAnimation && instance.boneMatrices.empty()) continue; + bool needsBones = modelNeedsAnimation && !instance.boneMatrices.empty(); + if (needsBones && instance.megaBoneOffset == 0) continue; + + // LOD selection + uint16_t desiredLOD = 0; + if (entry.distSq > 150.0f * 150.0f) desiredLOD = 3; + else if (entry.distSq > 80.0f * 80.0f) desiredLOD = 2; + else if (entry.distSq > 40.0f * 40.0f) desiredLOD = 1; + uint16_t targetLOD = desiredLOD; + if (desiredLOD > 0 && !(model.availableLODs & (1u << desiredLOD))) targetLOD = 0; + + pending.push_back({entry.index, instanceFadeAlpha, needsBones, targetLOD}); + } + + if (pending.empty()) { visStart = groupEnd; continue; } + + // Sort by targetLOD so each sub-group occupies a contiguous SSBO range + std::sort(pending.begin(), pending.end(), + [](const PendingInstance& a, const PendingInstance& b) { return a.targetLOD < b.targetLOD; }); + + // Bind vertex/index buffers once per model group + VkDeviceSize vbOffset = 0; + vkCmdBindVertexBuffers(cmd, 0, 1, &model.vertexBuffer, &vbOffset); + vkCmdBindIndexBuffer(cmd, model.indexBuffer, 0, VK_INDEX_TYPE_UINT16); + + // Write base instance data to SSBO (uvOffset=0 — overridden for tex-anim batches) + uint32_t baseSSBOOffset = instanceDataCount_; + for (const auto& p : pending) { + if (instanceDataCount_ >= MAX_INSTANCE_DATA) break; + auto& inst = instances[p.instanceIdx]; + auto& e = instSSBO[instanceDataCount_]; + e.model = inst.modelMatrix; + e.uvOffset = glm::vec2(0.0f); + e.fadeAlpha = p.fadeAlpha; + e.useBones = p.useBones ? 1 : 0; + e.boneBase = p.useBones ? static_cast(inst.megaBoneOffset) : 0; + std::memset(e._pad, 0, sizeof(e._pad)); + instanceDataCount_++; + } + + // Process LOD sub-groups within this model group + size_t lodIdx = 0; + while (lodIdx < pending.size()) { + uint16_t lod = pending[lodIdx].targetLOD; + size_t lodEnd = lodIdx + 1; + while (lodEnd < pending.size() && pending[lodEnd].targetLOD == lod) lodEnd++; + uint32_t groupSize = static_cast(lodEnd - lodIdx); + uint32_t groupSSBOOffset = baseSSBOOffset + static_cast(lodIdx); + + for (size_t bi = 0; bi < model.batches.size(); bi++) { + const auto& batch = model.batches[bi]; + if (batch.indexCount == 0) continue; + if (!model.isGroundDetail && batch.submeshLevel != lod) continue; + if (batch.batchOpacity < 0.01f) continue; + + // Opaque gate — skip transparent batches + const bool rawTransparent = (batch.blendMode >= 2) || model.isSpellEffect; + if (rawTransparent) continue; + + // Particle-dominant effects: emission geometry — skip opaque + if (particleDominantEffect && batch.blendMode <= 1) continue; + + // Glow sprite check (per model+batch, sprites generated per instance) + const bool koboldFlameCard = batch.colorKeyBlack && model.isKoboldFlame; + const bool smallCardLikeBatch = + (batch.glowSize <= 1.35f) || + (batch.lanternGlowHint && batch.glowSize <= 6.0f); + const bool batchUnlit = (batch.materialFlags & 0x01) != 0; + const bool shouldUseGlowSprite = + !koboldFlameCard && + (model.isElvenLike || (model.isLanternLike && batch.lanternGlowHint)) && + !model.isSpellEffect && + smallCardLikeBatch && + (batch.lanternGlowHint || + (batch.blendMode >= 3) || + (batch.colorKeyBlack && batchUnlit && batch.blendMode >= 1)); + if (shouldUseGlowSprite) { + // Generate glow sprites for each instance in the group + for (size_t j = lodIdx; j < lodEnd; j++) { + auto& inst = instances[pending[j].instanceIdx]; + float distSq = sortedVisible_[visStart].distSq; // approximate with group + if (distSq < 180.0f * 180.0f) { + glm::vec3 worldPos = glm::vec3(inst.modelMatrix * glm::vec4(batch.center, 1.0f)); + GlowSprite gs; + gs.worldPos = worldPos; + if (batch.glowTint == 1 || model.isElvenLike) + gs.color = glm::vec4(0.48f, 0.72f, 1.0f, 1.05f); + else if (batch.glowTint == 2) + gs.color = glm::vec4(1.0f, 0.28f, 0.22f, 1.10f); + else + gs.color = glm::vec4(1.0f, 0.82f, 0.46f, 1.15f); + gs.size = batch.glowSize * inst.scale * 1.45f; + glowSprites_.push_back(gs); + GlowSprite halo = gs; + halo.color.a *= 0.42f; + halo.size *= 1.8f; + glowSprites_.push_back(halo); + } + } + const bool cardLikeSkipMesh = + (batch.blendMode >= 3) || batch.colorKeyBlack || batchUnlit; + const bool lanternGlowCardSkip = + model.isLanternLike && batch.lanternGlowHint && + smallCardLikeBatch && cardLikeSkipMesh; + if (lanternGlowCardSkip || (cardLikeSkipMesh && !model.isLanternLike)) + continue; + } + + // Handle texture animation: if this batch has per-instance uvOffset, + // write a separate SSBO range with the correct offsets. + bool hasBatchTexAnim = (batch.textureAnimIndex != 0xFFFF && model.hasTextureAnimation) + || model.isLavaModel; + uint32_t drawOffset = groupSSBOOffset; + if (hasBatchTexAnim && instanceDataCount_ + groupSize <= MAX_INSTANCE_DATA) { + drawOffset = instanceDataCount_; + for (size_t j = lodIdx; j < lodEnd; j++) { + auto& inst = instances[pending[j].instanceIdx]; + glm::vec2 uvOffset(0.0f); + if (batch.textureAnimIndex != 0xFFFF && model.hasTextureAnimation) { + uint16_t lookupIdx = batch.textureAnimIndex; + if (lookupIdx < model.textureTransformLookup.size()) { + uint16_t transformIdx = model.textureTransformLookup[lookupIdx]; + if (transformIdx < model.textureTransforms.size()) { + const auto& tt = model.textureTransforms[transformIdx]; + glm::vec3 trans = interpVec3(tt.translation, + inst.currentSequenceIndex, inst.animTime, + glm::vec3(0.0f), model.globalSequenceDurations); + uvOffset = glm::vec2(trans.x, trans.y); + } + } + } + if (model.isLavaModel && uvOffset == glm::vec2(0.0f)) { + float t = std::chrono::duration( + std::chrono::steady_clock::now() - kLavaAnimStart).count(); + uvOffset = glm::vec2(t * 0.03f, -t * 0.08f); + } + // Copy base entry and override uvOffset + instSSBO[instanceDataCount_] = instSSBO[groupSSBOOffset + (j - lodIdx)]; + instSSBO[instanceDataCount_].uvOffset = uvOffset; + instanceDataCount_++; + } + } + + // Pipeline selection (per-model/batch, not per-instance) + const bool foliageCutout = foliageLikeModel && !model.isSpellEffect && batch.blendMode <= 3; + const bool forceCutout = + !model.isSpellEffect && + (model.isGroundDetail || foliageCutout || + batch.blendMode == 1 || + (batch.blendMode >= 2 && !batch.hasAlpha) || + batch.colorKeyBlack); + + uint8_t effectiveBlendMode = batch.blendMode; + if (model.isSpellEffect) { + if (effectiveBlendMode <= 1) effectiveBlendMode = 3; + else if (effectiveBlendMode == 4 || effectiveBlendMode == 5) effectiveBlendMode = 3; + } + if (forceCutout) effectiveBlendMode = 1; + + VkPipeline desiredPipeline; + if (forceCutout) { + desiredPipeline = opaquePipeline_; + } else { + switch (effectiveBlendMode) { + case 0: desiredPipeline = opaquePipeline_; break; + case 1: desiredPipeline = alphaTestPipeline_; break; + case 2: desiredPipeline = alphaPipeline_; break; + default: desiredPipeline = additivePipeline_; break; + } + } + if (desiredPipeline != currentPipeline) { + vkCmdBindPipeline(cmd, VK_PIPELINE_BIND_POINT_GRAPHICS, desiredPipeline); + currentPipeline = desiredPipeline; + } + + // Update material UBO + if (batch.materialUBOMapped) { + auto* mat = static_cast(batch.materialUBOMapped); + mat->interiorDarken = insideInterior ? 1.0f : 0.0f; + if (batch.colorKeyBlack) + mat->colorKeyThreshold = (effectiveBlendMode == 4 || effectiveBlendMode == 5) ? 0.7f : 0.08f; + if (forceCutout) { + mat->alphaTest = model.isGroundDetail ? 3 : (foliageCutout ? 2 : 1); + if (model.isGroundDetail) mat->unlit = 0; + } + } + + // Bind material descriptor set (set 1) + if (!batch.materialSet) continue; + if (batch.materialSet != currentMaterialSet) { + vkCmdBindDescriptorSets(cmd, VK_PIPELINE_BIND_POINT_GRAPHICS, + pipelineLayout_, 1, 1, &batch.materialSet, 0, nullptr); + currentMaterialSet = batch.materialSet; + } + + // Push constants + instanced draw + M2PushConstants pc; + pc.texCoordSet = static_cast(batch.textureUnit); + pc.isFoliage = model.shadowWindFoliage ? 1 : 0; + pc.instanceDataOffset = static_cast(drawOffset); + vkCmdPushConstants(cmd, pipelineLayout_, VK_SHADER_STAGE_VERTEX_BIT, 0, sizeof(pc), &pc); + vkCmdDrawIndexed(cmd, batch.indexCount, groupSize, batch.indexStart, 0, 0); + lastDrawCallCount++; + } + + lodIdx = lodEnd; + } + + visStart = groupEnd; } } - // Pass 2: transparent/additive batches — sort back-to-front by distance so - // overlapping transparent geometry composites in the correct painter's order. - opaquePass = false; + // ===================================================================== + // Pass 2: Transparent/additive batches — back-to-front per instance + // ===================================================================== + // Transparent geometry must be drawn individually per instance in back-to- + // front order for correct alpha compositing. Each draw writes one + // M2InstanceGPU entry and issues a single-instance indexed draw. std::sort(sortedVisible_.begin(), sortedVisible_.end(), [](const VisibleEntry& a, const VisibleEntry& b) { return a.distSq > b.distSq; }); currentModelId = UINT32_MAX; currentModel = nullptr; currentModelValid = false; - // Reset state so the first transparent bind always sets explicitly currentPipeline = opaquePipeline_; currentMaterialSet = VK_NULL_HANDLE; - currentBoneSet = VK_NULL_HANDLE; for (const auto& entry : sortedVisible_) { if (entry.index >= instances.size()) continue; auto& instance = instances[entry.index]; - // Quick skip: if model has no transparent batches at all, skip it entirely + // Quick skip: if model has no transparent batches at all if (entry.modelId != currentModelId) { auto mdlIt = models.find(entry.modelId); if (mdlIt == models.end()) continue; if (!mdlIt->second.hasTransparentBatches && !mdlIt->second.isSpellEffect) continue; } - // Reuse the same rendering logic as pass 1 (via fallthrough — the batch gate - // `!opaquePass && !rawTransparent → continue` handles opaque skipping) if (entry.modelId != currentModelId) { currentModelId = entry.modelId; currentModelValid = false; @@ -2690,15 +3072,15 @@ void M2Renderer::render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet, const currentModel = &mdlIt->second; if (!currentModel->vertexBuffer || !currentModel->indexBuffer) continue; currentModelValid = true; - VkDeviceSize offset = 0; - vkCmdBindVertexBuffers(cmd, 0, 1, ¤tModel->vertexBuffer, &offset); + VkDeviceSize vbOff = 0; + vkCmdBindVertexBuffers(cmd, 0, 1, ¤tModel->vertexBuffer, &vbOff); vkCmdBindIndexBuffer(cmd, currentModel->indexBuffer, 0, VK_INDEX_TYPE_UINT16); } if (!currentModelValid) continue; const M2ModelGPU& model = *currentModel; - // Distance-based fade alpha (same as pass 1) + // Fade alpha float fadeAlpha = 1.0f; float fadeFrac = model.disableAnimation ? 0.55f : fadeStartFraction; float fadeStartDistSq = entry.effectiveMaxDistSq * fadeFrac * fadeFrac; @@ -2713,13 +3095,7 @@ void M2Renderer::render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet, const bool modelNeedsAnimation = model.hasAnimation && !model.disableAnimation; if (modelNeedsAnimation && instance.boneMatrices.empty()) continue; bool needsBones = modelNeedsAnimation && !instance.boneMatrices.empty(); - if (needsBones && (!instance.boneBuffer[frameIndex] || !instance.boneSet[frameIndex])) continue; - bool useBones = needsBones; - if (useBones && instance.boneSet[frameIndex] && instance.boneSet[frameIndex] != currentBoneSet) { - vkCmdBindDescriptorSets(cmd, VK_PIPELINE_BIND_POINT_GRAPHICS, - pipelineLayout_, 2, 1, &instance.boneSet[frameIndex], 0, nullptr); - currentBoneSet = instance.boneSet[frameIndex]; - } + if (needsBones && instance.megaBoneOffset == 0) continue; uint16_t desiredLOD = 0; if (entry.distSq > 150.0f * 150.0f) desiredLOD = 3; @@ -2742,7 +3118,7 @@ void M2Renderer::render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet, const if (!rawTransparent) continue; } - // Skip glow sprites (handled after loop) + // Skip glow sprites (handled in opaque pass) const bool batchUnlit = (batch.materialFlags & 0x01) != 0; const bool koboldFlameCard = batch.colorKeyBlack && model.isKoboldFlame; const bool smallCardLikeBatch = @@ -2766,7 +3142,10 @@ void M2Renderer::render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet, const continue; } - glm::vec2 uvOffset(0.0f, 0.0f); + if (particleDominantEffect) continue; // emission-only mesh + + // Compute UV offset for this instance + batch + glm::vec2 uvOffset(0.0f); if (batch.textureAnimIndex != 0xFFFF && model.hasTextureAnimation) { uint16_t lookupIdx = batch.textureAnimIndex; if (lookupIdx < model.textureTransformLookup.size()) { @@ -2785,6 +3164,19 @@ void M2Renderer::render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet, const uvOffset = glm::vec2(t * 0.03f, -t * 0.08f); } + // Write single instance entry to SSBO + if (instanceDataCount_ >= MAX_INSTANCE_DATA) continue; + uint32_t drawOffset = instanceDataCount_; + auto& e = instSSBO[instanceDataCount_]; + e.model = instance.modelMatrix; + e.uvOffset = uvOffset; + e.fadeAlpha = instanceFadeAlpha; + e.useBones = needsBones ? 1 : 0; + e.boneBase = needsBones ? static_cast(instance.megaBoneOffset) : 0; + std::memset(e._pad, 0, sizeof(e._pad)); + instanceDataCount_++; + + // Pipeline selection uint8_t effectiveBlendMode = batch.blendMode; if (model.isSpellEffect) { if (effectiveBlendMode <= 1) effectiveBlendMode = 3; @@ -2815,14 +3207,11 @@ void M2Renderer::render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet, const currentMaterialSet = batch.materialSet; } + // Push constants + single-instance draw M2PushConstants pc; - pc.model = instance.modelMatrix; - pc.uvOffset = uvOffset; - pc.texCoordSet = static_cast(batch.textureUnit); - pc.useBones = useBones ? 1 : 0; + pc.texCoordSet = static_cast(batch.textureUnit); pc.isFoliage = model.shadowWindFoliage ? 1 : 0; - pc.fadeAlpha = instanceFadeAlpha; - if (particleDominantEffect) continue; // emission-only mesh + pc.instanceDataOffset = static_cast(drawOffset); vkCmdPushConstants(cmd, pipelineLayout_, VK_SHADER_STAGE_VERTEX_BIT, 0, sizeof(pc), &pc); vkCmdDrawIndexed(cmd, batch.indexCount, 1, batch.indexStart, 0, 0); lastDrawCallCount++; @@ -4842,7 +5231,9 @@ void M2Renderer::recreatePipelines() { {4, 0, VK_FORMAT_R32G32B32A32_SFLOAT, 14 * sizeof(float)}, // boneIndices (float) }; - auto buildM2Pipeline = [&](VkPipelineColorBlendAttachmentState blendState, bool depthWrite) -> VkPipeline { + // Pipeline derivatives — opaque is the base, others derive from it for shared state optimization + auto buildM2Pipeline = [&](VkPipelineColorBlendAttachmentState blendState, bool depthWrite, + VkPipelineCreateFlags flags = 0, VkPipeline basePipeline = VK_NULL_HANDLE) -> VkPipeline { return PipelineBuilder() .setShaders(m2Vert.stageInfo(VK_SHADER_STAGE_VERTEX_BIT), m2Frag.stageInfo(VK_SHADER_STAGE_FRAGMENT_BIT)) @@ -4855,13 +5246,19 @@ void M2Renderer::recreatePipelines() { .setLayout(pipelineLayout_) .setRenderPass(mainPass) .setDynamicStates({VK_DYNAMIC_STATE_VIEWPORT, VK_DYNAMIC_STATE_SCISSOR}) + .setFlags(flags) + .setBasePipeline(basePipeline) .build(device, vkCtx_->getPipelineCache()); }; - opaquePipeline_ = buildM2Pipeline(PipelineBuilder::blendDisabled(), true); - alphaTestPipeline_ = buildM2Pipeline(PipelineBuilder::blendAlpha(), true); - alphaPipeline_ = buildM2Pipeline(PipelineBuilder::blendAlpha(), false); - additivePipeline_ = buildM2Pipeline(PipelineBuilder::blendAdditive(), false); + opaquePipeline_ = buildM2Pipeline(PipelineBuilder::blendDisabled(), true, + VK_PIPELINE_CREATE_ALLOW_DERIVATIVES_BIT); + alphaTestPipeline_ = buildM2Pipeline(PipelineBuilder::blendAlpha(), true, + VK_PIPELINE_CREATE_DERIVATIVE_BIT, opaquePipeline_); + alphaPipeline_ = buildM2Pipeline(PipelineBuilder::blendAlpha(), false, + VK_PIPELINE_CREATE_DERIVATIVE_BIT, opaquePipeline_); + additivePipeline_ = buildM2Pipeline(PipelineBuilder::blendAdditive(), false, + VK_PIPELINE_CREATE_DERIVATIVE_BIT, opaquePipeline_); // --- Particle pipelines --- if (particleVert.isValid() && particleFrag.isValid()) { diff --git a/src/rendering/render_graph.cpp b/src/rendering/render_graph.cpp new file mode 100644 index 00000000..d36d20bc --- /dev/null +++ b/src/rendering/render_graph.cpp @@ -0,0 +1,194 @@ +#include "rendering/render_graph.hpp" +#include "core/logger.hpp" +#include +#include +#include + +namespace wowee { +namespace rendering { + +void RenderGraph::reset() { + passes_.clear(); + executionOrder_.clear(); + compiled_ = false; + // Keep resource registry — resources are stable across frames +} + +RGResource RenderGraph::registerResource(const std::string& name) { + // Check for duplicate + for (const auto& r : resources_) { + if (r.name == name) return {r.id}; + } + uint32_t id = nextResourceId_++; + resources_.push_back({name, id}); + return {id}; +} + +RGResource RenderGraph::findResource(const std::string& name) const { + for (const auto& r : resources_) { + if (r.name == name) return {r.id}; + } + return {}; // invalid +} + +void RenderGraph::addPass(const std::string& name, + const std::vector& inputs, + const std::vector& outputs, + std::function execute) { + RGPass pass; + pass.name = name; + pass.inputs = inputs; + pass.outputs = outputs; + pass.execute = std::move(execute); + pass.enabled = true; + passes_.push_back(std::move(pass)); +} + +void RenderGraph::setPassEnabled(const std::string& name, bool enabled) { + for (auto& pass : passes_) { + if (pass.name == name) { + pass.enabled = enabled; + return; + } + } +} + +void RenderGraph::compile() { + topologicalSort(); + compiled_ = true; +} + +void RenderGraph::topologicalSort() { + const uint32_t n = static_cast(passes_.size()); + if (n == 0) { executionOrder_.clear(); return; } + + // Build adjacency: if pass A outputs resource R and pass B inputs resource R, + // then A must execute before B (edge A → B). + // Map: resource id → index of pass that produces it + std::unordered_map producer; + for (uint32_t i = 0; i < n; ++i) { + for (const auto& out : passes_[i].outputs) { + producer[out.id] = i; + } + } + + // Build in-degree and adjacency list + std::vector inDegree(n, 0); + std::vector> adj(n); + + for (uint32_t i = 0; i < n; ++i) { + for (const auto& inp : passes_[i].inputs) { + auto it = producer.find(inp.id); + if (it != producer.end() && it->second != i) { + adj[it->second].push_back(i); + inDegree[i]++; + } + } + } + + // Kahn's algorithm + std::queue queue; + for (uint32_t i = 0; i < n; ++i) { + if (inDegree[i] == 0) queue.push(i); + } + + executionOrder_.clear(); + executionOrder_.reserve(n); + + while (!queue.empty()) { + uint32_t u = queue.front(); + queue.pop(); + executionOrder_.push_back(u); + for (uint32_t v : adj[u]) { + if (--inDegree[v] == 0) queue.push(v); + } + } + + // If not all passes are in the order, there's a cycle — fall back to insertion order + if (executionOrder_.size() != n) { + LOG_WARNING("RenderGraph: dependency cycle detected, falling back to insertion order"); + executionOrder_.clear(); + for (uint32_t i = 0; i < n; ++i) executionOrder_.push_back(i); + } +} + +void RenderGraph::execute(VkCommandBuffer cmd) { + if (!compiled_) { + LOG_WARNING("RenderGraph::execute called without compile()"); + compile(); + } + + for (uint32_t idx : executionOrder_) { + const auto& pass = passes_[idx]; + if (!pass.enabled) continue; + + // Insert image barriers declared for this pass + if (!pass.imageBarriers.empty()) { + std::vector barriers; + barriers.reserve(pass.imageBarriers.size()); + + VkPipelineStageFlags srcStages = 0; + VkPipelineStageFlags dstStages = 0; + + for (const auto& b : pass.imageBarriers) { + VkImageMemoryBarrier ib{}; + ib.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER; + ib.oldLayout = b.oldLayout; + ib.newLayout = b.newLayout; + ib.srcAccessMask = b.srcAccess; + ib.dstAccessMask = b.dstAccess; + ib.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + ib.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + ib.image = b.image; + ib.subresourceRange = {b.aspectMask, 0, 1, 0, 1}; + barriers.push_back(ib); + srcStages |= b.srcStage; + dstStages |= b.dstStage; + } + + vkCmdPipelineBarrier(cmd, + srcStages, dstStages, + 0, + 0, nullptr, + 0, nullptr, + static_cast(barriers.size()), barriers.data()); + } + + // Insert buffer barriers declared for this pass + if (!pass.bufferBarriers.empty()) { + std::vector barriers; + barriers.reserve(pass.bufferBarriers.size()); + + VkPipelineStageFlags srcStages = 0; + VkPipelineStageFlags dstStages = 0; + + for (const auto& b : pass.bufferBarriers) { + VkBufferMemoryBarrier bb{}; + bb.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; + bb.srcAccessMask = b.srcAccess; + bb.dstAccessMask = b.dstAccess; + bb.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + bb.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + bb.buffer = b.buffer; + bb.offset = b.offset; + bb.size = b.size; + barriers.push_back(bb); + srcStages |= b.srcStage; + dstStages |= b.dstStage; + } + + vkCmdPipelineBarrier(cmd, + srcStages, dstStages, + 0, + 0, nullptr, + static_cast(barriers.size()), barriers.data(), + 0, nullptr); + } + + // Execute the pass + pass.execute(cmd); + } +} + +} // namespace rendering +} // namespace wowee diff --git a/src/rendering/renderer.cpp b/src/rendering/renderer.cpp index 31f4c68c..1daf09cf 100644 --- a/src/rendering/renderer.cpp +++ b/src/rendering/renderer.cpp @@ -61,6 +61,7 @@ #include "rendering/spell_visual_system.hpp" #include "rendering/post_process_pipeline.hpp" #include "rendering/animation_controller.hpp" +#include "rendering/render_graph.hpp" #include #include #include @@ -458,7 +459,9 @@ void Renderer::updatePerFrameUBO() { } currentFrameData.lightSpaceMatrix = lightSpaceMatrix; - currentFrameData.shadowParams = glm::vec4(shadowsEnabled ? 1.0f : 0.0f, 0.8f, 0.0f, 0.0f); + // Scale shadow bias proportionally to ortho extent to avoid acne at close range / gaps at far range + float shadowBias = 0.8f * (shadowDistance_ / 300.0f); + currentFrameData.shadowParams = glm::vec4(shadowsEnabled ? 1.0f : 0.0f, shadowBias, 0.0f, 0.0f); // Player water ripple data: pack player XY into shadowParams.zw, ripple strength into fogParams.w if (cameraController) { @@ -563,6 +566,15 @@ bool Renderer::initialize(core::Window* win) { postProcessPipeline_ = std::make_unique(); postProcessPipeline_->initialize(vkCtx); + // Phase 2.5: Create render graph and register virtual resources + renderGraph_ = std::make_unique(); + renderGraph_->registerResource("shadow_depth"); + renderGraph_->registerResource("reflection_texture"); + renderGraph_->registerResource("cull_visibility"); + renderGraph_->registerResource("scene_color"); + renderGraph_->registerResource("scene_depth"); + renderGraph_->registerResource("final_image"); + LOG_INFO("Renderer initialized"); return true; } @@ -674,6 +686,10 @@ void Renderer::shutdown() { postProcessPipeline_->shutdown(); postProcessPipeline_.reset(); } + + // Phase 2.5: Destroy render graph + renderGraph_.reset(); + destroyPerFrameResources(); zoneManager.reset(); @@ -839,36 +855,19 @@ void Renderer::beginFrame() { // FSR2 jitter pattern (§4.3 — delegates to PostProcessPipeline) if (postProcessPipeline_ && camera) postProcessPipeline_->applyJitter(camera.get()); + // Compute fresh shadow matrix BEFORE UBO update so shaders get current-frame data. + lightSpaceMatrix = computeLightSpaceMatrix(); + // Update per-frame UBO with current camera/lighting state updatePerFrameUBO(); - // --- Off-screen pre-passes (before main render pass) --- - // Minimap composite (renders 3x3 tile grid into 768x768 render target) - if (minimap && minimap->isEnabled() && camera) { - glm::vec3 minimapCenter = camera->getPosition(); - if (cameraController && cameraController->isThirdPerson()) - minimapCenter = characterPosition; - minimap->compositePass(currentCmd, minimapCenter); + // --- Off-screen pre-passes (Phase 2.5: render graph) --- + // Build frame graph: registers pre-passes as graph nodes with dependencies. + // compile() topologically sorts; execute() runs them with auto barriers. + buildFrameGraph(nullptr); + if (renderGraph_) { + renderGraph_->execute(currentCmd); } - // World map composite (renders zone tiles into 1024x768 render target) - if (worldMap) { - worldMap->compositePass(currentCmd); - } - - // Character preview composite passes - for (auto* preview : activePreviews_) { - if (preview && preview->isModelLoaded()) { - preview->compositePass(currentCmd, vkCtx->getCurrentFrame()); - } - } - - // Shadow pre-pass (before main render pass) - if (shadowsEnabled && shadowDepthImage[0] != VK_NULL_HANDLE) { - renderShadowPass(); - } - - // Water reflection pre-pass (renders scene from mirrored camera into 512x512 texture) - renderReflectionPass(); // --- Begin render pass --- // Select framebuffer: PP off-screen target or swapchain (§4.3 — PostProcessPipeline) @@ -3063,17 +3062,10 @@ void Renderer::renderShadowPass() { // Shadows render every frame — throttling causes visible flicker on player/NPCs - // Compute and store light space matrix; write to per-frame UBO - lightSpaceMatrix = computeLightSpaceMatrix(); + // lightSpaceMatrix was already computed at frame start (before updatePerFrameUBO). // Zero matrix means character position isn't set yet — skip shadow pass entirely. if (lightSpaceMatrix == glm::mat4(0.0f)) return; uint32_t frame = vkCtx->getCurrentFrame(); - auto* ubo = reinterpret_cast(perFrameUBOMapped[frame]); - if (ubo) { - ubo->lightSpaceMatrix = lightSpaceMatrix; - ubo->shadowParams.x = shadowsEnabled ? 1.0f : 0.0f; - ubo->shadowParams.y = 0.8f; - } // Barrier 1: transition this frame's shadow map into writable depth layout. VkImageMemoryBarrier b1{}; @@ -3147,5 +3139,69 @@ void Renderer::renderShadowPass() { shadowDepthLayout_[frame] = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; } +// Phase 2.5: Build the per-frame render graph for off-screen pre-passes. +// Declares passes as graph nodes with input/output dependencies. +// compile() performs topological sort; execute() runs them with auto barriers. +void Renderer::buildFrameGraph(game::GameHandler* gameHandler) { + (void)gameHandler; + if (!renderGraph_) return; + + renderGraph_->reset(); + + auto shadowDepth = renderGraph_->findResource("shadow_depth"); + auto reflTex = renderGraph_->findResource("reflection_texture"); + auto cullVis = renderGraph_->findResource("cull_visibility"); + + // Minimap composites (no dependencies — standalone off-screen render target) + renderGraph_->addPass("minimap_composite", {}, {}, + [this](VkCommandBuffer cmd) { + if (minimap && minimap->isEnabled() && camera) { + glm::vec3 minimapCenter = camera->getPosition(); + if (cameraController && cameraController->isThirdPerson()) + minimapCenter = characterPosition; + minimap->compositePass(cmd, minimapCenter); + } + }); + + // World map composite (standalone) + renderGraph_->addPass("worldmap_composite", {}, {}, + [this](VkCommandBuffer cmd) { + if (worldMap) worldMap->compositePass(cmd); + }); + + // Character preview composites (standalone) + renderGraph_->addPass("preview_composite", {}, {}, + [this](VkCommandBuffer cmd) { + uint32_t frame = vkCtx->getCurrentFrame(); + for (auto* preview : activePreviews_) { + if (preview && preview->isModelLoaded()) + preview->compositePass(cmd, frame); + } + }); + + // Shadow pre-pass → outputs shadow_depth + renderGraph_->addPass("shadow_pass", {}, {shadowDepth}, + [this](VkCommandBuffer) { + if (shadowsEnabled && shadowDepthImage[0] != VK_NULL_HANDLE) + renderShadowPass(); + }); + renderGraph_->setPassEnabled("shadow_pass", shadowsEnabled && shadowDepthImage[0] != VK_NULL_HANDLE); + + // Reflection pre-pass → outputs reflection_texture (reads scene, so after shadow) + renderGraph_->addPass("reflection_pass", {shadowDepth}, {reflTex}, + [this](VkCommandBuffer) { + renderReflectionPass(); + }); + + // GPU frustum cull compute → outputs cull_visibility + renderGraph_->addPass("compute_cull", {}, {cullVis}, + [this](VkCommandBuffer cmd) { + if (m2Renderer && camera) + m2Renderer->dispatchCullCompute(cmd, vkCtx->getCurrentFrame(), *camera); + }); + + renderGraph_->compile(); +} + } // namespace rendering } // namespace wowee diff --git a/src/rendering/terrain_renderer.cpp b/src/rendering/terrain_renderer.cpp index 0de9698a..458714a5 100644 --- a/src/rendering/terrain_renderer.cpp +++ b/src/rendering/terrain_renderer.cpp @@ -128,7 +128,7 @@ bool TerrainRenderer::initialize(VkContext* ctx, VkDescriptorSetLayout perFrameL vertexAttribs[3] = { 3, 0, VK_FORMAT_R32G32_SFLOAT, static_cast(offsetof(pipeline::TerrainVertex, layerUV)) }; - // --- Build fill pipeline --- + // --- Build fill pipeline (base for derivatives — shared state optimization) --- VkRenderPass mainPass = vkCtx->getImGuiRenderPass(); pipeline = PipelineBuilder() @@ -143,6 +143,7 @@ bool TerrainRenderer::initialize(VkContext* ctx, VkDescriptorSetLayout perFrameL .setLayout(pipelineLayout) .setRenderPass(mainPass) .setDynamicStates({ VK_DYNAMIC_STATE_VIEWPORT, VK_DYNAMIC_STATE_SCISSOR }) + .setFlags(VK_PIPELINE_CREATE_ALLOW_DERIVATIVES_BIT) .build(device, vkCtx->getPipelineCache()); if (!pipeline) { @@ -152,7 +153,7 @@ bool TerrainRenderer::initialize(VkContext* ctx, VkDescriptorSetLayout perFrameL return false; } - // --- Build wireframe pipeline --- + // --- Build wireframe pipeline (derivative of fill) --- wireframePipeline = PipelineBuilder() .setShaders(vertShader.stageInfo(VK_SHADER_STAGE_VERTEX_BIT), fragShader.stageInfo(VK_SHADER_STAGE_FRAGMENT_BIT)) @@ -165,6 +166,8 @@ bool TerrainRenderer::initialize(VkContext* ctx, VkDescriptorSetLayout perFrameL .setLayout(pipelineLayout) .setRenderPass(mainPass) .setDynamicStates({ VK_DYNAMIC_STATE_VIEWPORT, VK_DYNAMIC_STATE_SCISSOR }) + .setFlags(VK_PIPELINE_CREATE_DERIVATIVE_BIT) + .setBasePipeline(pipeline) .build(device, vkCtx->getPipelineCache()); if (!wireframePipeline) { @@ -190,6 +193,64 @@ bool TerrainRenderer::initialize(VkContext* ctx, VkDescriptorSetLayout perFrameL envSizeMBOrDefault("WOWEE_TERRAIN_TEX_CACHE_MB", 4096) * 1024ull * 1024ull; LOG_INFO("Terrain texture cache budget: ", textureCacheBudgetBytes_ / (1024 * 1024), " MB"); + // Phase 2.2: Allocate mega vertex/index buffers and indirect draw buffer. + // All terrain chunks share these buffers, eliminating per-chunk VB/IB rebinds. + { + VmaAllocator allocator = vkCtx->getAllocator(); + + // Mega vertex buffer (host-visible for direct write during chunk upload) + VkBufferCreateInfo vbCI{}; + vbCI.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO; + vbCI.size = static_cast(MEGA_VB_MAX_VERTS) * sizeof(pipeline::TerrainVertex); + vbCI.usage = VK_BUFFER_USAGE_VERTEX_BUFFER_BIT; + VmaAllocationCreateInfo vbAllocCI{}; + vbAllocCI.usage = VMA_MEMORY_USAGE_CPU_TO_GPU; + vbAllocCI.flags = VMA_ALLOCATION_CREATE_MAPPED_BIT; + VmaAllocationInfo vbInfo{}; + if (vmaCreateBuffer(allocator, &vbCI, &vbAllocCI, + &megaVB_, &megaVBAlloc_, &vbInfo) == VK_SUCCESS) { + megaVBMapped_ = vbInfo.pMappedData; + } else { + LOG_WARNING("TerrainRenderer: mega VB allocation failed, per-chunk fallback"); + } + + // Mega index buffer + VkBufferCreateInfo ibCI{}; + ibCI.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO; + ibCI.size = static_cast(MEGA_IB_MAX_INDICES) * sizeof(uint32_t); + ibCI.usage = VK_BUFFER_USAGE_INDEX_BUFFER_BIT; + VmaAllocationCreateInfo ibAllocCI{}; + ibAllocCI.usage = VMA_MEMORY_USAGE_CPU_TO_GPU; + ibAllocCI.flags = VMA_ALLOCATION_CREATE_MAPPED_BIT; + VmaAllocationInfo ibInfo{}; + if (vmaCreateBuffer(allocator, &ibCI, &ibAllocCI, + &megaIB_, &megaIBAlloc_, &ibInfo) == VK_SUCCESS) { + megaIBMapped_ = ibInfo.pMappedData; + } else { + LOG_WARNING("TerrainRenderer: mega IB allocation failed, per-chunk fallback"); + } + + // Indirect draw command buffer + VkBufferCreateInfo indCI{}; + indCI.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO; + indCI.size = MAX_INDIRECT_DRAWS * sizeof(VkDrawIndexedIndirectCommand); + indCI.usage = VK_BUFFER_USAGE_INDIRECT_BUFFER_BIT; + VmaAllocationCreateInfo indAllocCI{}; + indAllocCI.usage = VMA_MEMORY_USAGE_CPU_TO_GPU; + indAllocCI.flags = VMA_ALLOCATION_CREATE_MAPPED_BIT; + VmaAllocationInfo indInfo{}; + if (vmaCreateBuffer(allocator, &indCI, &indAllocCI, + &indirectBuffer_, &indirectAlloc_, &indInfo) == VK_SUCCESS) { + indirectMapped_ = indInfo.pMappedData; + } else { + LOG_WARNING("TerrainRenderer: indirect buffer allocation failed"); + } + + LOG_INFO("Terrain mega buffers: VB=", vbCI.size / (1024*1024), "MB IB=", + ibCI.size / (1024*1024), "MB indirect=", + indCI.size / 1024, "KB"); + } + LOG_INFO("Terrain renderer initialized (Vulkan)"); return true; } @@ -232,7 +293,7 @@ void TerrainRenderer::recreatePipelines() { VkRenderPass mainPass = vkCtx->getImGuiRenderPass(); - // Rebuild fill pipeline + // Rebuild fill pipeline (base for derivatives — shared state optimization) pipeline = PipelineBuilder() .setShaders(vertShader.stageInfo(VK_SHADER_STAGE_VERTEX_BIT), fragShader.stageInfo(VK_SHADER_STAGE_FRAGMENT_BIT)) @@ -245,13 +306,14 @@ void TerrainRenderer::recreatePipelines() { .setLayout(pipelineLayout) .setRenderPass(mainPass) .setDynamicStates({ VK_DYNAMIC_STATE_VIEWPORT, VK_DYNAMIC_STATE_SCISSOR }) + .setFlags(VK_PIPELINE_CREATE_ALLOW_DERIVATIVES_BIT) .build(device, vkCtx->getPipelineCache()); if (!pipeline) { LOG_ERROR("TerrainRenderer::recreatePipelines: failed to create fill pipeline"); } - // Rebuild wireframe pipeline + // Rebuild wireframe pipeline (derivative of fill) wireframePipeline = PipelineBuilder() .setShaders(vertShader.stageInfo(VK_SHADER_STAGE_VERTEX_BIT), fragShader.stageInfo(VK_SHADER_STAGE_FRAGMENT_BIT)) @@ -264,6 +326,8 @@ void TerrainRenderer::recreatePipelines() { .setLayout(pipelineLayout) .setRenderPass(mainPass) .setDynamicStates({ VK_DYNAMIC_STATE_VIEWPORT, VK_DYNAMIC_STATE_SCISSOR }) + .setFlags(VK_PIPELINE_CREATE_DERIVATIVE_BIT) + .setBasePipeline(pipeline) .build(device, vkCtx->getPipelineCache()); if (!wireframePipeline) { @@ -311,6 +375,13 @@ void TerrainRenderer::shutdown() { if (shadowParamsLayout_) { vkDestroyDescriptorSetLayout(device, shadowParamsLayout_, nullptr); shadowParamsLayout_ = VK_NULL_HANDLE; } if (shadowParamsUBO_) { vmaDestroyBuffer(allocator, shadowParamsUBO_, shadowParamsAlloc_); shadowParamsUBO_ = VK_NULL_HANDLE; shadowParamsAlloc_ = VK_NULL_HANDLE; } + // Phase 2.2: Destroy mega buffers and indirect draw buffer + if (megaVB_) { vmaDestroyBuffer(allocator, megaVB_, megaVBAlloc_); megaVB_ = VK_NULL_HANDLE; megaVBAlloc_ = VK_NULL_HANDLE; megaVBMapped_ = nullptr; } + if (megaIB_) { vmaDestroyBuffer(allocator, megaIB_, megaIBAlloc_); megaIB_ = VK_NULL_HANDLE; megaIBAlloc_ = VK_NULL_HANDLE; megaIBMapped_ = nullptr; } + if (indirectBuffer_) { vmaDestroyBuffer(allocator, indirectBuffer_, indirectAlloc_); indirectBuffer_ = VK_NULL_HANDLE; indirectAlloc_ = VK_NULL_HANDLE; indirectMapped_ = nullptr; } + megaVBUsed_ = 0; + megaIBUsed_ = 0; + vkCtx = nullptr; } @@ -537,6 +608,7 @@ TerrainChunkGPU TerrainRenderer::uploadChunk(const pipeline::ChunkMesh& chunk) { gpuChunk.worldY = chunk.worldY; gpuChunk.worldZ = chunk.worldZ; gpuChunk.indexCount = static_cast(chunk.indices.size()); + gpuChunk.vertexCount = static_cast(chunk.vertices.size()); VkDeviceSize vbSize = chunk.vertices.size() * sizeof(pipeline::TerrainVertex); AllocatedBuffer vb = uploadBuffer(*vkCtx, chunk.vertices.data(), vbSize, @@ -550,6 +622,25 @@ TerrainChunkGPU TerrainRenderer::uploadChunk(const pipeline::ChunkMesh& chunk) { gpuChunk.indexBuffer = ib.buffer; gpuChunk.indexAlloc = ib.allocation; + // Phase 2.2: Also copy into mega buffers for indirect drawing + uint32_t vertCount = static_cast(chunk.vertices.size()); + uint32_t idxCount = static_cast(chunk.indices.size()); + if (megaVBMapped_ && megaIBMapped_ && + megaVBUsed_ + vertCount <= MEGA_VB_MAX_VERTS && + megaIBUsed_ + idxCount <= MEGA_IB_MAX_INDICES) { + // Copy vertices + auto* vbDst = static_cast(megaVBMapped_) + megaVBUsed_; + std::memcpy(vbDst, chunk.vertices.data(), vertCount * sizeof(pipeline::TerrainVertex)); + // Copy indices + auto* ibDst = static_cast(megaIBMapped_) + megaIBUsed_; + std::memcpy(ibDst, chunk.indices.data(), idxCount * sizeof(uint32_t)); + + gpuChunk.megaBaseVertex = static_cast(megaVBUsed_); + gpuChunk.megaFirstIndex = megaIBUsed_; + megaVBUsed_ += vertCount; + megaIBUsed_ += idxCount; + } + return gpuChunk; } @@ -789,6 +880,15 @@ void TerrainRenderer::render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet, c renderedChunks = 0; culledChunks = 0; + // Phase 2.2: Use mega VB + IB when available. + // Bind mega buffers once, then use direct draws with base vertex/index offsets. + const bool useMegaBuffers = (megaVB_ && megaIB_); + if (useMegaBuffers) { + VkDeviceSize megaOffset = 0; + vkCmdBindVertexBuffers(cmd, 0, 1, &megaVB_, &megaOffset); + vkCmdBindIndexBuffer(cmd, megaIB_, 0, VK_INDEX_TYPE_UINT32); + } + for (const auto& chunk : chunks) { if (!chunk.isValid() || !chunk.materialSet) continue; @@ -808,11 +908,17 @@ void TerrainRenderer::render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet, c vkCmdBindDescriptorSets(cmd, VK_PIPELINE_BIND_POINT_GRAPHICS, pipelineLayout, 1, 1, &chunk.materialSet, 0, nullptr); - VkDeviceSize offset = 0; - vkCmdBindVertexBuffers(cmd, 0, 1, &chunk.vertexBuffer, &offset); - vkCmdBindIndexBuffer(cmd, chunk.indexBuffer, 0, VK_INDEX_TYPE_UINT32); - - vkCmdDrawIndexed(cmd, chunk.indexCount, 1, 0, 0, 0); + if (useMegaBuffers && chunk.megaBaseVertex >= 0) { + // Direct draw from mega buffer — single VB/IB already bound + vkCmdDrawIndexed(cmd, chunk.indexCount, 1, + chunk.megaFirstIndex, chunk.megaBaseVertex, 0); + } else { + // Fallback: per-chunk VB/IB bind + direct draw + VkDeviceSize offset = 0; + vkCmdBindVertexBuffers(cmd, 0, 1, &chunk.vertexBuffer, &offset); + vkCmdBindIndexBuffer(cmd, chunk.indexBuffer, 0, VK_INDEX_TYPE_UINT32); + vkCmdDrawIndexed(cmd, chunk.indexCount, 1, 0, 0, 0); + } renderedChunks++; } @@ -986,6 +1092,14 @@ void TerrainRenderer::renderShadow(VkCommandBuffer cmd, const glm::mat4& lightSp vkCmdPushConstants(cmd, shadowPipelineLayout_, VK_SHADER_STAGE_VERTEX_BIT, 0, 128, &push); + // Phase 2.2: Bind mega buffers once for shadow pass (same as opaque) + const bool useMegaShadow = (megaVB_ && megaIB_); + if (useMegaShadow) { + VkDeviceSize megaOffset = 0; + vkCmdBindVertexBuffers(cmd, 0, 1, &megaVB_, &megaOffset); + vkCmdBindIndexBuffer(cmd, megaIB_, 0, VK_INDEX_TYPE_UINT32); + } + for (const auto& chunk : chunks) { if (!chunk.isValid()) continue; @@ -995,10 +1109,14 @@ void TerrainRenderer::renderShadow(VkCommandBuffer cmd, const glm::mat4& lightSp float combinedRadius = shadowRadius + chunk.boundingSphereRadius; if (distSq > combinedRadius * combinedRadius) continue; - VkDeviceSize offset = 0; - vkCmdBindVertexBuffers(cmd, 0, 1, &chunk.vertexBuffer, &offset); - vkCmdBindIndexBuffer(cmd, chunk.indexBuffer, 0, VK_INDEX_TYPE_UINT16); - vkCmdDrawIndexed(cmd, chunk.indexCount, 1, 0, 0, 0); + if (useMegaShadow && chunk.megaBaseVertex >= 0) { + vkCmdDrawIndexed(cmd, chunk.indexCount, 1, chunk.megaFirstIndex, chunk.megaBaseVertex, 0); + } else { + VkDeviceSize offset = 0; + vkCmdBindVertexBuffers(cmd, 0, 1, &chunk.vertexBuffer, &offset); + vkCmdBindIndexBuffer(cmd, chunk.indexBuffer, 0, VK_INDEX_TYPE_UINT32); + vkCmdDrawIndexed(cmd, chunk.indexCount, 1, 0, 0, 0); + } } } diff --git a/src/rendering/vk_context.cpp b/src/rendering/vk_context.cpp index c2a37415..4a5d6366 100644 --- a/src/rendering/vk_context.cpp +++ b/src/rendering/vk_context.cpp @@ -334,7 +334,7 @@ bool VkContext::selectPhysicalDevice() { VkPhysicalDeviceProperties props; vkGetPhysicalDeviceProperties(physicalDevice, &props); - uint32_t apiVersion = props.apiVersion; + (void)props.apiVersion; // Available if needed for version checks gpuVendorId_ = props.vendorID; std::strncpy(gpuName_, props.deviceName, sizeof(gpuName_) - 1); gpuName_[sizeof(gpuName_) - 1] = '\0'; diff --git a/src/rendering/vk_pipeline.cpp b/src/rendering/vk_pipeline.cpp index 2a95bd8b..e5c32e6c 100644 --- a/src/rendering/vk_pipeline.cpp +++ b/src/rendering/vk_pipeline.cpp @@ -111,6 +111,17 @@ PipelineBuilder& PipelineBuilder::setDynamicStates(const std::vector(offsetof(WMOVertexData, tangent)) }; - // --- Build opaque pipeline --- + // --- Build opaque pipeline (base for derivatives — shared state optimization) --- VkRenderPass mainPass = vkCtx_->getImGuiRenderPass(); opaquePipeline_ = PipelineBuilder() @@ -184,6 +184,7 @@ bool WMORenderer::initialize(VkContext* ctx, VkDescriptorSetLayout perFrameLayou .setLayout(pipelineLayout_) .setRenderPass(mainPass) .setDynamicStates({ VK_DYNAMIC_STATE_VIEWPORT, VK_DYNAMIC_STATE_SCISSOR }) + .setFlags(VK_PIPELINE_CREATE_ALLOW_DERIVATIVES_BIT) .build(device, vkCtx_->getPipelineCache()); if (!opaquePipeline_) { @@ -193,7 +194,7 @@ bool WMORenderer::initialize(VkContext* ctx, VkDescriptorSetLayout perFrameLayou return false; } - // --- Build transparent pipeline --- + // --- Build transparent pipeline (derivative of opaque) --- transparentPipeline_ = PipelineBuilder() .setShaders(vertShader.stageInfo(VK_SHADER_STAGE_VERTEX_BIT), fragShader.stageInfo(VK_SHADER_STAGE_FRAGMENT_BIT)) @@ -206,13 +207,15 @@ bool WMORenderer::initialize(VkContext* ctx, VkDescriptorSetLayout perFrameLayou .setLayout(pipelineLayout_) .setRenderPass(mainPass) .setDynamicStates({ VK_DYNAMIC_STATE_VIEWPORT, VK_DYNAMIC_STATE_SCISSOR }) + .setFlags(VK_PIPELINE_CREATE_DERIVATIVE_BIT) + .setBasePipeline(opaquePipeline_) .build(device, vkCtx_->getPipelineCache()); if (!transparentPipeline_) { core::Logger::getInstance().warning("WMORenderer: transparent pipeline not available"); } - // --- Build glass pipeline (alpha blend WITH depth write for windows) --- + // --- Build glass pipeline (derivative — alpha blend WITH depth write for windows) --- glassPipeline_ = PipelineBuilder() .setShaders(vertShader.stageInfo(VK_SHADER_STAGE_VERTEX_BIT), fragShader.stageInfo(VK_SHADER_STAGE_FRAGMENT_BIT)) @@ -225,9 +228,11 @@ bool WMORenderer::initialize(VkContext* ctx, VkDescriptorSetLayout perFrameLayou .setLayout(pipelineLayout_) .setRenderPass(mainPass) .setDynamicStates({ VK_DYNAMIC_STATE_VIEWPORT, VK_DYNAMIC_STATE_SCISSOR }) + .setFlags(VK_PIPELINE_CREATE_DERIVATIVE_BIT) + .setBasePipeline(opaquePipeline_) .build(device, vkCtx_->getPipelineCache()); - // --- Build wireframe pipeline --- + // --- Build wireframe pipeline (derivative of opaque) --- wireframePipeline_ = PipelineBuilder() .setShaders(vertShader.stageInfo(VK_SHADER_STAGE_VERTEX_BIT), fragShader.stageInfo(VK_SHADER_STAGE_FRAGMENT_BIT)) @@ -240,6 +245,8 @@ bool WMORenderer::initialize(VkContext* ctx, VkDescriptorSetLayout perFrameLayou .setLayout(pipelineLayout_) .setRenderPass(mainPass) .setDynamicStates({ VK_DYNAMIC_STATE_VIEWPORT, VK_DYNAMIC_STATE_SCISSOR }) + .setFlags(VK_PIPELINE_CREATE_DERIVATIVE_BIT) + .setBasePipeline(opaquePipeline_) .build(device, vkCtx_->getPipelineCache()); if (!wireframePipeline_) { @@ -1434,7 +1441,7 @@ void WMORenderer::render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet, const if (doDistanceCull) { glm::vec3 closestPoint = glm::clamp(camPos, gMin, gMax); float distSq = glm::dot(closestPoint - camPos, closestPoint - camPos); - if (distSq > 250000.0f) { + if (distSq > 1440000.0f) { // 1200 units — matches terrain view distance result.distanceCulled++; continue; } @@ -3733,6 +3740,7 @@ void WMORenderer::recreatePipelines() { VkRenderPass mainPass = vkCtx_->getImGuiRenderPass(); + // Pipeline derivatives — opaque is the base, others derive for shared state optimization opaquePipeline_ = PipelineBuilder() .setShaders(vertShader.stageInfo(VK_SHADER_STAGE_VERTEX_BIT), fragShader.stageInfo(VK_SHADER_STAGE_FRAGMENT_BIT)) @@ -3745,6 +3753,7 @@ void WMORenderer::recreatePipelines() { .setLayout(pipelineLayout_) .setRenderPass(mainPass) .setDynamicStates({ VK_DYNAMIC_STATE_VIEWPORT, VK_DYNAMIC_STATE_SCISSOR }) + .setFlags(VK_PIPELINE_CREATE_ALLOW_DERIVATIVES_BIT) .build(device, vkCtx_->getPipelineCache()); transparentPipeline_ = PipelineBuilder() @@ -3759,6 +3768,8 @@ void WMORenderer::recreatePipelines() { .setLayout(pipelineLayout_) .setRenderPass(mainPass) .setDynamicStates({ VK_DYNAMIC_STATE_VIEWPORT, VK_DYNAMIC_STATE_SCISSOR }) + .setFlags(VK_PIPELINE_CREATE_DERIVATIVE_BIT) + .setBasePipeline(opaquePipeline_) .build(device, vkCtx_->getPipelineCache()); glassPipeline_ = PipelineBuilder() @@ -3773,6 +3784,8 @@ void WMORenderer::recreatePipelines() { .setLayout(pipelineLayout_) .setRenderPass(mainPass) .setDynamicStates({ VK_DYNAMIC_STATE_VIEWPORT, VK_DYNAMIC_STATE_SCISSOR }) + .setFlags(VK_PIPELINE_CREATE_DERIVATIVE_BIT) + .setBasePipeline(opaquePipeline_) .build(device, vkCtx_->getPipelineCache()); wireframePipeline_ = PipelineBuilder() @@ -3787,6 +3800,8 @@ void WMORenderer::recreatePipelines() { .setLayout(pipelineLayout_) .setRenderPass(mainPass) .setDynamicStates({ VK_DYNAMIC_STATE_VIEWPORT, VK_DYNAMIC_STATE_SCISSOR }) + .setFlags(VK_PIPELINE_CREATE_DERIVATIVE_BIT) + .setBasePipeline(opaquePipeline_) .build(device, vkCtx_->getPipelineCache()); vertShader.destroy(); diff --git a/tools/asset_extract/extractor.cpp b/tools/asset_extract/extractor.cpp index 3c61bef3..d79d4671 100644 --- a/tools/asset_extract/extractor.cpp +++ b/tools/asset_extract/extractor.cpp @@ -537,20 +537,6 @@ static std::vector discoverArchives(const std::string& mpqDir, return result; } -// Read a text file into a vector of lines (for external listfile loading) -static std::vector readLines(const std::string& path) { - std::vector lines; - std::ifstream f(path); - if (!f) return lines; - std::string line; - while (std::getline(f, line)) { - // Trim trailing \r - if (!line.empty() && line.back() == '\r') line.pop_back(); - if (!line.empty()) lines.push_back(std::move(line)); - } - return lines; -} - // Extract the (listfile) from an MPQ archive into a set of filenames static void extractInternalListfile(HANDLE hMpq, std::set& out) { HANDLE hFile = nullptr; @@ -595,14 +581,9 @@ bool Extractor::enumerateFiles(const Options& opts, std::cout << "Found " << archives.size() << " MPQ archives\n"; - // Load external listfile into memory once (avoids repeated file I/O) - std::vector externalEntries; - std::vector externalPtrs; - if (!opts.listFile.empty()) { - externalEntries = readLines(opts.listFile); - externalPtrs.reserve(externalEntries.size()); - for (const auto& e : externalEntries) externalPtrs.push_back(e.c_str()); - std::cout << " Loaded external listfile: " << externalEntries.size() << " entries\n"; + const bool haveExternalListFile = !opts.listFile.empty(); + if (haveExternalListFile) { + std::cout << " Using external listfile: " << opts.listFile << "\n"; } const auto wantedDbcs = buildWantedDbcSet(opts); @@ -616,12 +597,11 @@ bool Extractor::enumerateFiles(const Options& opts, continue; } - // Inject external listfile entries into archive's in-memory name table. - // SFileAddListFileEntries is fast — it only hashes the names against the - // archive's hash table, no file I/O involved. - if (!externalPtrs.empty()) { - SFileAddListFileEntries(hMpq, externalPtrs.data(), - static_cast(externalPtrs.size())); + // Inject external listfile into archive's in-memory name table. + // SFileAddListFile reads the file and hashes names against the + // archive's hash table. + if (haveExternalListFile) { + SFileAddListFile(hMpq, opts.listFile.c_str()); } if (opts.verbose) {