mirror of
https://github.com/Kelsidavis/WoWee.git
synced 2026-04-17 09:33:51 +00:00
feat(rendering): GPU architecture + visual quality fixes
M2 GPU instancing - M2InstanceGPU SSBO (96 B/entry, double-buffered, 16384 max) - Group opaque instances by (modelId, LOD); single vkCmdDrawIndexed per group - boneBase field indexes into mega bone SSBO via gl_InstanceIndex Indirect terrain drawing - 24 MB mega index buffer (6M uint32) + 64 MB mega vertex buffer - CPU builds VkDrawIndexedIndirectCommand per visible chunk - Single VB/IB bind per frame; shadow pass reuses mega buffers - Replaced vkCmdDrawIndexedIndirect with direct vkCmdDrawIndexed to fix host-mapped buffer race condition that caused terrain flickering GPU frustum culling (compute shader) - m2_cull.comp.glsl: 64-thread workgroups, sphere-vs-6-planes + distance cull - CullInstanceGPU SSBO input, uint visibility[] output, double-buffered - dispatchCullCompute() runs before main pass via render graph node Consolidated bone matrix SSBOs - 16 MB double-buffered mega bone SSBO (2048 instances × 128 bones) - Eliminated per-instance descriptor sets; one megaBoneSet_ per frame - prepareRender() packs bone matrices consecutively into current frame slot Render graph / frame graph - RenderGraph: RGResource handles, RGPass nodes, Kahn topological sort - Automatic VkImageMemoryBarrier/VkBufferMemoryBarrier between passes - Passes: minimap_composite, worldmap_composite, preview_composite, shadow_pass, reflection_pass, compute_cull - beginFrame() uses buildFrameGraph() + renderGraph_->execute(cmd) Pipeline derivatives - PipelineBuilder::setFlags/setBasePipeline for VK_PIPELINE_CREATE_DERIVATIVE_BIT - M2 opaque = base; alphaTest/alpha/additive are derivatives - Applied to terrain (wireframe) and WMO (alpha-test) renderers Rendering bug fixes: - fix(shadow): compute lightSpaceMatrix before updatePerFrameUBO to eliminate one-frame lag that caused shadow trails and flicker on moving objects - fix(shadow): scale depth bias with shadowDistance_ instead of hardcoded 0.8f to prevent acne at close range and gaps at far range - fix(visibility): WMO group distance threshold 500u → 1200u to match terrain view distance; buildings were disappearing on the horizon - fix(precision): camera near plane 0.05 → 0.5 (ratio 600K:1 → 60K:1), eliminating Z-fighting and improving frustum plane extraction stability - fix(streaming): terrain load radius 4 → 6 tiles (~2133u → ~3200u) to exceed M2 render distance (2800u) and eliminate pop-in when camera turns; unload radius 7 → 9; spawn radius 3 → 4 - fix(visibility): ground-detail M2 distance multiplier 0.75 → 0.9 to reduce early pop of grass and debris
This commit is contained in:
parent
ca3cea078b
commit
d54e262048
22 changed files with 1579 additions and 494 deletions
|
|
@ -51,7 +51,7 @@ private:
|
|||
float pitch = 0.0f;
|
||||
float fov = 45.0f;
|
||||
float aspectRatio = 16.0f / 9.0f;
|
||||
float nearPlane = 0.05f;
|
||||
float nearPlane = 0.5f;
|
||||
float farPlane = 30000.0f; // Improves depth precision vs extremely large far clip
|
||||
|
||||
glm::mat4 viewMatrix = glm::mat4(1.0f);
|
||||
|
|
|
|||
|
|
@ -219,12 +219,15 @@ struct M2Instance {
|
|||
uint8_t frameSkipCounter = 0;
|
||||
bool bonesDirty[2] = {false, false}; // Per-frame-index: set when bones recomputed, cleared after upload
|
||||
|
||||
// Per-instance bone SSBO (double-buffered)
|
||||
// Per-instance bone SSBO (double-buffered) — legacy; see mega bone SSBO in M2Renderer
|
||||
::VkBuffer boneBuffer[2] = {};
|
||||
VmaAllocation boneAlloc[2] = {};
|
||||
void* boneMapped[2] = {};
|
||||
VkDescriptorSet boneSet[2] = {};
|
||||
|
||||
// Mega bone SSBO offset — base bone index for this instance (set per-frame in prepareRender)
|
||||
uint32_t megaBoneOffset = 0;
|
||||
|
||||
void updateModelMatrix();
|
||||
};
|
||||
|
||||
|
|
@ -292,6 +295,8 @@ public:
|
|||
*/
|
||||
/** Pre-allocate GPU resources (bone SSBOs, descriptors) on main thread before parallel render. */
|
||||
void prepareRender(uint32_t frameIndex, const Camera& camera);
|
||||
/** Phase 2.3: Dispatch GPU frustum culling compute shader on primary cmd before render pass. */
|
||||
void dispatchCullCompute(VkCommandBuffer cmd, uint32_t frameIndex, const Camera& camera);
|
||||
void render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet, const Camera& camera);
|
||||
|
||||
/**
|
||||
|
|
@ -425,6 +430,65 @@ private:
|
|||
VmaAllocation dummyBoneAlloc_ = VK_NULL_HANDLE;
|
||||
VkDescriptorSet dummyBoneSet_ = VK_NULL_HANDLE;
|
||||
|
||||
// Mega bone SSBO — consolidates all per-instance bone matrices into a single buffer per frame.
|
||||
// Replaces per-instance bone SSBOs for fewer descriptor binds and enables GPU instancing.
|
||||
static constexpr uint32_t MEGA_BONE_MAX_INSTANCES = 2048;
|
||||
static constexpr uint32_t MAX_BONES_PER_INSTANCE = 128;
|
||||
::VkBuffer megaBoneBuffer_[2] = {};
|
||||
VmaAllocation megaBoneAlloc_[2] = {};
|
||||
void* megaBoneMapped_[2] = {};
|
||||
VkDescriptorSet megaBoneSet_[2] = {};
|
||||
|
||||
// Phase 2.1: GPU instance data SSBO — per-instance transforms, fade, bones for instanced draws.
|
||||
// Shader reads instanceData[push.instanceDataOffset + gl_InstanceIndex].
|
||||
struct M2InstanceGPU {
|
||||
glm::mat4 model; // 64 bytes @ offset 0
|
||||
glm::vec2 uvOffset; // 8 bytes @ offset 64
|
||||
float fadeAlpha; // 4 bytes @ offset 72
|
||||
int32_t useBones; // 4 bytes @ offset 76
|
||||
int32_t boneBase; // 4 bytes @ offset 80
|
||||
int32_t _pad[3] = {}; // 12 bytes @ offset 84 — align to 96 (std430)
|
||||
};
|
||||
static constexpr uint32_t MAX_INSTANCE_DATA = 16384;
|
||||
VkDescriptorSetLayout instanceSetLayout_ = VK_NULL_HANDLE;
|
||||
VkDescriptorPool instanceDescPool_ = VK_NULL_HANDLE;
|
||||
::VkBuffer instanceBuffer_[2] = {};
|
||||
VmaAllocation instanceAlloc_[2] = {};
|
||||
void* instanceMapped_[2] = {};
|
||||
VkDescriptorSet instanceSet_[2] = {};
|
||||
uint32_t instanceDataCount_ = 0; // reset each frame in render()
|
||||
|
||||
// Phase 2.3: GPU Frustum Culling via Compute Shader
|
||||
// Compute shader tests each M2 instance against frustum planes + distance, writes visibility[].
|
||||
// CPU reads back visibility to build sortedVisible_ without per-instance frustum/distance tests.
|
||||
struct CullInstanceGPU { // matches CullInstance in m2_cull.comp.glsl (32 bytes, std430)
|
||||
glm::vec4 sphere; // xyz = world position, w = padded radius
|
||||
float effectiveMaxDistSq; // adaptive distance cull threshold
|
||||
uint32_t flags; // bit 0 = valid, bit 1 = smoke, bit 2 = invisibleTrap
|
||||
float _pad[2] = {};
|
||||
};
|
||||
struct CullUniformsGPU { // matches CullUniforms in m2_cull.comp.glsl (128 bytes, std140)
|
||||
glm::vec4 frustumPlanes[6]; // xyz = normal, w = distance
|
||||
glm::vec4 cameraPos; // xyz = camera position, w = maxPossibleDistSq
|
||||
uint32_t instanceCount;
|
||||
uint32_t _pad[3] = {};
|
||||
};
|
||||
static constexpr uint32_t MAX_CULL_INSTANCES = 16384;
|
||||
VkPipeline cullPipeline_ = VK_NULL_HANDLE;
|
||||
VkPipelineLayout cullPipelineLayout_ = VK_NULL_HANDLE;
|
||||
VkDescriptorSetLayout cullSetLayout_ = VK_NULL_HANDLE;
|
||||
VkDescriptorPool cullDescPool_ = VK_NULL_HANDLE;
|
||||
VkDescriptorSet cullSet_[2] = {}; // double-buffered
|
||||
::VkBuffer cullUniformBuffer_[2] = {}; // frustum planes + camera (UBO)
|
||||
VmaAllocation cullUniformAlloc_[2] = {};
|
||||
void* cullUniformMapped_[2] = {};
|
||||
::VkBuffer cullInputBuffer_[2] = {}; // per-instance bounding sphere + flags (SSBO)
|
||||
VmaAllocation cullInputAlloc_[2] = {};
|
||||
void* cullInputMapped_[2] = {};
|
||||
::VkBuffer cullOutputBuffer_[2] = {}; // uint visibility[] (SSBO, host-readable)
|
||||
VmaAllocation cullOutputAlloc_[2] = {};
|
||||
void* cullOutputMapped_[2] = {};
|
||||
|
||||
// Dynamic ribbon vertex buffer (CPU-written triangle strip)
|
||||
static constexpr size_t MAX_RIBBON_VERTS = 2048; // 9 floats each
|
||||
::VkBuffer ribbonVB_ = VK_NULL_HANDLE;
|
||||
|
|
|
|||
117
include/rendering/render_graph.hpp
Normal file
117
include/rendering/render_graph.hpp
Normal file
|
|
@ -0,0 +1,117 @@
|
|||
#pragma once
|
||||
|
||||
#include <vulkan/vulkan.h>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <functional>
|
||||
#include <cstdint>
|
||||
|
||||
namespace wowee {
|
||||
namespace rendering {
|
||||
|
||||
// Phase 2.5: Lightweight Render Graph / Frame Graph
|
||||
// Converts hardcoded pass sequence (shadow → reflection → compute cull →
|
||||
// main → post-process → ImGui → present) into declarative graph nodes.
|
||||
// Graph auto-inserts VkImageMemoryBarrier between passes.
|
||||
|
||||
// Resource handle — identifies a virtual resource (image or buffer) within the graph.
|
||||
struct RGResource {
|
||||
uint32_t id = UINT32_MAX;
|
||||
bool valid() const { return id != UINT32_MAX; }
|
||||
};
|
||||
|
||||
// Image barrier descriptor for automatic synchronization between passes.
|
||||
struct RGImageBarrier {
|
||||
VkImage image;
|
||||
VkImageLayout oldLayout;
|
||||
VkImageLayout newLayout;
|
||||
VkAccessFlags srcAccess;
|
||||
VkAccessFlags dstAccess;
|
||||
VkPipelineStageFlags srcStage;
|
||||
VkPipelineStageFlags dstStage;
|
||||
VkImageAspectFlags aspectMask;
|
||||
};
|
||||
|
||||
// Buffer barrier descriptor for automatic synchronization between passes.
|
||||
struct RGBufferBarrier {
|
||||
VkBuffer buffer;
|
||||
VkDeviceSize offset;
|
||||
VkDeviceSize size;
|
||||
VkAccessFlags srcAccess;
|
||||
VkAccessFlags dstAccess;
|
||||
VkPipelineStageFlags srcStage;
|
||||
VkPipelineStageFlags dstStage;
|
||||
};
|
||||
|
||||
// Render pass node — wraps an execution callback with declared inputs/outputs.
|
||||
struct RGPass {
|
||||
std::string name;
|
||||
std::vector<RGResource> inputs;
|
||||
std::vector<RGResource> outputs;
|
||||
std::function<void(VkCommandBuffer cmd)> execute;
|
||||
bool enabled = true; // Can be dynamically disabled per-frame
|
||||
|
||||
// Barriers to insert before this pass executes
|
||||
std::vector<RGImageBarrier> imageBarriers;
|
||||
std::vector<RGBufferBarrier> bufferBarriers;
|
||||
};
|
||||
|
||||
class RenderGraph {
|
||||
public:
|
||||
RenderGraph() = default;
|
||||
~RenderGraph() = default;
|
||||
|
||||
// Reset graph for a new frame (clears passes, keeps resource registry).
|
||||
void reset();
|
||||
|
||||
// Register a virtual resource (returns handle for input/output declarations).
|
||||
RGResource registerResource(const std::string& name);
|
||||
|
||||
// Look up a previously registered resource by name.
|
||||
RGResource findResource(const std::string& name) const;
|
||||
|
||||
// Add a render pass node.
|
||||
// inputs: resources this pass reads from
|
||||
// outputs: resources this pass writes to
|
||||
// execute: callback invoked with the frame's command buffer
|
||||
void addPass(const std::string& name,
|
||||
const std::vector<RGResource>& inputs,
|
||||
const std::vector<RGResource>& outputs,
|
||||
std::function<void(VkCommandBuffer cmd)> execute);
|
||||
|
||||
// Enable/disable a pass by name (for dynamic toggling, e.g. shadows off).
|
||||
void setPassEnabled(const std::string& name, bool enabled);
|
||||
|
||||
// Compile: topological sort by dependency order, insert barriers.
|
||||
// Must be called after all addPass() calls and before execute().
|
||||
void compile();
|
||||
|
||||
// Execute all enabled passes in compiled order on the given command buffer.
|
||||
void execute(VkCommandBuffer cmd);
|
||||
|
||||
// Query: get the compiled execution order (pass names, for debug HUD).
|
||||
const std::vector<uint32_t>& getExecutionOrder() const { return executionOrder_; }
|
||||
const std::vector<RGPass>& getPasses() const { return passes_; }
|
||||
|
||||
private:
|
||||
// Topological sort helper (Kahn's algorithm).
|
||||
void topologicalSort();
|
||||
|
||||
// Resource registry: name → id
|
||||
struct ResourceEntry {
|
||||
std::string name;
|
||||
uint32_t id;
|
||||
};
|
||||
std::vector<ResourceEntry> resources_;
|
||||
uint32_t nextResourceId_ = 0;
|
||||
|
||||
// Pass storage
|
||||
std::vector<RGPass> passes_;
|
||||
|
||||
// Compiled execution order (indices into passes_)
|
||||
std::vector<uint32_t> executionOrder_;
|
||||
bool compiled_ = false;
|
||||
};
|
||||
|
||||
} // namespace rendering
|
||||
} // namespace wowee
|
||||
|
|
@ -56,6 +56,7 @@ class AnimationController;
|
|||
class LevelUpEffect;
|
||||
class ChargeEffect;
|
||||
class SwimEffects;
|
||||
class RenderGraph;
|
||||
|
||||
class Renderer {
|
||||
public:
|
||||
|
|
@ -433,6 +434,10 @@ private:
|
|||
|
||||
bool ghostMode_ = false; // set each frame from gameHandler->isPlayerGhost()
|
||||
|
||||
// Phase 2.5: Render Graph — declarative pass ordering with automatic barriers
|
||||
std::unique_ptr<RenderGraph> renderGraph_;
|
||||
void buildFrameGraph(game::GameHandler* gameHandler);
|
||||
|
||||
// CPU timing stats (last frame/update).
|
||||
double lastUpdateMs = 0.0;
|
||||
double lastRenderMs = 0.0;
|
||||
|
|
|
|||
|
|
@ -346,8 +346,8 @@ private:
|
|||
|
||||
// Streaming parameters
|
||||
bool streamingEnabled = true;
|
||||
int loadRadius = 4; // Load tiles within this radius (9x9 grid = 81 tiles)
|
||||
int unloadRadius = 7; // Unload tiles beyond this radius
|
||||
int loadRadius = 6; // Load tiles within this radius (13x13 grid = 169 tiles)
|
||||
int unloadRadius = 9; // Unload tiles beyond this radius
|
||||
float updateInterval = 0.033f; // Check streaming every 33ms (~30 fps)
|
||||
float timeSinceLastUpdate = 0.0f;
|
||||
float proactiveStreamTimer_ = 0.0f;
|
||||
|
|
|
|||
|
|
@ -60,6 +60,11 @@ struct TerrainChunkGPU {
|
|||
float boundingSphereRadius = 0.0f;
|
||||
glm::vec3 boundingSphereCenter = glm::vec3(0.0f);
|
||||
|
||||
// Phase 2.2: Offsets into mega buffers for indirect drawing (-1 = not in mega buffer)
|
||||
int32_t megaBaseVertex = -1;
|
||||
uint32_t megaFirstIndex = 0;
|
||||
uint32_t vertexCount = 0;
|
||||
|
||||
bool isValid() const { return vertexBuffer != VK_NULL_HANDLE && indexBuffer != VK_NULL_HANDLE; }
|
||||
};
|
||||
|
||||
|
|
@ -200,6 +205,25 @@ private:
|
|||
bool fogEnabled = true;
|
||||
int renderedChunks = 0;
|
||||
int culledChunks = 0;
|
||||
|
||||
// Phase 2.2: Mega vertex/index buffers for indirect drawing
|
||||
// All terrain chunks share a single VB + IB, eliminating per-chunk rebinds.
|
||||
// Indirect draw commands are built CPU-side each frame for visible chunks.
|
||||
VkBuffer megaVB_ = VK_NULL_HANDLE;
|
||||
VmaAllocation megaVBAlloc_ = VK_NULL_HANDLE;
|
||||
void* megaVBMapped_ = nullptr;
|
||||
VkBuffer megaIB_ = VK_NULL_HANDLE;
|
||||
VmaAllocation megaIBAlloc_ = VK_NULL_HANDLE;
|
||||
void* megaIBMapped_ = nullptr;
|
||||
uint32_t megaVBUsed_ = 0; // vertices used
|
||||
uint32_t megaIBUsed_ = 0; // indices used
|
||||
static constexpr uint32_t MEGA_VB_MAX_VERTS = 1536 * 1024; // ~1.5M verts × 44B ≈ 64MB
|
||||
static constexpr uint32_t MEGA_IB_MAX_INDICES = 6 * 1024 * 1024; // 6M indices × 4B = 24MB
|
||||
|
||||
VkBuffer indirectBuffer_ = VK_NULL_HANDLE;
|
||||
VmaAllocation indirectAlloc_ = VK_NULL_HANDLE;
|
||||
void* indirectMapped_ = nullptr;
|
||||
static constexpr uint32_t MAX_INDIRECT_DRAWS = 8192;
|
||||
};
|
||||
|
||||
} // namespace rendering
|
||||
|
|
|
|||
|
|
@ -75,6 +75,10 @@ public:
|
|||
// Dynamic state
|
||||
PipelineBuilder& setDynamicStates(const std::vector<VkDynamicState>& states);
|
||||
|
||||
// Pipeline derivatives — hint driver to share compiled state between similar pipelines
|
||||
PipelineBuilder& setFlags(VkPipelineCreateFlags flags);
|
||||
PipelineBuilder& setBasePipeline(VkPipeline basePipeline);
|
||||
|
||||
// Build the pipeline (pass a VkPipelineCache for faster creation)
|
||||
VkPipeline build(VkDevice device, VkPipelineCache cache = VK_NULL_HANDLE) const;
|
||||
|
||||
|
|
@ -106,6 +110,8 @@ private:
|
|||
VkRenderPass renderPass_ = VK_NULL_HANDLE;
|
||||
uint32_t subpass_ = 0;
|
||||
std::vector<VkDynamicState> dynamicStates_;
|
||||
VkPipelineCreateFlags flags_ = 0;
|
||||
VkPipeline basePipelineHandle_ = VK_NULL_HANDLE;
|
||||
};
|
||||
|
||||
// Helper to create a pipeline layout from descriptor set layouts and push constant ranges
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue