diff --git a/assets/shaders/fsr2_accumulate.comp.glsl b/assets/shaders/fsr2_accumulate.comp.glsl new file mode 100644 index 00000000..7fb0cb27 --- /dev/null +++ b/assets/shaders/fsr2_accumulate.comp.glsl @@ -0,0 +1,85 @@ +#version 450 + +layout(local_size_x = 8, local_size_y = 8) in; + +layout(set = 0, binding = 0) uniform sampler2D sceneColor; +layout(set = 0, binding = 1) uniform sampler2D depthBuffer; +layout(set = 0, binding = 2) uniform sampler2D motionVectors; +layout(set = 0, binding = 3) uniform sampler2D historyInput; +layout(set = 0, binding = 4, rgba16f) uniform writeonly image2D historyOutput; + +layout(push_constant) uniform PushConstants { + vec4 internalSize; // xy = internal resolution, zw = 1/internal + vec4 displaySize; // xy = display resolution, zw = 1/display + vec4 jitterOffset; // xy = current jitter (NDC-space), zw = unused + vec4 params; // x = resetHistory (1=reset), y = sharpness, zw = unused +} pc; + +vec3 rgbToYCoCg(vec3 rgb) { + float y = 0.25 * rgb.r + 0.5 * rgb.g + 0.25 * rgb.b; + float co = 0.5 * rgb.r - 0.5 * rgb.b; + float cg = -0.25 * rgb.r + 0.5 * rgb.g - 0.25 * rgb.b; + return vec3(y, co, cg); +} + +vec3 yCoCgToRgb(vec3 ycocg) { + float y = ycocg.x; + float co = ycocg.y; + float cg = ycocg.z; + return vec3(y + co - cg, y + cg, y - co - cg); +} + +void main() { + ivec2 outPixel = ivec2(gl_GlobalInvocationID.xy); + ivec2 outSize = ivec2(pc.displaySize.xy); + if (outPixel.x >= outSize.x || outPixel.y >= outSize.y) return; + + vec2 outUV = (vec2(outPixel) + 0.5) * pc.displaySize.zw; + vec3 currentColor = texture(sceneColor, outUV).rgb; + + if (pc.params.x > 0.5) { + imageStore(historyOutput, outPixel, vec4(currentColor, 1.0)); + return; + } + + vec2 motion = texture(motionVectors, outUV).rg; + vec2 historyUV = outUV + motion; + + float historyValid = (historyUV.x >= 0.0 && historyUV.x <= 1.0 && + historyUV.y >= 0.0 && historyUV.y <= 1.0) ? 1.0 : 0.0; + + vec3 historyColor = texture(historyInput, historyUV).rgb; + + // Neighborhood clamping in YCoCg space + vec2 texelSize = pc.internalSize.zw; + vec3 s0 = rgbToYCoCg(currentColor); + vec3 s1 = rgbToYCoCg(texture(sceneColor, outUV + vec2(-texelSize.x, 0.0)).rgb); + vec3 s2 = rgbToYCoCg(texture(sceneColor, outUV + vec2( texelSize.x, 0.0)).rgb); + vec3 s3 = rgbToYCoCg(texture(sceneColor, outUV + vec2(0.0, -texelSize.y)).rgb); + vec3 s4 = rgbToYCoCg(texture(sceneColor, outUV + vec2(0.0, texelSize.y)).rgb); + vec3 s5 = rgbToYCoCg(texture(sceneColor, outUV + vec2(-texelSize.x, -texelSize.y)).rgb); + vec3 s6 = rgbToYCoCg(texture(sceneColor, outUV + vec2( texelSize.x, -texelSize.y)).rgb); + vec3 s7 = rgbToYCoCg(texture(sceneColor, outUV + vec2(-texelSize.x, texelSize.y)).rgb); + vec3 s8 = rgbToYCoCg(texture(sceneColor, outUV + vec2( texelSize.x, texelSize.y)).rgb); + + vec3 m1 = s0 + s1 + s2 + s3 + s4 + s5 + s6 + s7 + s8; + vec3 m2 = s0*s0 + s1*s1 + s2*s2 + s3*s3 + s4*s4 + s5*s5 + s6*s6 + s7*s7 + s8*s8; + vec3 mean = m1 / 9.0; + vec3 variance = max(m2 / 9.0 - mean * mean, vec3(0.0)); + vec3 stddev = sqrt(variance); + + float gamma = 1.5; + vec3 boxMin = mean - gamma * stddev; + vec3 boxMax = mean + gamma * stddev; + + vec3 historyYCoCg = rgbToYCoCg(historyColor); + vec3 clampedHistory = clamp(historyYCoCg, boxMin, boxMax); + historyColor = yCoCgToRgb(clampedHistory); + + float clampDist = length(historyYCoCg - clampedHistory); + float blendFactor = mix(0.05, 0.30, clamp(clampDist * 2.0, 0.0, 1.0)); + blendFactor = mix(blendFactor, 1.0, 1.0 - historyValid); + + vec3 result = mix(historyColor, currentColor, blendFactor); + imageStore(historyOutput, outPixel, vec4(result, 1.0)); +} diff --git a/assets/shaders/fsr2_accumulate.comp.spv b/assets/shaders/fsr2_accumulate.comp.spv new file mode 100644 index 00000000..47529d75 Binary files /dev/null and b/assets/shaders/fsr2_accumulate.comp.spv differ diff --git a/assets/shaders/fsr2_motion.comp.glsl b/assets/shaders/fsr2_motion.comp.glsl new file mode 100644 index 00000000..b0b39375 --- /dev/null +++ b/assets/shaders/fsr2_motion.comp.glsl @@ -0,0 +1,35 @@ +#version 450 + +layout(local_size_x = 8, local_size_y = 8) in; + +layout(set = 0, binding = 0) uniform sampler2D depthBuffer; +layout(set = 0, binding = 1, rg16f) uniform writeonly image2D motionVectors; + +layout(push_constant) uniform PushConstants { + mat4 reprojMatrix; // prevUnjitteredVP * inverse(currentUnjitteredVP) + vec4 resolution; // xy = internal size, zw = 1/internal size +} pc; + +void main() { + ivec2 pixelCoord = ivec2(gl_GlobalInvocationID.xy); + ivec2 imgSize = ivec2(pc.resolution.xy); + if (pixelCoord.x >= imgSize.x || pixelCoord.y >= imgSize.y) return; + + // Sample depth (Vulkan: 0 = near, 1 = far) + float depth = texelFetch(depthBuffer, pixelCoord, 0).r; + + // Pixel center in UV [0,1] and NDC [-1,1] + vec2 uv = (vec2(pixelCoord) + 0.5) * pc.resolution.zw; + vec2 ndc = uv * 2.0 - 1.0; + + // Clip-to-clip reprojection: current unjittered clip → previous unjittered clip + vec4 clipPos = vec4(ndc, depth, 1.0); + vec4 prevClip = pc.reprojMatrix * clipPos; + vec2 prevNdc = prevClip.xy / prevClip.w; + vec2 prevUV = prevNdc * 0.5 + 0.5; + + // Motion = previous position - current position (both unjittered, in UV space) + vec2 motion = prevUV - uv; + + imageStore(motionVectors, pixelCoord, vec4(motion, 0.0, 0.0)); +} diff --git a/assets/shaders/fsr2_motion.comp.spv b/assets/shaders/fsr2_motion.comp.spv new file mode 100644 index 00000000..faa3d836 Binary files /dev/null and b/assets/shaders/fsr2_motion.comp.spv differ diff --git a/assets/shaders/fsr2_sharpen.frag.glsl b/assets/shaders/fsr2_sharpen.frag.glsl new file mode 100644 index 00000000..2c649d22 --- /dev/null +++ b/assets/shaders/fsr2_sharpen.frag.glsl @@ -0,0 +1,50 @@ +#version 450 + +layout(location = 0) in vec2 TexCoord; +layout(location = 0) out vec4 FragColor; + +layout(set = 0, binding = 0) uniform sampler2D inputImage; + +layout(push_constant) uniform PushConstants { + vec4 params; // x = 1/width, y = 1/height, z = sharpness (0-2), w = unused +} pc; + +void main() { + // Undo the vertex shader Y flip (postprocess.vert flips for Vulkan overlay, + // but we need standard UV coords for texture sampling) + vec2 tc = vec2(TexCoord.x, 1.0 - TexCoord.y); + + vec2 texelSize = pc.params.xy; + float sharpness = pc.params.z; + + // RCAS: Robust Contrast-Adaptive Sharpening + // 5-tap cross pattern + vec3 center = texture(inputImage, tc).rgb; + vec3 north = texture(inputImage, tc + vec2(0.0, -texelSize.y)).rgb; + vec3 south = texture(inputImage, tc + vec2(0.0, texelSize.y)).rgb; + vec3 west = texture(inputImage, tc + vec2(-texelSize.x, 0.0)).rgb; + vec3 east = texture(inputImage, tc + vec2( texelSize.x, 0.0)).rgb; + + // Compute local contrast (min/max of neighborhood) + vec3 minRGB = min(center, min(min(north, south), min(west, east))); + vec3 maxRGB = max(center, max(max(north, south), max(west, east))); + + // Adaptive sharpening weight based on local contrast + // High contrast = less sharpening (prevent ringing) + vec3 range = maxRGB - minRGB; + vec3 rcpRange = 1.0 / (range + 0.001); + + // Sharpening amount: inversely proportional to contrast + float luma = dot(center, vec3(0.299, 0.587, 0.114)); + float lumaRange = max(range.r, max(range.g, range.b)); + float w = clamp(1.0 - lumaRange * 2.0, 0.0, 1.0) * sharpness * 0.25; + + // Apply sharpening via unsharp mask + vec3 avg = (north + south + west + east) * 0.25; + vec3 sharpened = center + (center - avg) * w; + + // Clamp to prevent ringing artifacts + sharpened = clamp(sharpened, minRGB, maxRGB); + + FragColor = vec4(sharpened, 1.0); +} diff --git a/assets/shaders/fsr2_sharpen.frag.spv b/assets/shaders/fsr2_sharpen.frag.spv new file mode 100644 index 00000000..f9d2394c Binary files /dev/null and b/assets/shaders/fsr2_sharpen.frag.spv differ diff --git a/assets/shaders/fsr_easu.frag.glsl b/assets/shaders/fsr_easu.frag.glsl new file mode 100644 index 00000000..20e5ed32 --- /dev/null +++ b/assets/shaders/fsr_easu.frag.glsl @@ -0,0 +1,102 @@ +#version 450 +// FSR 1.0 EASU (Edge Adaptive Spatial Upsampling) — Fragment Shader +// Based on AMD FidelityFX Super Resolution 1.0 +// Implements edge-adaptive bilinear upsampling with directional filtering + +layout(set = 0, binding = 0) uniform sampler2D uInput; + +layout(push_constant) uniform FSRConstants { + vec4 con0; // inputSize.xy, 1/inputSize.xy + vec4 con1; // inputSize.xy / outputSize.xy, 0.5 * inputSize.xy / outputSize.xy + vec4 con2; // outputSize.xy, 1/outputSize.xy + vec4 con3; // sharpness, 0, 0, 0 +} fsr; + +layout(location = 0) in vec2 TexCoord; +layout(location = 0) out vec4 outColor; + +// Fetch a texel with offset (in input pixels) +vec3 fsrFetch(vec2 p, vec2 off) { + return textureLod(uInput, (p + off + 0.5) * fsr.con0.zw, 0.0).rgb; +} + +void main() { + // Undo the vertex shader Y flip (postprocess.vert flips for Vulkan overlay, + // but we need standard UV coords for texture sampling) + vec2 tc = vec2(TexCoord.x, 1.0 - TexCoord.y); + + // Map output pixel to input space + vec2 pp = tc * fsr.con2.xy; // output pixel position + vec2 ip = pp * fsr.con1.xy - 0.5; // input pixel position (centered) + vec2 fp = floor(ip); + vec2 ff = ip - fp; + + // 12-tap filter: 4x3 grid around the pixel + // b c + // e f g h + // i j k l + // n o + vec3 b = fsrFetch(fp, vec2( 0, -1)); + vec3 c = fsrFetch(fp, vec2( 1, -1)); + vec3 e = fsrFetch(fp, vec2(-1, 0)); + vec3 f = fsrFetch(fp, vec2( 0, 0)); + vec3 g = fsrFetch(fp, vec2( 1, 0)); + vec3 h = fsrFetch(fp, vec2( 2, 0)); + vec3 i = fsrFetch(fp, vec2(-1, 1)); + vec3 j = fsrFetch(fp, vec2( 0, 1)); + vec3 k = fsrFetch(fp, vec2( 1, 1)); + vec3 l = fsrFetch(fp, vec2( 2, 1)); + vec3 n = fsrFetch(fp, vec2( 0, 2)); + vec3 o = fsrFetch(fp, vec2( 1, 2)); + + // Luma (use green channel as good perceptual approximation) + float bL = b.g, cL = c.g, eL = e.g, fL = f.g; + float gL = g.g, hL = h.g, iL = i.g, jL = j.g; + float kL = k.g, lL = l.g, nL = n.g, oL = o.g; + + // Directional edge detection + // Compute gradients in 4 directions (N-S, E-W, NE-SW, NW-SE) + float dc = cL - jL; + float db = bL - kL; + float de = eL - hL; + float di = iL - lL; + + // Length of the edge in each direction + float lenH = abs(eL - fL) + abs(fL - gL) + abs(iL - jL) + abs(jL - kL); + float lenV = abs(bL - fL) + abs(fL - jL) + abs(cL - gL) + abs(gL - kL); + + // Determine dominant edge direction + float dirH = lenV / (lenH + lenV + 1e-7); + float dirV = lenH / (lenH + lenV + 1e-7); + + // Bilinear weights + float w1 = (1.0 - ff.x) * (1.0 - ff.y); + float w2 = ff.x * (1.0 - ff.y); + float w3 = (1.0 - ff.x) * ff.y; + float w4 = ff.x * ff.y; + + // Edge-aware sharpening: boost weights along edges + float sharpness = fsr.con3.x; + float edgeStr = max(abs(lenH - lenV) / (lenH + lenV + 1e-7), 0.0); + float sharp = mix(0.0, sharpness, edgeStr); + + // Sharpen bilinear by pulling toward nearest texel + float maxW = max(max(w1, w2), max(w3, w4)); + w1 = mix(w1, float(w1 == maxW), sharp * 0.25); + w2 = mix(w2, float(w2 == maxW), sharp * 0.25); + w3 = mix(w3, float(w3 == maxW), sharp * 0.25); + w4 = mix(w4, float(w4 == maxW), sharp * 0.25); + + // Normalize + float wSum = w1 + w2 + w3 + w4; + w1 /= wSum; w2 /= wSum; w3 /= wSum; w4 /= wSum; + + // Final color: weighted blend of the 4 nearest texels with edge awareness + vec3 color = f * w1 + g * w2 + j * w3 + k * w4; + + // Optional: blend in some of the surrounding texels for anti-aliasing + float aa = 0.125 * edgeStr; + color = mix(color, (b + c + e + h + i + l + n + o) / 8.0, aa * 0.15); + + outColor = vec4(clamp(color, 0.0, 1.0), 1.0); +} diff --git a/assets/shaders/fsr_easu.frag.spv b/assets/shaders/fsr_easu.frag.spv new file mode 100644 index 00000000..5ddc2ea8 Binary files /dev/null and b/assets/shaders/fsr_easu.frag.spv differ diff --git a/assets/shaders/fsr_rcas.frag.glsl b/assets/shaders/fsr_rcas.frag.glsl new file mode 100644 index 00000000..a2d0e599 --- /dev/null +++ b/assets/shaders/fsr_rcas.frag.glsl @@ -0,0 +1,43 @@ +#version 450 +// FSR 1.0 RCAS (Robust Contrast Adaptive Sharpening) — Fragment Shader +// Based on AMD FidelityFX Super Resolution 1.0 +// Applies contrast-adaptive sharpening after EASU upscaling + +layout(set = 0, binding = 0) uniform sampler2D uInput; + +layout(push_constant) uniform RCASConstants { + vec4 con0; // 1/outputSize.xy, outputSize.xy + vec4 con1; // sharpness (x), 0, 0, 0 +} rcas; + +layout(location = 0) in vec2 TexCoord; +layout(location = 0) out vec4 outColor; + +void main() { + // Fetch center and 4-neighborhood + vec2 texelSize = rcas.con0.xy; + vec3 c = texture(uInput, TexCoord).rgb; + vec3 n = texture(uInput, TexCoord + vec2( 0, -texelSize.y)).rgb; + vec3 s = texture(uInput, TexCoord + vec2( 0, texelSize.y)).rgb; + vec3 w = texture(uInput, TexCoord + vec2(-texelSize.x, 0)).rgb; + vec3 e = texture(uInput, TexCoord + vec2( texelSize.x, 0)).rgb; + + // Luma (green channel approximation) + float cL = c.g, nL = n.g, sL = s.g, wL = w.g, eL = e.g; + + // Min/max of neighborhood + float minL = min(min(nL, sL), min(wL, eL)); + float maxL = max(max(nL, sL), max(wL, eL)); + + // Contrast adaptive sharpening weight + // Higher contrast = less sharpening to avoid ringing + float contrast = maxL - minL; + float sharpness = rcas.con1.x; + float w0 = sharpness * (1.0 - smoothstep(0.0, 0.3, contrast)); + + // Apply sharpening: center + w0 * (center - average_neighbors) + vec3 avg = (n + s + w + e) * 0.25; + vec3 sharpened = c + w0 * (c - avg); + + outColor = vec4(clamp(sharpened, 0.0, 1.0), 1.0); +} diff --git a/assets/shaders/fsr_rcas.frag.spv b/assets/shaders/fsr_rcas.frag.spv new file mode 100644 index 00000000..336e7843 Binary files /dev/null and b/assets/shaders/fsr_rcas.frag.spv differ diff --git a/assets/shaders/wmo.frag.glsl b/assets/shaders/wmo.frag.glsl index c04e1a93..a4bae057 100644 --- a/assets/shaders/wmo.frag.glsl +++ b/assets/shaders/wmo.frag.glsl @@ -149,21 +149,21 @@ void main() { vec3 norm = vertexNormal; if (enableNormalMap != 0 && lodFactor < 0.99 && normalMapStrength > 0.001) { vec3 mapNormal = texture(uNormalHeightMap, finalUV).rgb * 2.0 - 1.0; - // Scale XY by strength to control effect intensity - mapNormal.xy *= normalMapStrength; mapNormal = normalize(mapNormal); vec3 worldNormal = normalize(TBN * mapNormal); if (!gl_FrontFacing) worldNormal = -worldNormal; - // Blend: strength + LOD both contribute to fade toward vertex normal - float blendFactor = max(lodFactor, 1.0 - normalMapStrength); - norm = normalize(mix(worldNormal, vertexNormal, blendFactor)); + // Linear blend: strength controls how much normal map detail shows, + // LOD fades out at distance. Both multiply for smooth falloff. + float blend = clamp(normalMapStrength, 0.0, 1.0) * (1.0 - lodFactor); + norm = normalize(mix(vertexNormal, worldNormal, blend)); } vec3 result; - // Sample shadow map — skip for interior WMO groups (no sun indoors) + // Sample shadow map for all WMO groups (interior groups with 0x2000 flag + // include covered outdoor areas like archways/streets that should receive shadows) float shadow = 1.0; - if (shadowParams.x > 0.5 && isInterior == 0) { + if (shadowParams.x > 0.5) { vec3 ldir = normalize(-lightDir.xyz); float normalOffset = SHADOW_TEXEL * 2.0 * (1.0 - abs(dot(norm, ldir))); vec3 biasedPos = FragPos + norm * normalOffset; diff --git a/assets/shaders/wmo.frag.spv b/assets/shaders/wmo.frag.spv index 2453f0ff..524dbd1e 100644 Binary files a/assets/shaders/wmo.frag.spv and b/assets/shaders/wmo.frag.spv differ diff --git a/include/core/application.hpp b/include/core/application.hpp index 84b89f32..4d10acc7 100644 --- a/include/core/application.hpp +++ b/include/core/application.hpp @@ -215,7 +215,7 @@ private: std::future future; }; std::vector asyncCreatureLoads_; - void processAsyncCreatureResults(); + void processAsyncCreatureResults(bool unlimited = false); static constexpr int MAX_ASYNC_CREATURE_LOADS = 4; // concurrent background loads std::unordered_set deadCreatureGuids_; // GUIDs that should spawn in corpse/death pose std::unordered_map displayIdModelCache_; // displayId → modelId (model caching) @@ -236,6 +236,11 @@ private: std::optional pendingWorldEntry_; // Deferred world entry during loading float taxiLandingClampTimer_ = 0.0f; float worldEntryMovementGraceTimer_ = 0.0f; + + // Hearth teleport: freeze player until terrain loads at destination + bool hearthTeleportPending_ = false; + glm::vec3 hearthTeleportPos_{0.0f}; // render coords + float hearthTeleportTimer_ = 0.0f; // timeout safety float facingSendCooldown_ = 0.0f; // Rate-limits MSG_MOVE_SET_FACING float lastSentCanonicalYaw_ = 1000.0f; // Sentinel — triggers first send float taxiStreamCooldown_ = 0.0f; @@ -373,7 +378,7 @@ private: std::unordered_set pendingPlayerSpawnGuids_; void processPlayerSpawnQueue(); std::unordered_set creaturePermanentFailureGuids_; - void processCreatureSpawnQueue(); + void processCreatureSpawnQueue(bool unlimited = false); struct PendingGameObjectSpawn { uint64_t guid; diff --git a/include/game/game_handler.hpp b/include/game/game_handler.hpp index 8a3ee441..3af2f59a 100644 --- a/include/game/game_handler.hpp +++ b/include/game/game_handler.hpp @@ -565,6 +565,8 @@ public: void unstuck(); void setUnstuckGyCallback(UnstuckCallback cb) { unstuckGyCallback_ = std::move(cb); } void unstuckGy(); + void setUnstuckHearthCallback(UnstuckCallback cb) { unstuckHearthCallback_ = std::move(cb); } + void unstuckHearth(); using BindPointCallback = std::function; void setBindPointCallback(BindPointCallback cb) { bindPointCallback_ = std::move(cb); } @@ -1445,6 +1447,7 @@ private: WorldEntryCallback worldEntryCallback_; UnstuckCallback unstuckCallback_; UnstuckCallback unstuckGyCallback_; + UnstuckCallback unstuckHearthCallback_; BindPointCallback bindPointCallback_; CreatureSpawnCallback creatureSpawnCallback_; CreatureDespawnCallback creatureDespawnCallback_; diff --git a/include/rendering/camera.hpp b/include/rendering/camera.hpp index 0464007f..99a4879a 100644 --- a/include/rendering/camera.hpp +++ b/include/rendering/camera.hpp @@ -23,9 +23,16 @@ public: const glm::vec3& getPosition() const { return position; } const glm::mat4& getViewMatrix() const { return viewMatrix; } const glm::mat4& getProjectionMatrix() const { return projectionMatrix; } + const glm::mat4& getUnjitteredProjectionMatrix() const { return unjitteredProjectionMatrix; } glm::mat4 getViewProjectionMatrix() const { return projectionMatrix * viewMatrix; } + glm::mat4 getUnjitteredViewProjectionMatrix() const { return unjitteredProjectionMatrix * viewMatrix; } float getAspectRatio() const { return aspectRatio; } + // Sub-pixel jitter for temporal upscaling (FSR 2) + void setJitter(float jx, float jy); + void clearJitter(); + glm::vec2 getJitter() const { return jitterOffset; } + glm::vec3 getForward() const; glm::vec3 getRight() const; glm::vec3 getUp() const; @@ -46,6 +53,8 @@ private: glm::mat4 viewMatrix = glm::mat4(1.0f); glm::mat4 projectionMatrix = glm::mat4(1.0f); + glm::mat4 unjitteredProjectionMatrix = glm::mat4(1.0f); + glm::vec2 jitterOffset = glm::vec2(0.0f); // NDC jitter (applied to projection) }; } // namespace rendering diff --git a/include/rendering/character_renderer.hpp b/include/rendering/character_renderer.hpp index 83cb3e7f..7a01c0d7 100644 --- a/include/rendering/character_renderer.hpp +++ b/include/rendering/character_renderer.hpp @@ -13,6 +13,8 @@ #include #include #include +#include +#include namespace wowee { namespace pipeline { class AssetManager; } @@ -64,6 +66,8 @@ public: void update(float deltaTime, const glm::vec3& cameraPos = glm::vec3(0.0f)); + /** Pre-allocate GPU resources (bone SSBOs, descriptors) on main thread before parallel render. */ + void prepareRender(uint32_t frameIndex); void render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet, const Camera& camera); void recreatePipelines(); bool initializeShadow(VkRenderPass shadowRenderPass); @@ -304,15 +308,23 @@ private: std::unique_ptr generateNormalHeightMap( const uint8_t* pixels, uint32_t width, uint32_t height, float& outVariance); - // Deferred normal map generation — avoids stalling loadModel - struct PendingNormalMap { + // Background normal map generation — CPU work on thread pool, GPU upload on main thread + struct NormalMapResult { std::string cacheKey; - std::vector pixels; // RGBA pixel data + std::vector pixels; // RGBA normal map output uint32_t width, height; + float variance; }; - std::deque pendingNormalMaps_; + // Completed results ready for GPU upload (populated by background threads) + std::mutex normalMapResultsMutex_; + std::deque completedNormalMaps_; + std::atomic pendingNormalMapCount_{0}; // in-flight background tasks + + // Pure CPU normal map generation (thread-safe, no GPU access) + static NormalMapResult generateNormalHeightMapCPU( + std::string cacheKey, std::vector pixels, uint32_t width, uint32_t height); public: - void processPendingNormalMaps(int budget = 2); + void processPendingNormalMaps(int budget = 4); private: // Normal mapping / POM settings diff --git a/include/rendering/loading_screen.hpp b/include/rendering/loading_screen.hpp index 5f119676..afd134b9 100644 --- a/include/rendering/loading_screen.hpp +++ b/include/rendering/loading_screen.hpp @@ -24,6 +24,10 @@ public: // Render the loading screen with progress bar and status text (pure ImGui) void render(); + // Draw loading screen as ImGui overlay (call within an existing ImGui frame). + // Used during warmup to overlay loading screen on top of the rendered world. + void renderOverlay(); + void setProgress(float progress) { loadProgress = progress; } void setStatus(const std::string& status) { statusText = status; } diff --git a/include/rendering/m2_renderer.hpp b/include/rendering/m2_renderer.hpp index 1c35e34b..ee7d6ebf 100644 --- a/include/rendering/m2_renderer.hpp +++ b/include/rendering/m2_renderer.hpp @@ -122,6 +122,7 @@ struct M2ModelGPU { bool isKoboldFlame = false; // Model name matches kobold+(candle/torch/mine) (precomputed) bool isLavaModel = false; // Model name contains lava/molten/magma (UV scroll fallback) bool hasTextureAnimation = false; // True if any batch has UV animation + uint8_t availableLODs = 0; // Bitmask: bit N set if any batch has submeshLevel==N // Particle emitter data (kept from M2Model) std::vector particleEmitters; @@ -193,6 +194,7 @@ struct M2Instance { // Frame-skip optimization (update distant animations less frequently) uint8_t frameSkipCounter = 0; + bool bonesDirty[2] = {false, false}; // Per-frame-index: set when bones recomputed, cleared after upload // Per-instance bone SSBO (double-buffered) ::VkBuffer boneBuffer[2] = {}; @@ -265,6 +267,8 @@ public: /** * Render all visible instances (Vulkan) */ + /** Pre-allocate GPU resources (bone SSBOs, descriptors) on main thread before parallel render. */ + void prepareRender(uint32_t frameIndex, const Camera& camera); void render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet, const Camera& camera); /** @@ -471,9 +475,7 @@ private: static constexpr float SPATIAL_CELL_SIZE = 64.0f; std::unordered_map, GridCellHash> spatialGrid; std::unordered_map instanceIndexById; - mutable std::vector candidateScratch; - mutable std::unordered_set candidateIdScratch; - mutable std::vector collisionTriScratch_; + // Collision scratch buffers are thread_local (see m2_renderer.cpp) for thread-safety. // Collision query profiling (per frame). mutable double queryTimeMs = 0.0; diff --git a/include/rendering/renderer.hpp b/include/rendering/renderer.hpp index ab14021c..0058fbdd 100644 --- a/include/rendering/renderer.hpp +++ b/include/rendering/renderer.hpp @@ -4,10 +4,12 @@ #include #include #include +#include #include #include #include #include "rendering/vk_frame_data.hpp" +#include "rendering/vk_utils.hpp" #include "rendering/sky_system.hpp" namespace wowee { @@ -244,7 +246,7 @@ private: glm::vec3 shadowCenter = glm::vec3(0.0f); bool shadowCenterInitialized = false; bool shadowsEnabled = true; - float shadowDistance_ = 72.0f; // Shadow frustum half-extent (default: 72 units) + float shadowDistance_ = 300.0f; // Shadow frustum half-extent (default: 300 units) uint32_t shadowFrameCounter_ = 0; @@ -255,10 +257,20 @@ public: void setShadowsEnabled(bool enabled) { shadowsEnabled = enabled; } bool areShadowsEnabled() const { return shadowsEnabled; } - void setShadowDistance(float dist) { shadowDistance_ = glm::clamp(dist, 40.0f, 200.0f); } + void setShadowDistance(float dist) { shadowDistance_ = glm::clamp(dist, 40.0f, 500.0f); } float getShadowDistance() const { return shadowDistance_; } void setMsaaSamples(VkSampleCountFlagBits samples); + // FSR (FidelityFX Super Resolution) upscaling + void setFSREnabled(bool enabled); + bool isFSREnabled() const { return fsr_.enabled; } + void setFSRQuality(float scaleFactor); // 0.50=Perf, 0.59=Balanced, 0.67=Quality, 0.77=UltraQuality + void setFSRSharpness(float sharpness); // 0.0 - 2.0 + float getFSRScaleFactor() const { return fsr_.scaleFactor; } + float getFSRSharpness() const { return fsr_.sharpness; } + void setFSR2Enabled(bool enabled); + bool isFSR2Enabled() const { return fsr2_.enabled; } + void setWaterRefractionEnabled(bool enabled); bool isWaterRefractionEnabled() const; @@ -312,7 +324,7 @@ private: VmaAllocation selCircleIdxAlloc = VK_NULL_HANDLE; int selCircleVertCount = 0; void initSelectionCircle(); - void renderSelectionCircle(const glm::mat4& view, const glm::mat4& projection); + void renderSelectionCircle(const glm::mat4& view, const glm::mat4& projection, VkCommandBuffer overrideCmd = VK_NULL_HANDLE); glm::vec3 selCirclePos{0.0f}; glm::vec3 selCircleColor{1.0f, 0.0f, 0.0f}; float selCircleRadius = 1.5f; @@ -322,7 +334,95 @@ private: VkPipeline overlayPipeline = VK_NULL_HANDLE; VkPipelineLayout overlayPipelineLayout = VK_NULL_HANDLE; void initOverlayPipeline(); - void renderOverlay(const glm::vec4& color); + void renderOverlay(const glm::vec4& color, VkCommandBuffer overrideCmd = VK_NULL_HANDLE); + + // FSR 1.0 upscaling state + struct FSRState { + bool enabled = false; + bool needsRecreate = false; + float scaleFactor = 0.77f; // Ultra Quality default + float sharpness = 0.5f; + uint32_t internalWidth = 0; + uint32_t internalHeight = 0; + + // Off-screen scene target (reduced resolution) + AllocatedImage sceneColor{}; // 1x color (non-MSAA render target / MSAA resolve target) + AllocatedImage sceneDepth{}; // Depth (matches current MSAA sample count) + AllocatedImage sceneMsaaColor{}; // MSAA color target (only when MSAA > 1x) + AllocatedImage sceneDepthResolve{}; // Depth resolve (only when MSAA + depth resolve) + VkFramebuffer sceneFramebuffer = VK_NULL_HANDLE; + VkSampler sceneSampler = VK_NULL_HANDLE; + + // Upscale pipeline + VkPipeline pipeline = VK_NULL_HANDLE; + VkPipelineLayout pipelineLayout = VK_NULL_HANDLE; + VkDescriptorSetLayout descSetLayout = VK_NULL_HANDLE; + VkDescriptorPool descPool = VK_NULL_HANDLE; + VkDescriptorSet descSet = VK_NULL_HANDLE; + }; + FSRState fsr_; + bool initFSRResources(); + void destroyFSRResources(); + void renderFSRUpscale(); + + // FSR 2.2 temporal upscaling state + struct FSR2State { + bool enabled = false; + bool needsRecreate = false; + float scaleFactor = 0.77f; + float sharpness = 0.5f; + uint32_t internalWidth = 0; + uint32_t internalHeight = 0; + + // Off-screen scene targets (internal resolution, no MSAA — FSR2 replaces AA) + AllocatedImage sceneColor{}; + AllocatedImage sceneDepth{}; + VkFramebuffer sceneFramebuffer = VK_NULL_HANDLE; + + // Samplers + VkSampler linearSampler = VK_NULL_HANDLE; // For color + VkSampler nearestSampler = VK_NULL_HANDLE; // For depth / motion vectors + + // Motion vector buffer (internal resolution) + AllocatedImage motionVectors{}; + + // History buffers (display resolution, ping-pong) + AllocatedImage history[2]{}; + uint32_t currentHistory = 0; // Output index (0 or 1) + + // Compute pipelines + VkPipeline motionVecPipeline = VK_NULL_HANDLE; + VkPipelineLayout motionVecPipelineLayout = VK_NULL_HANDLE; + VkDescriptorSetLayout motionVecDescSetLayout = VK_NULL_HANDLE; + VkDescriptorPool motionVecDescPool = VK_NULL_HANDLE; + VkDescriptorSet motionVecDescSet = VK_NULL_HANDLE; + + VkPipeline accumulatePipeline = VK_NULL_HANDLE; + VkPipelineLayout accumulatePipelineLayout = VK_NULL_HANDLE; + VkDescriptorSetLayout accumulateDescSetLayout = VK_NULL_HANDLE; + VkDescriptorPool accumulateDescPool = VK_NULL_HANDLE; + VkDescriptorSet accumulateDescSets[2] = {}; // Per ping-pong + + // RCAS sharpening pass (display resolution) + VkPipeline sharpenPipeline = VK_NULL_HANDLE; + VkPipelineLayout sharpenPipelineLayout = VK_NULL_HANDLE; + VkDescriptorSetLayout sharpenDescSetLayout = VK_NULL_HANDLE; + VkDescriptorPool sharpenDescPool = VK_NULL_HANDLE; + VkDescriptorSet sharpenDescSets[2] = {}; + + // Previous frame state for motion vector reprojection + glm::mat4 prevViewProjection = glm::mat4(1.0f); + glm::vec2 prevJitter = glm::vec2(0.0f); + uint32_t frameIndex = 0; + bool needsHistoryReset = true; + }; + FSR2State fsr2_; + bool initFSR2Resources(); + void destroyFSR2Resources(); + void dispatchMotionVectors(); + void dispatchTemporalAccumulate(); + void renderFSR2Sharpen(); + static float halton(uint32_t index, uint32_t base); // Footstep event tracking (animation-driven) uint32_t footstepLastAnimationId = 0; @@ -411,6 +511,36 @@ private: void setupWater1xPass(); void renderReflectionPass(); + // ── Multithreaded secondary command buffer recording ── + // Indices into secondaryCmds_ arrays + static constexpr uint32_t SEC_SKY = 0; // sky (main thread) + static constexpr uint32_t SEC_TERRAIN = 1; // terrain (worker 0) + static constexpr uint32_t SEC_WMO = 2; // WMO (worker 1) + static constexpr uint32_t SEC_CHARS = 3; // selection circle + characters (main thread) + static constexpr uint32_t SEC_M2 = 4; // M2 + particles + glow (worker 2) + static constexpr uint32_t SEC_POST = 5; // water + weather + effects (main thread) + static constexpr uint32_t SEC_IMGUI = 6; // ImGui (main thread, non-FSR only) + static constexpr uint32_t NUM_SECONDARIES = 7; + static constexpr uint32_t NUM_WORKERS = 3; // terrain, WMO, M2 + + // Per-worker command pools (thread-safe: one pool per thread) + VkCommandPool workerCmdPools_[NUM_WORKERS] = {}; + // Main-thread command pool for its secondary buffers + VkCommandPool mainSecondaryCmdPool_ = VK_NULL_HANDLE; + // Pre-allocated secondary command buffers [secondaryIndex][frameInFlight] + VkCommandBuffer secondaryCmds_[NUM_SECONDARIES][MAX_FRAMES] = {}; + + bool parallelRecordingEnabled_ = false; // set true after pools/buffers created + bool createSecondaryCommandResources(); + void destroySecondaryCommandResources(); + VkCommandBuffer beginSecondary(uint32_t secondaryIndex); + void setSecondaryViewportScissor(VkCommandBuffer cmd); + + // Cached render pass state for secondary buffer inheritance + VkRenderPass activeRenderPass_ = VK_NULL_HANDLE; + VkFramebuffer activeFramebuffer_ = VK_NULL_HANDLE; + VkExtent2D activeRenderExtent_ = {0, 0}; + // Active character previews for off-screen rendering std::vector activePreviews_; diff --git a/include/rendering/terrain_manager.hpp b/include/rendering/terrain_manager.hpp index 6f732721..2a746d3e 100644 --- a/include/rendering/terrain_manager.hpp +++ b/include/rendering/terrain_manager.hpp @@ -348,6 +348,7 @@ private: int unloadRadius = 7; // Unload tiles beyond this radius float updateInterval = 0.033f; // Check streaming every 33ms (~30 fps) float timeSinceLastUpdate = 0.0f; + float proactiveStreamTimer_ = 0.0f; bool taxiStreamingMode_ = false; // Tile size constants (WoW ADT specifications) diff --git a/include/rendering/vk_context.hpp b/include/rendering/vk_context.hpp index 907e21bf..154a4f98 100644 --- a/include/rendering/vk_context.hpp +++ b/include/rendering/vk_context.hpp @@ -84,6 +84,10 @@ public: bool isSwapchainDirty() const { return swapchainDirty; } void markSwapchainDirty() { swapchainDirty = true; } + // VSync (present mode) + bool isVsyncEnabled() const { return vsync_; } + void setVsync(bool enabled) { vsync_ = enabled; } + bool isDeviceLost() const { return deviceLost_; } // MSAA @@ -145,6 +149,7 @@ private: std::vector swapchainFramebuffers; bool swapchainDirty = false; bool deviceLost_ = false; + bool vsync_ = true; // Per-frame resources FrameData frames[MAX_FRAMES_IN_FLIGHT]; diff --git a/include/rendering/wmo_renderer.hpp b/include/rendering/wmo_renderer.hpp index f0d3b36f..4546d41c 100644 --- a/include/rendering/wmo_renderer.hpp +++ b/include/rendering/wmo_renderer.hpp @@ -148,6 +148,8 @@ public: * @param perFrameSet Per-frame descriptor set (set 0) * @param camera Camera for frustum culling */ + /** Pre-update mutable state (frame ID, material UBOs) on main thread before parallel render. */ + void prepareRender(); void render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet, const Camera& camera); /** @@ -332,6 +334,9 @@ public: // Defer normal/height map generation during streaming to avoid CPU stalls void setDeferNormalMaps(bool defer) { deferNormalMaps_ = defer; } + // Generate normal/height maps for cached textures that were loaded while deferred + void backfillNormalMaps(); + private: // WMO material UBO — matches WMOMaterial in wmo.frag.glsl struct WMOMaterialUBO { @@ -706,9 +711,7 @@ private: static constexpr float SPATIAL_CELL_SIZE = 64.0f; std::unordered_map, GridCellHash> spatialGrid; std::unordered_map instanceIndexById; - mutable std::vector candidateScratch; - mutable std::vector triScratch_; // Scratch for collision grid queries - mutable std::unordered_set candidateIdScratch; + // Collision scratch buffers are thread_local (see wmo_renderer.cpp) for thread-safety. // Parallel visibility culling uint32_t numCullThreads_ = 1; @@ -720,6 +723,8 @@ private: uint32_t distanceCulled = 0; }; std::vector> cullFutures_; + std::vector visibleInstances_; // reused per frame + std::vector drawLists_; // reused per frame // Collision query profiling (per frame). mutable double queryTimeMs = 0.0; diff --git a/include/ui/game_screen.hpp b/include/ui/game_screen.hpp index 7e428523..3bb99628 100644 --- a/include/ui/game_screen.hpp +++ b/include/ui/game_screen.hpp @@ -87,7 +87,7 @@ private: bool pendingVsync = false; int pendingResIndex = 0; bool pendingShadows = true; - float pendingShadowDistance = 72.0f; + float pendingShadowDistance = 300.0f; bool pendingWaterRefraction = false; int pendingMasterVolume = 100; int pendingMusicVolume = 30; @@ -116,6 +116,10 @@ private: float pendingNormalMapStrength = 0.8f; // 0.0-2.0 bool pendingPOM = true; // on by default int pendingPOMQuality = 1; // 0=Low(16), 1=Medium(32), 2=High(64) + bool pendingFSR = false; + int pendingFSRQuality = 0; // 0=UltraQuality, 1=Quality, 2=Balanced, 3=Performance + float pendingFSRSharpness = 0.5f; + bool fsrSettingsApplied_ = false; // UI element transparency (0.0 = fully transparent, 1.0 = fully opaque) float uiOpacity_ = 0.65f; diff --git a/src/core/application.cpp b/src/core/application.cpp index 1a239d8a..f9ac557c 100644 --- a/src/core/application.cpp +++ b/src/core/application.cpp @@ -49,9 +49,9 @@ #include // GL/glew.h removed — Vulkan migration Phase 1 #include +#include #include #include -#include #include #include #include @@ -868,7 +868,7 @@ void Application::update(float deltaTime) { } auto stageEnd = std::chrono::steady_clock::now(); float stageMs = std::chrono::duration(stageEnd - stageStart).count(); - if (stageMs > 3.0f) { + if (stageMs > 50.0f) { LOG_WARNING("SLOW update stage '", stageName, "': ", stageMs, "ms"); } }; @@ -913,29 +913,12 @@ void Application::update(float deltaTime) { inGameStep = "spawn/equipment queues"; updateCheckpoint = "in_game: spawn/equipment queues"; runInGameStage("spawn/equipment queues", [&] { - auto t0 = std::chrono::steady_clock::now(); processPlayerSpawnQueue(); - auto t1 = std::chrono::steady_clock::now(); processCreatureSpawnQueue(); - auto t2 = std::chrono::steady_clock::now(); processAsyncNpcCompositeResults(); - auto t3 = std::chrono::steady_clock::now(); processDeferredEquipmentQueue(); - auto t4 = std::chrono::steady_clock::now(); - // Process deferred normal maps (2 per frame to spread CPU cost) if (auto* cr = renderer ? renderer->getCharacterRenderer() : nullptr) { - cr->processPendingNormalMaps(2); - } - auto t5 = std::chrono::steady_clock::now(); - float pMs = std::chrono::duration(t1 - t0).count(); - float cMs = std::chrono::duration(t2 - t1).count(); - float nMs = std::chrono::duration(t3 - t2).count(); - float eMs = std::chrono::duration(t4 - t3).count(); - float nmMs = std::chrono::duration(t5 - t4).count(); - float total = pMs + cMs + nMs + eMs + nmMs; - if (total > 4.0f) { - LOG_WARNING("spawn/equip breakdown: player=", pMs, "ms creature=", cMs, - "ms npcComposite=", nMs, "ms equip=", eMs, "ms normalMaps=", nmMs, "ms"); + cr->processPendingNormalMaps(4); } }); // Self-heal missing creature visuals: if a nearby UNIT exists in @@ -1032,14 +1015,33 @@ void Application::update(float deltaTime) { if (renderer && renderer->getCameraController()) renderer->getCameraController()->clearMovementInputs(); } + // Hearth teleport: keep player frozen until terrain loads at destination + if (hearthTeleportPending_ && renderer && renderer->getTerrainManager()) { + hearthTeleportTimer_ -= deltaTime; + auto terrainH = renderer->getTerrainManager()->getHeightAt( + hearthTeleportPos_.x, hearthTeleportPos_.y); + if (terrainH || hearthTeleportTimer_ <= 0.0f) { + // Terrain loaded (or timeout) — snap to floor and release + if (terrainH) { + hearthTeleportPos_.z = *terrainH + 0.5f; + renderer->getCameraController()->teleportTo(hearthTeleportPos_); + } + renderer->getCameraController()->setExternalFollow(false); + worldEntryMovementGraceTimer_ = 1.0f; + hearthTeleportPending_ = false; + LOG_INFO("Unstuck hearth: terrain loaded, player released", + terrainH ? "" : " (timeout)"); + } + } if (renderer && renderer->getCameraController()) { const bool externallyDrivenMotion = onTaxi || onWMOTransport || chargeActive_; // Keep physics frozen (externalFollow) during landing clamp when terrain // hasn't loaded yet — prevents gravity from pulling player through void. + bool hearthFreeze = hearthTeleportPending_; bool landingClampActive = !onTaxi && taxiLandingClampTimer_ > 0.0f && worldEntryMovementGraceTimer_ <= 0.0f && !gameHandler->isMounted(); - renderer->getCameraController()->setExternalFollow(externallyDrivenMotion || landingClampActive); + renderer->getCameraController()->setExternalFollow(externallyDrivenMotion || landingClampActive || hearthFreeze); renderer->getCameraController()->setExternalMoving(externallyDrivenMotion); if (externallyDrivenMotion) { // Drop any stale local movement toggles while server drives taxi motion. @@ -1514,7 +1516,7 @@ void Application::update(float deltaTime) { } float ruMs = std::chrono::duration( std::chrono::steady_clock::now() - rendererUpdateStart).count(); - if (ruMs > 5.0f) { + if (ruMs > 50.0f) { LOG_WARNING("SLOW update stage 'renderer->update': ", ruMs, "ms"); } } @@ -1894,9 +1896,43 @@ void Application::setupUICallbacks() { LOG_INFO("Unstuck: high fallback snap"); }); + // /unstuckhearth — teleport to hearthstone bind point (server-synced). + // Freezes player until terrain loads at destination to prevent falling through world. + gameHandler->setUnstuckHearthCallback([this, clearStuckMovement, forceServerTeleportCommand]() { + if (!renderer || !renderer->getCameraController() || !gameHandler) return; + + uint32_t bindMap = 0; + glm::vec3 bindPos(0.0f); + if (!gameHandler->getHomeBind(bindMap, bindPos)) { + LOG_WARNING("Unstuck hearth: no bind point available"); + return; + } + + worldEntryMovementGraceTimer_ = 10.0f; // long grace — terrain load check will clear it + taxiLandingClampTimer_ = 0.0f; + lastTaxiFlight_ = false; + clearStuckMovement(); + + auto* cc = renderer->getCameraController(); + glm::vec3 renderPos = core::coords::canonicalToRender(bindPos); + renderPos.z += 2.0f; + + // Freeze player in place (no gravity/movement) until terrain loads + cc->teleportTo(renderPos); + cc->setExternalFollow(true); + forceServerTeleportCommand(renderPos); + clearStuckMovement(); + + // Set pending state — update loop will unfreeze once terrain is loaded + hearthTeleportPending_ = true; + hearthTeleportPos_ = renderPos; + hearthTeleportTimer_ = 15.0f; // 15s safety timeout + LOG_INFO("Unstuck hearth: teleporting to bind point, waiting for terrain..."); + }); + // Auto-unstuck: falling for > 5 seconds = void fall, teleport to map entry if (renderer->getCameraController()) { - renderer->getCameraController()->setAutoUnstuckCallback([this]() { + renderer->getCameraController()->setAutoUnstuckCallback([this, forceServerTeleportCommand]() { if (!renderer || !renderer->getCameraController()) return; auto* cc = renderer->getCameraController(); @@ -1904,7 +1940,8 @@ void Application::setupUICallbacks() { glm::vec3 spawnPos = cc->getDefaultPosition(); spawnPos.z += 5.0f; cc->teleportTo(spawnPos); - LOG_INFO("Auto-unstuck: teleported to map entry point"); + forceServerTeleportCommand(spawnPos); + LOG_INFO("Auto-unstuck: teleported to map entry point (server synced)"); }); } @@ -4167,11 +4204,17 @@ void Application::loadOnlineWorldTerrain(uint32_t mapId, float x, float y, float }); } - // Hide first-login hitch by draining initial world packets/spawn queues before - // dropping the loading screen. Keep this bounded so we don't stall indefinitely. + // Keep the loading screen visible until all spawn/equipment/gameobject queues + // are fully drained. This ensures the player sees a fully populated world + // (character clothed, NPCs placed, game objects loaded) when the screen drops. { - const float kWarmupMaxSeconds = 2.5f; + const float kMinWarmupSeconds = 2.0f; // minimum time to drain network packets + const float kMaxWarmupSeconds = 15.0f; // hard cap to avoid infinite stall const auto warmupStart = std::chrono::high_resolution_clock::now(); + // Track consecutive idle iterations (all queues empty) to detect convergence + int idleIterations = 0; + const int kIdleThreshold = 5; // require 5 consecutive empty loops (~80ms) + while (true) { SDL_Event event; while (SDL_PollEvent(&event)) { @@ -4185,7 +4228,6 @@ void Application::loadOnlineWorldTerrain(uint32_t mapId, float x, float y, float int w = event.window.data1; int h = event.window.data2; window->setSize(w, h); - // Vulkan viewport set in command buffer if (renderer && renderer->getCamera()) { renderer->getCamera()->setAspectRatio(static_cast(w) / h); } @@ -4207,60 +4249,18 @@ void Application::loadOnlineWorldTerrain(uint32_t mapId, float x, float y, float processPlayerSpawnQueue(); // During load screen warmup: lift per-frame budgets so GPU uploads - // happen in bulk while the loading screen is still visible. - // Process ALL async creature model uploads (no 3-per-frame cap). - { - for (auto it = asyncCreatureLoads_.begin(); it != asyncCreatureLoads_.end(); ) { - if (!it->future.valid() || - it->future.wait_for(std::chrono::milliseconds(0)) != std::future_status::ready) { - ++it; - continue; - } - auto result = it->future.get(); - it = asyncCreatureLoads_.erase(it); - if (result.permanent_failure) { - nonRenderableCreatureDisplayIds_.insert(result.displayId); - creaturePermanentFailureGuids_.insert(result.guid); - pendingCreatureSpawnGuids_.erase(result.guid); - creatureSpawnRetryCounts_.erase(result.guid); - continue; - } - if (!result.valid || !result.model) { - pendingCreatureSpawnGuids_.erase(result.guid); - creatureSpawnRetryCounts_.erase(result.guid); - continue; - } - auto* charRenderer = renderer ? renderer->getCharacterRenderer() : nullptr; - if (!charRenderer) { pendingCreatureSpawnGuids_.erase(result.guid); continue; } - if (!charRenderer->loadModel(*result.model, result.modelId)) { - nonRenderableCreatureDisplayIds_.insert(result.displayId); - creaturePermanentFailureGuids_.insert(result.guid); - pendingCreatureSpawnGuids_.erase(result.guid); - creatureSpawnRetryCounts_.erase(result.guid); - continue; - } - displayIdModelCache_[result.displayId] = result.modelId; - pendingCreatureSpawnGuids_.erase(result.guid); - creatureSpawnRetryCounts_.erase(result.guid); - if (!creatureInstances_.count(result.guid) && - !creaturePermanentFailureGuids_.count(result.guid)) { - PendingCreatureSpawn s{}; - s.guid = result.guid; s.displayId = result.displayId; - s.x = result.x; s.y = result.y; s.z = result.z; - s.orientation = result.orientation; - pendingCreatureSpawns_.push_back(s); - pendingCreatureSpawnGuids_.insert(result.guid); - } - } - } - processCreatureSpawnQueue(); + // and spawns happen in bulk while the loading screen is still visible. + processCreatureSpawnQueue(true); processAsyncNpcCompositeResults(); - processDeferredEquipmentQueue(); + // Process equipment queue more aggressively during warmup (multiple per iteration) + for (int i = 0; i < 8 && (!deferredEquipmentQueue_.empty() || !asyncEquipmentLoads_.empty()); i++) { + processDeferredEquipmentQueue(); + } if (auto* cr = renderer ? renderer->getCharacterRenderer() : nullptr) { - cr->processPendingNormalMaps(10); // higher budget during load screen + cr->processPendingNormalMaps(INT_MAX); } - // Process ALL pending game object spawns (no 1-per-frame cap during load screen). + // Process ALL pending game object spawns. while (!pendingGameObjectSpawns_.empty()) { auto& s = pendingGameObjectSpawns_.front(); spawnOnlineGameObject(s.guid, s.entry, s.displayId, s.x, s.y, s.z, s.orientation); @@ -4271,14 +4271,42 @@ void Application::loadOnlineWorldTerrain(uint32_t mapId, float x, float y, float processPendingMount(); updateQuestMarkers(); + // Update renderer (terrain streaming, animations) + if (renderer) { + renderer->update(1.0f / 60.0f); + } + const auto now = std::chrono::high_resolution_clock::now(); const float elapsed = std::chrono::duration(now - warmupStart).count(); - const float t = std::clamp(elapsed / kWarmupMaxSeconds, 0.0f, 1.0f); - showProgress("Finalizing world sync...", 0.97f + t * 0.025f); - if (elapsed >= kWarmupMaxSeconds) { + // Check if all queues are drained + bool queuesEmpty = + pendingCreatureSpawns_.empty() && + asyncCreatureLoads_.empty() && + asyncNpcCompositeLoads_.empty() && + deferredEquipmentQueue_.empty() && + asyncEquipmentLoads_.empty() && + pendingGameObjectSpawns_.empty() && + asyncGameObjectLoads_.empty() && + pendingPlayerSpawns_.empty(); + + if (queuesEmpty) { + idleIterations++; + } else { + idleIterations = 0; + } + + // Exit when: (min time passed AND queues drained for several iterations) OR hard cap + bool readyToExit = (elapsed >= kMinWarmupSeconds && idleIterations >= kIdleThreshold); + if (readyToExit || elapsed >= kMaxWarmupSeconds) { + if (elapsed >= kMaxWarmupSeconds) { + LOG_WARNING("Warmup hit hard cap (", kMaxWarmupSeconds, "s), entering world with pending work"); + } break; } + + const float t = std::clamp(elapsed / kMaxWarmupSeconds, 0.0f, 1.0f); + showProgress("Finalizing world sync...", 0.97f + t * 0.025f); SDL_Delay(16); } } @@ -5154,7 +5182,7 @@ void Application::spawnOnlineCreature(uint64_t guid, uint32_t displayId, float x { auto texEnd = std::chrono::steady_clock::now(); float texMs = std::chrono::duration(texEnd - texStart).count(); - if (texMs > 3.0f) { + if (texMs > 50.0f) { LOG_WARNING("spawnCreature texture setup took ", texMs, "ms displayId=", displayId, " hasPreDec=", hasPreDec, " extra=", dispData.extraDisplayId); } @@ -6804,9 +6832,10 @@ void Application::spawnOnlineGameObject(uint64_t guid, uint32_t entry, uint32_t " displayId=", displayId, " at (", x, ", ", y, ", ", z, ")"); } -void Application::processAsyncCreatureResults() { +void Application::processAsyncCreatureResults(bool unlimited) { // Check completed async model loads and finalize on main thread (GPU upload + instance creation). // Limit GPU model uploads per frame to avoid spikes, but always drain cheap bookkeeping. + // In unlimited mode (load screen), process all pending uploads without cap. static constexpr int kMaxModelUploadsPerFrame = 1; int modelUploads = 0; @@ -6819,9 +6848,7 @@ void Application::processAsyncCreatureResults() { // Peek: if this result needs a NEW model upload (not cached) and we've hit // the upload budget, defer to next frame without consuming the future. - if (modelUploads >= kMaxModelUploadsPerFrame) { - // Check if this displayId already has a cached model (cheap spawn, no GPU upload). - // We can't peek the displayId without getting the future, so just break. + if (!unlimited && modelUploads >= kMaxModelUploadsPerFrame) { break; } @@ -6864,7 +6891,7 @@ void Application::processAsyncCreatureResults() { { auto uploadEnd = std::chrono::steady_clock::now(); float uploadMs = std::chrono::duration(uploadEnd - uploadStart).count(); - if (uploadMs > 3.0f) { + if (uploadMs > 100.0f) { LOG_WARNING("charRenderer->loadModel took ", uploadMs, "ms displayId=", result.displayId, " preDecoded=", result.predecodedTextures.size()); } @@ -6967,17 +6994,18 @@ void Application::processAsyncNpcCompositeResults() { } } -void Application::processCreatureSpawnQueue() { +void Application::processCreatureSpawnQueue(bool unlimited) { auto startTime = std::chrono::steady_clock::now(); // Budget: max 2ms per frame for creature spawning to prevent stutter. + // In unlimited mode (load screen), process everything without budget cap. static constexpr float kSpawnBudgetMs = 2.0f; // First, finalize any async model loads that completed on background threads. - processAsyncCreatureResults(); + processAsyncCreatureResults(unlimited); { auto now = std::chrono::steady_clock::now(); float asyncMs = std::chrono::duration(now - startTime).count(); - if (asyncMs > 3.0f) { + if (asyncMs > 100.0f) { LOG_WARNING("processAsyncCreatureResults took ", asyncMs, "ms"); } } @@ -6992,11 +7020,11 @@ void Application::processCreatureSpawnQueue() { int asyncLaunched = 0; size_t rotationsLeft = pendingCreatureSpawns_.size(); while (!pendingCreatureSpawns_.empty() && - processed < MAX_SPAWNS_PER_FRAME && + (unlimited || processed < MAX_SPAWNS_PER_FRAME) && rotationsLeft > 0) { // Check time budget every iteration (including first — async results may // have already consumed the budget via GPU model uploads). - { + if (!unlimited) { auto now = std::chrono::steady_clock::now(); float elapsedMs = std::chrono::duration(now - startTime).count(); if (elapsedMs >= kSpawnBudgetMs) break; @@ -7017,7 +7045,8 @@ void Application::processCreatureSpawnQueue() { // For new models: launch async load on background thread instead of blocking. if (needsNewModel) { - if (static_cast(asyncCreatureLoads_.size()) + asyncLaunched >= MAX_ASYNC_CREATURE_LOADS) { + const int maxAsync = unlimited ? (MAX_ASYNC_CREATURE_LOADS * 4) : MAX_ASYNC_CREATURE_LOADS; + if (static_cast(asyncCreatureLoads_.size()) + asyncLaunched >= maxAsync) { // Too many in-flight — defer to next frame pendingCreatureSpawns_.push_back(s); rotationsLeft--; @@ -7273,7 +7302,7 @@ void Application::processCreatureSpawnQueue() { spawnOnlineCreature(s.guid, s.displayId, s.x, s.y, s.z, s.orientation); auto spawnEnd = std::chrono::steady_clock::now(); float spawnMs = std::chrono::duration(spawnEnd - spawnStart).count(); - if (spawnMs > 3.0f) { + if (spawnMs > 100.0f) { LOG_WARNING("spawnOnlineCreature took ", spawnMs, "ms displayId=", s.displayId); } } diff --git a/src/core/window.cpp b/src/core/window.cpp index eed83c97..9f74a81c 100644 --- a/src/core/window.cpp +++ b/src/core/window.cpp @@ -84,6 +84,7 @@ bool Window::initialize() { // Initialize Vulkan context vkContext = std::make_unique(); + vkContext->setVsync(vsync); if (!vkContext->initialize(window)) { LOG_ERROR("Failed to initialize Vulkan context"); return false; @@ -158,11 +159,13 @@ void Window::setFullscreen(bool enable) { } } -void Window::setVsync([[maybe_unused]] bool enable) { - // VSync in Vulkan is controlled by present mode (set at swapchain creation) - // For now, store the preference — applied on next swapchain recreation +void Window::setVsync(bool enable) { vsync = enable; - LOG_INFO("VSync preference set to ", enable ? "on" : "off", " (applied on swapchain recreation)"); + if (vkContext) { + vkContext->setVsync(enable); + vkContext->markSwapchainDirty(); + } + LOG_INFO("VSync ", enable ? "enabled" : "disabled"); } void Window::applyResolution(int w, int h) { diff --git a/src/game/game_handler.cpp b/src/game/game_handler.cpp index 9a7aed97..3cd05d3c 100644 --- a/src/game/game_handler.cpp +++ b/src/game/game_handler.cpp @@ -11435,6 +11435,15 @@ void GameHandler::unstuckGy() { } } +void GameHandler::unstuckHearth() { + if (unstuckHearthCallback_) { + unstuckHearthCallback_(); + addSystemChatMessage("Unstuck: teleported to hearthstone location."); + } else { + addSystemChatMessage("No hearthstone bind point set."); + } +} + void GameHandler::handleLootResponse(network::Packet& packet) { if (!LootResponseParser::parse(packet, currentLoot)) return; lootWindowOpen = true; diff --git a/src/rendering/camera.cpp b/src/rendering/camera.cpp index f8b45f3c..bd1ebe0a 100644 --- a/src/rendering/camera.cpp +++ b/src/rendering/camera.cpp @@ -20,6 +20,13 @@ void Camera::updateProjectionMatrix() { projectionMatrix = glm::perspective(glm::radians(fov), aspectRatio, nearPlane, farPlane); // Vulkan clip-space has Y pointing down; flip the projection's Y axis. projectionMatrix[1][1] *= -1.0f; + unjitteredProjectionMatrix = projectionMatrix; + + // Re-apply jitter if active + if (jitterOffset.x != 0.0f || jitterOffset.y != 0.0f) { + projectionMatrix[2][0] += jitterOffset.x; + projectionMatrix[2][1] += jitterOffset.y; + } } glm::vec3 Camera::getForward() const { @@ -40,6 +47,21 @@ glm::vec3 Camera::getUp() const { return glm::normalize(glm::cross(getRight(), getForward())); } +void Camera::setJitter(float jx, float jy) { + // Remove old jitter, apply new + projectionMatrix[2][0] -= jitterOffset.x; + projectionMatrix[2][1] -= jitterOffset.y; + jitterOffset = glm::vec2(jx, jy); + projectionMatrix[2][0] += jitterOffset.x; + projectionMatrix[2][1] += jitterOffset.y; +} + +void Camera::clearJitter() { + projectionMatrix[2][0] -= jitterOffset.x; + projectionMatrix[2][1] -= jitterOffset.y; + jitterOffset = glm::vec2(0.0f); +} + Ray Camera::screenToWorldRay(float screenX, float screenY, float screenW, float screenH) const { float ndcX = (2.0f * screenX / screenW) - 1.0f; // Vulkan Y-flip is baked into projectionMatrix, so NDC Y maps directly: diff --git a/src/rendering/camera_controller.cpp b/src/rendering/camera_controller.cpp index 4103cc9f..891d53ba 100644 --- a/src/rendering/camera_controller.cpp +++ b/src/rendering/camera_controller.cpp @@ -1,5 +1,6 @@ #include "rendering/camera_controller.hpp" #include +#include #include #include "rendering/terrain_manager.hpp" #include "rendering/wmo_renderer.hpp" @@ -808,25 +809,53 @@ void CameraController::update(float deltaTime) { if (useCached) { groundH = cachedFloorHeight_; } else { - // Full collision check + // Full collision check — run terrain/WMO/M2 queries in parallel std::optional terrainH; std::optional wmoH; std::optional m2H; - if (terrainManager) { - terrainH = terrainManager->getHeightAt(targetPos.x, targetPos.y); - } // When airborne, anchor probe to last ground level so the // ceiling doesn't rise with the jump and catch roof geometry. float wmoBaseZ = grounded ? std::max(targetPos.z, lastGroundZ) : lastGroundZ; float wmoProbeZ = wmoBaseZ + stepUpBudget + 0.5f; float wmoNormalZ = 1.0f; + + // Launch WMO + M2 floor queries asynchronously while terrain runs on this thread. + // Collision scratch buffers are thread_local so concurrent calls are safe. + using FloorResult = std::pair, float>; + std::future wmoFuture; + std::future m2Future; + bool wmoAsync = false, m2Async = false; + float px = targetPos.x, py = targetPos.y; if (wmoRenderer) { - wmoH = wmoRenderer->getFloorHeight(targetPos.x, targetPos.y, wmoProbeZ, &wmoNormalZ); + wmoAsync = true; + wmoFuture = std::async(std::launch::async, + [this, px, py, wmoProbeZ]() -> FloorResult { + float nz = 1.0f; + auto h = wmoRenderer->getFloorHeight(px, py, wmoProbeZ, &nz); + return {h, nz}; + }); } if (m2Renderer && !externalFollow_) { - float m2NormalZ = 1.0f; - m2H = m2Renderer->getFloorHeight(targetPos.x, targetPos.y, wmoProbeZ, &m2NormalZ); - if (m2H && m2NormalZ < MIN_WALKABLE_NORMAL_M2) { + m2Async = true; + m2Future = std::async(std::launch::async, + [this, px, py, wmoProbeZ]() -> FloorResult { + float nz = 1.0f; + auto h = m2Renderer->getFloorHeight(px, py, wmoProbeZ, &nz); + return {h, nz}; + }); + } + if (terrainManager) { + terrainH = terrainManager->getHeightAt(targetPos.x, targetPos.y); + } + if (wmoAsync) { + auto [h, nz] = wmoFuture.get(); + wmoH = h; + wmoNormalZ = nz; + } + if (m2Async) { + auto [h, nz] = m2Future.get(); + m2H = h; + if (m2H && nz < MIN_WALKABLE_NORMAL_M2) { m2H = std::nullopt; } } diff --git a/src/rendering/character_renderer.cpp b/src/rendering/character_renderer.cpp index baaaf3e6..f69ae75c 100644 --- a/src/rendering/character_renderer.cpp +++ b/src/rendering/character_renderer.cpp @@ -332,6 +332,11 @@ void CharacterRenderer::shutdown() { LOG_INFO("CharacterRenderer::shutdown instances=", instances.size(), " models=", models.size(), " override=", (void*)renderPassOverride_); + // Wait for any in-flight background normal map generation threads + while (pendingNormalMapCount_.load(std::memory_order_relaxed) > 0) { + std::this_thread::sleep_for(std::chrono::milliseconds(1)); + } + vkDeviceWaitIdle(vkCtx_->getDevice()); VkDevice device = vkCtx_->getDevice(); VmaAllocator alloc = vkCtx_->getAllocator(); @@ -413,6 +418,16 @@ void CharacterRenderer::clear() { LOG_INFO("CharacterRenderer::clear instances=", instances.size(), " models=", models.size()); + // Wait for any in-flight background normal map generation threads + while (pendingNormalMapCount_.load(std::memory_order_relaxed) > 0) { + std::this_thread::sleep_for(std::chrono::milliseconds(1)); + } + // Discard any completed results that haven't been uploaded + { + std::lock_guard lock(normalMapResultsMutex_); + completedNormalMaps_.clear(); + } + vkDeviceWaitIdle(vkCtx_->getDevice()); VkDevice device = vkCtx_->getDevice(); @@ -509,7 +524,32 @@ std::unique_ptr CharacterRenderer::generateNormalHeightMap( const uint8_t* pixels, uint32_t width, uint32_t height, float& outVariance) { if (!vkCtx_ || width == 0 || height == 0) return nullptr; + // Use the CPU-only static method, then upload to GPU + std::vector dummy(width * height * 4); + std::memcpy(dummy.data(), pixels, dummy.size()); + auto result = generateNormalHeightMapCPU("", std::move(dummy), width, height); + outVariance = result.variance; + + auto tex = std::make_unique(); + if (!tex->upload(*vkCtx_, result.pixels.data(), width, height, VK_FORMAT_R8G8B8A8_UNORM, true)) { + return nullptr; + } + tex->createSampler(vkCtx_->getDevice(), VK_FILTER_LINEAR, VK_FILTER_LINEAR, + VK_SAMPLER_ADDRESS_MODE_REPEAT); + return tex; +} + +// Static, thread-safe CPU-only normal map generation (no GPU access) +CharacterRenderer::NormalMapResult CharacterRenderer::generateNormalHeightMapCPU( + std::string cacheKey, std::vector srcPixels, uint32_t width, uint32_t height) { + NormalMapResult result; + result.cacheKey = std::move(cacheKey); + result.width = width; + result.height = height; + result.variance = 0.0f; + const uint32_t totalPixels = width * height; + const uint8_t* pixels = srcPixels.data(); // Step 1: Compute height from luminance std::vector heightMap(totalPixels); @@ -524,7 +564,7 @@ std::unique_ptr CharacterRenderer::generateNormalHeightMap( sumH2 += h * h; } double mean = sumH / totalPixels; - outVariance = static_cast(sumH2 / totalPixels - mean * mean); + result.variance = static_cast(sumH2 / totalPixels - mean * mean); // Step 1.5: Box blur the height map to reduce noise from diffuse textures auto wrapSample = [&](const std::vector& map, int x, int y) -> float { @@ -545,11 +585,9 @@ std::unique_ptr CharacterRenderer::generateNormalHeightMap( } } - // Step 2: Sobel 3x3 → normal map (crisp detail from original, blurred for POM alpha) - // Higher strength than WMO (2.0) because character/weapon textures are hand-painted - // with baked-in lighting that produces low-contrast gradients in the Sobel filter. + // Step 2: Sobel 3x3 → normal map const float strength = 5.0f; - std::vector output(totalPixels * 4); + result.pixels.resize(totalPixels * 4); auto sampleH = [&](int x, int y) -> float { x = ((x % (int)width) + (int)width) % (int)width; @@ -573,20 +611,14 @@ std::unique_ptr CharacterRenderer::generateNormalHeightMap( if (len > 0.0f) { nx /= len; ny /= len; nz /= len; } uint32_t idx = (y * width + x) * 4; - output[idx + 0] = static_cast(std::clamp((nx * 0.5f + 0.5f) * 255.0f, 0.0f, 255.0f)); - output[idx + 1] = static_cast(std::clamp((ny * 0.5f + 0.5f) * 255.0f, 0.0f, 255.0f)); - output[idx + 2] = static_cast(std::clamp((nz * 0.5f + 0.5f) * 255.0f, 0.0f, 255.0f)); - output[idx + 3] = static_cast(std::clamp(blurredHeight[y * width + x] * 255.0f, 0.0f, 255.0f)); + result.pixels[idx + 0] = static_cast(std::clamp((nx * 0.5f + 0.5f) * 255.0f, 0.0f, 255.0f)); + result.pixels[idx + 1] = static_cast(std::clamp((ny * 0.5f + 0.5f) * 255.0f, 0.0f, 255.0f)); + result.pixels[idx + 2] = static_cast(std::clamp((nz * 0.5f + 0.5f) * 255.0f, 0.0f, 255.0f)); + result.pixels[idx + 3] = static_cast(std::clamp(blurredHeight[y * width + x] * 255.0f, 0.0f, 255.0f)); } } - auto tex = std::make_unique(); - if (!tex->upload(*vkCtx_, output.data(), width, height, VK_FORMAT_R8G8B8A8_UNORM, true)) { - return nullptr; - } - tex->createSampler(vkCtx_->getDevice(), VK_FILTER_LINEAR, VK_FILTER_LINEAR, - VK_SAMPLER_ADDRESS_MODE_REPEAT); - return tex; + return result; } VkTexture* CharacterRenderer::loadTexture(const std::string& path) { @@ -687,15 +719,22 @@ VkTexture* CharacterRenderer::loadTexture(const std::string& path) { e.hasAlpha = hasAlpha; e.colorKeyBlack = colorKeyBlackHint; - // Defer normal/height map generation to avoid stalling loadModel. - // Normal maps are generated in processPendingNormalMaps() at a per-frame budget. + // Launch normal map generation on background thread — CPU work is pure compute, + // only the GPU upload (in processPendingNormalMaps) needs the main thread (~1-2ms). if (blpImage.width >= 32 && blpImage.height >= 32) { - PendingNormalMap pending; - pending.cacheKey = key; - pending.pixels.assign(blpImage.data.begin(), blpImage.data.end()); - pending.width = blpImage.width; - pending.height = blpImage.height; - pendingNormalMaps_.push_back(std::move(pending)); + uint32_t w = blpImage.width, h = blpImage.height; + std::string ck = key; + std::vector px(blpImage.data.begin(), blpImage.data.end()); + pendingNormalMapCount_.fetch_add(1, std::memory_order_relaxed); + auto* self = this; + std::thread([self, ck = std::move(ck), px = std::move(px), w, h]() mutable { + auto result = generateNormalHeightMapCPU(std::move(ck), std::move(px), w, h); + { + std::lock_guard lock(self->normalMapResultsMutex_); + self->completedNormalMaps_.push_back(std::move(result)); + } + self->pendingNormalMapCount_.fetch_sub(1, std::memory_order_relaxed); + }).detach(); e.normalMapPending = true; } @@ -709,30 +748,39 @@ VkTexture* CharacterRenderer::loadTexture(const std::string& path) { } void CharacterRenderer::processPendingNormalMaps(int budget) { - if (pendingNormalMaps_.empty() || !vkCtx_) return; + if (!vkCtx_) return; - int processed = 0; - while (!pendingNormalMaps_.empty() && processed < budget) { - auto pending = std::move(pendingNormalMaps_.front()); - pendingNormalMaps_.pop_front(); + // Collect completed results from background threads + std::deque ready; + { + std::lock_guard lock(normalMapResultsMutex_); + if (completedNormalMaps_.empty()) return; + int count = std::min(budget, static_cast(completedNormalMaps_.size())); + for (int i = 0; i < count; i++) { + ready.push_back(std::move(completedNormalMaps_.front())); + completedNormalMaps_.pop_front(); + } + } - auto it = textureCache.find(pending.cacheKey); + // GPU upload only (~1-2ms each) — CPU work already done on background thread + for (auto& result : ready) { + auto it = textureCache.find(result.cacheKey); if (it == textureCache.end()) continue; // texture was evicted - float nhVariance = 0.0f; vkCtx_->beginUploadBatch(); - auto nhMap = generateNormalHeightMap(pending.pixels.data(), - pending.width, pending.height, nhVariance); - vkCtx_->endUploadBatch(); - - if (nhMap) { - it->second.heightMapVariance = nhVariance; - it->second.approxBytes += approxTextureBytesWithMips(pending.width, pending.height); - textureCacheBytes_ += approxTextureBytesWithMips(pending.width, pending.height); - it->second.normalHeightMap = std::move(nhMap); + auto tex = std::make_unique(); + bool ok = tex->upload(*vkCtx_, result.pixels.data(), result.width, result.height, + VK_FORMAT_R8G8B8A8_UNORM, true); + if (ok) { + tex->createSampler(vkCtx_->getDevice(), VK_FILTER_LINEAR, VK_FILTER_LINEAR, + VK_SAMPLER_ADDRESS_MODE_REPEAT); + it->second.heightMapVariance = result.variance; + it->second.approxBytes += approxTextureBytesWithMips(result.width, result.height); + textureCacheBytes_ += approxTextureBytesWithMips(result.width, result.height); + it->second.normalHeightMap = std::move(tex); } + vkCtx_->endUploadBatch(); it->second.normalMapPending = false; - processed++; } } @@ -1876,6 +1924,61 @@ glm::mat4 CharacterRenderer::getBoneTransform(const pipeline::M2Bone& bone, floa // --- Rendering --- +void CharacterRenderer::prepareRender(uint32_t frameIndex) { + if (instances.empty() || !opaquePipeline_) return; + + // Pre-allocate bone SSBOs + descriptor sets on main thread (pool ops not thread-safe) + for (auto& [id, instance] : instances) { + int numBones = std::min(static_cast(instance.boneMatrices.size()), MAX_BONES); + if (numBones <= 0) continue; + + if (!instance.boneBuffer[frameIndex]) { + VkBufferCreateInfo bci{VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO}; + bci.size = MAX_BONES * sizeof(glm::mat4); + bci.usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT; + VmaAllocationCreateInfo aci{}; + aci.usage = VMA_MEMORY_USAGE_CPU_TO_GPU; + aci.flags = VMA_ALLOCATION_CREATE_MAPPED_BIT; + VmaAllocationInfo allocInfo{}; + vmaCreateBuffer(vkCtx_->getAllocator(), &bci, &aci, + &instance.boneBuffer[frameIndex], &instance.boneAlloc[frameIndex], &allocInfo); + instance.boneMapped[frameIndex] = allocInfo.pMappedData; + + VkDescriptorSetAllocateInfo ai{VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO}; + ai.descriptorPool = boneDescPool_; + ai.descriptorSetCount = 1; + ai.pSetLayouts = &boneSetLayout_; + VkResult dsRes = vkAllocateDescriptorSets(vkCtx_->getDevice(), &ai, &instance.boneSet[frameIndex]); + if (dsRes != VK_SUCCESS) { + LOG_ERROR("CharacterRenderer::prepareRender: bone descriptor alloc failed (instance=", + id, ", frame=", frameIndex, ", vk=", static_cast(dsRes), ")"); + if (instance.boneBuffer[frameIndex]) { + vmaDestroyBuffer(vkCtx_->getAllocator(), + instance.boneBuffer[frameIndex], instance.boneAlloc[frameIndex]); + instance.boneBuffer[frameIndex] = VK_NULL_HANDLE; + instance.boneAlloc[frameIndex] = VK_NULL_HANDLE; + instance.boneMapped[frameIndex] = nullptr; + } + continue; + } + + if (instance.boneSet[frameIndex]) { + VkDescriptorBufferInfo bufInfo{}; + bufInfo.buffer = instance.boneBuffer[frameIndex]; + bufInfo.offset = 0; + bufInfo.range = bci.size; + VkWriteDescriptorSet write{VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET}; + write.dstSet = instance.boneSet[frameIndex]; + write.dstBinding = 0; + write.descriptorCount = 1; + write.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; + write.pBufferInfo = &bufInfo; + vkUpdateDescriptorSets(vkCtx_->getDevice(), 1, &write, 0, nullptr); + } + } + } +} + void CharacterRenderer::render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet, [[maybe_unused]] const Camera& camera) { if (instances.empty() || !opaquePipeline_) { return; diff --git a/src/rendering/loading_screen.cpp b/src/rendering/loading_screen.cpp index 34ad1aa6..a2e83a2b 100644 --- a/src/rendering/loading_screen.cpp +++ b/src/rendering/loading_screen.cpp @@ -240,6 +240,66 @@ bool LoadingScreen::loadImage(const std::string& path) { return true; } +void LoadingScreen::renderOverlay() { + // Draw loading screen content as ImGui overlay within an existing ImGui frame. + // Caller is responsible for ImGui NewFrame/Render and Vulkan frame management. + ImGuiIO& io = ImGui::GetIO(); + float screenW = io.DisplaySize.x; + float screenH = io.DisplaySize.y; + + ImGui::SetNextWindowPos(ImVec2(0, 0)); + ImGui::SetNextWindowSize(ImVec2(screenW, screenH)); + ImGui::Begin("##LoadingScreenOverlay", nullptr, + ImGuiWindowFlags_NoTitleBar | ImGuiWindowFlags_NoResize | + ImGuiWindowFlags_NoMove | ImGuiWindowFlags_NoScrollbar | + ImGuiWindowFlags_NoInputs | ImGuiWindowFlags_NoBackground | + ImGuiWindowFlags_NoBringToFrontOnFocus); + + if (bgDescriptorSet) { + ImGui::GetWindowDrawList()->AddImage( + reinterpret_cast(bgDescriptorSet), + ImVec2(0, 0), ImVec2(screenW, screenH)); + } + + // Progress bar + { + const float barWidthFrac = 0.6f; + const float barHeight = 6.0f; + const float barY = screenH * 0.06f; + float barX = screenW * (0.5f - barWidthFrac * 0.5f); + float barW = screenW * barWidthFrac; + ImDrawList* drawList = ImGui::GetWindowDrawList(); + drawList->AddRectFilled(ImVec2(barX, barY), ImVec2(barX + barW, barY + barHeight), + IM_COL32(25, 25, 25, 200), 2.0f); + if (loadProgress > 0.001f) { + drawList->AddRectFilled(ImVec2(barX, barY), ImVec2(barX + barW * loadProgress, barY + barHeight), + IM_COL32(199, 156, 33, 255), 2.0f); + } + drawList->AddRect(ImVec2(barX - 1, barY - 1), ImVec2(barX + barW + 1, barY + barHeight + 1), + IM_COL32(140, 110, 25, 255), 2.0f); + } + + // Percentage text + { + char pctBuf[32]; + snprintf(pctBuf, sizeof(pctBuf), "%d%%", static_cast(loadProgress * 100.0f)); + float textY = screenH * 0.06f - 20.0f; + ImVec2 pctSize = ImGui::CalcTextSize(pctBuf); + ImGui::SetCursorPos(ImVec2((screenW - pctSize.x) * 0.5f, textY)); + ImGui::TextColored(ImVec4(0.0f, 0.0f, 0.0f, 1.0f), "%s", pctBuf); + } + + // Status text + { + float statusY = screenH * 0.06f + 14.0f; + ImVec2 statusSize = ImGui::CalcTextSize(statusText.c_str()); + ImGui::SetCursorPos(ImVec2((screenW - statusSize.x) * 0.5f, statusY)); + ImGui::TextColored(ImVec4(0.0f, 0.0f, 0.0f, 1.0f), "%s", statusText.c_str()); + } + + ImGui::End(); +} + void LoadingScreen::render() { // If a frame is already in progress (e.g. called from a UI callback), // end it before starting our own diff --git a/src/rendering/m2_renderer.cpp b/src/rendering/m2_renderer.cpp index d455e494..eed9a025 100644 --- a/src/rendering/m2_renderer.cpp +++ b/src/rendering/m2_renderer.cpp @@ -282,6 +282,14 @@ glm::vec3 closestPointOnTriangle(const glm::vec3& p, } // namespace +// Thread-local scratch buffers for collision queries (allows concurrent getFloorHeight calls) +static thread_local std::vector tl_m2_candidateScratch; +static thread_local std::unordered_set tl_m2_candidateIdScratch; +static thread_local std::vector tl_m2_collisionTriScratch; + +// Forward declaration (defined after animation helpers) +static void computeBoneMatrices(const M2ModelGPU& model, M2Instance& instance); + void M2Instance::updateModelMatrix() { modelMatrix = glm::mat4(1.0f); modelMatrix = glm::translate(modelMatrix, position); @@ -1028,10 +1036,9 @@ bool M2Renderer::loadModel(const pipeline::M2Model& model, uint32_t modelId) { (lowerName.find("trunk") != std::string::npos) || (lowerName.find("stump") != std::string::npos) || (lowerName.find("log") != std::string::npos); - // Only large trees (canopy > 20 model units wide) get trunk collision. - // Small/mid trees are walkthrough to avoid getting stuck between them. - // Only large trees get trunk collision; all smaller trees are walkthrough. - bool treeWithTrunk = treeLike && !hardTreePart && !foliageName && horiz > 40.0f; + // Trees with visible trunks get collision. Threshold: canopy wider than 6 + // model units AND taller than 4 units (filters out small bushes/saplings). + bool treeWithTrunk = treeLike && !hardTreePart && !foliageName && horiz > 6.0f && vert > 4.0f; bool softTree = treeLike && !hardTreePart && !treeWithTrunk; bool forceSolidCurb = gpuModel.collisionSteppedLowPlatform || knownStormwindPlanter || likelyCurbName || gpuModel.collisionPlanter; bool narrowVerticalName = @@ -1602,6 +1609,12 @@ bool M2Renderer::loadModel(const pipeline::M2Model& model, uint32_t modelId) { } } + // Pre-compute available LOD levels to avoid per-instance batch iteration + gpuModel.availableLODs = 0; + for (const auto& b : gpuModel.batches) { + if (b.submeshLevel < 8) gpuModel.availableLODs |= (1u << b.submeshLevel); + } + models[modelId] = std::move(gpuModel); LOG_DEBUG("Loaded M2 model: ", model.name, " (", models[modelId].vertexCount, " vertices, ", @@ -1667,6 +1680,21 @@ uint32_t M2Renderer::createInstance(uint32_t modelId, const glm::vec3& position, instance.animDuration = static_cast(mdl.sequences[0].duration); instance.animTime = static_cast(rand() % std::max(1u, mdl.sequences[0].duration)); instance.variationTimer = 3000.0f + static_cast(rand() % 8000); + + // Seed bone matrices from an existing instance of the same model so the + // new instance renders immediately instead of being invisible until the + // next update() computes bones (prevents pop-in flash). + for (const auto& existing : instances) { + if (existing.modelId == modelId && !existing.boneMatrices.empty()) { + instance.boneMatrices = existing.boneMatrices; + instance.bonesDirty[0] = instance.bonesDirty[1] = true; + break; + } + } + // If no sibling exists yet, compute bones immediately + if (instance.boneMatrices.empty()) { + computeBoneMatrices(mdlRef, instance); + } } // Register in dedup map before pushing (uses original position, not ground-adjusted) @@ -1758,6 +1786,18 @@ uint32_t M2Renderer::createInstanceWithMatrix(uint32_t modelId, const glm::mat4& instance.animDuration = static_cast(mdl2.sequences[0].duration); instance.animTime = static_cast(rand() % std::max(1u, mdl2.sequences[0].duration)); instance.variationTimer = 3000.0f + static_cast(rand() % 8000); + + // Seed bone matrices from an existing sibling so the instance renders immediately + for (const auto& existing : instances) { + if (existing.modelId == modelId && !existing.boneMatrices.empty()) { + instance.boneMatrices = existing.boneMatrices; + instance.bonesDirty[0] = instance.bonesDirty[1] = true; + break; + } + } + if (instance.boneMatrices.empty()) { + computeBoneMatrices(mdl2, instance); + } } else { instance.animTime = static_cast(rand()) / RAND_MAX * 10000.0f; } @@ -1911,6 +1951,7 @@ static void computeBoneMatrices(const M2ModelGPU& model, M2Instance& instance) { instance.boneMatrices[i] = local; } } + instance.bonesDirty[0] = instance.bonesDirty[1] = true; } void M2Renderer::update(float deltaTime, const glm::vec3& cameraPos, const glm::mat4& viewProjection) { @@ -2172,6 +2213,53 @@ void M2Renderer::update(float deltaTime, const glm::vec3& cameraPos, const glm:: } +void M2Renderer::prepareRender(uint32_t frameIndex, const Camera& camera) { + if (!initialized_ || instances.empty()) return; + (void)camera; // reserved for future frustum-based culling + + // Pre-allocate bone SSBOs + descriptor sets on main thread (pool ops not thread-safe). + // Only iterate animated instances — static doodads don't need bone buffers. + for (size_t idx : animatedInstanceIndices_) { + if (idx >= instances.size()) continue; + auto& instance = instances[idx]; + + if (instance.boneMatrices.empty()) continue; + + if (!instance.boneBuffer[frameIndex]) { + VkBufferCreateInfo bci{VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO}; + bci.size = 128 * sizeof(glm::mat4); + bci.usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT; + VmaAllocationCreateInfo aci{}; + aci.usage = VMA_MEMORY_USAGE_CPU_TO_GPU; + aci.flags = VMA_ALLOCATION_CREATE_MAPPED_BIT; + VmaAllocationInfo allocInfo{}; + vmaCreateBuffer(vkCtx_->getAllocator(), &bci, &aci, + &instance.boneBuffer[frameIndex], &instance.boneAlloc[frameIndex], &allocInfo); + instance.boneMapped[frameIndex] = allocInfo.pMappedData; + + // Force dirty so current boneMatrices get copied into this + // newly-allocated buffer during render (prevents garbage/zero + // bones when the other frame index already cleared bonesDirty). + instance.bonesDirty[frameIndex] = true; + + instance.boneSet[frameIndex] = allocateBoneSet(); + if (instance.boneSet[frameIndex]) { + VkDescriptorBufferInfo bufInfo{}; + bufInfo.buffer = instance.boneBuffer[frameIndex]; + bufInfo.offset = 0; + bufInfo.range = bci.size; + VkWriteDescriptorSet write{VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET}; + write.dstSet = instance.boneSet[frameIndex]; + write.dstBinding = 0; + write.descriptorCount = 1; + write.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; + write.pBufferInfo = &bufInfo; + vkUpdateDescriptorSets(vkCtx_->getDevice(), 1, &write, 0, nullptr); + } + } + } +} + void M2Renderer::render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet, const Camera& camera) { if (instances.empty() || !opaquePipeline_) { return; @@ -2254,8 +2342,8 @@ void M2Renderer::render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet, const } // Sort by modelId to minimize vertex/index buffer rebinds - std::stable_sort(sortedVisible_.begin(), sortedVisible_.end(), - [](const VisibleEntry& a, const VisibleEntry& b) { return a.modelId < b.modelId; }); + std::sort(sortedVisible_.begin(), sortedVisible_.end(), + [](const VisibleEntry& a, const VisibleEntry& b) { return a.modelId < b.modelId; }); uint32_t currentModelId = UINT32_MAX; const M2ModelGPU* currentModel = nullptr; @@ -2330,44 +2418,26 @@ void M2Renderer::render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet, const } } - // Upload bone matrices to SSBO if model has skeletal animation - bool useBones = model.hasAnimation && !model.disableAnimation && !instance.boneMatrices.empty(); + // Upload bone matrices to SSBO if model has skeletal animation. + // Skip animated instances entirely until bones are computed + buffers allocated + // to prevent bind-pose/T-pose flash on first appearance. + bool modelNeedsAnimation = model.hasAnimation && !model.disableAnimation; + if (modelNeedsAnimation && instance.boneMatrices.empty()) { + continue; // Bones not yet computed — skip to avoid bind-pose flash + } + bool needsBones = modelNeedsAnimation && !instance.boneMatrices.empty(); + if (needsBones && (!instance.boneBuffer[frameIndex] || !instance.boneSet[frameIndex])) { + continue; // Bone buffers not yet allocated — skip to avoid bind-pose flash + } + bool useBones = needsBones; if (useBones) { - // Lazy-allocate bone SSBO on first use - if (!instance.boneBuffer[frameIndex]) { - VkBufferCreateInfo bci{VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO}; - bci.size = 128 * sizeof(glm::mat4); // max 128 bones - bci.usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT; - VmaAllocationCreateInfo aci{}; - aci.usage = VMA_MEMORY_USAGE_CPU_TO_GPU; - aci.flags = VMA_ALLOCATION_CREATE_MAPPED_BIT; - VmaAllocationInfo allocInfo{}; - vmaCreateBuffer(vkCtx_->getAllocator(), &bci, &aci, - &instance.boneBuffer[frameIndex], &instance.boneAlloc[frameIndex], &allocInfo); - instance.boneMapped[frameIndex] = allocInfo.pMappedData; - - // Allocate descriptor set for bone SSBO - instance.boneSet[frameIndex] = allocateBoneSet(); - if (instance.boneSet[frameIndex]) { - VkDescriptorBufferInfo bufInfo{}; - bufInfo.buffer = instance.boneBuffer[frameIndex]; - bufInfo.offset = 0; - bufInfo.range = bci.size; - VkWriteDescriptorSet write{VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET}; - write.dstSet = instance.boneSet[frameIndex]; - write.dstBinding = 0; - write.descriptorCount = 1; - write.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; - write.pBufferInfo = &bufInfo; - vkUpdateDescriptorSets(vkCtx_->getDevice(), 1, &write, 0, nullptr); - } - } - - // Upload bone matrices - if (instance.boneMapped[frameIndex]) { + // Upload bone matrices only when recomputed (per-frame-index tracking + // ensures both double-buffered SSBOs get the latest bone data) + if (instance.bonesDirty[frameIndex] && instance.boneMapped[frameIndex]) { int numBones = std::min(static_cast(instance.boneMatrices.size()), 128); memcpy(instance.boneMapped[frameIndex], instance.boneMatrices.data(), numBones * sizeof(glm::mat4)); + instance.bonesDirty[frameIndex] = false; } // Bind bone descriptor set (set 2) @@ -2384,12 +2454,8 @@ void M2Renderer::render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet, const else if (entry.distSq > 40.0f * 40.0f) desiredLOD = 1; uint16_t targetLOD = desiredLOD; - if (desiredLOD > 0) { - bool hasDesiredLOD = false; - for (const auto& b : model.batches) { - if (b.submeshLevel == desiredLOD) { hasDesiredLOD = true; break; } - } - if (!hasDesiredLOD) targetLOD = 0; + if (desiredLOD > 0 && !(model.availableLODs & (1u << desiredLOD))) { + targetLOD = 0; } const bool foliageLikeModel = model.isFoliageLike; @@ -3597,7 +3663,7 @@ void M2Renderer::rebuildSpatialIndex() { void M2Renderer::gatherCandidates(const glm::vec3& queryMin, const glm::vec3& queryMax, std::vector& outIndices) const { outIndices.clear(); - candidateIdScratch.clear(); + tl_m2_candidateIdScratch.clear(); GridCell minCell = toCell(queryMin); GridCell maxCell = toCell(queryMax); @@ -3607,7 +3673,7 @@ void M2Renderer::gatherCandidates(const glm::vec3& queryMin, const glm::vec3& qu auto it = spatialGrid.find(GridCell{x, y, z}); if (it == spatialGrid.end()) continue; for (uint32_t id : it->second) { - if (!candidateIdScratch.insert(id).second) continue; + if (!tl_m2_candidateIdScratch.insert(id).second) continue; auto idxIt = instanceIndexById.find(id); if (idxIt != instanceIndexById.end()) { outIndices.push_back(idxIt->second); @@ -3780,9 +3846,9 @@ std::optional M2Renderer::getFloorHeight(float glX, float glY, float glZ, glm::vec3 queryMin(glX - 2.0f, glY - 2.0f, glZ - 6.0f); glm::vec3 queryMax(glX + 2.0f, glY + 2.0f, glZ + 8.0f); - gatherCandidates(queryMin, queryMax, candidateScratch); + gatherCandidates(queryMin, queryMax, tl_m2_candidateScratch); - for (size_t idx : candidateScratch) { + for (size_t idx : tl_m2_candidateScratch) { const auto& instance = instances[idx]; if (collisionFocusEnabled && pointAABBDistanceSq(collisionFocusPos, instance.worldBoundsMin, instance.worldBoundsMax) > collisionFocusRadiusSq) { @@ -3804,14 +3870,14 @@ std::optional M2Renderer::getFloorHeight(float glX, float glY, float glZ, model.collision.getFloorTrisInRange( localPos.x - 1.0f, localPos.y - 1.0f, localPos.x + 1.0f, localPos.y + 1.0f, - collisionTriScratch_); + tl_m2_collisionTriScratch); glm::vec3 rayOrigin(localPos.x, localPos.y, localPos.z + 5.0f); glm::vec3 rayDir(0.0f, 0.0f, -1.0f); float bestHitZ = -std::numeric_limits::max(); bool hitAny = false; - for (uint32_t ti : collisionTriScratch_) { + for (uint32_t ti : tl_m2_collisionTriScratch) { if (ti >= model.collision.triCount) continue; if (model.collision.triBounds[ti].maxZ < localPos.z - 10.0f || model.collision.triBounds[ti].minZ > localPos.z + 5.0f) continue; @@ -3926,10 +3992,10 @@ bool M2Renderer::checkCollision(const glm::vec3& from, const glm::vec3& to, glm::vec3 queryMin = glm::min(from, to) - glm::vec3(7.0f, 7.0f, 5.0f); glm::vec3 queryMax = glm::max(from, to) + glm::vec3(7.0f, 7.0f, 5.0f); - gatherCandidates(queryMin, queryMax, candidateScratch); + gatherCandidates(queryMin, queryMax, tl_m2_candidateScratch); // Check against all M2 instances in local space (rotation-aware). - for (size_t idx : candidateScratch) { + for (size_t idx : tl_m2_candidateScratch) { const auto& instance = instances[idx]; if (collisionFocusEnabled && pointAABBDistanceSq(collisionFocusPos, instance.worldBoundsMin, instance.worldBoundsMax) > collisionFocusRadiusSq) { @@ -3962,14 +4028,14 @@ bool M2Renderer::checkCollision(const glm::vec3& from, const glm::vec3& to, std::min(localFrom.y, localPos.y) - localRadius - 1.0f, std::max(localFrom.x, localPos.x) + localRadius + 1.0f, std::max(localFrom.y, localPos.y) + localRadius + 1.0f, - collisionTriScratch_); + tl_m2_collisionTriScratch); constexpr float PLAYER_HEIGHT = 2.0f; constexpr float MAX_TOTAL_PUSH = 0.02f; // Cap total push per instance bool pushed = false; float totalPushX = 0.0f, totalPushY = 0.0f; - for (uint32_t ti : collisionTriScratch_) { + for (uint32_t ti : tl_m2_collisionTriScratch) { if (ti >= model.collision.triCount) continue; if (localPos.z + PLAYER_HEIGHT < model.collision.triBounds[ti].minZ || localPos.z > model.collision.triBounds[ti].maxZ) continue; @@ -4167,9 +4233,9 @@ float M2Renderer::raycastBoundingBoxes(const glm::vec3& origin, const glm::vec3& glm::vec3 rayEnd = origin + direction * maxDistance; glm::vec3 queryMin = glm::min(origin, rayEnd) - glm::vec3(1.0f); glm::vec3 queryMax = glm::max(origin, rayEnd) + glm::vec3(1.0f); - gatherCandidates(queryMin, queryMax, candidateScratch); + gatherCandidates(queryMin, queryMax, tl_m2_candidateScratch); - for (size_t idx : candidateScratch) { + for (size_t idx : tl_m2_candidateScratch) { const auto& instance = instances[idx]; if (collisionFocusEnabled && pointAABBDistanceSq(collisionFocusPos, instance.worldBoundsMin, instance.worldBoundsMax) > collisionFocusRadiusSq) { diff --git a/src/rendering/performance_hud.cpp b/src/rendering/performance_hud.cpp index d939f4f9..86dc2f21 100644 --- a/src/rendering/performance_hud.cpp +++ b/src/rendering/performance_hud.cpp @@ -1,5 +1,6 @@ #include "rendering/performance_hud.hpp" #include "rendering/renderer.hpp" +#include "rendering/vk_context.hpp" #include "rendering/terrain_renderer.hpp" #include "rendering/terrain_manager.hpp" #include "rendering/water_renderer.hpp" @@ -187,6 +188,19 @@ void PerformanceHUD::render(const Renderer* renderer, const Camera* camera) { 0, nullptr, 0.0f, 33.33f, ImVec2(200, 40)); } + // FSR info + if (renderer->isFSREnabled()) { + ImGui::TextColored(ImVec4(0.4f, 1.0f, 0.4f, 1.0f), "FSR 1.0: ON"); + auto* ctx = renderer->getVkContext(); + if (ctx) { + auto ext = ctx->getSwapchainExtent(); + float sf = renderer->getFSRScaleFactor(); + uint32_t iw = static_cast(ext.width * sf) & ~1u; + uint32_t ih = static_cast(ext.height * sf) & ~1u; + ImGui::Text(" %ux%u -> %ux%u (%.0f%%)", iw, ih, ext.width, ext.height, sf * 100.0f); + } + } + ImGui::Spacing(); } diff --git a/src/rendering/renderer.cpp b/src/rendering/renderer.cpp index 55ba1370..063bae9a 100644 --- a/src/rendering/renderer.cpp +++ b/src/rendering/renderer.cpp @@ -70,6 +70,7 @@ #include #include #include +#include namespace wowee { namespace rendering { @@ -721,11 +722,18 @@ bool Renderer::initialize(core::Window* win) { // TODO Phase 6: Vulkan underwater overlay, post-process, and shadow map // GL versions stubbed during migration + // Create secondary command buffer resources for multithreaded rendering + if (!createSecondaryCommandResources()) { + LOG_WARNING("Failed to create secondary command buffers — falling back to single-threaded rendering"); + } + LOG_INFO("Renderer initialized"); return true; } void Renderer::shutdown() { + destroySecondaryCommandResources(); + LOG_WARNING("Renderer::shutdown - terrainManager stopWorkers..."); if (terrainManager) { terrainManager->stopWorkers(); @@ -828,6 +836,8 @@ void Renderer::shutdown() { if (overlayPipelineLayout) { vkDestroyPipelineLayout(device, overlayPipelineLayout, nullptr); overlayPipelineLayout = VK_NULL_HANDLE; } } + destroyFSRResources(); + destroyFSR2Resources(); destroyPerFrameResources(); zoneManager.reset(); @@ -866,6 +876,9 @@ bool Renderer::isWaterRefractionEnabled() const { void Renderer::setMsaaSamples(VkSampleCountFlagBits samples) { if (!vkCtx) return; + // FSR2 requires non-MSAA render pass — block MSAA changes while FSR2 is active + if (fsr2_.enabled && samples > VK_SAMPLE_COUNT_1_BIT) return; + // Clamp to device maximum VkSampleCountFlagBits maxSamples = vkCtx->getMaxUsableSampleCount(); if (samples > maxSamples) samples = maxSamples; @@ -901,12 +914,7 @@ void Renderer::applyMsaaChange() { if (terrainRenderer) terrainRenderer->recreatePipelines(); if (waterRenderer) { waterRenderer->recreatePipelines(); - if (vkCtx->getMsaaSamples() != VK_SAMPLE_COUNT_1_BIT) { - waterRenderer->destroyWater1xResources(); - setupWater1xPass(); - } else { - waterRenderer->destroyWater1xResources(); - } + waterRenderer->destroyWater1xResources(); // no longer used } if (wmoRenderer) wmoRenderer->recreatePipelines(); if (m2Renderer) m2Renderer->recreatePipelines(); @@ -928,10 +936,12 @@ void Renderer::applyMsaaChange() { if (minimap) minimap->recreatePipelines(); - // Selection circle + overlay use lazy init, just destroy them + // Selection circle + overlay + FSR use lazy init, just destroy them VkDevice device = vkCtx->getDevice(); if (selCirclePipeline) { vkDestroyPipeline(device, selCirclePipeline, nullptr); selCirclePipeline = VK_NULL_HANDLE; } if (overlayPipeline) { vkDestroyPipeline(device, overlayPipeline, nullptr); overlayPipeline = VK_NULL_HANDLE; } + if (fsr_.sceneFramebuffer) destroyFSRResources(); // Will be lazily recreated in beginFrame() + if (fsr2_.sceneFramebuffer) destroyFSR2Resources(); // Reinitialize ImGui Vulkan backend with new MSAA sample count ImGui_ImplVulkan_Shutdown(); @@ -961,17 +971,47 @@ void Renderer::beginFrame() { applyMsaaChange(); } + // FSR resource management (safe: between frames, no command buffer in flight) + if (fsr_.needsRecreate && fsr_.sceneFramebuffer) { + destroyFSRResources(); + fsr_.needsRecreate = false; + if (!fsr_.enabled) LOG_INFO("FSR: disabled"); + } + if (fsr_.enabled && !fsr2_.enabled && !fsr_.sceneFramebuffer) { + if (!initFSRResources()) { + LOG_ERROR("FSR: initialization failed, disabling"); + fsr_.enabled = false; + } + } + + // FSR 2.2 resource management + if (fsr2_.needsRecreate && fsr2_.sceneFramebuffer) { + destroyFSR2Resources(); + fsr2_.needsRecreate = false; + if (!fsr2_.enabled) LOG_INFO("FSR2: disabled"); + } + if (fsr2_.enabled && !fsr2_.sceneFramebuffer) { + if (!initFSR2Resources()) { + LOG_ERROR("FSR2: initialization failed, disabling"); + fsr2_.enabled = false; + } + } + // Handle swapchain recreation if needed if (vkCtx->isSwapchainDirty()) { vkCtx->recreateSwapchain(window->getWidth(), window->getHeight()); // Rebuild water resources that reference swapchain extent/views if (waterRenderer) { waterRenderer->recreatePipelines(); - if (waterRenderer->hasWater1xPass() - && vkCtx->getMsaaSamples() != VK_SAMPLE_COUNT_1_BIT) { - waterRenderer->destroyWater1xResources(); - setupWater1xPass(); - } + } + // Recreate FSR resources for new swapchain dimensions + if (fsr_.enabled && !fsr2_.enabled) { + destroyFSRResources(); + initFSRResources(); + } + if (fsr2_.enabled) { + destroyFSR2Resources(); + initFSR2Resources(); } } @@ -982,6 +1022,14 @@ void Renderer::beginFrame() { return; } + // Apply FSR2 jitter to camera projection before UBO upload + if (fsr2_.enabled && fsr2_.sceneFramebuffer && camera) { + // Halton(2,3) sequence for sub-pixel jitter, scaled to internal resolution + float jx = (halton(fsr2_.frameIndex + 1, 2) - 0.5f) * 2.0f / static_cast(fsr2_.internalWidth); + float jy = (halton(fsr2_.frameIndex + 1, 3) - 0.5f) * 2.0f / static_cast(fsr2_.internalHeight); + camera->setJitter(jx, jy); + } + // Update per-frame UBO with current camera/lighting state updatePerFrameUBO(); @@ -1018,47 +1066,187 @@ void Renderer::beginFrame() { renderReflectionPass(); } // !skipPrePasses - // --- Begin main render pass (clear color + depth) --- + // --- Begin render pass --- + // If FSR is enabled, render scene to off-screen target at reduced resolution. + // Otherwise, render directly to swapchain. VkRenderPassBeginInfo rpInfo{}; rpInfo.sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO; rpInfo.renderPass = vkCtx->getImGuiRenderPass(); - rpInfo.framebuffer = vkCtx->getSwapchainFramebuffers()[currentImageIndex]; - rpInfo.renderArea.offset = {0, 0}; - rpInfo.renderArea.extent = vkCtx->getSwapchainExtent(); - // MSAA render pass has 3 attachments (color, depth, resolve), non-MSAA has 2 - VkClearValue clearValues[3]{}; + VkExtent2D renderExtent; + if (fsr2_.enabled && fsr2_.sceneFramebuffer) { + rpInfo.framebuffer = fsr2_.sceneFramebuffer; + renderExtent = { fsr2_.internalWidth, fsr2_.internalHeight }; + } else if (fsr_.enabled && fsr_.sceneFramebuffer) { + rpInfo.framebuffer = fsr_.sceneFramebuffer; + renderExtent = { fsr_.internalWidth, fsr_.internalHeight }; + } else { + rpInfo.framebuffer = vkCtx->getSwapchainFramebuffers()[currentImageIndex]; + renderExtent = vkCtx->getSwapchainExtent(); + } + + rpInfo.renderArea.offset = {0, 0}; + rpInfo.renderArea.extent = renderExtent; + + // Clear values must match attachment count: 2 (no MSAA), 3 (MSAA), or 4 (MSAA+depth resolve) + VkClearValue clearValues[4]{}; clearValues[0].color = {{0.0f, 0.0f, 0.0f, 1.0f}}; clearValues[1].depthStencil = {1.0f, 0}; - clearValues[2].color = {{0.0f, 0.0f, 0.0f, 1.0f}}; // resolve (DONT_CARE, but count must match) + clearValues[2].color = {{0.0f, 0.0f, 0.0f, 1.0f}}; + clearValues[3].depthStencil = {1.0f, 0}; bool msaaOn = (vkCtx->getMsaaSamples() > VK_SAMPLE_COUNT_1_BIT); - rpInfo.clearValueCount = msaaOn ? 3 : 2; + if (msaaOn) { + bool depthRes = (vkCtx->getDepthResolveImageView() != VK_NULL_HANDLE); + rpInfo.clearValueCount = depthRes ? 4 : 3; + } else { + rpInfo.clearValueCount = 2; + } rpInfo.pClearValues = clearValues; - vkCmdBeginRenderPass(currentCmd, &rpInfo, VK_SUBPASS_CONTENTS_INLINE); + // Cache render pass state for secondary command buffer inheritance + activeRenderPass_ = rpInfo.renderPass; + activeFramebuffer_ = rpInfo.framebuffer; + activeRenderExtent_ = renderExtent; - // Set dynamic viewport and scissor - VkExtent2D extent = vkCtx->getSwapchainExtent(); - VkViewport viewport{}; - viewport.x = 0.0f; - viewport.y = 0.0f; - viewport.width = static_cast(extent.width); - viewport.height = static_cast(extent.height); - viewport.minDepth = 0.0f; - viewport.maxDepth = 1.0f; - vkCmdSetViewport(currentCmd, 0, 1, &viewport); + VkSubpassContents subpassMode = parallelRecordingEnabled_ + ? VK_SUBPASS_CONTENTS_SECONDARY_COMMAND_BUFFERS + : VK_SUBPASS_CONTENTS_INLINE; + vkCmdBeginRenderPass(currentCmd, &rpInfo, subpassMode); - VkRect2D scissor{}; - scissor.offset = {0, 0}; - scissor.extent = extent; - vkCmdSetScissor(currentCmd, 0, 1, &scissor); + if (!parallelRecordingEnabled_) { + // Fallback: set dynamic viewport and scissor on primary (inline mode) + VkViewport viewport{}; + viewport.width = static_cast(renderExtent.width); + viewport.height = static_cast(renderExtent.height); + viewport.maxDepth = 1.0f; + vkCmdSetViewport(currentCmd, 0, 1, &viewport); + + VkRect2D scissor{}; + scissor.extent = renderExtent; + vkCmdSetScissor(currentCmd, 0, 1, &scissor); + } } void Renderer::endFrame() { if (!vkCtx || currentCmd == VK_NULL_HANDLE) return; - // ImGui always renders in the main pass (its pipeline matches the main render pass) - ImGui_ImplVulkan_RenderDrawData(ImGui::GetDrawData(), currentCmd); + if (fsr2_.enabled && fsr2_.sceneFramebuffer) { + // End the off-screen scene render pass + vkCmdEndRenderPass(currentCmd); + + // Compute passes: motion vectors → temporal accumulation + dispatchMotionVectors(); + dispatchTemporalAccumulate(); + + // Transition history output: GENERAL → SHADER_READ_ONLY for sharpen pass + transitionImageLayout(currentCmd, fsr2_.history[fsr2_.currentHistory].image, + VK_IMAGE_LAYOUT_GENERAL, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, + VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, + VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT); + + // Begin swapchain render pass at full resolution for sharpening + ImGui + VkRenderPassBeginInfo rpInfo{}; + rpInfo.sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO; + rpInfo.renderPass = vkCtx->getImGuiRenderPass(); + rpInfo.framebuffer = vkCtx->getSwapchainFramebuffers()[currentImageIndex]; + rpInfo.renderArea.offset = {0, 0}; + rpInfo.renderArea.extent = vkCtx->getSwapchainExtent(); + + bool msaaOn = (vkCtx->getMsaaSamples() > VK_SAMPLE_COUNT_1_BIT); + VkClearValue clearValues[4]{}; + clearValues[0].color = {{0.0f, 0.0f, 0.0f, 1.0f}}; + clearValues[1].depthStencil = {1.0f, 0}; + clearValues[2].color = {{0.0f, 0.0f, 0.0f, 1.0f}}; + clearValues[3].depthStencil = {1.0f, 0}; + rpInfo.clearValueCount = msaaOn ? (vkCtx->getDepthResolveImageView() ? 4u : 3u) : 2u; + rpInfo.pClearValues = clearValues; + + vkCmdBeginRenderPass(currentCmd, &rpInfo, VK_SUBPASS_CONTENTS_INLINE); + + VkExtent2D ext = vkCtx->getSwapchainExtent(); + VkViewport vp{}; + vp.width = static_cast(ext.width); + vp.height = static_cast(ext.height); + vp.maxDepth = 1.0f; + vkCmdSetViewport(currentCmd, 0, 1, &vp); + VkRect2D sc{}; + sc.extent = ext; + vkCmdSetScissor(currentCmd, 0, 1, &sc); + + // Draw RCAS sharpening from accumulated history buffer + renderFSR2Sharpen(); + + // Store current VP for next frame's motion vectors, advance frame + fsr2_.prevViewProjection = camera->getUnjitteredViewProjectionMatrix(); + fsr2_.prevJitter = camera->getJitter(); + camera->clearJitter(); + fsr2_.currentHistory = 1 - fsr2_.currentHistory; + fsr2_.frameIndex = (fsr2_.frameIndex + 1) % 256; // Wrap to keep Halton values well-distributed + + } else if (fsr_.enabled && fsr_.sceneFramebuffer) { + // End the off-screen scene render pass + vkCmdEndRenderPass(currentCmd); + + // Transition scene color (1x resolve/color target): PRESENT_SRC_KHR → SHADER_READ_ONLY + // The render pass finalLayout puts the resolve/color attachment in PRESENT_SRC_KHR + transitionImageLayout(currentCmd, fsr_.sceneColor.image, + VK_IMAGE_LAYOUT_PRESENT_SRC_KHR, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, + VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT, + VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT); + + // Begin swapchain render pass at full resolution + VkRenderPassBeginInfo rpInfo{}; + rpInfo.sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO; + rpInfo.renderPass = vkCtx->getImGuiRenderPass(); + rpInfo.framebuffer = vkCtx->getSwapchainFramebuffers()[currentImageIndex]; + rpInfo.renderArea.offset = {0, 0}; + rpInfo.renderArea.extent = vkCtx->getSwapchainExtent(); + + // Clear values must match the render pass attachment count + bool msaaOn = (vkCtx->getMsaaSamples() > VK_SAMPLE_COUNT_1_BIT); + VkClearValue clearValues[4]{}; + clearValues[0].color = {{0.0f, 0.0f, 0.0f, 1.0f}}; + clearValues[1].depthStencil = {1.0f, 0}; + clearValues[2].color = {{0.0f, 0.0f, 0.0f, 1.0f}}; + clearValues[3].depthStencil = {1.0f, 0}; + if (msaaOn) { + bool depthRes = (vkCtx->getDepthResolveImageView() != VK_NULL_HANDLE); + rpInfo.clearValueCount = depthRes ? 4 : 3; + } else { + rpInfo.clearValueCount = 2; + } + rpInfo.pClearValues = clearValues; + + vkCmdBeginRenderPass(currentCmd, &rpInfo, VK_SUBPASS_CONTENTS_INLINE); + + // Set full-resolution viewport and scissor + VkExtent2D ext = vkCtx->getSwapchainExtent(); + VkViewport vp{}; + vp.width = static_cast(ext.width); + vp.height = static_cast(ext.height); + vp.maxDepth = 1.0f; + vkCmdSetViewport(currentCmd, 0, 1, &vp); + VkRect2D sc{}; + sc.extent = ext; + vkCmdSetScissor(currentCmd, 0, 1, &sc); + + // Draw FSR upscale fullscreen quad + renderFSRUpscale(); + } + + // ImGui rendering — must respect subpass contents mode + if (!fsr_.enabled && !fsr2_.enabled && parallelRecordingEnabled_) { + // Scene pass was begun with VK_SUBPASS_CONTENTS_SECONDARY_COMMAND_BUFFERS, + // so ImGui must be recorded into a secondary command buffer. + VkCommandBuffer imguiCmd = beginSecondary(SEC_IMGUI); + setSecondaryViewportScissor(imguiCmd); + ImGui_ImplVulkan_RenderDrawData(ImGui::GetDrawData(), imguiCmd); + vkEndCommandBuffer(imguiCmd); + vkCmdExecuteCommands(currentCmd, 1, &imguiCmd); + } else { + // FSR swapchain pass uses INLINE mode; non-parallel also uses INLINE. + ImGui_ImplVulkan_RenderDrawData(ImGui::GetDrawData(), currentCmd); + } vkCmdEndRenderPass(currentCmd); @@ -1076,16 +1264,7 @@ void Renderer::endFrame() { frame); } - // Render water in separate 1x pass after MSAA resolve + scene capture - bool waterDeferred = waterRenderer && waterRenderer->hasSurfaces() && waterRenderer->hasWater1xPass() - && vkCtx->getMsaaSamples() != VK_SAMPLE_COUNT_1_BIT; - if (waterDeferred && camera) { - VkExtent2D ext = vkCtx->getSwapchainExtent(); - if (waterRenderer->beginWater1xPass(currentCmd, currentImageIndex, ext)) { - waterRenderer->render(currentCmd, perFrameDescSets[frame], *camera, globalTime, true, frame); - waterRenderer->endWater1xPass(currentCmd); - } - } + // Water now renders in the main pass (renderWorld), no separate 1x pass needed. // Submit and present vkCtx->endFrame(currentCmd, currentImageIndex); @@ -2434,7 +2613,7 @@ void Renderer::update(float deltaTime) { cameraController->update(deltaTime); auto cameraEnd = std::chrono::steady_clock::now(); lastCameraUpdateMs = std::chrono::duration(cameraEnd - cameraStart).count(); - if (lastCameraUpdateMs > 3.0) { + if (lastCameraUpdateMs > 50.0) { LOG_WARNING("SLOW cameraController->update: ", lastCameraUpdateMs, "ms"); } @@ -2534,7 +2713,7 @@ void Renderer::update(float deltaTime) { terrainManager->update(*camera, deltaTime); float terrMs = std::chrono::duration( std::chrono::steady_clock::now() - terrStart).count(); - if (terrMs > 5.0f) { + if (terrMs > 50.0f) { LOG_WARNING("SLOW terrainManager->update: ", terrMs, "ms"); } } @@ -2586,16 +2765,23 @@ void Renderer::update(float deltaTime) { } - // Update character animations + // Launch M2 doodad animation on background thread (overlaps with character animation + audio) + std::future m2AnimFuture; + bool m2AnimLaunched = false; + if (m2Renderer && camera) { + float m2DeltaTime = deltaTime; + glm::vec3 m2CamPos = camera->getPosition(); + glm::mat4 m2ViewProj = camera->getProjectionMatrix() * camera->getViewMatrix(); + m2AnimFuture = std::async(std::launch::async, + [this, m2DeltaTime, m2CamPos, m2ViewProj]() { + m2Renderer->update(m2DeltaTime, m2CamPos, m2ViewProj); + }); + m2AnimLaunched = true; + } + + // Update character animations (runs in parallel with M2 animation above) if (characterRenderer && camera) { - auto charAnimStart = std::chrono::steady_clock::now(); characterRenderer->update(deltaTime, camera->getPosition()); - float charAnimMs = std::chrono::duration( - std::chrono::steady_clock::now() - charAnimStart).count(); - if (charAnimMs > 5.0f) { - LOG_WARNING("SLOW characterRenderer->update: ", charAnimMs, "ms (", - characterRenderer->getInstanceCount(), " instances)"); - } } // Update AudioEngine (cleanup finished sounds, etc.) @@ -2780,17 +2966,9 @@ void Renderer::update(float deltaTime) { ambientSoundManager->update(deltaTime, camPos, isIndoor, isSwimming, isBlacksmith); } - // Update M2 doodad animations (pass camera for frustum-culling bone computation) - if (m2Renderer && camera) { - auto m2Start = std::chrono::steady_clock::now(); - m2Renderer->update(deltaTime, camera->getPosition(), - camera->getProjectionMatrix() * camera->getViewMatrix()); - float m2Ms = std::chrono::duration( - std::chrono::steady_clock::now() - m2Start).count(); - if (m2Ms > 3.0f) { - LOG_WARNING("SLOW m2Renderer->update: ", m2Ms, "ms (", - m2Renderer->getInstanceCount(), " instances)"); - } + // Wait for M2 doodad animation to finish (was launched earlier in parallel with character anim) + if (m2AnimLaunched) { + m2AnimFuture.get(); } // Helper: play zone music, dispatching local files (file: prefix) vs MPQ paths @@ -3097,10 +3275,11 @@ void Renderer::clearSelectionCircle() { selCircleVisible = false; } -void Renderer::renderSelectionCircle(const glm::mat4& view, const glm::mat4& projection) { +void Renderer::renderSelectionCircle(const glm::mat4& view, const glm::mat4& projection, VkCommandBuffer overrideCmd) { if (!selCircleVisible) return; initSelectionCircle(); - if (selCirclePipeline == VK_NULL_HANDLE || currentCmd == VK_NULL_HANDLE) return; + VkCommandBuffer cmd = (overrideCmd != VK_NULL_HANDLE) ? overrideCmd : currentCmd; + if (selCirclePipeline == VK_NULL_HANDLE || cmd == VK_NULL_HANDLE) return; // Keep circle anchored near target foot Z. Accept nearby floor probes only, // so distant upper/lower WMO planes don't yank the ring away from feet. @@ -3132,19 +3311,19 @@ void Renderer::renderSelectionCircle(const glm::mat4& view, const glm::mat4& pro glm::mat4 mvp = projection * view * model; glm::vec4 color4(selCircleColor, 1.0f); - vkCmdBindPipeline(currentCmd, VK_PIPELINE_BIND_POINT_GRAPHICS, selCirclePipeline); + vkCmdBindPipeline(cmd, VK_PIPELINE_BIND_POINT_GRAPHICS, selCirclePipeline); VkDeviceSize offset = 0; - vkCmdBindVertexBuffers(currentCmd, 0, 1, &selCircleVertBuf, &offset); - vkCmdBindIndexBuffer(currentCmd, selCircleIdxBuf, 0, VK_INDEX_TYPE_UINT16); + vkCmdBindVertexBuffers(cmd, 0, 1, &selCircleVertBuf, &offset); + vkCmdBindIndexBuffer(cmd, selCircleIdxBuf, 0, VK_INDEX_TYPE_UINT16); // Push mvp (64 bytes) at offset 0 - vkCmdPushConstants(currentCmd, selCirclePipelineLayout, + vkCmdPushConstants(cmd, selCirclePipelineLayout, VK_SHADER_STAGE_VERTEX_BIT | VK_SHADER_STAGE_FRAGMENT_BIT, 0, 64, &mvp[0][0]); // Push color (16 bytes) at offset 64 - vkCmdPushConstants(currentCmd, selCirclePipelineLayout, + vkCmdPushConstants(cmd, selCirclePipelineLayout, VK_SHADER_STAGE_VERTEX_BIT | VK_SHADER_STAGE_FRAGMENT_BIT, 64, 16, &color4[0]); - vkCmdDrawIndexed(currentCmd, static_cast(selCircleVertCount), 1, 0, 0, 0); + vkCmdDrawIndexed(cmd, static_cast(selCircleVertCount), 1, 0, 0, 0); } // ────────────────────────────────────────────────────────────── @@ -3194,15 +3373,877 @@ void Renderer::initOverlayPipeline() { if (overlayPipeline) LOG_INFO("Renderer: overlay pipeline initialized"); } -void Renderer::renderOverlay(const glm::vec4& color) { +void Renderer::renderOverlay(const glm::vec4& color, VkCommandBuffer overrideCmd) { if (!overlayPipeline) initOverlayPipeline(); - if (!overlayPipeline || currentCmd == VK_NULL_HANDLE) return; - vkCmdBindPipeline(currentCmd, VK_PIPELINE_BIND_POINT_GRAPHICS, overlayPipeline); - vkCmdPushConstants(currentCmd, overlayPipelineLayout, + VkCommandBuffer cmd = (overrideCmd != VK_NULL_HANDLE) ? overrideCmd : currentCmd; + if (!overlayPipeline || cmd == VK_NULL_HANDLE) return; + vkCmdBindPipeline(cmd, VK_PIPELINE_BIND_POINT_GRAPHICS, overlayPipeline); + vkCmdPushConstants(cmd, overlayPipelineLayout, VK_SHADER_STAGE_FRAGMENT_BIT, 0, 16, &color[0]); - vkCmdDraw(currentCmd, 3, 1, 0, 0); // fullscreen triangle + vkCmdDraw(cmd, 3, 1, 0, 0); // fullscreen triangle } +// ========================= FSR 1.0 Upscaling ========================= + +bool Renderer::initFSRResources() { + if (!vkCtx) return false; + + VkDevice device = vkCtx->getDevice(); + VmaAllocator alloc = vkCtx->getAllocator(); + VkExtent2D swapExtent = vkCtx->getSwapchainExtent(); + VkSampleCountFlagBits msaa = vkCtx->getMsaaSamples(); + bool useMsaa = (msaa > VK_SAMPLE_COUNT_1_BIT); + bool useDepthResolve = (vkCtx->getDepthResolveImageView() != VK_NULL_HANDLE); + + fsr_.internalWidth = static_cast(swapExtent.width * fsr_.scaleFactor); + fsr_.internalHeight = static_cast(swapExtent.height * fsr_.scaleFactor); + fsr_.internalWidth = (fsr_.internalWidth + 1) & ~1u; + fsr_.internalHeight = (fsr_.internalHeight + 1) & ~1u; + + LOG_INFO("FSR: initializing at ", fsr_.internalWidth, "x", fsr_.internalHeight, + " -> ", swapExtent.width, "x", swapExtent.height, + " (scale=", fsr_.scaleFactor, ", MSAA=", static_cast(msaa), "x)"); + + VkFormat colorFmt = vkCtx->getSwapchainFormat(); + VkFormat depthFmt = vkCtx->getDepthFormat(); + + // sceneColor: always 1x, always sampled — this is what FSR reads + // Non-MSAA: direct render target. MSAA: resolve target. + fsr_.sceneColor = createImage(device, alloc, fsr_.internalWidth, fsr_.internalHeight, + colorFmt, VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT | VK_IMAGE_USAGE_SAMPLED_BIT); + if (!fsr_.sceneColor.image) { + LOG_ERROR("FSR: failed to create scene color image"); + return false; + } + + // sceneDepth: matches current MSAA sample count + fsr_.sceneDepth = createImage(device, alloc, fsr_.internalWidth, fsr_.internalHeight, + depthFmt, VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT, msaa); + if (!fsr_.sceneDepth.image) { + LOG_ERROR("FSR: failed to create scene depth image"); + destroyFSRResources(); + return false; + } + + if (useMsaa) { + // sceneMsaaColor: multisampled color target + fsr_.sceneMsaaColor = createImage(device, alloc, fsr_.internalWidth, fsr_.internalHeight, + colorFmt, VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT, msaa); + if (!fsr_.sceneMsaaColor.image) { + LOG_ERROR("FSR: failed to create MSAA color image"); + destroyFSRResources(); + return false; + } + + if (useDepthResolve) { + fsr_.sceneDepthResolve = createImage(device, alloc, fsr_.internalWidth, fsr_.internalHeight, + depthFmt, VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT); + if (!fsr_.sceneDepthResolve.image) { + LOG_ERROR("FSR: failed to create depth resolve image"); + destroyFSRResources(); + return false; + } + } + } + + // Build framebuffer matching the main render pass attachment layout: + // Non-MSAA: [color, depth] + // MSAA (no depth res): [msaaColor, depth, resolve] + // MSAA (depth res): [msaaColor, depth, resolve, depthResolve] + VkImageView fbAttachments[4]{}; + uint32_t fbCount; + if (useMsaa) { + fbAttachments[0] = fsr_.sceneMsaaColor.imageView; + fbAttachments[1] = fsr_.sceneDepth.imageView; + fbAttachments[2] = fsr_.sceneColor.imageView; // resolve target + fbCount = 3; + if (useDepthResolve) { + fbAttachments[3] = fsr_.sceneDepthResolve.imageView; + fbCount = 4; + } + } else { + fbAttachments[0] = fsr_.sceneColor.imageView; + fbAttachments[1] = fsr_.sceneDepth.imageView; + fbCount = 2; + } + + VkFramebufferCreateInfo fbInfo{}; + fbInfo.sType = VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO; + fbInfo.renderPass = vkCtx->getImGuiRenderPass(); + fbInfo.attachmentCount = fbCount; + fbInfo.pAttachments = fbAttachments; + fbInfo.width = fsr_.internalWidth; + fbInfo.height = fsr_.internalHeight; + fbInfo.layers = 1; + + if (vkCreateFramebuffer(device, &fbInfo, nullptr, &fsr_.sceneFramebuffer) != VK_SUCCESS) { + LOG_ERROR("FSR: failed to create scene framebuffer"); + destroyFSRResources(); + return false; + } + + // Sampler for the resolved scene color + VkSamplerCreateInfo samplerInfo{}; + samplerInfo.sType = VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO; + samplerInfo.minFilter = VK_FILTER_LINEAR; + samplerInfo.magFilter = VK_FILTER_LINEAR; + samplerInfo.addressModeU = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE; + samplerInfo.addressModeV = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE; + samplerInfo.addressModeW = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE; + samplerInfo.mipmapMode = VK_SAMPLER_MIPMAP_MODE_LINEAR; + if (vkCreateSampler(device, &samplerInfo, nullptr, &fsr_.sceneSampler) != VK_SUCCESS) { + LOG_ERROR("FSR: failed to create sampler"); + destroyFSRResources(); + return false; + } + + // Descriptor set layout: binding 0 = combined image sampler + VkDescriptorSetLayoutBinding binding{}; + binding.binding = 0; + binding.descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER; + binding.descriptorCount = 1; + binding.stageFlags = VK_SHADER_STAGE_FRAGMENT_BIT; + + VkDescriptorSetLayoutCreateInfo layoutInfo{}; + layoutInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO; + layoutInfo.bindingCount = 1; + layoutInfo.pBindings = &binding; + vkCreateDescriptorSetLayout(device, &layoutInfo, nullptr, &fsr_.descSetLayout); + + VkDescriptorPoolSize poolSize{}; + poolSize.type = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER; + poolSize.descriptorCount = 1; + VkDescriptorPoolCreateInfo poolInfo{}; + poolInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO; + poolInfo.maxSets = 1; + poolInfo.poolSizeCount = 1; + poolInfo.pPoolSizes = &poolSize; + vkCreateDescriptorPool(device, &poolInfo, nullptr, &fsr_.descPool); + + VkDescriptorSetAllocateInfo dsAllocInfo{}; + dsAllocInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO; + dsAllocInfo.descriptorPool = fsr_.descPool; + dsAllocInfo.descriptorSetCount = 1; + dsAllocInfo.pSetLayouts = &fsr_.descSetLayout; + vkAllocateDescriptorSets(device, &dsAllocInfo, &fsr_.descSet); + + // Always bind the 1x sceneColor (FSR reads the resolved image) + VkDescriptorImageInfo imgInfo{}; + imgInfo.sampler = fsr_.sceneSampler; + imgInfo.imageView = fsr_.sceneColor.imageView; + imgInfo.imageLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; + + VkWriteDescriptorSet write{}; + write.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; + write.dstSet = fsr_.descSet; + write.dstBinding = 0; + write.descriptorCount = 1; + write.descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER; + write.pImageInfo = &imgInfo; + vkUpdateDescriptorSets(device, 1, &write, 0, nullptr); + + // Pipeline layout + VkPushConstantRange pc{}; + pc.stageFlags = VK_SHADER_STAGE_FRAGMENT_BIT; + pc.offset = 0; + pc.size = 64; + VkPipelineLayoutCreateInfo plCI{}; + plCI.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO; + plCI.setLayoutCount = 1; + plCI.pSetLayouts = &fsr_.descSetLayout; + plCI.pushConstantRangeCount = 1; + plCI.pPushConstantRanges = &pc; + vkCreatePipelineLayout(device, &plCI, nullptr, &fsr_.pipelineLayout); + + // Load shaders + VkShaderModule vertMod, fragMod; + if (!vertMod.loadFromFile(device, "assets/shaders/postprocess.vert.spv") || + !fragMod.loadFromFile(device, "assets/shaders/fsr_easu.frag.spv")) { + LOG_ERROR("FSR: failed to load shaders"); + destroyFSRResources(); + return false; + } + + // FSR upscale pipeline renders into the swapchain pass at full resolution + // Must match swapchain pass MSAA setting + fsr_.pipeline = PipelineBuilder() + .setShaders(vertMod.stageInfo(VK_SHADER_STAGE_VERTEX_BIT), + fragMod.stageInfo(VK_SHADER_STAGE_FRAGMENT_BIT)) + .setVertexInput({}, {}) + .setTopology(VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST) + .setRasterization(VK_POLYGON_MODE_FILL, VK_CULL_MODE_NONE) + .setNoDepthTest() + .setColorBlendAttachment(PipelineBuilder::blendDisabled()) + .setMultisample(msaa) + .setLayout(fsr_.pipelineLayout) + .setRenderPass(vkCtx->getImGuiRenderPass()) + .setDynamicStates({VK_DYNAMIC_STATE_VIEWPORT, VK_DYNAMIC_STATE_SCISSOR}) + .build(device); + + vertMod.destroy(); + fragMod.destroy(); + + if (!fsr_.pipeline) { + LOG_ERROR("FSR: failed to create upscale pipeline"); + destroyFSRResources(); + return false; + } + + LOG_INFO("FSR: initialized successfully"); + return true; +} + +void Renderer::destroyFSRResources() { + if (!vkCtx) return; + VkDevice device = vkCtx->getDevice(); + VmaAllocator alloc = vkCtx->getAllocator(); + + vkDeviceWaitIdle(device); + + if (fsr_.pipeline) { vkDestroyPipeline(device, fsr_.pipeline, nullptr); fsr_.pipeline = VK_NULL_HANDLE; } + if (fsr_.pipelineLayout) { vkDestroyPipelineLayout(device, fsr_.pipelineLayout, nullptr); fsr_.pipelineLayout = VK_NULL_HANDLE; } + if (fsr_.descPool) { vkDestroyDescriptorPool(device, fsr_.descPool, nullptr); fsr_.descPool = VK_NULL_HANDLE; fsr_.descSet = VK_NULL_HANDLE; } + if (fsr_.descSetLayout) { vkDestroyDescriptorSetLayout(device, fsr_.descSetLayout, nullptr); fsr_.descSetLayout = VK_NULL_HANDLE; } + if (fsr_.sceneFramebuffer) { vkDestroyFramebuffer(device, fsr_.sceneFramebuffer, nullptr); fsr_.sceneFramebuffer = VK_NULL_HANDLE; } + if (fsr_.sceneSampler) { vkDestroySampler(device, fsr_.sceneSampler, nullptr); fsr_.sceneSampler = VK_NULL_HANDLE; } + destroyImage(device, alloc, fsr_.sceneDepthResolve); + destroyImage(device, alloc, fsr_.sceneMsaaColor); + destroyImage(device, alloc, fsr_.sceneDepth); + destroyImage(device, alloc, fsr_.sceneColor); + + fsr_.internalWidth = 0; + fsr_.internalHeight = 0; +} + +void Renderer::renderFSRUpscale() { + if (!fsr_.pipeline || currentCmd == VK_NULL_HANDLE) return; + + VkExtent2D outExtent = vkCtx->getSwapchainExtent(); + float inW = static_cast(fsr_.internalWidth); + float inH = static_cast(fsr_.internalHeight); + float outW = static_cast(outExtent.width); + float outH = static_cast(outExtent.height); + + // FSR push constants + struct { + glm::vec4 con0; // inputSize.xy, 1/inputSize.xy + glm::vec4 con1; // inputSize.xy / outputSize.xy, 0.5 * inputSize.xy / outputSize.xy + glm::vec4 con2; // outputSize.xy, 1/outputSize.xy + glm::vec4 con3; // sharpness, 0, 0, 0 + } fsrConst; + + fsrConst.con0 = glm::vec4(inW, inH, 1.0f / inW, 1.0f / inH); + fsrConst.con1 = glm::vec4(inW / outW, inH / outH, 0.5f * inW / outW, 0.5f * inH / outH); + fsrConst.con2 = glm::vec4(outW, outH, 1.0f / outW, 1.0f / outH); + fsrConst.con3 = glm::vec4(fsr_.sharpness, 0.0f, 0.0f, 0.0f); + + vkCmdBindPipeline(currentCmd, VK_PIPELINE_BIND_POINT_GRAPHICS, fsr_.pipeline); + vkCmdBindDescriptorSets(currentCmd, VK_PIPELINE_BIND_POINT_GRAPHICS, + fsr_.pipelineLayout, 0, 1, &fsr_.descSet, 0, nullptr); + vkCmdPushConstants(currentCmd, fsr_.pipelineLayout, + VK_SHADER_STAGE_FRAGMENT_BIT, 0, 64, &fsrConst); + vkCmdDraw(currentCmd, 3, 1, 0, 0); +} + +void Renderer::setFSREnabled(bool enabled) { + if (fsr_.enabled == enabled) return; + fsr_.enabled = enabled; + + if (!enabled) { + // Defer destruction to next beginFrame() — can't destroy mid-render + fsr_.needsRecreate = true; + } + // Resources created/destroyed lazily in beginFrame() +} + +void Renderer::setFSRQuality(float scaleFactor) { + scaleFactor = glm::clamp(scaleFactor, 0.5f, 1.0f); + fsr_.scaleFactor = scaleFactor; + fsr2_.scaleFactor = scaleFactor; + // Don't destroy/recreate mid-frame — mark for lazy recreation in next beginFrame() + if (fsr_.enabled && fsr_.sceneFramebuffer) { + fsr_.needsRecreate = true; + } + if (fsr2_.enabled && fsr2_.sceneFramebuffer) { + fsr2_.needsRecreate = true; + fsr2_.needsHistoryReset = true; + } +} + +void Renderer::setFSRSharpness(float sharpness) { + fsr_.sharpness = glm::clamp(sharpness, 0.0f, 2.0f); + fsr2_.sharpness = glm::clamp(sharpness, 0.0f, 2.0f); +} + +// ========================= End FSR 1.0 ========================= + +// ========================= FSR 2.2 Temporal Upscaling ========================= + +float Renderer::halton(uint32_t index, uint32_t base) { + float f = 1.0f; + float r = 0.0f; + uint32_t current = index; + while (current > 0) { + f /= static_cast(base); + r += f * static_cast(current % base); + current /= base; + } + return r; +} + +bool Renderer::initFSR2Resources() { + if (!vkCtx) return false; + + VkDevice device = vkCtx->getDevice(); + VmaAllocator alloc = vkCtx->getAllocator(); + VkExtent2D swapExtent = vkCtx->getSwapchainExtent(); + + fsr2_.internalWidth = static_cast(swapExtent.width * fsr2_.scaleFactor); + fsr2_.internalHeight = static_cast(swapExtent.height * fsr2_.scaleFactor); + fsr2_.internalWidth = (fsr2_.internalWidth + 1) & ~1u; + fsr2_.internalHeight = (fsr2_.internalHeight + 1) & ~1u; + + LOG_INFO("FSR2: initializing at ", fsr2_.internalWidth, "x", fsr2_.internalHeight, + " -> ", swapExtent.width, "x", swapExtent.height, + " (scale=", fsr2_.scaleFactor, ")"); + + VkFormat colorFmt = vkCtx->getSwapchainFormat(); + VkFormat depthFmt = vkCtx->getDepthFormat(); + + // Scene color (internal resolution, 1x — FSR2 replaces MSAA) + fsr2_.sceneColor = createImage(device, alloc, fsr2_.internalWidth, fsr2_.internalHeight, + colorFmt, VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT | VK_IMAGE_USAGE_SAMPLED_BIT); + if (!fsr2_.sceneColor.image) { LOG_ERROR("FSR2: failed to create scene color"); return false; } + + // Scene depth (internal resolution, 1x, sampled for motion vectors) + fsr2_.sceneDepth = createImage(device, alloc, fsr2_.internalWidth, fsr2_.internalHeight, + depthFmt, VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT | VK_IMAGE_USAGE_SAMPLED_BIT); + if (!fsr2_.sceneDepth.image) { LOG_ERROR("FSR2: failed to create scene depth"); destroyFSR2Resources(); return false; } + + // Motion vector buffer (internal resolution) + fsr2_.motionVectors = createImage(device, alloc, fsr2_.internalWidth, fsr2_.internalHeight, + VK_FORMAT_R16G16_SFLOAT, VK_IMAGE_USAGE_STORAGE_BIT | VK_IMAGE_USAGE_SAMPLED_BIT); + if (!fsr2_.motionVectors.image) { LOG_ERROR("FSR2: failed to create motion vectors"); destroyFSR2Resources(); return false; } + + // History buffers (display resolution, ping-pong) + for (int i = 0; i < 2; i++) { + fsr2_.history[i] = createImage(device, alloc, swapExtent.width, swapExtent.height, + VK_FORMAT_R16G16B16A16_SFLOAT, + VK_IMAGE_USAGE_STORAGE_BIT | VK_IMAGE_USAGE_SAMPLED_BIT); + if (!fsr2_.history[i].image) { LOG_ERROR("FSR2: failed to create history buffer ", i); destroyFSR2Resources(); return false; } + } + + // Scene framebuffer (non-MSAA: [color, depth]) + // Must use the same render pass as the swapchain — which must be non-MSAA when FSR2 is active + VkImageView fbAttachments[2] = { fsr2_.sceneColor.imageView, fsr2_.sceneDepth.imageView }; + VkFramebufferCreateInfo fbInfo{}; + fbInfo.sType = VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO; + fbInfo.renderPass = vkCtx->getImGuiRenderPass(); + fbInfo.attachmentCount = 2; + fbInfo.pAttachments = fbAttachments; + fbInfo.width = fsr2_.internalWidth; + fbInfo.height = fsr2_.internalHeight; + fbInfo.layers = 1; + if (vkCreateFramebuffer(device, &fbInfo, nullptr, &fsr2_.sceneFramebuffer) != VK_SUCCESS) { + LOG_ERROR("FSR2: failed to create scene framebuffer"); + destroyFSR2Resources(); + return false; + } + + // Samplers + VkSamplerCreateInfo samplerInfo{}; + samplerInfo.sType = VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO; + samplerInfo.minFilter = VK_FILTER_LINEAR; + samplerInfo.magFilter = VK_FILTER_LINEAR; + samplerInfo.addressModeU = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE; + samplerInfo.addressModeV = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE; + samplerInfo.addressModeW = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE; + vkCreateSampler(device, &samplerInfo, nullptr, &fsr2_.linearSampler); + + samplerInfo.minFilter = VK_FILTER_NEAREST; + samplerInfo.magFilter = VK_FILTER_NEAREST; + vkCreateSampler(device, &samplerInfo, nullptr, &fsr2_.nearestSampler); + + // --- Motion Vector Compute Pipeline --- + { + // Descriptor set layout: binding 0 = depth (sampler), binding 1 = motion vectors (storage image) + VkDescriptorSetLayoutBinding bindings[2] = {}; + bindings[0].binding = 0; + bindings[0].descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER; + bindings[0].descriptorCount = 1; + bindings[0].stageFlags = VK_SHADER_STAGE_COMPUTE_BIT; + bindings[1].binding = 1; + bindings[1].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE; + bindings[1].descriptorCount = 1; + bindings[1].stageFlags = VK_SHADER_STAGE_COMPUTE_BIT; + + VkDescriptorSetLayoutCreateInfo layoutInfo{VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO}; + layoutInfo.bindingCount = 2; + layoutInfo.pBindings = bindings; + vkCreateDescriptorSetLayout(device, &layoutInfo, nullptr, &fsr2_.motionVecDescSetLayout); + + VkPushConstantRange pc{}; + pc.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT; + pc.offset = 0; + pc.size = sizeof(glm::mat4) + sizeof(glm::vec4); // 80 bytes + + VkPipelineLayoutCreateInfo plCI{VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO}; + plCI.setLayoutCount = 1; + plCI.pSetLayouts = &fsr2_.motionVecDescSetLayout; + plCI.pushConstantRangeCount = 1; + plCI.pPushConstantRanges = &pc; + vkCreatePipelineLayout(device, &plCI, nullptr, &fsr2_.motionVecPipelineLayout); + + VkShaderModule compMod; + if (!compMod.loadFromFile(device, "assets/shaders/fsr2_motion.comp.spv")) { + LOG_ERROR("FSR2: failed to load motion vector compute shader"); + destroyFSR2Resources(); + return false; + } + + VkComputePipelineCreateInfo cpCI{VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO}; + cpCI.stage = compMod.stageInfo(VK_SHADER_STAGE_COMPUTE_BIT); + cpCI.layout = fsr2_.motionVecPipelineLayout; + if (vkCreateComputePipelines(device, VK_NULL_HANDLE, 1, &cpCI, nullptr, &fsr2_.motionVecPipeline) != VK_SUCCESS) { + LOG_ERROR("FSR2: failed to create motion vector pipeline"); + compMod.destroy(); + destroyFSR2Resources(); + return false; + } + compMod.destroy(); + + // Descriptor pool + set + VkDescriptorPoolSize poolSizes[2] = {}; + poolSizes[0] = {VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, 1}; + poolSizes[1] = {VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, 1}; + VkDescriptorPoolCreateInfo poolInfo{VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO}; + poolInfo.maxSets = 1; + poolInfo.poolSizeCount = 2; + poolInfo.pPoolSizes = poolSizes; + vkCreateDescriptorPool(device, &poolInfo, nullptr, &fsr2_.motionVecDescPool); + + VkDescriptorSetAllocateInfo dsAI{VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO}; + dsAI.descriptorPool = fsr2_.motionVecDescPool; + dsAI.descriptorSetCount = 1; + dsAI.pSetLayouts = &fsr2_.motionVecDescSetLayout; + vkAllocateDescriptorSets(device, &dsAI, &fsr2_.motionVecDescSet); + + // Write descriptors + VkDescriptorImageInfo depthImgInfo{}; + depthImgInfo.sampler = fsr2_.nearestSampler; + depthImgInfo.imageView = fsr2_.sceneDepth.imageView; + depthImgInfo.imageLayout = VK_IMAGE_LAYOUT_DEPTH_STENCIL_READ_ONLY_OPTIMAL; + + VkDescriptorImageInfo mvImgInfo{}; + mvImgInfo.imageView = fsr2_.motionVectors.imageView; + mvImgInfo.imageLayout = VK_IMAGE_LAYOUT_GENERAL; + + VkWriteDescriptorSet writes[2] = {}; + writes[0].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; + writes[0].dstSet = fsr2_.motionVecDescSet; + writes[0].dstBinding = 0; + writes[0].descriptorCount = 1; + writes[0].descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER; + writes[0].pImageInfo = &depthImgInfo; + + writes[1].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; + writes[1].dstSet = fsr2_.motionVecDescSet; + writes[1].dstBinding = 1; + writes[1].descriptorCount = 1; + writes[1].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE; + writes[1].pImageInfo = &mvImgInfo; + + vkUpdateDescriptorSets(device, 2, writes, 0, nullptr); + } + + // --- Temporal Accumulation Compute Pipeline --- + { + // bindings: 0=sceneColor, 1=depth, 2=motionVectors, 3=historyInput, 4=historyOutput + VkDescriptorSetLayoutBinding bindings[5] = {}; + for (int i = 0; i < 4; i++) { + bindings[i].binding = i; + bindings[i].descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER; + bindings[i].descriptorCount = 1; + bindings[i].stageFlags = VK_SHADER_STAGE_COMPUTE_BIT; + } + bindings[4].binding = 4; + bindings[4].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE; + bindings[4].descriptorCount = 1; + bindings[4].stageFlags = VK_SHADER_STAGE_COMPUTE_BIT; + + VkDescriptorSetLayoutCreateInfo layoutInfo{VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO}; + layoutInfo.bindingCount = 5; + layoutInfo.pBindings = bindings; + vkCreateDescriptorSetLayout(device, &layoutInfo, nullptr, &fsr2_.accumulateDescSetLayout); + + VkPushConstantRange pc{}; + pc.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT; + pc.offset = 0; + pc.size = 4 * sizeof(glm::vec4); // 64 bytes + + VkPipelineLayoutCreateInfo plCI{VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO}; + plCI.setLayoutCount = 1; + plCI.pSetLayouts = &fsr2_.accumulateDescSetLayout; + plCI.pushConstantRangeCount = 1; + plCI.pPushConstantRanges = &pc; + vkCreatePipelineLayout(device, &plCI, nullptr, &fsr2_.accumulatePipelineLayout); + + VkShaderModule compMod; + if (!compMod.loadFromFile(device, "assets/shaders/fsr2_accumulate.comp.spv")) { + LOG_ERROR("FSR2: failed to load accumulation compute shader"); + destroyFSR2Resources(); + return false; + } + + VkComputePipelineCreateInfo cpCI{VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO}; + cpCI.stage = compMod.stageInfo(VK_SHADER_STAGE_COMPUTE_BIT); + cpCI.layout = fsr2_.accumulatePipelineLayout; + if (vkCreateComputePipelines(device, VK_NULL_HANDLE, 1, &cpCI, nullptr, &fsr2_.accumulatePipeline) != VK_SUCCESS) { + LOG_ERROR("FSR2: failed to create accumulation pipeline"); + compMod.destroy(); + destroyFSR2Resources(); + return false; + } + compMod.destroy(); + + // Descriptor pool: 2 sets (ping-pong), each with 4 samplers + 1 storage image + VkDescriptorPoolSize poolSizes[2] = {}; + poolSizes[0] = {VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, 8}; + poolSizes[1] = {VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, 2}; + VkDescriptorPoolCreateInfo poolInfo{VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO}; + poolInfo.maxSets = 2; + poolInfo.poolSizeCount = 2; + poolInfo.pPoolSizes = poolSizes; + vkCreateDescriptorPool(device, &poolInfo, nullptr, &fsr2_.accumulateDescPool); + + // Allocate 2 descriptor sets (one per ping-pong direction) + VkDescriptorSetLayout layouts[2] = { fsr2_.accumulateDescSetLayout, fsr2_.accumulateDescSetLayout }; + VkDescriptorSetAllocateInfo dsAI{VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO}; + dsAI.descriptorPool = fsr2_.accumulateDescPool; + dsAI.descriptorSetCount = 2; + dsAI.pSetLayouts = layouts; + vkAllocateDescriptorSets(device, &dsAI, fsr2_.accumulateDescSets); + + // Write descriptors for both ping-pong sets + for (int pp = 0; pp < 2; pp++) { + int inputHistory = 1 - pp; // Read from the other + int outputHistory = pp; // Write to this one + + VkDescriptorImageInfo colorInfo{fsr2_.linearSampler, fsr2_.sceneColor.imageView, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL}; + VkDescriptorImageInfo depthInfo{fsr2_.nearestSampler, fsr2_.sceneDepth.imageView, VK_IMAGE_LAYOUT_DEPTH_STENCIL_READ_ONLY_OPTIMAL}; + VkDescriptorImageInfo mvInfo{fsr2_.nearestSampler, fsr2_.motionVectors.imageView, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL}; + VkDescriptorImageInfo histInInfo{fsr2_.linearSampler, fsr2_.history[inputHistory].imageView, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL}; + VkDescriptorImageInfo histOutInfo{VK_NULL_HANDLE, fsr2_.history[outputHistory].imageView, VK_IMAGE_LAYOUT_GENERAL}; + + VkWriteDescriptorSet writes[5] = {}; + for (int w = 0; w < 5; w++) { + writes[w].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; + writes[w].dstSet = fsr2_.accumulateDescSets[pp]; + writes[w].dstBinding = w; + writes[w].descriptorCount = 1; + } + writes[0].descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER; writes[0].pImageInfo = &colorInfo; + writes[1].descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER; writes[1].pImageInfo = &depthInfo; + writes[2].descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER; writes[2].pImageInfo = &mvInfo; + writes[3].descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER; writes[3].pImageInfo = &histInInfo; + writes[4].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE; writes[4].pImageInfo = &histOutInfo; + + vkUpdateDescriptorSets(device, 5, writes, 0, nullptr); + } + } + + // --- RCAS Sharpening Pipeline (fragment shader, fullscreen pass) --- + { + VkDescriptorSetLayoutBinding binding{}; + binding.binding = 0; + binding.descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER; + binding.descriptorCount = 1; + binding.stageFlags = VK_SHADER_STAGE_FRAGMENT_BIT; + + VkDescriptorSetLayoutCreateInfo layoutInfo{VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO}; + layoutInfo.bindingCount = 1; + layoutInfo.pBindings = &binding; + vkCreateDescriptorSetLayout(device, &layoutInfo, nullptr, &fsr2_.sharpenDescSetLayout); + + VkPushConstantRange pc{}; + pc.stageFlags = VK_SHADER_STAGE_FRAGMENT_BIT; + pc.offset = 0; + pc.size = sizeof(glm::vec4); + + VkPipelineLayoutCreateInfo plCI{VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO}; + plCI.setLayoutCount = 1; + plCI.pSetLayouts = &fsr2_.sharpenDescSetLayout; + plCI.pushConstantRangeCount = 1; + plCI.pPushConstantRanges = &pc; + vkCreatePipelineLayout(device, &plCI, nullptr, &fsr2_.sharpenPipelineLayout); + + VkShaderModule vertMod, fragMod; + if (!vertMod.loadFromFile(device, "assets/shaders/postprocess.vert.spv") || + !fragMod.loadFromFile(device, "assets/shaders/fsr2_sharpen.frag.spv")) { + LOG_ERROR("FSR2: failed to load sharpen shaders"); + destroyFSR2Resources(); + return false; + } + + fsr2_.sharpenPipeline = PipelineBuilder() + .setShaders(vertMod.stageInfo(VK_SHADER_STAGE_VERTEX_BIT), + fragMod.stageInfo(VK_SHADER_STAGE_FRAGMENT_BIT)) + .setVertexInput({}, {}) + .setTopology(VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST) + .setRasterization(VK_POLYGON_MODE_FILL, VK_CULL_MODE_NONE) + .setNoDepthTest() + .setColorBlendAttachment(PipelineBuilder::blendDisabled()) + .setMultisample(VK_SAMPLE_COUNT_1_BIT) + .setLayout(fsr2_.sharpenPipelineLayout) + .setRenderPass(vkCtx->getImGuiRenderPass()) + .setDynamicStates({VK_DYNAMIC_STATE_VIEWPORT, VK_DYNAMIC_STATE_SCISSOR}) + .build(device); + + vertMod.destroy(); + fragMod.destroy(); + + if (!fsr2_.sharpenPipeline) { + LOG_ERROR("FSR2: failed to create sharpen pipeline"); + destroyFSR2Resources(); + return false; + } + + // Descriptor pool + sets for sharpen pass (double-buffered to avoid race condition) + VkDescriptorPoolSize poolSize{VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, 2}; + VkDescriptorPoolCreateInfo poolInfo{VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO}; + poolInfo.maxSets = 2; + poolInfo.poolSizeCount = 1; + poolInfo.pPoolSizes = &poolSize; + vkCreateDescriptorPool(device, &poolInfo, nullptr, &fsr2_.sharpenDescPool); + + VkDescriptorSetLayout layouts[2] = {fsr2_.sharpenDescSetLayout, fsr2_.sharpenDescSetLayout}; + VkDescriptorSetAllocateInfo dsAI{VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO}; + dsAI.descriptorPool = fsr2_.sharpenDescPool; + dsAI.descriptorSetCount = 2; + dsAI.pSetLayouts = layouts; + vkAllocateDescriptorSets(device, &dsAI, fsr2_.sharpenDescSets); + // Descriptors updated dynamically each frame to point at the correct history buffer + } + + fsr2_.needsHistoryReset = true; + fsr2_.frameIndex = 0; + LOG_INFO("FSR2: initialized successfully"); + return true; +} + +void Renderer::destroyFSR2Resources() { + if (!vkCtx) return; + VkDevice device = vkCtx->getDevice(); + VmaAllocator alloc = vkCtx->getAllocator(); + + vkDeviceWaitIdle(device); + + if (fsr2_.sharpenPipeline) { vkDestroyPipeline(device, fsr2_.sharpenPipeline, nullptr); fsr2_.sharpenPipeline = VK_NULL_HANDLE; } + if (fsr2_.sharpenPipelineLayout) { vkDestroyPipelineLayout(device, fsr2_.sharpenPipelineLayout, nullptr); fsr2_.sharpenPipelineLayout = VK_NULL_HANDLE; } + if (fsr2_.sharpenDescPool) { vkDestroyDescriptorPool(device, fsr2_.sharpenDescPool, nullptr); fsr2_.sharpenDescPool = VK_NULL_HANDLE; fsr2_.sharpenDescSets[0] = fsr2_.sharpenDescSets[1] = VK_NULL_HANDLE; } + if (fsr2_.sharpenDescSetLayout) { vkDestroyDescriptorSetLayout(device, fsr2_.sharpenDescSetLayout, nullptr); fsr2_.sharpenDescSetLayout = VK_NULL_HANDLE; } + + if (fsr2_.accumulatePipeline) { vkDestroyPipeline(device, fsr2_.accumulatePipeline, nullptr); fsr2_.accumulatePipeline = VK_NULL_HANDLE; } + if (fsr2_.accumulatePipelineLayout) { vkDestroyPipelineLayout(device, fsr2_.accumulatePipelineLayout, nullptr); fsr2_.accumulatePipelineLayout = VK_NULL_HANDLE; } + if (fsr2_.accumulateDescPool) { vkDestroyDescriptorPool(device, fsr2_.accumulateDescPool, nullptr); fsr2_.accumulateDescPool = VK_NULL_HANDLE; fsr2_.accumulateDescSets[0] = fsr2_.accumulateDescSets[1] = VK_NULL_HANDLE; } + if (fsr2_.accumulateDescSetLayout) { vkDestroyDescriptorSetLayout(device, fsr2_.accumulateDescSetLayout, nullptr); fsr2_.accumulateDescSetLayout = VK_NULL_HANDLE; } + + if (fsr2_.motionVecPipeline) { vkDestroyPipeline(device, fsr2_.motionVecPipeline, nullptr); fsr2_.motionVecPipeline = VK_NULL_HANDLE; } + if (fsr2_.motionVecPipelineLayout) { vkDestroyPipelineLayout(device, fsr2_.motionVecPipelineLayout, nullptr); fsr2_.motionVecPipelineLayout = VK_NULL_HANDLE; } + if (fsr2_.motionVecDescPool) { vkDestroyDescriptorPool(device, fsr2_.motionVecDescPool, nullptr); fsr2_.motionVecDescPool = VK_NULL_HANDLE; fsr2_.motionVecDescSet = VK_NULL_HANDLE; } + if (fsr2_.motionVecDescSetLayout) { vkDestroyDescriptorSetLayout(device, fsr2_.motionVecDescSetLayout, nullptr); fsr2_.motionVecDescSetLayout = VK_NULL_HANDLE; } + + if (fsr2_.sceneFramebuffer) { vkDestroyFramebuffer(device, fsr2_.sceneFramebuffer, nullptr); fsr2_.sceneFramebuffer = VK_NULL_HANDLE; } + if (fsr2_.linearSampler) { vkDestroySampler(device, fsr2_.linearSampler, nullptr); fsr2_.linearSampler = VK_NULL_HANDLE; } + if (fsr2_.nearestSampler) { vkDestroySampler(device, fsr2_.nearestSampler, nullptr); fsr2_.nearestSampler = VK_NULL_HANDLE; } + + destroyImage(device, alloc, fsr2_.motionVectors); + for (int i = 0; i < 2; i++) destroyImage(device, alloc, fsr2_.history[i]); + destroyImage(device, alloc, fsr2_.sceneDepth); + destroyImage(device, alloc, fsr2_.sceneColor); + + fsr2_.internalWidth = 0; + fsr2_.internalHeight = 0; +} + +void Renderer::dispatchMotionVectors() { + if (!fsr2_.motionVecPipeline || currentCmd == VK_NULL_HANDLE) return; + + // Transition depth: DEPTH_STENCIL_ATTACHMENT → DEPTH_STENCIL_READ_ONLY + transitionImageLayout(currentCmd, fsr2_.sceneDepth.image, + VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL, + VK_IMAGE_LAYOUT_DEPTH_STENCIL_READ_ONLY_OPTIMAL, + VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT, + VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT); + + // Transition motion vectors: UNDEFINED → GENERAL + transitionImageLayout(currentCmd, fsr2_.motionVectors.image, + VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_GENERAL, + VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, + VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT); + + vkCmdBindPipeline(currentCmd, VK_PIPELINE_BIND_POINT_COMPUTE, fsr2_.motionVecPipeline); + vkCmdBindDescriptorSets(currentCmd, VK_PIPELINE_BIND_POINT_COMPUTE, + fsr2_.motionVecPipelineLayout, 0, 1, &fsr2_.motionVecDescSet, 0, nullptr); + + // Single reprojection matrix: prevUnjitteredVP * inv(currentUnjitteredVP) + // Both matrices are unjittered — jitter only affects sub-pixel sampling, + // not motion vector computation. This avoids numerical instability from + // jitter amplification through large world coordinates. + struct { + glm::mat4 reprojMatrix; // prevUnjitteredVP * inv(currentUnjitteredVP) + glm::vec4 resolution; + } pc; + + glm::mat4 currentUnjitteredVP = camera->getUnjitteredViewProjectionMatrix(); + pc.reprojMatrix = fsr2_.prevViewProjection * glm::inverse(currentUnjitteredVP); + pc.resolution = glm::vec4( + static_cast(fsr2_.internalWidth), + static_cast(fsr2_.internalHeight), + 1.0f / fsr2_.internalWidth, + 1.0f / fsr2_.internalHeight); + + vkCmdPushConstants(currentCmd, fsr2_.motionVecPipelineLayout, + VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(pc), &pc); + + uint32_t gx = (fsr2_.internalWidth + 7) / 8; + uint32_t gy = (fsr2_.internalHeight + 7) / 8; + vkCmdDispatch(currentCmd, gx, gy, 1); + + // Transition motion vectors: GENERAL → SHADER_READ_ONLY for accumulation + transitionImageLayout(currentCmd, fsr2_.motionVectors.image, + VK_IMAGE_LAYOUT_GENERAL, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, + VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, + VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT); +} + +void Renderer::dispatchTemporalAccumulate() { + if (!fsr2_.accumulatePipeline || currentCmd == VK_NULL_HANDLE) return; + + VkExtent2D swapExtent = vkCtx->getSwapchainExtent(); + uint32_t outputIdx = fsr2_.currentHistory; + uint32_t inputIdx = 1 - outputIdx; + + // Transition scene color: PRESENT_SRC_KHR → SHADER_READ_ONLY + transitionImageLayout(currentCmd, fsr2_.sceneColor.image, + VK_IMAGE_LAYOUT_PRESENT_SRC_KHR, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, + VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT, + VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT); + + // History layout lifecycle: + // First frame: both in UNDEFINED + // Subsequent frames: both in SHADER_READ_ONLY (output was transitioned for sharpen, + // input was left in SHADER_READ_ONLY from its sharpen read) + VkImageLayout historyOldLayout = fsr2_.needsHistoryReset + ? VK_IMAGE_LAYOUT_UNDEFINED + : VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; + + // Transition history input: SHADER_READ_ONLY → SHADER_READ_ONLY (barrier for sync) + transitionImageLayout(currentCmd, fsr2_.history[inputIdx].image, + historyOldLayout, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, + VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, // sharpen read in previous frame + VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT); + + // Transition history output: SHADER_READ_ONLY → GENERAL (for compute write) + transitionImageLayout(currentCmd, fsr2_.history[outputIdx].image, + historyOldLayout, VK_IMAGE_LAYOUT_GENERAL, + VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, + VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT); + + vkCmdBindPipeline(currentCmd, VK_PIPELINE_BIND_POINT_COMPUTE, fsr2_.accumulatePipeline); + vkCmdBindDescriptorSets(currentCmd, VK_PIPELINE_BIND_POINT_COMPUTE, + fsr2_.accumulatePipelineLayout, 0, 1, &fsr2_.accumulateDescSets[outputIdx], 0, nullptr); + + // Push constants + struct { + glm::vec4 internalSize; + glm::vec4 displaySize; + glm::vec4 jitterOffset; + glm::vec4 params; + } pc; + + pc.internalSize = glm::vec4( + static_cast(fsr2_.internalWidth), static_cast(fsr2_.internalHeight), + 1.0f / fsr2_.internalWidth, 1.0f / fsr2_.internalHeight); + pc.displaySize = glm::vec4( + static_cast(swapExtent.width), static_cast(swapExtent.height), + 1.0f / swapExtent.width, 1.0f / swapExtent.height); + glm::vec2 jitter = camera->getJitter(); + pc.jitterOffset = glm::vec4(jitter.x, jitter.y, 0.0f, 0.0f); + pc.params = glm::vec4(fsr2_.needsHistoryReset ? 1.0f : 0.0f, fsr2_.sharpness, 0.0f, 0.0f); + + vkCmdPushConstants(currentCmd, fsr2_.accumulatePipelineLayout, + VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(pc), &pc); + + uint32_t gx = (swapExtent.width + 7) / 8; + uint32_t gy = (swapExtent.height + 7) / 8; + vkCmdDispatch(currentCmd, gx, gy, 1); + + fsr2_.needsHistoryReset = false; +} + +void Renderer::renderFSR2Sharpen() { + if (!fsr2_.sharpenPipeline || currentCmd == VK_NULL_HANDLE) return; + + VkExtent2D ext = vkCtx->getSwapchainExtent(); + uint32_t outputIdx = fsr2_.currentHistory; + + // Use per-frame descriptor set to avoid race with in-flight command buffers + uint32_t frameIdx = vkCtx->getCurrentFrame(); + VkDescriptorSet descSet = fsr2_.sharpenDescSets[frameIdx]; + + // Update sharpen descriptor to point at current history output + VkDescriptorImageInfo imgInfo{}; + imgInfo.sampler = fsr2_.linearSampler; + imgInfo.imageView = fsr2_.history[outputIdx].imageView; + imgInfo.imageLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; + + VkWriteDescriptorSet write{VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET}; + write.dstSet = descSet; + write.dstBinding = 0; + write.descriptorCount = 1; + write.descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER; + write.pImageInfo = &imgInfo; + vkUpdateDescriptorSets(vkCtx->getDevice(), 1, &write, 0, nullptr); + + vkCmdBindPipeline(currentCmd, VK_PIPELINE_BIND_POINT_GRAPHICS, fsr2_.sharpenPipeline); + vkCmdBindDescriptorSets(currentCmd, VK_PIPELINE_BIND_POINT_GRAPHICS, + fsr2_.sharpenPipelineLayout, 0, 1, &descSet, 0, nullptr); + + glm::vec4 params(1.0f / ext.width, 1.0f / ext.height, fsr2_.sharpness, 0.0f); + vkCmdPushConstants(currentCmd, fsr2_.sharpenPipelineLayout, + VK_SHADER_STAGE_FRAGMENT_BIT, 0, sizeof(glm::vec4), ¶ms); + + vkCmdDraw(currentCmd, 3, 1, 0, 0); +} + +void Renderer::setFSR2Enabled(bool enabled) { + if (fsr2_.enabled == enabled) return; + fsr2_.enabled = enabled; + + if (enabled) { + // FSR2 replaces both FSR1 and MSAA + if (fsr_.enabled) { + fsr_.enabled = false; + fsr_.needsRecreate = true; + } + // FSR2 requires non-MSAA render pass (its framebuffer has 2 attachments) + if (vkCtx && vkCtx->getMsaaSamples() > VK_SAMPLE_COUNT_1_BIT) { + pendingMsaaSamples_ = VK_SAMPLE_COUNT_1_BIT; + msaaChangePending_ = true; + } + // Use FSR1's scale factor and sharpness as defaults + fsr2_.scaleFactor = fsr_.scaleFactor; + fsr2_.sharpness = fsr_.sharpness; + fsr2_.needsHistoryReset = true; + } else { + fsr2_.needsRecreate = true; + if (camera) camera->clearJitter(); + } +} + +// ========================= End FSR 2.2 ========================= + void Renderer::renderWorld(game::World* world, game::GameHandler* gameHandler) { (void)world; @@ -3233,153 +4274,283 @@ void Renderer::renderWorld(game::World* world, game::GameHandler* gameHandler) { // Get time of day for sky-related rendering float timeOfDay = (skySystem && skySystem->getSkybox()) ? skySystem->getSkybox()->getTimeOfDay() : 12.0f; - // Render sky system (unified coordinator for skybox, stars, celestial, clouds, lens flare) - if (skySystem && camera && !skipSky) { - rendering::SkyParams skyParams; - skyParams.timeOfDay = timeOfDay; - skyParams.gameTime = gameHandler ? gameHandler->getGameTime() : -1.0f; - - if (lightingManager) { - const auto& lighting = lightingManager->getLightingParams(); - skyParams.directionalDir = lighting.directionalDir; - skyParams.sunColor = lighting.diffuseColor; - skyParams.skyTopColor = lighting.skyTopColor; - skyParams.skyMiddleColor = lighting.skyMiddleColor; - skyParams.skyBand1Color = lighting.skyBand1Color; - skyParams.skyBand2Color = lighting.skyBand2Color; - skyParams.cloudDensity = lighting.cloudDensity; - skyParams.fogDensity = lighting.fogDensity; - skyParams.horizonGlow = lighting.horizonGlow; - } - - // Weather attenuation for lens flare - if (gameHandler) { - skyParams.weatherIntensity = gameHandler->getWeatherIntensity(); - } - - skyParams.skyboxModelId = 0; - skyParams.skyboxHasStars = false; - - skySystem->render(currentCmd, perFrameSet, *camera, skyParams); - } - - // Terrain (opaque pass) - if (terrainRenderer && camera && terrainEnabled && !skipTerrain) { - auto terrainStart = std::chrono::steady_clock::now(); - terrainRenderer->render(currentCmd, perFrameSet, *camera); - lastTerrainRenderMs = std::chrono::duration( - std::chrono::steady_clock::now() - terrainStart).count(); - } - - // WMO buildings (opaque, drawn before characters so selection circle sits on top) - if (wmoRenderer && camera && !skipWMO) { - auto wmoStart = std::chrono::steady_clock::now(); - wmoRenderer->render(currentCmd, perFrameSet, *camera); - lastWMORenderMs = std::chrono::duration( - std::chrono::steady_clock::now() - wmoStart).count(); - } - - // Selection circle (drawn after WMO, before characters) - renderSelectionCircle(view, projection); - - // Characters (after selection circle so units draw over the ring) - if (characterRenderer && camera && !skipChars) { - characterRenderer->render(currentCmd, perFrameSet, *camera); - } - - // M2 doodads, creatures, glow sprites, particles - if (m2Renderer && camera && !skipM2) { - if (cameraController) { + // ── Multithreaded secondary command buffer recording ── + // Terrain, WMO, and M2 record on worker threads while main thread handles + // sky, characters, water, and effects. prepareRender() on main thread first + // to handle thread-unsafe GPU allocations (descriptor pools, bone SSBOs). + if (parallelRecordingEnabled_) { + // --- Pre-compute state + GPU allocations on main thread (not thread-safe) --- + if (m2Renderer && cameraController) { m2Renderer->setInsideInterior(cameraController->isInsideWMO()); m2Renderer->setOnTaxi(cameraController->isOnTaxi()); } - auto m2Start = std::chrono::steady_clock::now(); - m2Renderer->render(currentCmd, perFrameSet, *camera); - m2Renderer->renderSmokeParticles(currentCmd, perFrameSet); - m2Renderer->renderM2Particles(currentCmd, perFrameSet); - lastM2RenderMs = std::chrono::duration( - std::chrono::steady_clock::now() - m2Start).count(); - } + if (wmoRenderer) wmoRenderer->prepareRender(); + if (m2Renderer && camera) m2Renderer->prepareRender(frameIdx, *camera); + if (characterRenderer) characterRenderer->prepareRender(frameIdx); - // Water (transparent, after all opaques) - // When MSAA is on and 1x pass is available, water renders after main pass ends - bool waterDeferred = waterRenderer && waterRenderer->hasWater1xPass() - && vkCtx->getMsaaSamples() != VK_SAMPLE_COUNT_1_BIT; - if (waterRenderer && camera && !waterDeferred) { - waterRenderer->render(currentCmd, perFrameSet, *camera, globalTime, false, vkCtx->getCurrentFrame()); - } + // --- Dispatch worker threads (terrain + WMO + M2) --- + std::future terrainFuture, wmoFuture, m2Future; - // Weather particles - if (weather && camera) { - weather->render(currentCmd, perFrameSet); - } - - // Swim effects (ripples, bubbles) - if (swimEffects && camera) { - swimEffects->render(currentCmd, perFrameSet); - } - - // Mount dust - if (mountDust && camera) { - mountDust->render(currentCmd, perFrameSet); - } - - // Charge effect - if (chargeEffect && camera) { - chargeEffect->render(currentCmd, perFrameSet); - } - - // Quest markers (billboards above NPCs) - if (questMarkerRenderer && camera) { - questMarkerRenderer->render(currentCmd, perFrameSet, *camera); - } - - // Underwater blue fog overlay — only for terrain water, not WMO water. - if (overlayPipeline && waterRenderer && camera) { - glm::vec3 camPos = camera->getPosition(); - auto waterH = waterRenderer->getNearestWaterHeightAt(camPos.x, camPos.y, camPos.z); - constexpr float MIN_SUBMERSION_OVERLAY = 1.5f; - if (waterH && camPos.z < (*waterH - MIN_SUBMERSION_OVERLAY) - && !waterRenderer->isWmoWaterAt(camPos.x, camPos.y)) { - float depth = *waterH - camPos.z - MIN_SUBMERSION_OVERLAY; - - // Check for canal (liquid type 5, 13, 17) — denser/darker fog - bool canal = false; - if (auto lt = waterRenderer->getWaterTypeAt(camPos.x, camPos.y)) - canal = (*lt == 5 || *lt == 13 || *lt == 17); - - // Fog opacity increases with depth: thin at surface, thick deep down - float fogStrength = 1.0f - std::exp(-depth * (canal ? 0.25f : 0.12f)); - fogStrength = glm::clamp(fogStrength, 0.0f, 0.75f); - - glm::vec4 tint = canal - ? glm::vec4(0.01f, 0.04f, 0.10f, fogStrength) - : glm::vec4(0.03f, 0.09f, 0.18f, fogStrength); - renderOverlay(tint); + if (terrainRenderer && camera && terrainEnabled && !skipTerrain) { + terrainFuture = std::async(std::launch::async, [&]() -> double { + auto t0 = std::chrono::steady_clock::now(); + VkCommandBuffer cmd = beginSecondary(SEC_TERRAIN); + setSecondaryViewportScissor(cmd); + terrainRenderer->render(cmd, perFrameSet, *camera); + vkEndCommandBuffer(cmd); + return std::chrono::duration( + std::chrono::steady_clock::now() - t0).count(); + }); } + + if (wmoRenderer && camera && !skipWMO) { + wmoFuture = std::async(std::launch::async, [&]() -> double { + auto t0 = std::chrono::steady_clock::now(); + VkCommandBuffer cmd = beginSecondary(SEC_WMO); + setSecondaryViewportScissor(cmd); + wmoRenderer->render(cmd, perFrameSet, *camera); + vkEndCommandBuffer(cmd); + return std::chrono::duration( + std::chrono::steady_clock::now() - t0).count(); + }); + } + + if (m2Renderer && camera && !skipM2) { + m2Future = std::async(std::launch::async, [&]() -> double { + auto t0 = std::chrono::steady_clock::now(); + VkCommandBuffer cmd = beginSecondary(SEC_M2); + setSecondaryViewportScissor(cmd); + m2Renderer->render(cmd, perFrameSet, *camera); + m2Renderer->renderSmokeParticles(cmd, perFrameSet); + m2Renderer->renderM2Particles(cmd, perFrameSet); + vkEndCommandBuffer(cmd); + return std::chrono::duration( + std::chrono::steady_clock::now() - t0).count(); + }); + } + + // --- Main thread: record sky (SEC_SKY) --- + { + VkCommandBuffer cmd = beginSecondary(SEC_SKY); + setSecondaryViewportScissor(cmd); + if (skySystem && camera && !skipSky) { + rendering::SkyParams skyParams; + skyParams.timeOfDay = timeOfDay; + skyParams.gameTime = gameHandler ? gameHandler->getGameTime() : -1.0f; + if (lightingManager) { + const auto& lighting = lightingManager->getLightingParams(); + skyParams.directionalDir = lighting.directionalDir; + skyParams.sunColor = lighting.diffuseColor; + skyParams.skyTopColor = lighting.skyTopColor; + skyParams.skyMiddleColor = lighting.skyMiddleColor; + skyParams.skyBand1Color = lighting.skyBand1Color; + skyParams.skyBand2Color = lighting.skyBand2Color; + skyParams.cloudDensity = lighting.cloudDensity; + skyParams.fogDensity = lighting.fogDensity; + skyParams.horizonGlow = lighting.horizonGlow; + } + if (gameHandler) skyParams.weatherIntensity = gameHandler->getWeatherIntensity(); + skyParams.skyboxModelId = 0; + skyParams.skyboxHasStars = false; + skySystem->render(cmd, perFrameSet, *camera, skyParams); + } + vkEndCommandBuffer(cmd); + } + + // --- Main thread: record characters + selection circle (SEC_CHARS) --- + { + VkCommandBuffer cmd = beginSecondary(SEC_CHARS); + setSecondaryViewportScissor(cmd); + renderSelectionCircle(view, projection, cmd); + if (characterRenderer && camera && !skipChars) { + characterRenderer->render(cmd, perFrameSet, *camera); + } + vkEndCommandBuffer(cmd); + } + + // --- Wait for workers --- + if (terrainFuture.valid()) lastTerrainRenderMs = terrainFuture.get(); + if (wmoFuture.valid()) lastWMORenderMs = wmoFuture.get(); + if (m2Future.valid()) lastM2RenderMs = m2Future.get(); + + // --- Main thread: record post-opaque (SEC_POST) --- + { + VkCommandBuffer cmd = beginSecondary(SEC_POST); + setSecondaryViewportScissor(cmd); + if (waterRenderer && camera) + waterRenderer->render(cmd, perFrameSet, *camera, globalTime, false, frameIdx); + if (weather && camera) weather->render(cmd, perFrameSet); + if (swimEffects && camera) swimEffects->render(cmd, perFrameSet); + if (mountDust && camera) mountDust->render(cmd, perFrameSet); + if (chargeEffect && camera) chargeEffect->render(cmd, perFrameSet); + if (questMarkerRenderer && camera) questMarkerRenderer->render(cmd, perFrameSet, *camera); + + // Underwater overlay + minimap + if (overlayPipeline && waterRenderer && camera) { + glm::vec3 camPos = camera->getPosition(); + auto waterH = waterRenderer->getNearestWaterHeightAt(camPos.x, camPos.y, camPos.z); + constexpr float MIN_SUBMERSION_OVERLAY = 1.5f; + if (waterH && camPos.z < (*waterH - MIN_SUBMERSION_OVERLAY) + && !waterRenderer->isWmoWaterAt(camPos.x, camPos.y)) { + float depth = *waterH - camPos.z - MIN_SUBMERSION_OVERLAY; + bool canal = false; + if (auto lt = waterRenderer->getWaterTypeAt(camPos.x, camPos.y)) + canal = (*lt == 5 || *lt == 13 || *lt == 17); + float fogStrength = 1.0f - std::exp(-depth * (canal ? 0.25f : 0.12f)); + fogStrength = glm::clamp(fogStrength, 0.0f, 0.75f); + glm::vec4 tint = canal + ? glm::vec4(0.01f, 0.04f, 0.10f, fogStrength) + : glm::vec4(0.03f, 0.09f, 0.18f, fogStrength); + renderOverlay(tint, cmd); + } + } + if (minimap && minimap->isEnabled() && camera && window) { + glm::vec3 minimapCenter = camera->getPosition(); + if (cameraController && cameraController->isThirdPerson()) + minimapCenter = characterPosition; + float minimapPlayerOrientation = 0.0f; + bool hasMinimapPlayerOrientation = false; + if (cameraController) { + float facingRad = glm::radians(characterYaw); + glm::vec3 facingFwd(std::cos(facingRad), std::sin(facingRad), 0.0f); + minimapPlayerOrientation = std::atan2(-facingFwd.x, facingFwd.y); + hasMinimapPlayerOrientation = true; + } else if (gameHandler) { + minimapPlayerOrientation = gameHandler->getMovementInfo().orientation; + hasMinimapPlayerOrientation = true; + } + minimap->render(cmd, *camera, minimapCenter, + window->getWidth(), window->getHeight(), + minimapPlayerOrientation, hasMinimapPlayerOrientation); + } + vkEndCommandBuffer(cmd); + } + + // --- Execute all secondary buffers in correct draw order --- + VkCommandBuffer validCmds[6]; + uint32_t numCmds = 0; + validCmds[numCmds++] = secondaryCmds_[SEC_SKY][frameIdx]; + if (terrainRenderer && camera && terrainEnabled && !skipTerrain) + validCmds[numCmds++] = secondaryCmds_[SEC_TERRAIN][frameIdx]; + if (wmoRenderer && camera && !skipWMO) + validCmds[numCmds++] = secondaryCmds_[SEC_WMO][frameIdx]; + validCmds[numCmds++] = secondaryCmds_[SEC_CHARS][frameIdx]; + if (m2Renderer && camera && !skipM2) + validCmds[numCmds++] = secondaryCmds_[SEC_M2][frameIdx]; + validCmds[numCmds++] = secondaryCmds_[SEC_POST][frameIdx]; + + vkCmdExecuteCommands(currentCmd, numCmds, validCmds); + + } else { + // ── Fallback: single-threaded inline recording (original path) ── + + if (skySystem && camera && !skipSky) { + rendering::SkyParams skyParams; + skyParams.timeOfDay = timeOfDay; + skyParams.gameTime = gameHandler ? gameHandler->getGameTime() : -1.0f; + if (lightingManager) { + const auto& lighting = lightingManager->getLightingParams(); + skyParams.directionalDir = lighting.directionalDir; + skyParams.sunColor = lighting.diffuseColor; + skyParams.skyTopColor = lighting.skyTopColor; + skyParams.skyMiddleColor = lighting.skyMiddleColor; + skyParams.skyBand1Color = lighting.skyBand1Color; + skyParams.skyBand2Color = lighting.skyBand2Color; + skyParams.cloudDensity = lighting.cloudDensity; + skyParams.fogDensity = lighting.fogDensity; + skyParams.horizonGlow = lighting.horizonGlow; + } + if (gameHandler) skyParams.weatherIntensity = gameHandler->getWeatherIntensity(); + skyParams.skyboxModelId = 0; + skyParams.skyboxHasStars = false; + skySystem->render(currentCmd, perFrameSet, *camera, skyParams); + } + + if (terrainRenderer && camera && terrainEnabled && !skipTerrain) { + auto terrainStart = std::chrono::steady_clock::now(); + terrainRenderer->render(currentCmd, perFrameSet, *camera); + lastTerrainRenderMs = std::chrono::duration( + std::chrono::steady_clock::now() - terrainStart).count(); + } + + if (wmoRenderer && camera && !skipWMO) { + wmoRenderer->prepareRender(); + auto wmoStart = std::chrono::steady_clock::now(); + wmoRenderer->render(currentCmd, perFrameSet, *camera); + lastWMORenderMs = std::chrono::duration( + std::chrono::steady_clock::now() - wmoStart).count(); + } + + renderSelectionCircle(view, projection); + + if (characterRenderer && camera && !skipChars) { + characterRenderer->prepareRender(frameIdx); + characterRenderer->render(currentCmd, perFrameSet, *camera); + } + + if (m2Renderer && camera && !skipM2) { + if (cameraController) { + m2Renderer->setInsideInterior(cameraController->isInsideWMO()); + m2Renderer->setOnTaxi(cameraController->isOnTaxi()); + } + m2Renderer->prepareRender(frameIdx, *camera); + auto m2Start = std::chrono::steady_clock::now(); + m2Renderer->render(currentCmd, perFrameSet, *camera); + m2Renderer->renderSmokeParticles(currentCmd, perFrameSet); + m2Renderer->renderM2Particles(currentCmd, perFrameSet); + lastM2RenderMs = std::chrono::duration( + std::chrono::steady_clock::now() - m2Start).count(); + } + + if (waterRenderer && camera) + waterRenderer->render(currentCmd, perFrameSet, *camera, globalTime, false, frameIdx); + if (weather && camera) weather->render(currentCmd, perFrameSet); + if (swimEffects && camera) swimEffects->render(currentCmd, perFrameSet); + if (mountDust && camera) mountDust->render(currentCmd, perFrameSet); + if (chargeEffect && camera) chargeEffect->render(currentCmd, perFrameSet); + if (questMarkerRenderer && camera) questMarkerRenderer->render(currentCmd, perFrameSet, *camera); } - // Minimap overlay - if (minimap && minimap->isEnabled() && camera && window) { - glm::vec3 minimapCenter = camera->getPosition(); - if (cameraController && cameraController->isThirdPerson()) - minimapCenter = characterPosition; - float minimapPlayerOrientation = 0.0f; - bool hasMinimapPlayerOrientation = false; - if (cameraController) { - // Use the same yaw that drives character model rendering so minimap - // orientation cannot drift by a different axis/sign convention. - float facingRad = glm::radians(characterYaw); - glm::vec3 facingFwd(std::cos(facingRad), std::sin(facingRad), 0.0f); - minimapPlayerOrientation = std::atan2(-facingFwd.x, facingFwd.y); - hasMinimapPlayerOrientation = true; - } else if (gameHandler) { - minimapPlayerOrientation = gameHandler->getMovementInfo().orientation; - hasMinimapPlayerOrientation = true; + // Underwater overlay and minimap — in the fallback path these run inline; + // in the parallel path they were already recorded into SEC_POST above. + if (!parallelRecordingEnabled_) { + if (overlayPipeline && waterRenderer && camera) { + glm::vec3 camPos = camera->getPosition(); + auto waterH = waterRenderer->getNearestWaterHeightAt(camPos.x, camPos.y, camPos.z); + constexpr float MIN_SUBMERSION_OVERLAY = 1.5f; + if (waterH && camPos.z < (*waterH - MIN_SUBMERSION_OVERLAY) + && !waterRenderer->isWmoWaterAt(camPos.x, camPos.y)) { + float depth = *waterH - camPos.z - MIN_SUBMERSION_OVERLAY; + bool canal = false; + if (auto lt = waterRenderer->getWaterTypeAt(camPos.x, camPos.y)) + canal = (*lt == 5 || *lt == 13 || *lt == 17); + float fogStrength = 1.0f - std::exp(-depth * (canal ? 0.25f : 0.12f)); + fogStrength = glm::clamp(fogStrength, 0.0f, 0.75f); + glm::vec4 tint = canal + ? glm::vec4(0.01f, 0.04f, 0.10f, fogStrength) + : glm::vec4(0.03f, 0.09f, 0.18f, fogStrength); + renderOverlay(tint); + } + } + if (minimap && minimap->isEnabled() && camera && window) { + glm::vec3 minimapCenter = camera->getPosition(); + if (cameraController && cameraController->isThirdPerson()) + minimapCenter = characterPosition; + float minimapPlayerOrientation = 0.0f; + bool hasMinimapPlayerOrientation = false; + if (cameraController) { + float facingRad = glm::radians(characterYaw); + glm::vec3 facingFwd(std::cos(facingRad), std::sin(facingRad), 0.0f); + minimapPlayerOrientation = std::atan2(-facingFwd.x, facingFwd.y); + hasMinimapPlayerOrientation = true; + } else if (gameHandler) { + minimapPlayerOrientation = gameHandler->getMovementInfo().orientation; + hasMinimapPlayerOrientation = true; + } + minimap->render(currentCmd, *camera, minimapCenter, + window->getWidth(), window->getHeight(), + minimapPlayerOrientation, hasMinimapPlayerOrientation); } - minimap->render(currentCmd, *camera, minimapCenter, - window->getWidth(), window->getHeight(), - minimapPlayerOrientation, hasMinimapPlayerOrientation); } auto renderEnd = std::chrono::steady_clock::now(); @@ -3413,8 +4584,6 @@ bool Renderer::initializeRenderers(pipeline::AssetManager* assetManager, const s if (!waterRenderer->initialize(vkCtx, perFrameSetLayout)) { LOG_ERROR("Failed to initialize water renderer"); waterRenderer.reset(); - } else if (vkCtx->getMsaaSamples() != VK_SAMPLE_COUNT_1_BIT) { - setupWater1xPass(); } } @@ -3827,27 +4996,32 @@ glm::mat4 Renderer::computeLightSpaceMatrix() { shadowCenter = desiredCenter; glm::vec3 center = shadowCenter; - // Snap to shadow texel grid to keep projection stable while moving. + // Snap shadow frustum to texel grid so the projection is perfectly stable + // while moving. We compute the light's right/up axes from the sun direction + // (these are constant per frame regardless of center) and snap center along + // them before building the view matrix. float halfExtent = kShadowHalfExtent; float texelWorld = (2.0f * halfExtent) / static_cast(SHADOW_MAP_SIZE); - // Build light view to get stable axes + // Stable light-space axes (independent of center position) glm::vec3 up(0.0f, 0.0f, 1.0f); - // If sunDir is nearly parallel to up, pick a different up vector if (std::abs(glm::dot(sunDir, up)) > 0.99f) { up = glm::vec3(0.0f, 1.0f, 0.0f); } - glm::mat4 lightView = glm::lookAt(center - sunDir * kShadowLightDistance, center, up); + glm::vec3 lightRight = glm::normalize(glm::cross(sunDir, up)); + glm::vec3 lightUp = glm::normalize(glm::cross(lightRight, sunDir)); - // Stable texel snapping in light space removes movement shimmer. - glm::vec4 centerLS = lightView * glm::vec4(center, 1.0f); - centerLS.x = std::round(centerLS.x / texelWorld) * texelWorld; - centerLS.y = std::round(centerLS.y / texelWorld) * texelWorld; - glm::vec4 snappedCenter = glm::inverse(lightView) * centerLS; - center = glm::vec3(snappedCenter); + // Snap center along light's right and up axes to align with texel grid. + // This eliminates sub-texel shifts that cause shadow shimmer. + float dotR = glm::dot(center, lightRight); + float dotU = glm::dot(center, lightUp); + dotR = std::floor(dotR / texelWorld) * texelWorld; + dotU = std::floor(dotU / texelWorld) * texelWorld; + float dotD = glm::dot(center, sunDir); // depth axis unchanged + center = lightRight * dotR + lightUp * dotU + sunDir * dotD; shadowCenter = center; - lightView = glm::lookAt(center - sunDir * kShadowLightDistance, center, up); + glm::mat4 lightView = glm::lookAt(center - sunDir * kShadowLightDistance, center, up); glm::mat4 lightProj = glm::ortho(-halfExtent, halfExtent, -halfExtent, halfExtent, kShadowNearPlane, kShadowFarPlane); lightProj[1][1] *= -1.0f; // Vulkan Y-flip for shadow pass @@ -3868,6 +5042,128 @@ void Renderer::setupWater1xPass() { vkCtx->getSwapchainImageViews(), depthView, vkCtx->getSwapchainExtent()); } +// ========================= Multithreaded Secondary Command Buffers ========================= + +bool Renderer::createSecondaryCommandResources() { + if (!vkCtx) return false; + VkDevice device = vkCtx->getDevice(); + uint32_t queueFamily = vkCtx->getGraphicsQueueFamily(); + + VkCommandPoolCreateInfo poolCI{}; + poolCI.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO; + poolCI.flags = VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT; + poolCI.queueFamilyIndex = queueFamily; + + // Create worker command pools (one per worker thread) + for (uint32_t w = 0; w < NUM_WORKERS; ++w) { + if (vkCreateCommandPool(device, &poolCI, nullptr, &workerCmdPools_[w]) != VK_SUCCESS) { + LOG_ERROR("Failed to create worker command pool ", w); + return false; + } + } + + // Create main-thread secondary command pool + if (vkCreateCommandPool(device, &poolCI, nullptr, &mainSecondaryCmdPool_) != VK_SUCCESS) { + LOG_ERROR("Failed to create main secondary command pool"); + return false; + } + + // Allocate secondary command buffers + VkCommandBufferAllocateInfo allocInfo{}; + allocInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO; + allocInfo.level = VK_COMMAND_BUFFER_LEVEL_SECONDARY; + allocInfo.commandBufferCount = 1; + + // Worker secondaries: SEC_TERRAIN=1, SEC_WMO=2, SEC_M2=4 → worker pools 0,1,2 + const uint32_t workerSecondaries[] = { SEC_TERRAIN, SEC_WMO, SEC_M2 }; + for (uint32_t w = 0; w < NUM_WORKERS; ++w) { + allocInfo.commandPool = workerCmdPools_[w]; + for (uint32_t f = 0; f < MAX_FRAMES; ++f) { + if (vkAllocateCommandBuffers(device, &allocInfo, &secondaryCmds_[workerSecondaries[w]][f]) != VK_SUCCESS) { + LOG_ERROR("Failed to allocate worker secondary buffer w=", w, " f=", f); + return false; + } + } + } + + // Main-thread secondaries: SEC_SKY=0, SEC_CHARS=3, SEC_POST=5, SEC_IMGUI=6 + const uint32_t mainSecondaries[] = { SEC_SKY, SEC_CHARS, SEC_POST, SEC_IMGUI }; + for (uint32_t idx : mainSecondaries) { + allocInfo.commandPool = mainSecondaryCmdPool_; + for (uint32_t f = 0; f < MAX_FRAMES; ++f) { + if (vkAllocateCommandBuffers(device, &allocInfo, &secondaryCmds_[idx][f]) != VK_SUCCESS) { + LOG_ERROR("Failed to allocate main secondary buffer idx=", idx, " f=", f); + return false; + } + } + } + + parallelRecordingEnabled_ = true; + LOG_INFO("Multithreaded rendering: ", NUM_WORKERS, " worker threads, ", + NUM_SECONDARIES, " secondary buffers [ENABLED]"); + return true; +} + +void Renderer::destroySecondaryCommandResources() { + if (!vkCtx) return; + VkDevice device = vkCtx->getDevice(); + vkDeviceWaitIdle(device); + + // Secondary buffers are freed when their pool is destroyed + for (uint32_t w = 0; w < NUM_WORKERS; ++w) { + if (workerCmdPools_[w]) { + vkDestroyCommandPool(device, workerCmdPools_[w], nullptr); + workerCmdPools_[w] = VK_NULL_HANDLE; + } + } + if (mainSecondaryCmdPool_) { + vkDestroyCommandPool(device, mainSecondaryCmdPool_, nullptr); + mainSecondaryCmdPool_ = VK_NULL_HANDLE; + } + + for (auto& arr : secondaryCmds_) + for (auto& cmd : arr) + cmd = VK_NULL_HANDLE; + + parallelRecordingEnabled_ = false; +} + +VkCommandBuffer Renderer::beginSecondary(uint32_t secondaryIndex) { + uint32_t frame = vkCtx->getCurrentFrame(); + VkCommandBuffer cmd = secondaryCmds_[secondaryIndex][frame]; + + VkCommandBufferInheritanceInfo inheritInfo{}; + inheritInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_INHERITANCE_INFO; + inheritInfo.renderPass = activeRenderPass_; + inheritInfo.subpass = 0; + inheritInfo.framebuffer = activeFramebuffer_; + + VkCommandBufferBeginInfo beginInfo{}; + beginInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO; + beginInfo.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT + | VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT; + beginInfo.pInheritanceInfo = &inheritInfo; + + VkResult result = vkBeginCommandBuffer(cmd, &beginInfo); + if (result != VK_SUCCESS) { + LOG_ERROR("vkBeginCommandBuffer failed for secondary ", secondaryIndex, + " frame ", frame, " result=", static_cast(result)); + } + return cmd; +} + +void Renderer::setSecondaryViewportScissor(VkCommandBuffer cmd) { + VkViewport vp{}; + vp.width = static_cast(activeRenderExtent_.width); + vp.height = static_cast(activeRenderExtent_.height); + vp.maxDepth = 1.0f; + vkCmdSetViewport(cmd, 0, 1, &vp); + + VkRect2D sc{}; + sc.extent = activeRenderExtent_; + vkCmdSetScissor(cmd, 0, 1, &sc); +} + void Renderer::renderReflectionPass() { if (!waterRenderer || !camera || !waterRenderer->hasReflectionPass() || !waterRenderer->hasSurfaces()) return; if (currentCmd == VK_NULL_HANDLE || !reflPerFrameUBOMapped) return; diff --git a/src/rendering/terrain_manager.cpp b/src/rendering/terrain_manager.cpp index 97527c8c..e186ed96 100644 --- a/src/rendering/terrain_manager.cpp +++ b/src/rendering/terrain_manager.cpp @@ -199,13 +199,29 @@ void TerrainManager::update(const Camera& camera, float deltaTime) { currentTile = newTile; } - // Stream tiles if we've moved significantly or initial load + // Stream tiles when player crosses a tile boundary if (newTile.x != lastStreamTile.x || newTile.y != lastStreamTile.y) { LOG_DEBUG("Streaming: cam=(", camPos.x, ",", camPos.y, ",", camPos.z, ") tile=[", newTile.x, ",", newTile.y, "] loaded=", loadedTiles.size()); streamTiles(); lastStreamTile = newTile; + } else { + // Proactive loading: when workers are idle, periodically re-check for + // unloaded tiles within range. Throttled to avoid hitching right after + // world load when many tiles finalize simultaneously. + proactiveStreamTimer_ += deltaTime; + if (proactiveStreamTimer_ >= 2.0f) { + proactiveStreamTimer_ = 0.0f; + bool workersIdle; + { + std::lock_guard lock(queueMutex); + workersIdle = loadQueue.empty(); + } + if (workersIdle) { + streamTiles(); + } + } } } @@ -800,7 +816,7 @@ bool TerrainManager::advanceFinalization(FinalizingTile& ft) { } bool allDone = terrainRenderer->loadTerrainIncremental( pending->mesh, pending->terrain.textures, x, y, - ft.terrainChunkNext, 32); + ft.terrainChunkNext, 16); if (!allDone) { return false; // More chunks remain — yield to time budget } @@ -830,11 +846,19 @@ bool TerrainManager::advanceFinalization(FinalizingTile& ft) { } case FinalizationPhase::M2_MODELS: { - // Upload multiple M2 models per call (batched GPU uploads) + // Upload multiple M2 models per call (batched GPU uploads). + // When no more tiles are queued for background parsing, increase the + // per-frame budget so idle workers don't waste time waiting for the + // main thread to trickle-upload models. if (m2Renderer && ft.m2ModelIndex < pending->m2Models.size()) { // Set pre-decoded BLP cache so loadTexture() skips main-thread BLP decode m2Renderer->setPredecodedBLPCache(&pending->preloadedM2Textures); - constexpr size_t kModelsPerStep = 4; + bool workersIdle; + { + std::lock_guard lk(queueMutex); + workersIdle = loadQueue.empty() && readyQueue.empty(); + } + const size_t kModelsPerStep = workersIdle ? 6 : 4; size_t uploaded = 0; while (ft.m2ModelIndex < pending->m2Models.size() && uploaded < kModelsPerStep) { auto& m2Ready = pending->m2Models[ft.m2ModelIndex]; @@ -896,7 +920,12 @@ bool TerrainManager::advanceFinalization(FinalizingTile& ft) { wmoRenderer->setPredecodedBLPCache(&pending->preloadedWMOTextures); wmoRenderer->setDeferNormalMaps(true); - constexpr size_t kWmosPerStep = 1; + bool wmoWorkersIdle; + { + std::lock_guard lk(queueMutex); + wmoWorkersIdle = loadQueue.empty() && readyQueue.empty(); + } + const size_t kWmosPerStep = wmoWorkersIdle ? 2 : 1; size_t uploaded = 0; while (ft.wmoModelIndex < pending->wmoModels.size() && uploaded < kWmosPerStep) { auto& wmoReady = pending->wmoModels[ft.wmoModelIndex]; @@ -911,6 +940,8 @@ bool TerrainManager::advanceFinalization(FinalizingTile& ft) { wmoRenderer->setDeferNormalMaps(false); wmoRenderer->setPredecodedBLPCache(nullptr); if (ft.wmoModelIndex < pending->wmoModels.size()) return false; + // All WMO models loaded — backfill normal/height maps that were skipped during streaming + wmoRenderer->backfillNormalMaps(); } ft.phase = FinalizationPhase::WMO_INSTANCES; return false; @@ -1176,7 +1207,7 @@ void TerrainManager::processReadyTiles() { // Async upload batch: record GPU copies into a command buffer, submit with // a fence, but DON'T wait. The fence is polled on subsequent frames. // This eliminates the main-thread stall from vkWaitForFences entirely. - const int maxSteps = taxiStreamingMode_ ? 8 : 2; + const int maxSteps = taxiStreamingMode_ ? 4 : 1; int steps = 0; if (vkCtx) vkCtx->beginUploadBatch(); diff --git a/src/rendering/vk_context.cpp b/src/rendering/vk_context.cpp index 79e7eac3..dc4144fa 100644 --- a/src/rendering/vk_context.cpp +++ b/src/rendering/vk_context.cpp @@ -252,14 +252,22 @@ bool VkContext::createAllocator() { bool VkContext::createSwapchain(int width, int height) { vkb::SwapchainBuilder swapchainBuilder{physicalDevice, device, surface}; - auto swapRet = swapchainBuilder + auto& builder = swapchainBuilder .set_desired_format({VK_FORMAT_B8G8R8A8_UNORM, VK_COLOR_SPACE_SRGB_NONLINEAR_KHR}) - .set_desired_present_mode(VK_PRESENT_MODE_FIFO_KHR) // VSync .set_desired_extent(static_cast(width), static_cast(height)) .set_image_usage_flags(VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT | VK_IMAGE_USAGE_TRANSFER_SRC_BIT) .set_desired_min_image_count(2) - .set_old_swapchain(swapchain) // For recreation - .build(); + .set_old_swapchain(swapchain); + + if (vsync_) { + builder.set_desired_present_mode(VK_PRESENT_MODE_FIFO_KHR); + } else { + builder.set_desired_present_mode(VK_PRESENT_MODE_IMMEDIATE_KHR); + builder.add_fallback_present_mode(VK_PRESENT_MODE_MAILBOX_KHR); + builder.add_fallback_present_mode(VK_PRESENT_MODE_FIFO_RELAXED_KHR); + } + + auto swapRet = builder.build(); if (!swapRet) { LOG_ERROR("Failed to create Vulkan swapchain: ", swapRet.error().message()); @@ -1026,14 +1034,22 @@ bool VkContext::recreateSwapchain(int width, int height) { VkSwapchainKHR oldSwapchain = swapchain; vkb::SwapchainBuilder swapchainBuilder{physicalDevice, device, surface}; - auto swapRet = swapchainBuilder + auto& builder = swapchainBuilder .set_desired_format({VK_FORMAT_B8G8R8A8_UNORM, VK_COLOR_SPACE_SRGB_NONLINEAR_KHR}) - .set_desired_present_mode(VK_PRESENT_MODE_FIFO_KHR) .set_desired_extent(static_cast(width), static_cast(height)) .set_image_usage_flags(VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT | VK_IMAGE_USAGE_TRANSFER_SRC_BIT) .set_desired_min_image_count(2) - .set_old_swapchain(oldSwapchain) - .build(); + .set_old_swapchain(oldSwapchain); + + if (vsync_) { + builder.set_desired_present_mode(VK_PRESENT_MODE_FIFO_KHR); + } else { + builder.set_desired_present_mode(VK_PRESENT_MODE_IMMEDIATE_KHR); + builder.add_fallback_present_mode(VK_PRESENT_MODE_MAILBOX_KHR); + builder.add_fallback_present_mode(VK_PRESENT_MODE_FIFO_RELAXED_KHR); + } + + auto swapRet = builder.build(); if (oldSwapchain) { vkDestroySwapchainKHR(device, oldSwapchain, nullptr); diff --git a/src/rendering/wmo_renderer.cpp b/src/rendering/wmo_renderer.cpp index 5dec0e3e..51d8c2a2 100644 --- a/src/rendering/wmo_renderer.cpp +++ b/src/rendering/wmo_renderer.cpp @@ -48,6 +48,11 @@ size_t envSizeOrDefault(const char* name, size_t defValue) { } } // namespace +// Thread-local scratch buffers for collision queries (allows concurrent getFloorHeight/checkWallCollision calls) +static thread_local std::vector tl_candidateScratch; +static thread_local std::vector tl_triScratch; +static thread_local std::unordered_set tl_candidateIdScratch; + static void transformAABB(const glm::mat4& modelMatrix, const glm::vec3& localMin, const glm::vec3& localMax, @@ -787,8 +792,8 @@ bool WMORenderer::loadModel(const pipeline::WMOModel& model, uint32_t id) { } // Build doodad's local transform (WoW coordinates) - // WMO doodads use quaternion rotation (X/Y swapped for correct orientation) - glm::quat fixedRotation(doodad.rotation.w, doodad.rotation.y, doodad.rotation.x, doodad.rotation.z); + // WMO doodads use quaternion rotation + glm::quat fixedRotation(doodad.rotation.w, doodad.rotation.x, doodad.rotation.y, doodad.rotation.z); glm::mat4 localTransform(1.0f); localTransform = glm::translate(localTransform, doodad.position); @@ -1288,7 +1293,7 @@ void WMORenderer::rebuildSpatialIndex() { void WMORenderer::gatherCandidates(const glm::vec3& queryMin, const glm::vec3& queryMax, std::vector& outIndices) const { outIndices.clear(); - candidateIdScratch.clear(); + tl_candidateIdScratch.clear(); GridCell minCell = toCell(queryMin); GridCell maxCell = toCell(queryMax); @@ -1298,7 +1303,7 @@ void WMORenderer::gatherCandidates(const glm::vec3& queryMin, const glm::vec3& q auto it = spatialGrid.find(GridCell{x, y, z}); if (it == spatialGrid.end()) continue; for (uint32_t id : it->second) { - if (!candidateIdScratch.insert(id).second) continue; + if (!tl_candidateIdScratch.insert(id).second) continue; auto idxIt = instanceIndexById.find(id); if (idxIt != instanceIndexById.end()) { outIndices.push_back(idxIt->second); @@ -1318,15 +1323,10 @@ void WMORenderer::gatherCandidates(const glm::vec3& queryMin, const glm::vec3& q } } -void WMORenderer::render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet, const Camera& camera) { +void WMORenderer::prepareRender() { ++currentFrameId; - if (!opaquePipeline_ || instances.empty()) { - lastDrawCalls = 0; - return; - } - - // Update material UBOs if settings changed + // Update material UBOs if settings changed (mapped memory writes — main thread only) if (materialSettingsDirty_) { materialSettingsDirty_ = false; static const int pomSampleTable[] = { 16, 32, 64 }; @@ -1335,7 +1335,6 @@ void WMORenderer::render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet, const for (auto& group : model.groups) { for (auto& mb : group.mergedBatches) { if (!mb.materialUBO) continue; - // Read existing UBO data, update normal/POM fields VmaAllocationInfo allocInfo{}; vmaGetAllocationInfo(vkCtx_->getAllocator(), mb.materialUBOAlloc, &allocInfo); if (allocInfo.pMappedData) { @@ -1351,6 +1350,13 @@ void WMORenderer::render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet, const } } } +} + +void WMORenderer::render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet, const Camera& camera) { + if (!opaquePipeline_ || instances.empty()) { + lastDrawCalls = 0; + return; + } lastDrawCalls = 0; @@ -1362,43 +1368,45 @@ void WMORenderer::render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet, const lastPortalCulledGroups = 0; lastDistanceCulledGroups = 0; - // ── Phase 1: Parallel visibility culling ────────────────────────── - std::vector visibleInstances; - visibleInstances.reserve(instances.size()); + // ── Phase 1: Visibility culling ────────────────────────── + visibleInstances_.clear(); for (size_t i = 0; i < instances.size(); ++i) { - const auto& instance = instances[i]; - if (loadedModels.find(instance.modelId) == loadedModels.end()) - continue; - visibleInstances.push_back(i); + if (loadedModels.count(instances[i].modelId)) + visibleInstances_.push_back(i); } glm::vec3 camPos = camera.getPosition(); bool doPortalCull = portalCulling; - bool doFrustumCull = false; // Temporarily disabled: can over-cull world WMOs bool doDistanceCull = distanceCulling; - auto cullInstance = [&](size_t instIdx) -> InstanceDrawList { - if (instIdx >= instances.size()) return InstanceDrawList{}; + auto cullInstance = [&](size_t instIdx, InstanceDrawList& result) { + if (instIdx >= instances.size()) return; const auto& instance = instances[instIdx]; auto mdlIt = loadedModels.find(instance.modelId); - if (mdlIt == loadedModels.end()) return InstanceDrawList{}; + if (mdlIt == loadedModels.end()) return; const ModelData& model = mdlIt->second; - InstanceDrawList result; result.instanceIndex = instIdx; + result.visibleGroups.clear(); + result.portalCulled = 0; + result.distanceCulled = 0; - // Portal-based visibility - std::unordered_set portalVisibleGroups; + // Portal-based visibility — use a flat sorted vector instead of unordered_set + std::vector portalVisibleGroups; bool usePortalCulling = doPortalCull && !model.portals.empty() && !model.portalRefs.empty(); if (usePortalCulling) { + std::unordered_set pvgSet; glm::vec4 localCamPos = instance.invModelMatrix * glm::vec4(camPos, 1.0f); getVisibleGroupsViaPortals(model, glm::vec3(localCamPos), frustum, - instance.modelMatrix, portalVisibleGroups); + instance.modelMatrix, pvgSet); + portalVisibleGroups.assign(pvgSet.begin(), pvgSet.end()); + std::sort(portalVisibleGroups.begin(), portalVisibleGroups.end()); } for (size_t gi = 0; gi < model.groups.size(); ++gi) { if (usePortalCulling && - portalVisibleGroups.find(static_cast(gi)) == portalVisibleGroups.end()) { + !std::binary_search(portalVisibleGroups.begin(), portalVisibleGroups.end(), + static_cast(gi))) { result.portalCulled++; continue; } @@ -1414,62 +1422,18 @@ void WMORenderer::render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet, const continue; } } - - if (doFrustumCull && !frustum.intersectsAABB(gMin, gMax)) - continue; } result.visibleGroups.push_back(static_cast(gi)); } - return result; }; - // Dispatch culling — parallel when enough instances, sequential otherwise. - std::vector drawLists; - drawLists.reserve(visibleInstances.size()); + // Resize drawLists to match (reuses previous capacity) + drawLists_.resize(visibleInstances_.size()); - static const size_t minParallelCullInstances = std::max( - 4, envSizeOrDefault("WOWEE_WMO_CULL_MT_MIN", 128)); - if (visibleInstances.size() >= minParallelCullInstances && numCullThreads_ > 1) { - static const size_t minCullWorkPerThread = std::max( - 16, envSizeOrDefault("WOWEE_WMO_CULL_WORK_PER_THREAD", 64)); - const size_t maxUsefulThreads = std::max( - 1, (visibleInstances.size() + minCullWorkPerThread - 1) / minCullWorkPerThread); - const size_t numThreads = std::min(static_cast(numCullThreads_), maxUsefulThreads); - if (numThreads <= 1) { - for (size_t idx : visibleInstances) { - drawLists.push_back(cullInstance(idx)); - } - } else { - const size_t chunkSize = visibleInstances.size() / numThreads; - const size_t remainder = visibleInstances.size() % numThreads; - - drawLists.resize(visibleInstances.size()); - - cullFutures_.clear(); - if (cullFutures_.capacity() < numThreads) { - cullFutures_.reserve(numThreads); - } - - size_t start = 0; - for (size_t t = 0; t < numThreads; ++t) { - const size_t end = start + chunkSize + (t < remainder ? 1 : 0); - cullFutures_.push_back(std::async(std::launch::async, - [&, start, end]() { - for (size_t j = start; j < end; ++j) { - drawLists[j] = cullInstance(visibleInstances[j]); - } - })); - start = end; - } - - for (auto& f : cullFutures_) { - f.get(); - } - } - } else { - for (size_t idx : visibleInstances) - drawLists.push_back(cullInstance(idx)); + // Sequential culling (parallel dispatch overhead > savings for typical instance counts) + for (size_t j = 0; j < visibleInstances_.size(); ++j) { + cullInstance(visibleInstances_[j], drawLists_[j]); } // ── Phase 2: Vulkan draw ──────────────────────────────── @@ -1484,7 +1448,7 @@ void WMORenderer::render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet, const // Track which pipeline is currently bound: 0=opaque, 1=transparent, 2=glass int currentPipelineKind = 0; - for (const auto& dl : drawLists) { + for (const auto& dl : drawLists_) { if (dl.instanceIndex >= instances.size()) continue; const auto& instance = instances[dl.instanceIndex]; auto modelIt = loadedModels.find(instance.modelId); @@ -2412,6 +2376,69 @@ VkTexture* WMORenderer::loadTexture(const std::string& path) { return rawPtr; } +void WMORenderer::backfillNormalMaps() { + if (!normalMappingEnabled_ && !pomEnabled_) return; + + if (!assetManager) return; + + int generated = 0; + for (auto& [key, entry] : textureCache) { + if (entry.normalHeightMap) continue; // already has one + if (!entry.texture) continue; + + // Re-load the BLP from MPQ to get pixel data for normal map generation + pipeline::BLPImage blp = assetManager->loadTexture(key); + if (!blp.isValid() || blp.width == 0 || blp.height == 0) continue; + + float variance = 0.0f; + auto nhMap = generateNormalHeightMap(blp.data.data(), blp.width, blp.height, variance); + if (nhMap) { + entry.normalHeightMap = std::move(nhMap); + entry.heightMapVariance = variance; + generated++; + } + } + + if (generated > 0) { + VkDevice device = vkCtx_->getDevice(); + int rebound = 0; + // Update merged batches: assign normal map pointer and rebind descriptor set + for (auto& [modelId, model] : loadedModels) { + for (auto& group : model.groups) { + for (auto& mb : group.mergedBatches) { + if (mb.normalHeightMap) continue; // already set + if (!mb.texture) continue; + // Find this texture in the cache + for (const auto& [cacheKey, cacheEntry] : textureCache) { + if (cacheEntry.texture.get() == mb.texture) { + if (cacheEntry.normalHeightMap) { + mb.normalHeightMap = cacheEntry.normalHeightMap.get(); + mb.heightMapVariance = cacheEntry.heightMapVariance; + // Rebind descriptor set binding 2 to the real normal/height map + if (mb.materialSet) { + VkDescriptorImageInfo nhImgInfo = mb.normalHeightMap->descriptorInfo(); + VkWriteDescriptorSet write{}; + write.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; + write.dstSet = mb.materialSet; + write.dstBinding = 2; + write.descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER; + write.descriptorCount = 1; + write.pImageInfo = &nhImgInfo; + vkUpdateDescriptorSets(device, 1, &write, 0, nullptr); + rebound++; + } + } + break; + } + } + } + } + } + materialSettingsDirty_ = true; + LOG_INFO("Backfilled ", generated, " normal/height maps (", rebound, " descriptor sets rebound) for deferred WMO textures"); + } +} + // Ray-AABB intersection (slab method) // Returns true if the ray intersects the axis-aligned bounding box static bool rayIntersectsAABB(const glm::vec3& origin, const glm::vec3& dir, @@ -2808,9 +2835,9 @@ std::optional WMORenderer::getFloorHeight(float glX, float glY, float glZ group.getTrianglesInRange( localOrigin.x - 1.0f, localOrigin.y - 1.0f, localOrigin.x + 1.0f, localOrigin.y + 1.0f, - triScratch_); + tl_triScratch); - for (uint32_t triStart : triScratch_) { + for (uint32_t triStart : tl_triScratch) { const glm::vec3& v0 = verts[indices[triStart]]; const glm::vec3& v1 = verts[indices[triStart + 1]]; const glm::vec3& v2 = verts[indices[triStart + 2]]; @@ -2884,9 +2911,9 @@ std::optional WMORenderer::getFloorHeight(float glX, float glY, float glZ // early-returned because overlapping WMO instances need full coverage). glm::vec3 queryMin(glX - 2.0f, glY - 2.0f, glZ - 8.0f); glm::vec3 queryMax(glX + 2.0f, glY + 2.0f, glZ + 10.0f); - gatherCandidates(queryMin, queryMax, candidateScratch); + gatherCandidates(queryMin, queryMax, tl_candidateScratch); - for (size_t idx : candidateScratch) { + for (size_t idx : tl_candidateScratch) { const auto& instance = instances[idx]; if (collisionFocusEnabled && pointAABBDistanceSq(collisionFocusPos, instance.worldBoundsMin, instance.worldBoundsMax) > collisionFocusRadiusSq) { @@ -3059,9 +3086,9 @@ bool WMORenderer::checkWallCollision(const glm::vec3& from, const glm::vec3& to, glm::vec3 queryMin = glm::min(from, to) - glm::vec3(8.0f, 8.0f, 5.0f); glm::vec3 queryMax = glm::max(from, to) + glm::vec3(8.0f, 8.0f, 5.0f); - gatherCandidates(queryMin, queryMax, candidateScratch); + gatherCandidates(queryMin, queryMax, tl_candidateScratch); - for (size_t idx : candidateScratch) { + for (size_t idx : tl_candidateScratch) { const auto& instance = instances[idx]; if (collisionFocusEnabled && pointAABBDistanceSq(collisionFocusPos, instance.worldBoundsMin, instance.worldBoundsMax) > collisionFocusRadiusSq) { @@ -3127,9 +3154,9 @@ bool WMORenderer::checkWallCollision(const glm::vec3& from, const glm::vec3& to, float rangeMinY = std::min(localFrom.y, localTo.y) - PLAYER_RADIUS - 1.5f; float rangeMaxX = std::max(localFrom.x, localTo.x) + PLAYER_RADIUS + 1.5f; float rangeMaxY = std::max(localFrom.y, localTo.y) + PLAYER_RADIUS + 1.5f; - group.getTrianglesInRange(rangeMinX, rangeMinY, rangeMaxX, rangeMaxY, triScratch_); + group.getTrianglesInRange(rangeMinX, rangeMinY, rangeMaxX, rangeMaxY, tl_triScratch); - for (uint32_t triStart : triScratch_) { + for (uint32_t triStart : tl_triScratch) { // Use pre-computed Z bounds for fast vertical reject const auto& tb = group.triBounds[triStart / 3]; @@ -3145,18 +3172,13 @@ bool WMORenderer::checkWallCollision(const glm::vec3& from, const glm::vec3& to, if (triHeight < 1.0f && tb.maxZ <= localFeetZ + 1.2f) continue; // Use MOPY flags to filter wall collision. - // Collidable triangles (flag 0x01) block the player — including - // invisible collision walls (0x01 without 0x20) used in tunnels. - // Skip detail/decorative geometry (0x04) and render-only surfaces. + // Collide with triangles that have the collision flag (0x08) or no flags at all. + // Skip detail/decorative (0x04) and render-only (0x20 without 0x08) surfaces. uint32_t triIdx = triStart / 3; if (!group.triMopyFlags.empty() && triIdx < group.triMopyFlags.size()) { uint8_t mopy = group.triMopyFlags[triIdx]; if (mopy != 0) { - bool collidable = (mopy & 0x01) != 0; - bool detail = (mopy & 0x04) != 0; - if (!collidable || detail) { - continue; - } + if ((mopy & 0x04) || !(mopy & 0x08)) continue; } } @@ -3217,8 +3239,8 @@ bool WMORenderer::checkWallCollision(const glm::vec3& from, const glm::vec3& to, if (absNz >= 0.35f) continue; const float SKIN = 0.005f; // small separation so we don't re-collide immediately - // Stronger push when inside WMO for more responsive indoor collision - const float MAX_PUSH = insideWMO ? 0.35f : 0.15f; + // Push must cover full penetration to prevent gradual clip-through + const float MAX_PUSH = PLAYER_RADIUS; float penetration = (PLAYER_RADIUS - horizDist); float pushDist = glm::clamp(penetration + SKIN, 0.0f, MAX_PUSH); glm::vec2 pushDir2; @@ -3302,9 +3324,9 @@ void WMORenderer::updateActiveGroup(float glX, float glY, float glZ) { glm::vec3 queryMin(glX - 0.5f, glY - 0.5f, glZ - 0.5f); glm::vec3 queryMax(glX + 0.5f, glY + 0.5f, glZ + 0.5f); - gatherCandidates(queryMin, queryMax, candidateScratch); + gatherCandidates(queryMin, queryMax, tl_candidateScratch); - for (size_t idx : candidateScratch) { + for (size_t idx : tl_candidateScratch) { const auto& instance = instances[idx]; if (glX < instance.worldBoundsMin.x || glX > instance.worldBoundsMax.x || glY < instance.worldBoundsMin.y || glY > instance.worldBoundsMax.y || @@ -3348,9 +3370,9 @@ bool WMORenderer::isInsideWMO(float glX, float glY, float glZ, uint32_t* outMode QueryTimer timer(&queryTimeMs, &queryCallCount); glm::vec3 queryMin(glX - 0.5f, glY - 0.5f, glZ - 0.5f); glm::vec3 queryMax(glX + 0.5f, glY + 0.5f, glZ + 0.5f); - gatherCandidates(queryMin, queryMax, candidateScratch); + gatherCandidates(queryMin, queryMax, tl_candidateScratch); - for (size_t idx : candidateScratch) { + for (size_t idx : tl_candidateScratch) { const auto& instance = instances[idx]; if (collisionFocusEnabled && pointAABBDistanceSq(collisionFocusPos, instance.worldBoundsMin, instance.worldBoundsMax) > collisionFocusRadiusSq) { @@ -3397,9 +3419,9 @@ bool WMORenderer::isInsideWMO(float glX, float glY, float glZ, uint32_t* outMode bool WMORenderer::isInsideInteriorWMO(float glX, float glY, float glZ) const { glm::vec3 queryMin(glX - 0.5f, glY - 0.5f, glZ - 0.5f); glm::vec3 queryMax(glX + 0.5f, glY + 0.5f, glZ + 0.5f); - gatherCandidates(queryMin, queryMax, candidateScratch); + gatherCandidates(queryMin, queryMax, tl_candidateScratch); - for (size_t idx : candidateScratch) { + for (size_t idx : tl_candidateScratch) { const auto& instance = instances[idx]; if (collisionFocusEnabled && pointAABBDistanceSq(collisionFocusPos, instance.worldBoundsMin, instance.worldBoundsMax) > collisionFocusRadiusSq) { @@ -3453,9 +3475,9 @@ float WMORenderer::raycastBoundingBoxes(const glm::vec3& origin, const glm::vec3 glm::vec3 rayEnd = origin + direction * maxDistance; glm::vec3 queryMin = glm::min(origin, rayEnd) - glm::vec3(1.0f); glm::vec3 queryMax = glm::max(origin, rayEnd) + glm::vec3(1.0f); - gatherCandidates(queryMin, queryMax, candidateScratch); + gatherCandidates(queryMin, queryMax, tl_candidateScratch); - for (size_t idx : candidateScratch) { + for (size_t idx : tl_candidateScratch) { const auto& instance = instances[idx]; if (collisionFocusEnabled && pointAABBDistanceSq(collisionFocusPos, instance.worldBoundsMin, instance.worldBoundsMax) > collisionFocusRadiusSq) { @@ -3509,9 +3531,9 @@ float WMORenderer::raycastBoundingBoxes(const glm::vec3& origin, const glm::vec3 float rMinY = std::min(localOrigin.y, localEnd.y) - 1.0f; float rMaxX = std::max(localOrigin.x, localEnd.x) + 1.0f; float rMaxY = std::max(localOrigin.y, localEnd.y) + 1.0f; - group.getWallTrianglesInRange(rMinX, rMinY, rMaxX, rMaxY, triScratch_); + group.getWallTrianglesInRange(rMinX, rMinY, rMaxX, rMaxY, tl_triScratch); - for (uint32_t triStart : triScratch_) { + for (uint32_t triStart : tl_triScratch) { const glm::vec3& v0 = verts[indices[triStart]]; const glm::vec3& v1 = verts[indices[triStart + 1]]; const glm::vec3& v2 = verts[indices[triStart + 2]]; diff --git a/src/ui/game_screen.cpp b/src/ui/game_screen.cpp index 3f1c0eb9..eab00305 100644 --- a/src/ui/game_screen.cpp +++ b/src/ui/game_screen.cpp @@ -317,6 +317,20 @@ void GameScreen::render(game::GameHandler& gameHandler) { } } + // Apply saved FSR setting once when renderer is available + if (!fsrSettingsApplied_ && pendingFSR) { + auto* renderer = core::Application::getInstance().getRenderer(); + if (renderer) { + static const float fsrScales[] = { 0.77f, 0.67f, 0.59f, 0.50f }; + renderer->setFSRQuality(fsrScales[pendingFSRQuality]); + renderer->setFSRSharpness(pendingFSRSharpness); + renderer->setFSREnabled(true); + fsrSettingsApplied_ = true; + } + } else { + fsrSettingsApplied_ = true; + } + // Apply auto-loot setting to GameHandler every frame (cheap bool sync) gameHandler.setAutoLoot(pendingAutoLoot); @@ -2687,6 +2701,12 @@ void GameScreen::sendChatMessage(game::GameHandler& gameHandler) { chatInputBuffer[0] = '\0'; return; } + // /unstuckhearth command — teleport to hearthstone bind point + if (cmdLower == "unstuckhearth") { + gameHandler.unstuckHearth(); + chatInputBuffer[0] = '\0'; + return; + } // /transport board — board test transport if (cmdLower == "transport board") { @@ -6250,7 +6270,7 @@ void GameScreen::renderSettingsWindow() { if (pendingShadows) { ImGui::SameLine(); ImGui::SetNextItemWidth(150.0f); - if (ImGui::SliderFloat("Distance##shadow", &pendingShadowDistance, 40.0f, 200.0f, "%.0f")) { + if (ImGui::SliderFloat("Distance##shadow", &pendingShadowDistance, 40.0f, 500.0f, "%.0f")) { if (renderer) renderer->setShadowDistance(pendingShadowDistance); saveSettings(); } @@ -6261,7 +6281,13 @@ void GameScreen::renderSettingsWindow() { } { const char* aaLabels[] = { "Off", "2x MSAA", "4x MSAA", "8x MSAA" }; - if (ImGui::Combo("Anti-Aliasing", &pendingAntiAliasing, aaLabels, 4)) { + bool fsr2Active = renderer && renderer->isFSR2Enabled(); + if (fsr2Active) { + ImGui::BeginDisabled(); + int disabled = 0; + ImGui::Combo("Anti-Aliasing (FSR2)", &disabled, "Off (FSR2 active)\0", 1); + ImGui::EndDisabled(); + } else if (ImGui::Combo("Anti-Aliasing", &pendingAntiAliasing, aaLabels, 4)) { static const VkSampleCountFlagBits aaSamples[] = { VK_SAMPLE_COUNT_1_BIT, VK_SAMPLE_COUNT_2_BIT, VK_SAMPLE_COUNT_4_BIT, VK_SAMPLE_COUNT_8_BIT @@ -6270,6 +6296,33 @@ void GameScreen::renderSettingsWindow() { saveSettings(); } } + // FSR Upscaling + { + // FSR mode selection: Off, FSR 1.0 (Spatial), FSR 2.2 (Temporal) + const char* fsrModeLabels[] = { "Off", "FSR 1.0 (Spatial)", "FSR 2.2 (Temporal)" }; + int fsrMode = pendingFSR ? 1 : 0; + if (renderer && renderer->isFSR2Enabled()) fsrMode = 2; + if (ImGui::Combo("Upscaling", &fsrMode, fsrModeLabels, 3)) { + pendingFSR = (fsrMode == 1); + if (renderer) { + renderer->setFSREnabled(fsrMode == 1); + renderer->setFSR2Enabled(fsrMode == 2); + } + saveSettings(); + } + if (fsrMode > 0) { + const char* fsrQualityLabels[] = { "Ultra Quality (77%)", "Quality (67%)", "Balanced (59%)", "Performance (50%)" }; + static const float fsrScaleFactors[] = { 0.77f, 0.67f, 0.59f, 0.50f }; + if (ImGui::Combo("FSR Quality", &pendingFSRQuality, fsrQualityLabels, 4)) { + if (renderer) renderer->setFSRQuality(fsrScaleFactors[pendingFSRQuality]); + saveSettings(); + } + if (ImGui::SliderFloat("FSR Sharpness", &pendingFSRSharpness, 0.0f, 2.0f, "%.1f")) { + if (renderer) renderer->setFSRSharpness(pendingFSRSharpness); + saveSettings(); + } + } + } if (ImGui::SliderInt("Ground Clutter Density", &pendingGroundClutterDensity, 0, 150, "%d%%")) { if (renderer) { if (auto* tm = renderer->getTerrainManager()) { @@ -6348,7 +6401,7 @@ void GameScreen::renderSettingsWindow() { pendingFullscreen = kDefaultFullscreen; pendingVsync = kDefaultVsync; pendingShadows = kDefaultShadows; - pendingShadowDistance = 72.0f; + pendingShadowDistance = 300.0f; pendingGroundClutterDensity = kDefaultGroundClutterDensity; pendingAntiAliasing = 0; pendingNormalMapping = true; @@ -7384,6 +7437,9 @@ void GameScreen::saveSettings() { out << "normal_map_strength=" << pendingNormalMapStrength << "\n"; out << "pom=" << (pendingPOM ? 1 : 0) << "\n"; out << "pom_quality=" << pendingPOMQuality << "\n"; + out << "fsr=" << (pendingFSR ? 1 : 0) << "\n"; + out << "fsr_quality=" << pendingFSRQuality << "\n"; + out << "fsr_sharpness=" << pendingFSRSharpness << "\n"; // Controls out << "mouse_sensitivity=" << pendingMouseSensitivity << "\n"; @@ -7463,13 +7519,16 @@ void GameScreen::loadSettings() { else if (key == "auto_loot") pendingAutoLoot = (std::stoi(val) != 0); else if (key == "ground_clutter_density") pendingGroundClutterDensity = std::clamp(std::stoi(val), 0, 150); else if (key == "shadows") pendingShadows = (std::stoi(val) != 0); - else if (key == "shadow_distance") pendingShadowDistance = std::clamp(std::stof(val), 40.0f, 200.0f); + else if (key == "shadow_distance") pendingShadowDistance = std::clamp(std::stof(val), 40.0f, 500.0f); else if (key == "water_refraction") pendingWaterRefraction = (std::stoi(val) != 0); else if (key == "antialiasing") pendingAntiAliasing = std::clamp(std::stoi(val), 0, 3); else if (key == "normal_mapping") pendingNormalMapping = (std::stoi(val) != 0); else if (key == "normal_map_strength") pendingNormalMapStrength = std::clamp(std::stof(val), 0.0f, 2.0f); else if (key == "pom") pendingPOM = (std::stoi(val) != 0); else if (key == "pom_quality") pendingPOMQuality = std::clamp(std::stoi(val), 0, 2); + else if (key == "fsr") pendingFSR = (std::stoi(val) != 0); + else if (key == "fsr_quality") pendingFSRQuality = std::clamp(std::stoi(val), 0, 3); + else if (key == "fsr_sharpness") pendingFSRSharpness = std::clamp(std::stof(val), 0.0f, 2.0f); // Controls else if (key == "mouse_sensitivity") pendingMouseSensitivity = std::clamp(std::stof(val), 0.05f, 1.0f); else if (key == "invert_mouse") pendingInvertMouse = (std::stoi(val) != 0);