FSR2 temporal upscaling fixes: unjittered reprojection, sharpen Y-flip, MSAA guard, descriptor double-buffering

- Motion vectors: single unjittered reprojection matrix (80 bytes) instead of two jittered matrices (160 bytes), eliminating numerical instability from jitter amplification through large world coordinates - Sharpen pass: fix Y-flip for correct UV sampling, double-buffer descriptor sets to avoid race with in-flight command buffers - MSAA: auto-disable when FSR2 enabled, grey out AA setting in UI - Accumulation: variance-based neighborhood clamping in YCoCg space, correct history layout transitions - Frame index: wrap at 256 for stable Halton sequence
Implement FSR 2.2 temporal upscaling
2026-03-22 23:30:14 +00:00 · 2026-03-08 01:22:15 -08:00 · 2026-03-07 23:13:01 -08:00 · 2026-03-07 23:02:25 -08:00 · 2026-03-07 22:55:02 -08:00 · 2026-03-07 22:51:59 -08:00
37 changed files with 2840 additions and 586 deletions
--- a/assets/shaders/fsr2_accumulate.comp.glsl
+++ b/assets/shaders/fsr2_accumulate.comp.glsl
@ -0,0 +1,85 @@
+#version 450
+
+layout(local_size_x = 8, local_size_y = 8) in;
+
+layout(set = 0, binding = 0) uniform sampler2D sceneColor;
+layout(set = 0, binding = 1) uniform sampler2D depthBuffer;
+layout(set = 0, binding = 2) uniform sampler2D motionVectors;
+layout(set = 0, binding = 3) uniform sampler2D historyInput;
+layout(set = 0, binding = 4, rgba16f) uniform writeonly image2D historyOutput;
+
+layout(push_constant) uniform PushConstants {
+    vec4 internalSize;   // xy = internal resolution, zw = 1/internal
+    vec4 displaySize;    // xy = display resolution, zw = 1/display
+    vec4 jitterOffset;   // xy = current jitter (NDC-space), zw = unused
+    vec4 params;         // x = resetHistory (1=reset), y = sharpness, zw = unused
+} pc;
+
+vec3 rgbToYCoCg(vec3 rgb) {
+    float y  = 0.25 * rgb.r + 0.5 * rgb.g + 0.25 * rgb.b;
+    float co = 0.5  * rgb.r                - 0.5  * rgb.b;
+    float cg = -0.25 * rgb.r + 0.5 * rgb.g - 0.25 * rgb.b;
+    return vec3(y, co, cg);
+}
+
+vec3 yCoCgToRgb(vec3 ycocg) {
+    float y  = ycocg.x;
+    float co = ycocg.y;
+    float cg = ycocg.z;
+    return vec3(y + co - cg, y + cg, y - co - cg);
+}
+
+void main() {
+    ivec2 outPixel = ivec2(gl_GlobalInvocationID.xy);
+    ivec2 outSize = ivec2(pc.displaySize.xy);
+    if (outPixel.x >= outSize.x || outPixel.y >= outSize.y) return;
+
+    vec2 outUV = (vec2(outPixel) + 0.5) * pc.displaySize.zw;
+    vec3 currentColor = texture(sceneColor, outUV).rgb;
+
+    if (pc.params.x > 0.5) {
+        imageStore(historyOutput, outPixel, vec4(currentColor, 1.0));
+        return;
+    }
+
+    vec2 motion = texture(motionVectors, outUV).rg;
+    vec2 historyUV = outUV + motion;
+
+    float historyValid = (historyUV.x >= 0.0 && historyUV.x <= 1.0 &&
+                          historyUV.y >= 0.0 && historyUV.y <= 1.0) ? 1.0 : 0.0;
+
+    vec3 historyColor = texture(historyInput, historyUV).rgb;
+
+    // Neighborhood clamping in YCoCg space
+    vec2 texelSize = pc.internalSize.zw;
+    vec3 s0 = rgbToYCoCg(currentColor);
+    vec3 s1 = rgbToYCoCg(texture(sceneColor, outUV + vec2(-texelSize.x, 0.0)).rgb);
+    vec3 s2 = rgbToYCoCg(texture(sceneColor, outUV + vec2( texelSize.x, 0.0)).rgb);
+    vec3 s3 = rgbToYCoCg(texture(sceneColor, outUV + vec2(0.0, -texelSize.y)).rgb);
+    vec3 s4 = rgbToYCoCg(texture(sceneColor, outUV + vec2(0.0,  texelSize.y)).rgb);
+    vec3 s5 = rgbToYCoCg(texture(sceneColor, outUV + vec2(-texelSize.x, -texelSize.y)).rgb);
+    vec3 s6 = rgbToYCoCg(texture(sceneColor, outUV + vec2( texelSize.x, -texelSize.y)).rgb);
+    vec3 s7 = rgbToYCoCg(texture(sceneColor, outUV + vec2(-texelSize.x,  texelSize.y)).rgb);
+    vec3 s8 = rgbToYCoCg(texture(sceneColor, outUV + vec2( texelSize.x,  texelSize.y)).rgb);
+
+    vec3 m1 = s0 + s1 + s2 + s3 + s4 + s5 + s6 + s7 + s8;
+    vec3 m2 = s0*s0 + s1*s1 + s2*s2 + s3*s3 + s4*s4 + s5*s5 + s6*s6 + s7*s7 + s8*s8;
+    vec3 mean = m1 / 9.0;
+    vec3 variance = max(m2 / 9.0 - mean * mean, vec3(0.0));
+    vec3 stddev = sqrt(variance);
+
+    float gamma = 1.5;
+    vec3 boxMin = mean - gamma * stddev;
+    vec3 boxMax = mean + gamma * stddev;
+
+    vec3 historyYCoCg = rgbToYCoCg(historyColor);
+    vec3 clampedHistory = clamp(historyYCoCg, boxMin, boxMax);
+    historyColor = yCoCgToRgb(clampedHistory);
+
+    float clampDist = length(historyYCoCg - clampedHistory);
+    float blendFactor = mix(0.05, 0.30, clamp(clampDist * 2.0, 0.0, 1.0));
+    blendFactor = mix(blendFactor, 1.0, 1.0 - historyValid);
+
+    vec3 result = mix(historyColor, currentColor, blendFactor);
+    imageStore(historyOutput, outPixel, vec4(result, 1.0));
+}
--- a/assets/shaders/fsr2_accumulate.comp.spv
+++ b/assets/shaders/fsr2_accumulate.comp.spv
--- a/assets/shaders/fsr2_motion.comp.glsl
+++ b/assets/shaders/fsr2_motion.comp.glsl
@ -0,0 +1,35 @@
+#version 450
+
+layout(local_size_x = 8, local_size_y = 8) in;
+
+layout(set = 0, binding = 0) uniform sampler2D depthBuffer;
+layout(set = 0, binding = 1, rg16f) uniform writeonly image2D motionVectors;
+
+layout(push_constant) uniform PushConstants {
+    mat4 reprojMatrix;      // prevUnjitteredVP * inverse(currentUnjitteredVP)
+    vec4 resolution;        // xy = internal size, zw = 1/internal size
+} pc;
+
+void main() {
+    ivec2 pixelCoord = ivec2(gl_GlobalInvocationID.xy);
+    ivec2 imgSize = ivec2(pc.resolution.xy);
+    if (pixelCoord.x >= imgSize.x || pixelCoord.y >= imgSize.y) return;
+
+    // Sample depth (Vulkan: 0 = near, 1 = far)
+    float depth = texelFetch(depthBuffer, pixelCoord, 0).r;
+
+    // Pixel center in UV [0,1] and NDC [-1,1]
+    vec2 uv = (vec2(pixelCoord) + 0.5) * pc.resolution.zw;
+    vec2 ndc = uv * 2.0 - 1.0;
+
+    // Clip-to-clip reprojection: current unjittered clip → previous unjittered clip
+    vec4 clipPos = vec4(ndc, depth, 1.0);
+    vec4 prevClip = pc.reprojMatrix * clipPos;
+    vec2 prevNdc = prevClip.xy / prevClip.w;
+    vec2 prevUV = prevNdc * 0.5 + 0.5;
+
+    // Motion = previous position - current position (both unjittered, in UV space)
+    vec2 motion = prevUV - uv;
+
+    imageStore(motionVectors, pixelCoord, vec4(motion, 0.0, 0.0));
+}
--- a/assets/shaders/fsr2_motion.comp.spv
+++ b/assets/shaders/fsr2_motion.comp.spv
--- a/assets/shaders/fsr2_sharpen.frag.glsl
+++ b/assets/shaders/fsr2_sharpen.frag.glsl
@ -0,0 +1,50 @@
+#version 450
+
+layout(location = 0) in vec2 TexCoord;
+layout(location = 0) out vec4 FragColor;
+
+layout(set = 0, binding = 0) uniform sampler2D inputImage;
+
+layout(push_constant) uniform PushConstants {
+    vec4 params;  // x = 1/width, y = 1/height, z = sharpness (0-2), w = unused
+} pc;
+
+void main() {
+    // Undo the vertex shader Y flip (postprocess.vert flips for Vulkan overlay,
+    // but we need standard UV coords for texture sampling)
+    vec2 tc = vec2(TexCoord.x, 1.0 - TexCoord.y);
+
+    vec2 texelSize = pc.params.xy;
+    float sharpness = pc.params.z;
+
+    // RCAS: Robust Contrast-Adaptive Sharpening
+    // 5-tap cross pattern
+    vec3 center = texture(inputImage, tc).rgb;
+    vec3 north  = texture(inputImage, tc + vec2(0.0, -texelSize.y)).rgb;
+    vec3 south  = texture(inputImage, tc + vec2(0.0,  texelSize.y)).rgb;
+    vec3 west   = texture(inputImage, tc + vec2(-texelSize.x, 0.0)).rgb;
+    vec3 east   = texture(inputImage, tc + vec2( texelSize.x, 0.0)).rgb;
+
+    // Compute local contrast (min/max of neighborhood)
+    vec3 minRGB = min(center, min(min(north, south), min(west, east)));
+    vec3 maxRGB = max(center, max(max(north, south), max(west, east)));
+
+    // Adaptive sharpening weight based on local contrast
+    // High contrast = less sharpening (prevent ringing)
+    vec3 range = maxRGB - minRGB;
+    vec3 rcpRange = 1.0 / (range + 0.001);
+
+    // Sharpening amount: inversely proportional to contrast
+    float luma = dot(center, vec3(0.299, 0.587, 0.114));
+    float lumaRange = max(range.r, max(range.g, range.b));
+    float w = clamp(1.0 - lumaRange * 2.0, 0.0, 1.0) * sharpness * 0.25;
+
+    // Apply sharpening via unsharp mask
+    vec3 avg = (north + south + west + east) * 0.25;
+    vec3 sharpened = center + (center - avg) * w;
+
+    // Clamp to prevent ringing artifacts
+    sharpened = clamp(sharpened, minRGB, maxRGB);
+
+    FragColor = vec4(sharpened, 1.0);
+}
--- a/assets/shaders/fsr2_sharpen.frag.spv
+++ b/assets/shaders/fsr2_sharpen.frag.spv
--- a/assets/shaders/fsr_easu.frag.glsl
+++ b/assets/shaders/fsr_easu.frag.glsl
@ -0,0 +1,102 @@
+#version 450
+// FSR 1.0 EASU (Edge Adaptive Spatial Upsampling) — Fragment Shader
+// Based on AMD FidelityFX Super Resolution 1.0
+// Implements edge-adaptive bilinear upsampling with directional filtering
+
+layout(set = 0, binding = 0) uniform sampler2D uInput;
+
+layout(push_constant) uniform FSRConstants {
+    vec4 con0; // inputSize.xy, 1/inputSize.xy
+    vec4 con1; // inputSize.xy / outputSize.xy, 0.5 * inputSize.xy / outputSize.xy
+    vec4 con2; // outputSize.xy, 1/outputSize.xy
+    vec4 con3; // sharpness, 0, 0, 0
+} fsr;
+
+layout(location = 0) in vec2 TexCoord;
+layout(location = 0) out vec4 outColor;
+
+// Fetch a texel with offset (in input pixels)
+vec3 fsrFetch(vec2 p, vec2 off) {
+    return textureLod(uInput, (p + off + 0.5) * fsr.con0.zw, 0.0).rgb;
+}
+
+void main() {
+    // Undo the vertex shader Y flip (postprocess.vert flips for Vulkan overlay,
+    // but we need standard UV coords for texture sampling)
+    vec2 tc = vec2(TexCoord.x, 1.0 - TexCoord.y);
+
+    // Map output pixel to input space
+    vec2 pp = tc * fsr.con2.xy; // output pixel position
+    vec2 ip = pp * fsr.con1.xy - 0.5; // input pixel position (centered)
+    vec2 fp = floor(ip);
+    vec2 ff = ip - fp;
+
+    // 12-tap filter: 4x3 grid around the pixel
+    //  b c
+    // e f g h
+    // i j k l
+    //  n o
+    vec3 b = fsrFetch(fp, vec2( 0, -1));
+    vec3 c = fsrFetch(fp, vec2( 1, -1));
+    vec3 e = fsrFetch(fp, vec2(-1,  0));
+    vec3 f = fsrFetch(fp, vec2( 0,  0));
+    vec3 g = fsrFetch(fp, vec2( 1,  0));
+    vec3 h = fsrFetch(fp, vec2( 2,  0));
+    vec3 i = fsrFetch(fp, vec2(-1,  1));
+    vec3 j = fsrFetch(fp, vec2( 0,  1));
+    vec3 k = fsrFetch(fp, vec2( 1,  1));
+    vec3 l = fsrFetch(fp, vec2( 2,  1));
+    vec3 n = fsrFetch(fp, vec2( 0,  2));
+    vec3 o = fsrFetch(fp, vec2( 1,  2));
+
+    // Luma (use green channel as good perceptual approximation)
+    float bL = b.g, cL = c.g, eL = e.g, fL = f.g;
+    float gL = g.g, hL = h.g, iL = i.g, jL = j.g;
+    float kL = k.g, lL = l.g, nL = n.g, oL = o.g;
+
+    // Directional edge detection
+    // Compute gradients in 4 directions (N-S, E-W, NE-SW, NW-SE)
+    float dc = cL - jL;
+    float db = bL - kL;
+    float de = eL - hL;
+    float di = iL - lL;
+
+    // Length of the edge in each direction
+    float lenH = abs(eL - fL) + abs(fL - gL) + abs(iL - jL) + abs(jL - kL);
+    float lenV = abs(bL - fL) + abs(fL - jL) + abs(cL - gL) + abs(gL - kL);
+
+    // Determine dominant edge direction
+    float dirH = lenV / (lenH + lenV + 1e-7);
+    float dirV = lenH / (lenH + lenV + 1e-7);
+
+    // Bilinear weights
+    float w1 = (1.0 - ff.x) * (1.0 - ff.y);
+    float w2 = ff.x * (1.0 - ff.y);
+    float w3 = (1.0 - ff.x) * ff.y;
+    float w4 = ff.x * ff.y;
+
+    // Edge-aware sharpening: boost weights along edges
+    float sharpness = fsr.con3.x;
+    float edgeStr = max(abs(lenH - lenV) / (lenH + lenV + 1e-7), 0.0);
+    float sharp = mix(0.0, sharpness, edgeStr);
+
+    // Sharpen bilinear by pulling toward nearest texel
+    float maxW = max(max(w1, w2), max(w3, w4));
+    w1 = mix(w1, float(w1 == maxW), sharp * 0.25);
+    w2 = mix(w2, float(w2 == maxW), sharp * 0.25);
+    w3 = mix(w3, float(w3 == maxW), sharp * 0.25);
+    w4 = mix(w4, float(w4 == maxW), sharp * 0.25);
+
+    // Normalize
+    float wSum = w1 + w2 + w3 + w4;
+    w1 /= wSum; w2 /= wSum; w3 /= wSum; w4 /= wSum;
+
+    // Final color: weighted blend of the 4 nearest texels with edge awareness
+    vec3 color = f * w1 + g * w2 + j * w3 + k * w4;
+
+    // Optional: blend in some of the surrounding texels for anti-aliasing
+    float aa = 0.125 * edgeStr;
+    color = mix(color, (b + c + e + h + i + l + n + o) / 8.0, aa * 0.15);
+
+    outColor = vec4(clamp(color, 0.0, 1.0), 1.0);
+}
--- a/assets/shaders/fsr_easu.frag.spv
+++ b/assets/shaders/fsr_easu.frag.spv
--- a/assets/shaders/fsr_rcas.frag.glsl
+++ b/assets/shaders/fsr_rcas.frag.glsl
@ -0,0 +1,43 @@
+#version 450
+// FSR 1.0 RCAS (Robust Contrast Adaptive Sharpening) — Fragment Shader
+// Based on AMD FidelityFX Super Resolution 1.0
+// Applies contrast-adaptive sharpening after EASU upscaling
+
+layout(set = 0, binding = 0) uniform sampler2D uInput;
+
+layout(push_constant) uniform RCASConstants {
+    vec4 con0; // 1/outputSize.xy, outputSize.xy
+    vec4 con1; // sharpness (x), 0, 0, 0
+} rcas;
+
+layout(location = 0) in vec2 TexCoord;
+layout(location = 0) out vec4 outColor;
+
+void main() {
+    // Fetch center and 4-neighborhood
+    vec2 texelSize = rcas.con0.xy;
+    vec3 c = texture(uInput, TexCoord).rgb;
+    vec3 n = texture(uInput, TexCoord + vec2( 0, -texelSize.y)).rgb;
+    vec3 s = texture(uInput, TexCoord + vec2( 0,  texelSize.y)).rgb;
+    vec3 w = texture(uInput, TexCoord + vec2(-texelSize.x,  0)).rgb;
+    vec3 e = texture(uInput, TexCoord + vec2( texelSize.x,  0)).rgb;
+
+    // Luma (green channel approximation)
+    float cL = c.g, nL = n.g, sL = s.g, wL = w.g, eL = e.g;
+
+    // Min/max of neighborhood
+    float minL = min(min(nL, sL), min(wL, eL));
+    float maxL = max(max(nL, sL), max(wL, eL));
+
+    // Contrast adaptive sharpening weight
+    // Higher contrast = less sharpening to avoid ringing
+    float contrast = maxL - minL;
+    float sharpness = rcas.con1.x;
+    float w0 = sharpness * (1.0 - smoothstep(0.0, 0.3, contrast));
+
+    // Apply sharpening: center + w0 * (center - average_neighbors)
+    vec3 avg = (n + s + w + e) * 0.25;
+    vec3 sharpened = c + w0 * (c - avg);
+
+    outColor = vec4(clamp(sharpened, 0.0, 1.0), 1.0);
+}
--- a/assets/shaders/fsr_rcas.frag.spv
+++ b/assets/shaders/fsr_rcas.frag.spv
--- a/assets/shaders/wmo.frag.glsl
+++ b/assets/shaders/wmo.frag.glsl
@ -149,21 +149,21 @@ void main() {
    vec3 norm = vertexNormal;
    if (enableNormalMap != 0 && lodFactor < 0.99 && normalMapStrength > 0.001) {
        vec3 mapNormal = texture(uNormalHeightMap, finalUV).rgb * 2.0 - 1.0;
-        // Scale XY by strength to control effect intensity
-        mapNormal.xy *= normalMapStrength;
        mapNormal = normalize(mapNormal);
        vec3 worldNormal = normalize(TBN * mapNormal);
        if (!gl_FrontFacing) worldNormal = -worldNormal;
-        // Blend: strength + LOD both contribute to fade toward vertex normal
-        float blendFactor = max(lodFactor, 1.0 - normalMapStrength);
-        norm = normalize(mix(worldNormal, vertexNormal, blendFactor));
+        // Linear blend: strength controls how much normal map detail shows,
+        // LOD fades out at distance. Both multiply for smooth falloff.
+        float blend = clamp(normalMapStrength, 0.0, 1.0) * (1.0 - lodFactor);
+        norm = normalize(mix(vertexNormal, worldNormal, blend));
    }

    vec3 result;

-    // Sample shadow map — skip for interior WMO groups (no sun indoors)
+    // Sample shadow map for all WMO groups (interior groups with 0x2000 flag
+    // include covered outdoor areas like archways/streets that should receive shadows)
    float shadow = 1.0;
-    if (shadowParams.x > 0.5 && isInterior == 0) {
+    if (shadowParams.x > 0.5) {
        vec3 ldir = normalize(-lightDir.xyz);
        float normalOffset = SHADOW_TEXEL * 2.0 * (1.0 - abs(dot(norm, ldir)));
        vec3 biasedPos = FragPos + norm * normalOffset;
--- a/assets/shaders/wmo.frag.spv
+++ b/assets/shaders/wmo.frag.spv
--- a/include/core/application.hpp
+++ b/include/core/application.hpp
@ -215,7 +215,7 @@ private:
        std::future<PreparedCreatureModel> future;
    };
    std::vector<AsyncCreatureLoad> asyncCreatureLoads_;
-    void processAsyncCreatureResults();
+    void processAsyncCreatureResults(bool unlimited = false);
    static constexpr int MAX_ASYNC_CREATURE_LOADS = 4; // concurrent background loads
    std::unordered_set<uint64_t> deadCreatureGuids_;            // GUIDs that should spawn in corpse/death pose
    std::unordered_map<uint32_t, uint32_t> displayIdModelCache_; // displayId → modelId (model caching)
@ -236,6 +236,11 @@ private:
    std::optional<PendingWorldEntry> pendingWorldEntry_;  // Deferred world entry during loading
    float taxiLandingClampTimer_ = 0.0f;
    float worldEntryMovementGraceTimer_ = 0.0f;
+
+    // Hearth teleport: freeze player until terrain loads at destination
+    bool hearthTeleportPending_ = false;
+    glm::vec3 hearthTeleportPos_{0.0f};  // render coords
+    float hearthTeleportTimer_ = 0.0f;   // timeout safety
    float facingSendCooldown_ = 0.0f;        // Rate-limits MSG_MOVE_SET_FACING
    float lastSentCanonicalYaw_ = 1000.0f;   // Sentinel — triggers first send
    float taxiStreamCooldown_ = 0.0f;
@ -373,7 +378,7 @@ private:
    std::unordered_set<uint64_t> pendingPlayerSpawnGuids_;
    void processPlayerSpawnQueue();
    std::unordered_set<uint64_t> creaturePermanentFailureGuids_;
-    void processCreatureSpawnQueue();
+    void processCreatureSpawnQueue(bool unlimited = false);

    struct PendingGameObjectSpawn {
        uint64_t guid;
--- a/include/game/game_handler.hpp
+++ b/include/game/game_handler.hpp
@ -565,6 +565,8 @@ public:
    void unstuck();
    void setUnstuckGyCallback(UnstuckCallback cb) { unstuckGyCallback_ = std::move(cb); }
    void unstuckGy();
+    void setUnstuckHearthCallback(UnstuckCallback cb) { unstuckHearthCallback_ = std::move(cb); }
+    void unstuckHearth();
    using BindPointCallback = std::function<void(uint32_t mapId, float x, float y, float z)>;
    void setBindPointCallback(BindPointCallback cb) { bindPointCallback_ = std::move(cb); }

@ -1445,6 +1447,7 @@ private:
    WorldEntryCallback worldEntryCallback_;
    UnstuckCallback unstuckCallback_;
    UnstuckCallback unstuckGyCallback_;
+    UnstuckCallback unstuckHearthCallback_;
    BindPointCallback bindPointCallback_;
    CreatureSpawnCallback creatureSpawnCallback_;
    CreatureDespawnCallback creatureDespawnCallback_;
--- a/include/rendering/camera.hpp
+++ b/include/rendering/camera.hpp
@ -23,9 +23,16 @@ public:
    const glm::vec3& getPosition() const { return position; }
    const glm::mat4& getViewMatrix() const { return viewMatrix; }
    const glm::mat4& getProjectionMatrix() const { return projectionMatrix; }
+    const glm::mat4& getUnjitteredProjectionMatrix() const { return unjitteredProjectionMatrix; }
    glm::mat4 getViewProjectionMatrix() const { return projectionMatrix * viewMatrix; }
+    glm::mat4 getUnjitteredViewProjectionMatrix() const { return unjitteredProjectionMatrix * viewMatrix; }
    float getAspectRatio() const { return aspectRatio; }

+    // Sub-pixel jitter for temporal upscaling (FSR 2)
+    void setJitter(float jx, float jy);
+    void clearJitter();
+    glm::vec2 getJitter() const { return jitterOffset; }
+
    glm::vec3 getForward() const;
    glm::vec3 getRight() const;
    glm::vec3 getUp() const;
@ -46,6 +53,8 @@ private:

    glm::mat4 viewMatrix = glm::mat4(1.0f);
    glm::mat4 projectionMatrix = glm::mat4(1.0f);
+    glm::mat4 unjitteredProjectionMatrix = glm::mat4(1.0f);
+    glm::vec2 jitterOffset = glm::vec2(0.0f);  // NDC jitter (applied to projection)
 };

 } // namespace rendering
--- a/include/rendering/character_renderer.hpp
+++ b/include/rendering/character_renderer.hpp
@ -13,6 +13,8 @@
 #include <utility>
 #include <future>
 #include <deque>
+#include <mutex>
+#include <atomic>

 namespace wowee {
 namespace pipeline { class AssetManager; }
@ -64,6 +66,8 @@ public:

    void update(float deltaTime, const glm::vec3& cameraPos = glm::vec3(0.0f));

+    /** Pre-allocate GPU resources (bone SSBOs, descriptors) on main thread before parallel render. */
+    void prepareRender(uint32_t frameIndex);
    void render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet, const Camera& camera);
    void recreatePipelines();
    bool initializeShadow(VkRenderPass shadowRenderPass);
@ -304,15 +308,23 @@ private:
    std::unique_ptr<VkTexture> generateNormalHeightMap(
        const uint8_t* pixels, uint32_t width, uint32_t height, float& outVariance);

-    // Deferred normal map generation — avoids stalling loadModel
-    struct PendingNormalMap {
+    // Background normal map generation — CPU work on thread pool, GPU upload on main thread
+    struct NormalMapResult {
        std::string cacheKey;
-        std::vector<uint8_t> pixels;  // RGBA pixel data
+        std::vector<uint8_t> pixels;  // RGBA normal map output
        uint32_t width, height;
+        float variance;
    };
-    std::deque<PendingNormalMap> pendingNormalMaps_;
+    // Completed results ready for GPU upload (populated by background threads)
+    std::mutex normalMapResultsMutex_;
+    std::deque<NormalMapResult> completedNormalMaps_;
+    std::atomic<int> pendingNormalMapCount_{0};  // in-flight background tasks
+
+    // Pure CPU normal map generation (thread-safe, no GPU access)
+    static NormalMapResult generateNormalHeightMapCPU(
+        std::string cacheKey, std::vector<uint8_t> pixels, uint32_t width, uint32_t height);
 public:
-    void processPendingNormalMaps(int budget = 2);
+    void processPendingNormalMaps(int budget = 4);
 private:

    // Normal mapping / POM settings
--- a/include/rendering/loading_screen.hpp
+++ b/include/rendering/loading_screen.hpp
@ -24,6 +24,10 @@ public:
    // Render the loading screen with progress bar and status text (pure ImGui)
    void render();

+    // Draw loading screen as ImGui overlay (call within an existing ImGui frame).
+    // Used during warmup to overlay loading screen on top of the rendered world.
+    void renderOverlay();
+
    void setProgress(float progress) { loadProgress = progress; }
    void setStatus(const std::string& status) { statusText = status; }

--- a/include/rendering/m2_renderer.hpp
+++ b/include/rendering/m2_renderer.hpp
@ -122,6 +122,7 @@ struct M2ModelGPU {
    bool isKoboldFlame = false;     // Model name matches kobold+(candle/torch/mine) (precomputed)
    bool isLavaModel = false;       // Model name contains lava/molten/magma (UV scroll fallback)
    bool hasTextureAnimation = false; // True if any batch has UV animation
+    uint8_t availableLODs = 0;  // Bitmask: bit N set if any batch has submeshLevel==N

    // Particle emitter data (kept from M2Model)
    std::vector<pipeline::M2ParticleEmitter> particleEmitters;
@ -193,6 +194,7 @@ struct M2Instance {

    // Frame-skip optimization (update distant animations less frequently)
    uint8_t frameSkipCounter = 0;
+    bool bonesDirty[2] = {false, false};  // Per-frame-index: set when bones recomputed, cleared after upload

    // Per-instance bone SSBO (double-buffered)
    ::VkBuffer boneBuffer[2] = {};
@ -265,6 +267,8 @@ public:
    /**
     * Render all visible instances (Vulkan)
     */
+    /** Pre-allocate GPU resources (bone SSBOs, descriptors) on main thread before parallel render. */
+    void prepareRender(uint32_t frameIndex, const Camera& camera);
    void render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet, const Camera& camera);

    /**
@ -471,9 +475,7 @@ private:
    static constexpr float SPATIAL_CELL_SIZE = 64.0f;
    std::unordered_map<GridCell, std::vector<uint32_t>, GridCellHash> spatialGrid;
    std::unordered_map<uint32_t, size_t> instanceIndexById;
-    mutable std::vector<size_t> candidateScratch;
-    mutable std::unordered_set<uint32_t> candidateIdScratch;
-    mutable std::vector<uint32_t> collisionTriScratch_;
+    // Collision scratch buffers are thread_local (see m2_renderer.cpp) for thread-safety.

    // Collision query profiling (per frame).
    mutable double queryTimeMs = 0.0;
--- a/include/rendering/renderer.hpp
+++ b/include/rendering/renderer.hpp
@ -4,10 +4,12 @@
 #include <string>
 #include <cstdint>
 #include <vector>
+#include <future>
 #include <glm/glm.hpp>
 #include <vulkan/vulkan.h>
 #include <vk_mem_alloc.h>
 #include "rendering/vk_frame_data.hpp"
+#include "rendering/vk_utils.hpp"
 #include "rendering/sky_system.hpp"

 namespace wowee {
@ -244,7 +246,7 @@ private:
    glm::vec3 shadowCenter = glm::vec3(0.0f);
    bool shadowCenterInitialized = false;
    bool shadowsEnabled = true;
-    float shadowDistance_ = 72.0f;  // Shadow frustum half-extent (default: 72 units)
+    float shadowDistance_ = 300.0f;  // Shadow frustum half-extent (default: 300 units)
    uint32_t shadowFrameCounter_ = 0;


@ -255,10 +257,20 @@ public:

    void setShadowsEnabled(bool enabled) { shadowsEnabled = enabled; }
    bool areShadowsEnabled() const { return shadowsEnabled; }
-    void setShadowDistance(float dist) { shadowDistance_ = glm::clamp(dist, 40.0f, 200.0f); }
+    void setShadowDistance(float dist) { shadowDistance_ = glm::clamp(dist, 40.0f, 500.0f); }
    float getShadowDistance() const { return shadowDistance_; }
    void setMsaaSamples(VkSampleCountFlagBits samples);

+    // FSR (FidelityFX Super Resolution) upscaling
+    void setFSREnabled(bool enabled);
+    bool isFSREnabled() const { return fsr_.enabled; }
+    void setFSRQuality(float scaleFactor);  // 0.50=Perf, 0.59=Balanced, 0.67=Quality, 0.77=UltraQuality
+    void setFSRSharpness(float sharpness);  // 0.0 - 2.0
+    float getFSRScaleFactor() const { return fsr_.scaleFactor; }
+    float getFSRSharpness() const { return fsr_.sharpness; }
+    void setFSR2Enabled(bool enabled);
+    bool isFSR2Enabled() const { return fsr2_.enabled; }
+
    void setWaterRefractionEnabled(bool enabled);
    bool isWaterRefractionEnabled() const;

@ -312,7 +324,7 @@ private:
    VmaAllocation selCircleIdxAlloc = VK_NULL_HANDLE;
    int selCircleVertCount = 0;
    void initSelectionCircle();
-    void renderSelectionCircle(const glm::mat4& view, const glm::mat4& projection);
+    void renderSelectionCircle(const glm::mat4& view, const glm::mat4& projection, VkCommandBuffer overrideCmd = VK_NULL_HANDLE);
    glm::vec3 selCirclePos{0.0f};
    glm::vec3 selCircleColor{1.0f, 0.0f, 0.0f};
    float selCircleRadius = 1.5f;
@ -322,7 +334,95 @@ private:
    VkPipeline overlayPipeline = VK_NULL_HANDLE;
    VkPipelineLayout overlayPipelineLayout = VK_NULL_HANDLE;
    void initOverlayPipeline();
-    void renderOverlay(const glm::vec4& color);
+    void renderOverlay(const glm::vec4& color, VkCommandBuffer overrideCmd = VK_NULL_HANDLE);
+
+    // FSR 1.0 upscaling state
+    struct FSRState {
+        bool enabled = false;
+        bool needsRecreate = false;
+        float scaleFactor = 0.77f;  // Ultra Quality default
+        float sharpness = 0.5f;
+        uint32_t internalWidth = 0;
+        uint32_t internalHeight = 0;
+
+        // Off-screen scene target (reduced resolution)
+        AllocatedImage sceneColor{};        // 1x color (non-MSAA render target / MSAA resolve target)
+        AllocatedImage sceneDepth{};        // Depth (matches current MSAA sample count)
+        AllocatedImage sceneMsaaColor{};    // MSAA color target (only when MSAA > 1x)
+        AllocatedImage sceneDepthResolve{}; // Depth resolve (only when MSAA + depth resolve)
+        VkFramebuffer sceneFramebuffer = VK_NULL_HANDLE;
+        VkSampler sceneSampler = VK_NULL_HANDLE;
+
+        // Upscale pipeline
+        VkPipeline pipeline = VK_NULL_HANDLE;
+        VkPipelineLayout pipelineLayout = VK_NULL_HANDLE;
+        VkDescriptorSetLayout descSetLayout = VK_NULL_HANDLE;
+        VkDescriptorPool descPool = VK_NULL_HANDLE;
+        VkDescriptorSet descSet = VK_NULL_HANDLE;
+    };
+    FSRState fsr_;
+    bool initFSRResources();
+    void destroyFSRResources();
+    void renderFSRUpscale();
+
+    // FSR 2.2 temporal upscaling state
+    struct FSR2State {
+        bool enabled = false;
+        bool needsRecreate = false;
+        float scaleFactor = 0.77f;
+        float sharpness = 0.5f;
+        uint32_t internalWidth = 0;
+        uint32_t internalHeight = 0;
+
+        // Off-screen scene targets (internal resolution, no MSAA — FSR2 replaces AA)
+        AllocatedImage sceneColor{};
+        AllocatedImage sceneDepth{};
+        VkFramebuffer sceneFramebuffer = VK_NULL_HANDLE;
+
+        // Samplers
+        VkSampler linearSampler = VK_NULL_HANDLE;   // For color
+        VkSampler nearestSampler = VK_NULL_HANDLE;  // For depth / motion vectors
+
+        // Motion vector buffer (internal resolution)
+        AllocatedImage motionVectors{};
+
+        // History buffers (display resolution, ping-pong)
+        AllocatedImage history[2]{};
+        uint32_t currentHistory = 0;  // Output index (0 or 1)
+
+        // Compute pipelines
+        VkPipeline motionVecPipeline = VK_NULL_HANDLE;
+        VkPipelineLayout motionVecPipelineLayout = VK_NULL_HANDLE;
+        VkDescriptorSetLayout motionVecDescSetLayout = VK_NULL_HANDLE;
+        VkDescriptorPool motionVecDescPool = VK_NULL_HANDLE;
+        VkDescriptorSet motionVecDescSet = VK_NULL_HANDLE;
+
+        VkPipeline accumulatePipeline = VK_NULL_HANDLE;
+        VkPipelineLayout accumulatePipelineLayout = VK_NULL_HANDLE;
+        VkDescriptorSetLayout accumulateDescSetLayout = VK_NULL_HANDLE;
+        VkDescriptorPool accumulateDescPool = VK_NULL_HANDLE;
+        VkDescriptorSet accumulateDescSets[2] = {};  // Per ping-pong
+
+        // RCAS sharpening pass (display resolution)
+        VkPipeline sharpenPipeline = VK_NULL_HANDLE;
+        VkPipelineLayout sharpenPipelineLayout = VK_NULL_HANDLE;
+        VkDescriptorSetLayout sharpenDescSetLayout = VK_NULL_HANDLE;
+        VkDescriptorPool sharpenDescPool = VK_NULL_HANDLE;
+        VkDescriptorSet sharpenDescSets[2] = {};
+
+        // Previous frame state for motion vector reprojection
+        glm::mat4 prevViewProjection = glm::mat4(1.0f);
+        glm::vec2 prevJitter = glm::vec2(0.0f);
+        uint32_t frameIndex = 0;
+        bool needsHistoryReset = true;
+    };
+    FSR2State fsr2_;
+    bool initFSR2Resources();
+    void destroyFSR2Resources();
+    void dispatchMotionVectors();
+    void dispatchTemporalAccumulate();
+    void renderFSR2Sharpen();
+    static float halton(uint32_t index, uint32_t base);

    // Footstep event tracking (animation-driven)
    uint32_t footstepLastAnimationId = 0;
@ -411,6 +511,36 @@ private:
    void setupWater1xPass();
    void renderReflectionPass();

+    // ── Multithreaded secondary command buffer recording ──
+    // Indices into secondaryCmds_ arrays
+    static constexpr uint32_t SEC_SKY     = 0;  // sky (main thread)
+    static constexpr uint32_t SEC_TERRAIN = 1;  // terrain (worker 0)
+    static constexpr uint32_t SEC_WMO     = 2;  // WMO (worker 1)
+    static constexpr uint32_t SEC_CHARS   = 3;  // selection circle + characters (main thread)
+    static constexpr uint32_t SEC_M2      = 4;  // M2 + particles + glow (worker 2)
+    static constexpr uint32_t SEC_POST    = 5;  // water + weather + effects (main thread)
+    static constexpr uint32_t SEC_IMGUI   = 6;  // ImGui (main thread, non-FSR only)
+    static constexpr uint32_t NUM_SECONDARIES = 7;
+    static constexpr uint32_t NUM_WORKERS = 3;  // terrain, WMO, M2
+
+    // Per-worker command pools (thread-safe: one pool per thread)
+    VkCommandPool workerCmdPools_[NUM_WORKERS] = {};
+    // Main-thread command pool for its secondary buffers
+    VkCommandPool mainSecondaryCmdPool_ = VK_NULL_HANDLE;
+    // Pre-allocated secondary command buffers [secondaryIndex][frameInFlight]
+    VkCommandBuffer secondaryCmds_[NUM_SECONDARIES][MAX_FRAMES] = {};
+
+    bool parallelRecordingEnabled_ = false;  // set true after pools/buffers created
+    bool createSecondaryCommandResources();
+    void destroySecondaryCommandResources();
+    VkCommandBuffer beginSecondary(uint32_t secondaryIndex);
+    void setSecondaryViewportScissor(VkCommandBuffer cmd);
+
+    // Cached render pass state for secondary buffer inheritance
+    VkRenderPass activeRenderPass_ = VK_NULL_HANDLE;
+    VkFramebuffer activeFramebuffer_ = VK_NULL_HANDLE;
+    VkExtent2D activeRenderExtent_ = {0, 0};
+
    // Active character previews for off-screen rendering
    std::vector<CharacterPreview*> activePreviews_;

--- a/include/rendering/terrain_manager.hpp
+++ b/include/rendering/terrain_manager.hpp
@ -348,6 +348,7 @@ private:
    int unloadRadius = 7;    // Unload tiles beyond this radius
    float updateInterval = 0.033f;  // Check streaming every 33ms (~30 fps)
    float timeSinceLastUpdate = 0.0f;
+    float proactiveStreamTimer_ = 0.0f;
    bool taxiStreamingMode_ = false;

    // Tile size constants (WoW ADT specifications)
--- a/include/rendering/vk_context.hpp
+++ b/include/rendering/vk_context.hpp
@ -84,6 +84,10 @@ public:
    bool isSwapchainDirty() const { return swapchainDirty; }
    void markSwapchainDirty() { swapchainDirty = true; }

+    // VSync (present mode)
+    bool isVsyncEnabled() const { return vsync_; }
+    void setVsync(bool enabled) { vsync_ = enabled; }
+
    bool isDeviceLost() const { return deviceLost_; }

    // MSAA
@ -145,6 +149,7 @@ private:
    std::vector<VkFramebuffer> swapchainFramebuffers;
    bool swapchainDirty = false;
    bool deviceLost_ = false;
+    bool vsync_ = true;

    // Per-frame resources
    FrameData frames[MAX_FRAMES_IN_FLIGHT];
--- a/include/rendering/wmo_renderer.hpp
+++ b/include/rendering/wmo_renderer.hpp
@ -148,6 +148,8 @@ public:
     * @param perFrameSet Per-frame descriptor set (set 0)
     * @param camera Camera for frustum culling
     */
+    /** Pre-update mutable state (frame ID, material UBOs) on main thread before parallel render. */
+    void prepareRender();
    void render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet, const Camera& camera);

    /**
@ -332,6 +334,9 @@ public:
    // Defer normal/height map generation during streaming to avoid CPU stalls
    void setDeferNormalMaps(bool defer) { deferNormalMaps_ = defer; }

+    // Generate normal/height maps for cached textures that were loaded while deferred
+    void backfillNormalMaps();
+
 private:
    // WMO material UBO — matches WMOMaterial in wmo.frag.glsl
    struct WMOMaterialUBO {
@ -706,9 +711,7 @@ private:
    static constexpr float SPATIAL_CELL_SIZE = 64.0f;
    std::unordered_map<GridCell, std::vector<uint32_t>, GridCellHash> spatialGrid;
    std::unordered_map<uint32_t, size_t> instanceIndexById;
-    mutable std::vector<size_t> candidateScratch;
-    mutable std::vector<uint32_t> triScratch_;  // Scratch for collision grid queries
-    mutable std::unordered_set<uint32_t> candidateIdScratch;
+    // Collision scratch buffers are thread_local (see wmo_renderer.cpp) for thread-safety.

    // Parallel visibility culling
    uint32_t numCullThreads_ = 1;
@ -720,6 +723,8 @@ private:
        uint32_t distanceCulled = 0;
    };
    std::vector<std::future<void>> cullFutures_;
+    std::vector<size_t> visibleInstances_;      // reused per frame
+    std::vector<InstanceDrawList> drawLists_;    // reused per frame

    // Collision query profiling (per frame).
    mutable double queryTimeMs = 0.0;
--- a/include/ui/game_screen.hpp
+++ b/include/ui/game_screen.hpp
@ -87,7 +87,7 @@ private:
    bool pendingVsync = false;
    int pendingResIndex = 0;
    bool pendingShadows = true;
-    float pendingShadowDistance = 72.0f;
+    float pendingShadowDistance = 300.0f;
    bool pendingWaterRefraction = false;
    int pendingMasterVolume = 100;
    int pendingMusicVolume = 30;
@ -116,6 +116,10 @@ private:
    float pendingNormalMapStrength = 0.8f;  // 0.0-2.0
    bool pendingPOM = true;             // on by default
    int pendingPOMQuality = 1;          // 0=Low(16), 1=Medium(32), 2=High(64)
+    bool pendingFSR = false;
+    int pendingFSRQuality = 0;          // 0=UltraQuality, 1=Quality, 2=Balanced, 3=Performance
+    float pendingFSRSharpness = 0.5f;
+    bool fsrSettingsApplied_ = false;

    // UI element transparency (0.0 = fully transparent, 1.0 = fully opaque)
    float uiOpacity_ = 0.65f;
--- a/src/core/application.cpp
+++ b/src/core/application.cpp
@ -49,9 +49,9 @@
 #include <SDL2/SDL.h>
 // GL/glew.h removed — Vulkan migration Phase 1
 #include <cstdlib>
+#include <climits>
 #include <algorithm>
 #include <cctype>
-#include <cctype>
 #include <optional>
 #include <sstream>
 #include <set>
@ -868,7 +868,7 @@ void Application::update(float deltaTime) {
                }
                auto stageEnd = std::chrono::steady_clock::now();
                float stageMs = std::chrono::duration<float, std::milli>(stageEnd - stageStart).count();
-                if (stageMs > 3.0f) {
+                if (stageMs > 50.0f) {
                    LOG_WARNING("SLOW update stage '", stageName, "': ", stageMs, "ms");
                }
            };
@ -913,29 +913,12 @@ void Application::update(float deltaTime) {
            inGameStep = "spawn/equipment queues";
            updateCheckpoint = "in_game: spawn/equipment queues";
            runInGameStage("spawn/equipment queues", [&] {
-                auto t0 = std::chrono::steady_clock::now();
                processPlayerSpawnQueue();
-                auto t1 = std::chrono::steady_clock::now();
                processCreatureSpawnQueue();
-                auto t2 = std::chrono::steady_clock::now();
                processAsyncNpcCompositeResults();
-                auto t3 = std::chrono::steady_clock::now();
                processDeferredEquipmentQueue();
-                auto t4 = std::chrono::steady_clock::now();
-                // Process deferred normal maps (2 per frame to spread CPU cost)
                if (auto* cr = renderer ? renderer->getCharacterRenderer() : nullptr) {
-                    cr->processPendingNormalMaps(2);
-                }
-                auto t5 = std::chrono::steady_clock::now();
-                float pMs = std::chrono::duration<float, std::milli>(t1 - t0).count();
-                float cMs = std::chrono::duration<float, std::milli>(t2 - t1).count();
-                float nMs = std::chrono::duration<float, std::milli>(t3 - t2).count();
-                float eMs = std::chrono::duration<float, std::milli>(t4 - t3).count();
-                float nmMs = std::chrono::duration<float, std::milli>(t5 - t4).count();
-                float total = pMs + cMs + nMs + eMs + nmMs;
-                if (total > 4.0f) {
-                    LOG_WARNING("spawn/equip breakdown: player=", pMs, "ms creature=", cMs,
-                                "ms npcComposite=", nMs, "ms equip=", eMs, "ms normalMaps=", nmMs, "ms");
+                    cr->processPendingNormalMaps(4);
                }
            });
            // Self-heal missing creature visuals: if a nearby UNIT exists in
@ -1032,14 +1015,33 @@ void Application::update(float deltaTime) {
                    if (renderer && renderer->getCameraController())
                        renderer->getCameraController()->clearMovementInputs();
                }
+                // Hearth teleport: keep player frozen until terrain loads at destination
+                if (hearthTeleportPending_ && renderer && renderer->getTerrainManager()) {
+                    hearthTeleportTimer_ -= deltaTime;
+                    auto terrainH = renderer->getTerrainManager()->getHeightAt(
+                        hearthTeleportPos_.x, hearthTeleportPos_.y);
+                    if (terrainH || hearthTeleportTimer_ <= 0.0f) {
+                        // Terrain loaded (or timeout) — snap to floor and release
+                        if (terrainH) {
+                            hearthTeleportPos_.z = *terrainH + 0.5f;
+                            renderer->getCameraController()->teleportTo(hearthTeleportPos_);
+                        }
+                        renderer->getCameraController()->setExternalFollow(false);
+                        worldEntryMovementGraceTimer_ = 1.0f;
+                        hearthTeleportPending_ = false;
+                        LOG_INFO("Unstuck hearth: terrain loaded, player released",
+                                 terrainH ? "" : " (timeout)");
+                    }
+                }
                if (renderer && renderer->getCameraController()) {
                const bool externallyDrivenMotion = onTaxi || onWMOTransport || chargeActive_;
                // Keep physics frozen (externalFollow) during landing clamp when terrain
                // hasn't loaded yet — prevents gravity from pulling player through void.
+                bool hearthFreeze = hearthTeleportPending_;
                bool landingClampActive = !onTaxi && taxiLandingClampTimer_ > 0.0f &&
                                          worldEntryMovementGraceTimer_ <= 0.0f &&
                                          !gameHandler->isMounted();
-                renderer->getCameraController()->setExternalFollow(externallyDrivenMotion || landingClampActive);
+                renderer->getCameraController()->setExternalFollow(externallyDrivenMotion || landingClampActive || hearthFreeze);
                renderer->getCameraController()->setExternalMoving(externallyDrivenMotion);
                if (externallyDrivenMotion) {
                    // Drop any stale local movement toggles while server drives taxi motion.
@ -1514,7 +1516,7 @@ void Application::update(float deltaTime) {
        }
        float ruMs = std::chrono::duration<float, std::milli>(
            std::chrono::steady_clock::now() - rendererUpdateStart).count();
-        if (ruMs > 5.0f) {
+        if (ruMs > 50.0f) {
            LOG_WARNING("SLOW update stage 'renderer->update': ", ruMs, "ms");
        }
    }
@ -1894,9 +1896,43 @@ void Application::setupUICallbacks() {
        LOG_INFO("Unstuck: high fallback snap");
    });

+    // /unstuckhearth — teleport to hearthstone bind point (server-synced).
+    // Freezes player until terrain loads at destination to prevent falling through world.
+    gameHandler->setUnstuckHearthCallback([this, clearStuckMovement, forceServerTeleportCommand]() {
+        if (!renderer || !renderer->getCameraController() || !gameHandler) return;
+
+        uint32_t bindMap = 0;
+        glm::vec3 bindPos(0.0f);
+        if (!gameHandler->getHomeBind(bindMap, bindPos)) {
+            LOG_WARNING("Unstuck hearth: no bind point available");
+            return;
+        }
+
+        worldEntryMovementGraceTimer_ = 10.0f;  // long grace — terrain load check will clear it
+        taxiLandingClampTimer_ = 0.0f;
+        lastTaxiFlight_ = false;
+        clearStuckMovement();
+
+        auto* cc = renderer->getCameraController();
+        glm::vec3 renderPos = core::coords::canonicalToRender(bindPos);
+        renderPos.z += 2.0f;
+
+        // Freeze player in place (no gravity/movement) until terrain loads
+        cc->teleportTo(renderPos);
+        cc->setExternalFollow(true);
+        forceServerTeleportCommand(renderPos);
+        clearStuckMovement();
+
+        // Set pending state — update loop will unfreeze once terrain is loaded
+        hearthTeleportPending_ = true;
+        hearthTeleportPos_ = renderPos;
+        hearthTeleportTimer_ = 15.0f;  // 15s safety timeout
+        LOG_INFO("Unstuck hearth: teleporting to bind point, waiting for terrain...");
+    });
+
    // Auto-unstuck: falling for > 5 seconds = void fall, teleport to map entry
    if (renderer->getCameraController()) {
-        renderer->getCameraController()->setAutoUnstuckCallback([this]() {
+        renderer->getCameraController()->setAutoUnstuckCallback([this, forceServerTeleportCommand]() {
            if (!renderer || !renderer->getCameraController()) return;
            auto* cc = renderer->getCameraController();

@ -1904,7 +1940,8 @@ void Application::setupUICallbacks() {
            glm::vec3 spawnPos = cc->getDefaultPosition();
            spawnPos.z += 5.0f;
            cc->teleportTo(spawnPos);
-            LOG_INFO("Auto-unstuck: teleported to map entry point");
+            forceServerTeleportCommand(spawnPos);
+            LOG_INFO("Auto-unstuck: teleported to map entry point (server synced)");
        });
    }

@ -4167,11 +4204,17 @@ void Application::loadOnlineWorldTerrain(uint32_t mapId, float x, float y, float
        });
    }

-    // Hide first-login hitch by draining initial world packets/spawn queues before
-    // dropping the loading screen. Keep this bounded so we don't stall indefinitely.
+    // Keep the loading screen visible until all spawn/equipment/gameobject queues
+    // are fully drained. This ensures the player sees a fully populated world
+    // (character clothed, NPCs placed, game objects loaded) when the screen drops.
    {
-        const float kWarmupMaxSeconds = 2.5f;
+        const float kMinWarmupSeconds = 2.0f;   // minimum time to drain network packets
+        const float kMaxWarmupSeconds = 15.0f;  // hard cap to avoid infinite stall
        const auto warmupStart = std::chrono::high_resolution_clock::now();
+        // Track consecutive idle iterations (all queues empty) to detect convergence
+        int idleIterations = 0;
+        const int kIdleThreshold = 5;  // require 5 consecutive empty loops (~80ms)
+
        while (true) {
            SDL_Event event;
            while (SDL_PollEvent(&event)) {
@ -4185,7 +4228,6 @@ void Application::loadOnlineWorldTerrain(uint32_t mapId, float x, float y, float
                    int w = event.window.data1;
                    int h = event.window.data2;
                    window->setSize(w, h);
-                    // Vulkan viewport set in command buffer
                    if (renderer && renderer->getCamera()) {
                        renderer->getCamera()->setAspectRatio(static_cast<float>(w) / h);
                    }
@ -4207,60 +4249,18 @@ void Application::loadOnlineWorldTerrain(uint32_t mapId, float x, float y, float
            processPlayerSpawnQueue();

            // During load screen warmup: lift per-frame budgets so GPU uploads
-            // happen in bulk while the loading screen is still visible.
-            // Process ALL async creature model uploads (no 3-per-frame cap).
-            {
-                for (auto it = asyncCreatureLoads_.begin(); it != asyncCreatureLoads_.end(); ) {
-                    if (!it->future.valid() ||
-                        it->future.wait_for(std::chrono::milliseconds(0)) != std::future_status::ready) {
-                        ++it;
-                        continue;
-                    }
-                    auto result = it->future.get();
-                    it = asyncCreatureLoads_.erase(it);
-                    if (result.permanent_failure) {
-                        nonRenderableCreatureDisplayIds_.insert(result.displayId);
-                        creaturePermanentFailureGuids_.insert(result.guid);
-                        pendingCreatureSpawnGuids_.erase(result.guid);
-                        creatureSpawnRetryCounts_.erase(result.guid);
-                        continue;
-                    }
-                    if (!result.valid || !result.model) {
-                        pendingCreatureSpawnGuids_.erase(result.guid);
-                        creatureSpawnRetryCounts_.erase(result.guid);
-                        continue;
-                    }
-                    auto* charRenderer = renderer ? renderer->getCharacterRenderer() : nullptr;
-                    if (!charRenderer) { pendingCreatureSpawnGuids_.erase(result.guid); continue; }
-                    if (!charRenderer->loadModel(*result.model, result.modelId)) {
-                        nonRenderableCreatureDisplayIds_.insert(result.displayId);
-                        creaturePermanentFailureGuids_.insert(result.guid);
-                        pendingCreatureSpawnGuids_.erase(result.guid);
-                        creatureSpawnRetryCounts_.erase(result.guid);
-                        continue;
-                    }
-                    displayIdModelCache_[result.displayId] = result.modelId;
-                    pendingCreatureSpawnGuids_.erase(result.guid);
-                    creatureSpawnRetryCounts_.erase(result.guid);
-                    if (!creatureInstances_.count(result.guid) &&
-                        !creaturePermanentFailureGuids_.count(result.guid)) {
-                        PendingCreatureSpawn s{};
-                        s.guid = result.guid; s.displayId = result.displayId;
-                        s.x = result.x; s.y = result.y; s.z = result.z;
-                        s.orientation = result.orientation;
-                        pendingCreatureSpawns_.push_back(s);
-                        pendingCreatureSpawnGuids_.insert(result.guid);
-                    }
-                }
-            }
-            processCreatureSpawnQueue();
+            // and spawns happen in bulk while the loading screen is still visible.
+            processCreatureSpawnQueue(true);
            processAsyncNpcCompositeResults();
-            processDeferredEquipmentQueue();
+            // Process equipment queue more aggressively during warmup (multiple per iteration)
+            for (int i = 0; i < 8 && (!deferredEquipmentQueue_.empty() || !asyncEquipmentLoads_.empty()); i++) {
+                processDeferredEquipmentQueue();
+            }
            if (auto* cr = renderer ? renderer->getCharacterRenderer() : nullptr) {
-                cr->processPendingNormalMaps(10);  // higher budget during load screen
+                cr->processPendingNormalMaps(INT_MAX);
            }

-            // Process ALL pending game object spawns (no 1-per-frame cap during load screen).
+            // Process ALL pending game object spawns.
            while (!pendingGameObjectSpawns_.empty()) {
                auto& s = pendingGameObjectSpawns_.front();
                spawnOnlineGameObject(s.guid, s.entry, s.displayId, s.x, s.y, s.z, s.orientation);
@ -4271,14 +4271,42 @@ void Application::loadOnlineWorldTerrain(uint32_t mapId, float x, float y, float
            processPendingMount();
            updateQuestMarkers();

+            // Update renderer (terrain streaming, animations)
+            if (renderer) {
+                renderer->update(1.0f / 60.0f);
+            }
+
            const auto now = std::chrono::high_resolution_clock::now();
            const float elapsed = std::chrono::duration<float>(now - warmupStart).count();
-            const float t = std::clamp(elapsed / kWarmupMaxSeconds, 0.0f, 1.0f);
-            showProgress("Finalizing world sync...", 0.97f + t * 0.025f);

-            if (elapsed >= kWarmupMaxSeconds) {
+            // Check if all queues are drained
+            bool queuesEmpty =
+                pendingCreatureSpawns_.empty() &&
+                asyncCreatureLoads_.empty() &&
+                asyncNpcCompositeLoads_.empty() &&
+                deferredEquipmentQueue_.empty() &&
+                asyncEquipmentLoads_.empty() &&
+                pendingGameObjectSpawns_.empty() &&
+                asyncGameObjectLoads_.empty() &&
+                pendingPlayerSpawns_.empty();
+
+            if (queuesEmpty) {
+                idleIterations++;
+            } else {
+                idleIterations = 0;
+            }
+
+            // Exit when: (min time passed AND queues drained for several iterations) OR hard cap
+            bool readyToExit = (elapsed >= kMinWarmupSeconds && idleIterations >= kIdleThreshold);
+            if (readyToExit || elapsed >= kMaxWarmupSeconds) {
+                if (elapsed >= kMaxWarmupSeconds) {
+                    LOG_WARNING("Warmup hit hard cap (", kMaxWarmupSeconds, "s), entering world with pending work");
+                }
                break;
            }
+
+            const float t = std::clamp(elapsed / kMaxWarmupSeconds, 0.0f, 1.0f);
+            showProgress("Finalizing world sync...", 0.97f + t * 0.025f);
            SDL_Delay(16);
        }
    }
@ -5154,7 +5182,7 @@ void Application::spawnOnlineCreature(uint64_t guid, uint32_t displayId, float x
        {
            auto texEnd = std::chrono::steady_clock::now();
            float texMs = std::chrono::duration<float, std::milli>(texEnd - texStart).count();
-            if (texMs > 3.0f) {
+            if (texMs > 50.0f) {
                LOG_WARNING("spawnCreature texture setup took ", texMs, "ms displayId=", displayId,
                            " hasPreDec=", hasPreDec, " extra=", dispData.extraDisplayId);
            }
@ -6804,9 +6832,10 @@ void Application::spawnOnlineGameObject(uint64_t guid, uint32_t entry, uint32_t
             " displayId=", displayId, " at (", x, ", ", y, ", ", z, ")");
 }

-void Application::processAsyncCreatureResults() {
+void Application::processAsyncCreatureResults(bool unlimited) {
    // Check completed async model loads and finalize on main thread (GPU upload + instance creation).
    // Limit GPU model uploads per frame to avoid spikes, but always drain cheap bookkeeping.
+    // In unlimited mode (load screen), process all pending uploads without cap.
    static constexpr int kMaxModelUploadsPerFrame = 1;
    int modelUploads = 0;

@ -6819,9 +6848,7 @@ void Application::processAsyncCreatureResults() {

        // Peek: if this result needs a NEW model upload (not cached) and we've hit
        // the upload budget, defer to next frame without consuming the future.
-        if (modelUploads >= kMaxModelUploadsPerFrame) {
-            // Check if this displayId already has a cached model (cheap spawn, no GPU upload).
-            // We can't peek the displayId without getting the future, so just break.
+        if (!unlimited && modelUploads >= kMaxModelUploadsPerFrame) {
            break;
        }

@ -6864,7 +6891,7 @@ void Application::processAsyncCreatureResults() {
        {
            auto uploadEnd = std::chrono::steady_clock::now();
            float uploadMs = std::chrono::duration<float, std::milli>(uploadEnd - uploadStart).count();
-            if (uploadMs > 3.0f) {
+            if (uploadMs > 100.0f) {
                LOG_WARNING("charRenderer->loadModel took ", uploadMs, "ms displayId=", result.displayId,
                            " preDecoded=", result.predecodedTextures.size());
            }
@ -6967,17 +6994,18 @@ void Application::processAsyncNpcCompositeResults() {
    }
 }

-void Application::processCreatureSpawnQueue() {
+void Application::processCreatureSpawnQueue(bool unlimited) {
    auto startTime = std::chrono::steady_clock::now();
    // Budget: max 2ms per frame for creature spawning to prevent stutter.
+    // In unlimited mode (load screen), process everything without budget cap.
    static constexpr float kSpawnBudgetMs = 2.0f;

    // First, finalize any async model loads that completed on background threads.
-    processAsyncCreatureResults();
+    processAsyncCreatureResults(unlimited);
    {
        auto now = std::chrono::steady_clock::now();
        float asyncMs = std::chrono::duration<float, std::milli>(now - startTime).count();
-        if (asyncMs > 3.0f) {
+        if (asyncMs > 100.0f) {
            LOG_WARNING("processAsyncCreatureResults took ", asyncMs, "ms");
        }
    }
@ -6992,11 +7020,11 @@ void Application::processCreatureSpawnQueue() {
    int asyncLaunched = 0;
    size_t rotationsLeft = pendingCreatureSpawns_.size();
    while (!pendingCreatureSpawns_.empty() &&
-           processed < MAX_SPAWNS_PER_FRAME &&
+           (unlimited || processed < MAX_SPAWNS_PER_FRAME) &&
           rotationsLeft > 0) {
        // Check time budget every iteration (including first — async results may
        // have already consumed the budget via GPU model uploads).
-        {
+        if (!unlimited) {
            auto now = std::chrono::steady_clock::now();
            float elapsedMs = std::chrono::duration<float, std::milli>(now - startTime).count();
            if (elapsedMs >= kSpawnBudgetMs) break;
@ -7017,7 +7045,8 @@ void Application::processCreatureSpawnQueue() {

        // For new models: launch async load on background thread instead of blocking.
        if (needsNewModel) {
-            if (static_cast<int>(asyncCreatureLoads_.size()) + asyncLaunched >= MAX_ASYNC_CREATURE_LOADS) {
+            const int maxAsync = unlimited ? (MAX_ASYNC_CREATURE_LOADS * 4) : MAX_ASYNC_CREATURE_LOADS;
+            if (static_cast<int>(asyncCreatureLoads_.size()) + asyncLaunched >= maxAsync) {
                // Too many in-flight — defer to next frame
                pendingCreatureSpawns_.push_back(s);
                rotationsLeft--;
@ -7273,7 +7302,7 @@ void Application::processCreatureSpawnQueue() {
            spawnOnlineCreature(s.guid, s.displayId, s.x, s.y, s.z, s.orientation);
            auto spawnEnd = std::chrono::steady_clock::now();
            float spawnMs = std::chrono::duration<float, std::milli>(spawnEnd - spawnStart).count();
-            if (spawnMs > 3.0f) {
+            if (spawnMs > 100.0f) {
                LOG_WARNING("spawnOnlineCreature took ", spawnMs, "ms displayId=", s.displayId);
            }
        }
--- a/src/core/window.cpp
+++ b/src/core/window.cpp
@ -84,6 +84,7 @@ bool Window::initialize() {

    // Initialize Vulkan context
    vkContext = std::make_unique<rendering::VkContext>();
+    vkContext->setVsync(vsync);
    if (!vkContext->initialize(window)) {
        LOG_ERROR("Failed to initialize Vulkan context");
        return false;
@ -158,11 +159,13 @@ void Window::setFullscreen(bool enable) {
    }
 }

-void Window::setVsync([[maybe_unused]] bool enable) {
-    // VSync in Vulkan is controlled by present mode (set at swapchain creation)
-    // For now, store the preference — applied on next swapchain recreation
+void Window::setVsync(bool enable) {
    vsync = enable;
-    LOG_INFO("VSync preference set to ", enable ? "on" : "off", " (applied on swapchain recreation)");
+    if (vkContext) {
+        vkContext->setVsync(enable);
+        vkContext->markSwapchainDirty();
+    }
+    LOG_INFO("VSync ", enable ? "enabled" : "disabled");
 }

 void Window::applyResolution(int w, int h) {
--- a/src/game/game_handler.cpp
+++ b/src/game/game_handler.cpp
@ -11435,6 +11435,15 @@ void GameHandler::unstuckGy() {
    }
 }

+void GameHandler::unstuckHearth() {
+    if (unstuckHearthCallback_) {
+        unstuckHearthCallback_();
+        addSystemChatMessage("Unstuck: teleported to hearthstone location.");
+    } else {
+        addSystemChatMessage("No hearthstone bind point set.");
+    }
+}
+
 void GameHandler::handleLootResponse(network::Packet& packet) {
    if (!LootResponseParser::parse(packet, currentLoot)) return;
    lootWindowOpen = true;
--- a/src/rendering/camera.cpp
+++ b/src/rendering/camera.cpp
@ -20,6 +20,13 @@ void Camera::updateProjectionMatrix() {
    projectionMatrix = glm::perspective(glm::radians(fov), aspectRatio, nearPlane, farPlane);
    // Vulkan clip-space has Y pointing down; flip the projection's Y axis.
    projectionMatrix[1][1] *= -1.0f;
+    unjitteredProjectionMatrix = projectionMatrix;
+
+    // Re-apply jitter if active
+    if (jitterOffset.x != 0.0f || jitterOffset.y != 0.0f) {
+        projectionMatrix[2][0] += jitterOffset.x;
+        projectionMatrix[2][1] += jitterOffset.y;
+    }
 }

 glm::vec3 Camera::getForward() const {
@ -40,6 +47,21 @@ glm::vec3 Camera::getUp() const {
    return glm::normalize(glm::cross(getRight(), getForward()));
 }

+void Camera::setJitter(float jx, float jy) {
+    // Remove old jitter, apply new
+    projectionMatrix[2][0] -= jitterOffset.x;
+    projectionMatrix[2][1] -= jitterOffset.y;
+    jitterOffset = glm::vec2(jx, jy);
+    projectionMatrix[2][0] += jitterOffset.x;
+    projectionMatrix[2][1] += jitterOffset.y;
+}
+
+void Camera::clearJitter() {
+    projectionMatrix[2][0] -= jitterOffset.x;
+    projectionMatrix[2][1] -= jitterOffset.y;
+    jitterOffset = glm::vec2(0.0f);
+}
+
 Ray Camera::screenToWorldRay(float screenX, float screenY, float screenW, float screenH) const {
    float ndcX = (2.0f * screenX / screenW) - 1.0f;
    // Vulkan Y-flip is baked into projectionMatrix, so NDC Y maps directly:
--- a/src/rendering/camera_controller.cpp
+++ b/src/rendering/camera_controller.cpp
@ -1,5 +1,6 @@
 #include "rendering/camera_controller.hpp"
 #include <algorithm>
+#include <future>
 #include <imgui.h>
 #include "rendering/terrain_manager.hpp"
 #include "rendering/wmo_renderer.hpp"
@ -808,25 +809,53 @@ void CameraController::update(float deltaTime) {
                if (useCached) {
                    groundH = cachedFloorHeight_;
                } else {
-                    // Full collision check
+                    // Full collision check — run terrain/WMO/M2 queries in parallel
                    std::optional<float> terrainH;
                    std::optional<float> wmoH;
                    std::optional<float> m2H;
-                    if (terrainManager) {
-                        terrainH = terrainManager->getHeightAt(targetPos.x, targetPos.y);
-                    }
                    // When airborne, anchor probe to last ground level so the
                    // ceiling doesn't rise with the jump and catch roof geometry.
                    float wmoBaseZ = grounded ? std::max(targetPos.z, lastGroundZ) : lastGroundZ;
                    float wmoProbeZ = wmoBaseZ + stepUpBudget + 0.5f;
                    float wmoNormalZ = 1.0f;
+
+                    // Launch WMO + M2 floor queries asynchronously while terrain runs on this thread.
+                    // Collision scratch buffers are thread_local so concurrent calls are safe.
+                    using FloorResult = std::pair<std::optional<float>, float>;
+                    std::future<FloorResult> wmoFuture;
+                    std::future<FloorResult> m2Future;
+                    bool wmoAsync = false, m2Async = false;
+                    float px = targetPos.x, py = targetPos.y;
                    if (wmoRenderer) {
-                        wmoH = wmoRenderer->getFloorHeight(targetPos.x, targetPos.y, wmoProbeZ, &wmoNormalZ);
+                        wmoAsync = true;
+                        wmoFuture = std::async(std::launch::async,
+                            [this, px, py, wmoProbeZ]() -> FloorResult {
+                                float nz = 1.0f;
+                                auto h = wmoRenderer->getFloorHeight(px, py, wmoProbeZ, &nz);
+                                return {h, nz};
+                            });
                    }
                    if (m2Renderer && !externalFollow_) {
-                        float m2NormalZ = 1.0f;
-                        m2H = m2Renderer->getFloorHeight(targetPos.x, targetPos.y, wmoProbeZ, &m2NormalZ);
-                        if (m2H && m2NormalZ < MIN_WALKABLE_NORMAL_M2) {
+                        m2Async = true;
+                        m2Future = std::async(std::launch::async,
+                            [this, px, py, wmoProbeZ]() -> FloorResult {
+                                float nz = 1.0f;
+                                auto h = m2Renderer->getFloorHeight(px, py, wmoProbeZ, &nz);
+                                return {h, nz};
+                            });
+                    }
+                    if (terrainManager) {
+                        terrainH = terrainManager->getHeightAt(targetPos.x, targetPos.y);
+                    }
+                    if (wmoAsync) {
+                        auto [h, nz] = wmoFuture.get();
+                        wmoH = h;
+                        wmoNormalZ = nz;
+                    }
+                    if (m2Async) {
+                        auto [h, nz] = m2Future.get();
+                        m2H = h;
+                        if (m2H && nz < MIN_WALKABLE_NORMAL_M2) {
                            m2H = std::nullopt;
                        }
                    }
--- a/src/rendering/character_renderer.cpp
+++ b/src/rendering/character_renderer.cpp
@ -332,6 +332,11 @@ void CharacterRenderer::shutdown() {
    LOG_INFO("CharacterRenderer::shutdown instances=", instances.size(),
             " models=", models.size(), " override=", (void*)renderPassOverride_);

+    // Wait for any in-flight background normal map generation threads
+    while (pendingNormalMapCount_.load(std::memory_order_relaxed) > 0) {
+        std::this_thread::sleep_for(std::chrono::milliseconds(1));
+    }
+
    vkDeviceWaitIdle(vkCtx_->getDevice());
    VkDevice device = vkCtx_->getDevice();
    VmaAllocator alloc = vkCtx_->getAllocator();
@ -413,6 +418,16 @@ void CharacterRenderer::clear() {
    LOG_INFO("CharacterRenderer::clear instances=", instances.size(),
             " models=", models.size());

+    // Wait for any in-flight background normal map generation threads
+    while (pendingNormalMapCount_.load(std::memory_order_relaxed) > 0) {
+        std::this_thread::sleep_for(std::chrono::milliseconds(1));
+    }
+    // Discard any completed results that haven't been uploaded
+    {
+        std::lock_guard<std::mutex> lock(normalMapResultsMutex_);
+        completedNormalMaps_.clear();
+    }
+
    vkDeviceWaitIdle(vkCtx_->getDevice());
    VkDevice device = vkCtx_->getDevice();

@ -509,7 +524,32 @@ std::unique_ptr<VkTexture> CharacterRenderer::generateNormalHeightMap(
        const uint8_t* pixels, uint32_t width, uint32_t height, float& outVariance) {
    if (!vkCtx_ || width == 0 || height == 0) return nullptr;

+    // Use the CPU-only static method, then upload to GPU
+    std::vector<uint8_t> dummy(width * height * 4);
+    std::memcpy(dummy.data(), pixels, dummy.size());
+    auto result = generateNormalHeightMapCPU("", std::move(dummy), width, height);
+    outVariance = result.variance;
+
+    auto tex = std::make_unique<VkTexture>();
+    if (!tex->upload(*vkCtx_, result.pixels.data(), width, height, VK_FORMAT_R8G8B8A8_UNORM, true)) {
+        return nullptr;
+    }
+    tex->createSampler(vkCtx_->getDevice(), VK_FILTER_LINEAR, VK_FILTER_LINEAR,
+                        VK_SAMPLER_ADDRESS_MODE_REPEAT);
+    return tex;
+}
+
+// Static, thread-safe CPU-only normal map generation (no GPU access)
+CharacterRenderer::NormalMapResult CharacterRenderer::generateNormalHeightMapCPU(
+        std::string cacheKey, std::vector<uint8_t> srcPixels, uint32_t width, uint32_t height) {
+    NormalMapResult result;
+    result.cacheKey = std::move(cacheKey);
+    result.width = width;
+    result.height = height;
+    result.variance = 0.0f;
+
    const uint32_t totalPixels = width * height;
+    const uint8_t* pixels = srcPixels.data();

    // Step 1: Compute height from luminance
    std::vector<float> heightMap(totalPixels);
@ -524,7 +564,7 @@ std::unique_ptr<VkTexture> CharacterRenderer::generateNormalHeightMap(
        sumH2 += h * h;
    }
    double mean = sumH / totalPixels;
-    outVariance = static_cast<float>(sumH2 / totalPixels - mean * mean);
+    result.variance = static_cast<float>(sumH2 / totalPixels - mean * mean);

    // Step 1.5: Box blur the height map to reduce noise from diffuse textures
    auto wrapSample = [&](const std::vector<float>& map, int x, int y) -> float {
@ -545,11 +585,9 @@ std::unique_ptr<VkTexture> CharacterRenderer::generateNormalHeightMap(
        }
    }

-    // Step 2: Sobel 3x3 → normal map (crisp detail from original, blurred for POM alpha)
-    // Higher strength than WMO (2.0) because character/weapon textures are hand-painted
-    // with baked-in lighting that produces low-contrast gradients in the Sobel filter.
+    // Step 2: Sobel 3x3 → normal map
    const float strength = 5.0f;
-    std::vector<uint8_t> output(totalPixels * 4);
+    result.pixels.resize(totalPixels * 4);

    auto sampleH = [&](int x, int y) -> float {
        x = ((x % (int)width) + (int)width) % (int)width;
@ -573,20 +611,14 @@ std::unique_ptr<VkTexture> CharacterRenderer::generateNormalHeightMap(
            if (len > 0.0f) { nx /= len; ny /= len; nz /= len; }

            uint32_t idx = (y * width + x) * 4;
-            output[idx + 0] = static_cast<uint8_t>(std::clamp((nx * 0.5f + 0.5f) * 255.0f, 0.0f, 255.0f));
-            output[idx + 1] = static_cast<uint8_t>(std::clamp((ny * 0.5f + 0.5f) * 255.0f, 0.0f, 255.0f));
-            output[idx + 2] = static_cast<uint8_t>(std::clamp((nz * 0.5f + 0.5f) * 255.0f, 0.0f, 255.0f));
-            output[idx + 3] = static_cast<uint8_t>(std::clamp(blurredHeight[y * width + x] * 255.0f, 0.0f, 255.0f));
+            result.pixels[idx + 0] = static_cast<uint8_t>(std::clamp((nx * 0.5f + 0.5f) * 255.0f, 0.0f, 255.0f));
+            result.pixels[idx + 1] = static_cast<uint8_t>(std::clamp((ny * 0.5f + 0.5f) * 255.0f, 0.0f, 255.0f));
+            result.pixels[idx + 2] = static_cast<uint8_t>(std::clamp((nz * 0.5f + 0.5f) * 255.0f, 0.0f, 255.0f));
+            result.pixels[idx + 3] = static_cast<uint8_t>(std::clamp(blurredHeight[y * width + x] * 255.0f, 0.0f, 255.0f));
        }
    }

-    auto tex = std::make_unique<VkTexture>();
-    if (!tex->upload(*vkCtx_, output.data(), width, height, VK_FORMAT_R8G8B8A8_UNORM, true)) {
-        return nullptr;
-    }
-    tex->createSampler(vkCtx_->getDevice(), VK_FILTER_LINEAR, VK_FILTER_LINEAR,
-                        VK_SAMPLER_ADDRESS_MODE_REPEAT);
-    return tex;
+    return result;
 }

 VkTexture* CharacterRenderer::loadTexture(const std::string& path) {
@ -687,15 +719,22 @@ VkTexture* CharacterRenderer::loadTexture(const std::string& path) {
    e.hasAlpha = hasAlpha;
    e.colorKeyBlack = colorKeyBlackHint;

-    // Defer normal/height map generation to avoid stalling loadModel.
-    // Normal maps are generated in processPendingNormalMaps() at a per-frame budget.
+    // Launch normal map generation on background thread — CPU work is pure compute,
+    // only the GPU upload (in processPendingNormalMaps) needs the main thread (~1-2ms).
    if (blpImage.width >= 32 && blpImage.height >= 32) {
-        PendingNormalMap pending;
-        pending.cacheKey = key;
-        pending.pixels.assign(blpImage.data.begin(), blpImage.data.end());
-        pending.width = blpImage.width;
-        pending.height = blpImage.height;
-        pendingNormalMaps_.push_back(std::move(pending));
+        uint32_t w = blpImage.width, h = blpImage.height;
+        std::string ck = key;
+        std::vector<uint8_t> px(blpImage.data.begin(), blpImage.data.end());
+        pendingNormalMapCount_.fetch_add(1, std::memory_order_relaxed);
+        auto* self = this;
+        std::thread([self, ck = std::move(ck), px = std::move(px), w, h]() mutable {
+            auto result = generateNormalHeightMapCPU(std::move(ck), std::move(px), w, h);
+            {
+                std::lock_guard<std::mutex> lock(self->normalMapResultsMutex_);
+                self->completedNormalMaps_.push_back(std::move(result));
+            }
+            self->pendingNormalMapCount_.fetch_sub(1, std::memory_order_relaxed);
+        }).detach();
        e.normalMapPending = true;
    }

@ -709,30 +748,39 @@ VkTexture* CharacterRenderer::loadTexture(const std::string& path) {
 }

 void CharacterRenderer::processPendingNormalMaps(int budget) {
-    if (pendingNormalMaps_.empty() || !vkCtx_) return;
+    if (!vkCtx_) return;

-    int processed = 0;
-    while (!pendingNormalMaps_.empty() && processed < budget) {
-        auto pending = std::move(pendingNormalMaps_.front());
-        pendingNormalMaps_.pop_front();
+    // Collect completed results from background threads
+    std::deque<NormalMapResult> ready;
+    {
+        std::lock_guard<std::mutex> lock(normalMapResultsMutex_);
+        if (completedNormalMaps_.empty()) return;
+        int count = std::min(budget, static_cast<int>(completedNormalMaps_.size()));
+        for (int i = 0; i < count; i++) {
+            ready.push_back(std::move(completedNormalMaps_.front()));
+            completedNormalMaps_.pop_front();
+        }
+    }

-        auto it = textureCache.find(pending.cacheKey);
+    // GPU upload only (~1-2ms each) — CPU work already done on background thread
+    for (auto& result : ready) {
+        auto it = textureCache.find(result.cacheKey);
        if (it == textureCache.end()) continue;  // texture was evicted

-        float nhVariance = 0.0f;
        vkCtx_->beginUploadBatch();
-        auto nhMap = generateNormalHeightMap(pending.pixels.data(),
-            pending.width, pending.height, nhVariance);
-        vkCtx_->endUploadBatch();
-
-        if (nhMap) {
-            it->second.heightMapVariance = nhVariance;
-            it->second.approxBytes += approxTextureBytesWithMips(pending.width, pending.height);
-            textureCacheBytes_ += approxTextureBytesWithMips(pending.width, pending.height);
-            it->second.normalHeightMap = std::move(nhMap);
+        auto tex = std::make_unique<VkTexture>();
+        bool ok = tex->upload(*vkCtx_, result.pixels.data(), result.width, result.height,
+                              VK_FORMAT_R8G8B8A8_UNORM, true);
+        if (ok) {
+            tex->createSampler(vkCtx_->getDevice(), VK_FILTER_LINEAR, VK_FILTER_LINEAR,
+                               VK_SAMPLER_ADDRESS_MODE_REPEAT);
+            it->second.heightMapVariance = result.variance;
+            it->second.approxBytes += approxTextureBytesWithMips(result.width, result.height);
+            textureCacheBytes_ += approxTextureBytesWithMips(result.width, result.height);
+            it->second.normalHeightMap = std::move(tex);
        }
+        vkCtx_->endUploadBatch();
        it->second.normalMapPending = false;
-        processed++;
    }
 }

@ -1876,6 +1924,61 @@ glm::mat4 CharacterRenderer::getBoneTransform(const pipeline::M2Bone& bone, floa

 // --- Rendering ---

+void CharacterRenderer::prepareRender(uint32_t frameIndex) {
+    if (instances.empty() || !opaquePipeline_) return;
+
+    // Pre-allocate bone SSBOs + descriptor sets on main thread (pool ops not thread-safe)
+    for (auto& [id, instance] : instances) {
+        int numBones = std::min(static_cast<int>(instance.boneMatrices.size()), MAX_BONES);
+        if (numBones <= 0) continue;
+
+        if (!instance.boneBuffer[frameIndex]) {
+            VkBufferCreateInfo bci{VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO};
+            bci.size = MAX_BONES * sizeof(glm::mat4);
+            bci.usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT;
+            VmaAllocationCreateInfo aci{};
+            aci.usage = VMA_MEMORY_USAGE_CPU_TO_GPU;
+            aci.flags = VMA_ALLOCATION_CREATE_MAPPED_BIT;
+            VmaAllocationInfo allocInfo{};
+            vmaCreateBuffer(vkCtx_->getAllocator(), &bci, &aci,
+                            &instance.boneBuffer[frameIndex], &instance.boneAlloc[frameIndex], &allocInfo);
+            instance.boneMapped[frameIndex] = allocInfo.pMappedData;
+
+            VkDescriptorSetAllocateInfo ai{VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO};
+            ai.descriptorPool = boneDescPool_;
+            ai.descriptorSetCount = 1;
+            ai.pSetLayouts = &boneSetLayout_;
+            VkResult dsRes = vkAllocateDescriptorSets(vkCtx_->getDevice(), &ai, &instance.boneSet[frameIndex]);
+            if (dsRes != VK_SUCCESS) {
+                LOG_ERROR("CharacterRenderer::prepareRender: bone descriptor alloc failed (instance=",
+                          id, ", frame=", frameIndex, ", vk=", static_cast<int>(dsRes), ")");
+                if (instance.boneBuffer[frameIndex]) {
+                    vmaDestroyBuffer(vkCtx_->getAllocator(),
+                                     instance.boneBuffer[frameIndex], instance.boneAlloc[frameIndex]);
+                    instance.boneBuffer[frameIndex] = VK_NULL_HANDLE;
+                    instance.boneAlloc[frameIndex] = VK_NULL_HANDLE;
+                    instance.boneMapped[frameIndex] = nullptr;
+                }
+                continue;
+            }
+
+            if (instance.boneSet[frameIndex]) {
+                VkDescriptorBufferInfo bufInfo{};
+                bufInfo.buffer = instance.boneBuffer[frameIndex];
+                bufInfo.offset = 0;
+                bufInfo.range = bci.size;
+                VkWriteDescriptorSet write{VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET};
+                write.dstSet = instance.boneSet[frameIndex];
+                write.dstBinding = 0;
+                write.descriptorCount = 1;
+                write.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
+                write.pBufferInfo = &bufInfo;
+                vkUpdateDescriptorSets(vkCtx_->getDevice(), 1, &write, 0, nullptr);
+            }
+        }
+    }
+}
+
 void CharacterRenderer::render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet, [[maybe_unused]] const Camera& camera) {
    if (instances.empty() || !opaquePipeline_) {
        return;
--- a/src/rendering/loading_screen.cpp
+++ b/src/rendering/loading_screen.cpp
@ -240,6 +240,66 @@ bool LoadingScreen::loadImage(const std::string& path) {
    return true;
 }

+void LoadingScreen::renderOverlay() {
+    // Draw loading screen content as ImGui overlay within an existing ImGui frame.
+    // Caller is responsible for ImGui NewFrame/Render and Vulkan frame management.
+    ImGuiIO& io = ImGui::GetIO();
+    float screenW = io.DisplaySize.x;
+    float screenH = io.DisplaySize.y;
+
+    ImGui::SetNextWindowPos(ImVec2(0, 0));
+    ImGui::SetNextWindowSize(ImVec2(screenW, screenH));
+    ImGui::Begin("##LoadingScreenOverlay", nullptr,
+        ImGuiWindowFlags_NoTitleBar | ImGuiWindowFlags_NoResize |
+        ImGuiWindowFlags_NoMove | ImGuiWindowFlags_NoScrollbar |
+        ImGuiWindowFlags_NoInputs | ImGuiWindowFlags_NoBackground |
+        ImGuiWindowFlags_NoBringToFrontOnFocus);
+
+    if (bgDescriptorSet) {
+        ImGui::GetWindowDrawList()->AddImage(
+            reinterpret_cast<ImTextureID>(bgDescriptorSet),
+            ImVec2(0, 0), ImVec2(screenW, screenH));
+    }
+
+    // Progress bar
+    {
+        const float barWidthFrac = 0.6f;
+        const float barHeight = 6.0f;
+        const float barY = screenH * 0.06f;
+        float barX = screenW * (0.5f - barWidthFrac * 0.5f);
+        float barW = screenW * barWidthFrac;
+        ImDrawList* drawList = ImGui::GetWindowDrawList();
+        drawList->AddRectFilled(ImVec2(barX, barY), ImVec2(barX + barW, barY + barHeight),
+            IM_COL32(25, 25, 25, 200), 2.0f);
+        if (loadProgress > 0.001f) {
+            drawList->AddRectFilled(ImVec2(barX, barY), ImVec2(barX + barW * loadProgress, barY + barHeight),
+                IM_COL32(199, 156, 33, 255), 2.0f);
+        }
+        drawList->AddRect(ImVec2(barX - 1, barY - 1), ImVec2(barX + barW + 1, barY + barHeight + 1),
+            IM_COL32(140, 110, 25, 255), 2.0f);
+    }
+
+    // Percentage text
+    {
+        char pctBuf[32];
+        snprintf(pctBuf, sizeof(pctBuf), "%d%%", static_cast<int>(loadProgress * 100.0f));
+        float textY = screenH * 0.06f - 20.0f;
+        ImVec2 pctSize = ImGui::CalcTextSize(pctBuf);
+        ImGui::SetCursorPos(ImVec2((screenW - pctSize.x) * 0.5f, textY));
+        ImGui::TextColored(ImVec4(0.0f, 0.0f, 0.0f, 1.0f), "%s", pctBuf);
+    }
+
+    // Status text
+    {
+        float statusY = screenH * 0.06f + 14.0f;
+        ImVec2 statusSize = ImGui::CalcTextSize(statusText.c_str());
+        ImGui::SetCursorPos(ImVec2((screenW - statusSize.x) * 0.5f, statusY));
+        ImGui::TextColored(ImVec4(0.0f, 0.0f, 0.0f, 1.0f), "%s", statusText.c_str());
+    }
+
+    ImGui::End();
+}
+
 void LoadingScreen::render() {
    // If a frame is already in progress (e.g. called from a UI callback),
    // end it before starting our own
--- a/src/rendering/m2_renderer.cpp
+++ b/src/rendering/m2_renderer.cpp
@ -282,6 +282,14 @@ glm::vec3 closestPointOnTriangle(const glm::vec3& p,

 } // namespace

+// Thread-local scratch buffers for collision queries (allows concurrent getFloorHeight calls)
+static thread_local std::vector<size_t> tl_m2_candidateScratch;
+static thread_local std::unordered_set<uint32_t> tl_m2_candidateIdScratch;
+static thread_local std::vector<uint32_t> tl_m2_collisionTriScratch;
+
+// Forward declaration (defined after animation helpers)
+static void computeBoneMatrices(const M2ModelGPU& model, M2Instance& instance);
+
 void M2Instance::updateModelMatrix() {
    modelMatrix = glm::mat4(1.0f);
    modelMatrix = glm::translate(modelMatrix, position);
@ -1028,10 +1036,9 @@ bool M2Renderer::loadModel(const pipeline::M2Model& model, uint32_t modelId) {
            (lowerName.find("trunk") != std::string::npos) ||
            (lowerName.find("stump") != std::string::npos) ||
            (lowerName.find("log") != std::string::npos);
-        // Only large trees (canopy > 20 model units wide) get trunk collision.
-        // Small/mid trees are walkthrough to avoid getting stuck between them.
-        // Only large trees get trunk collision; all smaller trees are walkthrough.
-        bool treeWithTrunk = treeLike && !hardTreePart && !foliageName && horiz > 40.0f;
+        // Trees with visible trunks get collision. Threshold: canopy wider than 6
+        // model units AND taller than 4 units (filters out small bushes/saplings).
+        bool treeWithTrunk = treeLike && !hardTreePart && !foliageName && horiz > 6.0f && vert > 4.0f;
        bool softTree = treeLike && !hardTreePart && !treeWithTrunk;
        bool forceSolidCurb = gpuModel.collisionSteppedLowPlatform || knownStormwindPlanter || likelyCurbName || gpuModel.collisionPlanter;
        bool narrowVerticalName =
@ -1602,6 +1609,12 @@ bool M2Renderer::loadModel(const pipeline::M2Model& model, uint32_t modelId) {
        }
    }

+    // Pre-compute available LOD levels to avoid per-instance batch iteration
+    gpuModel.availableLODs = 0;
+    for (const auto& b : gpuModel.batches) {
+        if (b.submeshLevel < 8) gpuModel.availableLODs |= (1u << b.submeshLevel);
+    }
+
    models[modelId] = std::move(gpuModel);

    LOG_DEBUG("Loaded M2 model: ", model.name, " (", models[modelId].vertexCount, " vertices, ",
@ -1667,6 +1680,21 @@ uint32_t M2Renderer::createInstance(uint32_t modelId, const glm::vec3& position,
        instance.animDuration = static_cast<float>(mdl.sequences[0].duration);
        instance.animTime = static_cast<float>(rand() % std::max(1u, mdl.sequences[0].duration));
        instance.variationTimer = 3000.0f + static_cast<float>(rand() % 8000);
+
+        // Seed bone matrices from an existing instance of the same model so the
+        // new instance renders immediately instead of being invisible until the
+        // next update() computes bones (prevents pop-in flash).
+        for (const auto& existing : instances) {
+            if (existing.modelId == modelId && !existing.boneMatrices.empty()) {
+                instance.boneMatrices = existing.boneMatrices;
+                instance.bonesDirty[0] = instance.bonesDirty[1] = true;
+                break;
+            }
+        }
+        // If no sibling exists yet, compute bones immediately
+        if (instance.boneMatrices.empty()) {
+            computeBoneMatrices(mdlRef, instance);
+        }
    }

    // Register in dedup map before pushing (uses original position, not ground-adjusted)
@ -1758,6 +1786,18 @@ uint32_t M2Renderer::createInstanceWithMatrix(uint32_t modelId, const glm::mat4&
        instance.animDuration = static_cast<float>(mdl2.sequences[0].duration);
        instance.animTime = static_cast<float>(rand() % std::max(1u, mdl2.sequences[0].duration));
        instance.variationTimer = 3000.0f + static_cast<float>(rand() % 8000);
+
+        // Seed bone matrices from an existing sibling so the instance renders immediately
+        for (const auto& existing : instances) {
+            if (existing.modelId == modelId && !existing.boneMatrices.empty()) {
+                instance.boneMatrices = existing.boneMatrices;
+                instance.bonesDirty[0] = instance.bonesDirty[1] = true;
+                break;
+            }
+        }
+        if (instance.boneMatrices.empty()) {
+            computeBoneMatrices(mdl2, instance);
+        }
    } else {
        instance.animTime = static_cast<float>(rand()) / RAND_MAX * 10000.0f;
    }
@ -1911,6 +1951,7 @@ static void computeBoneMatrices(const M2ModelGPU& model, M2Instance& instance) {
            instance.boneMatrices[i] = local;
        }
    }
+    instance.bonesDirty[0] = instance.bonesDirty[1] = true;
 }

 void M2Renderer::update(float deltaTime, const glm::vec3& cameraPos, const glm::mat4& viewProjection) {
@ -2172,6 +2213,53 @@ void M2Renderer::update(float deltaTime, const glm::vec3& cameraPos, const glm::

 }

+void M2Renderer::prepareRender(uint32_t frameIndex, const Camera& camera) {
+    if (!initialized_ || instances.empty()) return;
+    (void)camera;  // reserved for future frustum-based culling
+
+    // Pre-allocate bone SSBOs + descriptor sets on main thread (pool ops not thread-safe).
+    // Only iterate animated instances — static doodads don't need bone buffers.
+    for (size_t idx : animatedInstanceIndices_) {
+        if (idx >= instances.size()) continue;
+        auto& instance = instances[idx];
+
+        if (instance.boneMatrices.empty()) continue;
+
+        if (!instance.boneBuffer[frameIndex]) {
+            VkBufferCreateInfo bci{VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO};
+            bci.size = 128 * sizeof(glm::mat4);
+            bci.usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT;
+            VmaAllocationCreateInfo aci{};
+            aci.usage = VMA_MEMORY_USAGE_CPU_TO_GPU;
+            aci.flags = VMA_ALLOCATION_CREATE_MAPPED_BIT;
+            VmaAllocationInfo allocInfo{};
+            vmaCreateBuffer(vkCtx_->getAllocator(), &bci, &aci,
+                            &instance.boneBuffer[frameIndex], &instance.boneAlloc[frameIndex], &allocInfo);
+            instance.boneMapped[frameIndex] = allocInfo.pMappedData;
+
+            // Force dirty so current boneMatrices get copied into this
+            // newly-allocated buffer during render (prevents garbage/zero
+            // bones when the other frame index already cleared bonesDirty).
+            instance.bonesDirty[frameIndex] = true;
+
+            instance.boneSet[frameIndex] = allocateBoneSet();
+            if (instance.boneSet[frameIndex]) {
+                VkDescriptorBufferInfo bufInfo{};
+                bufInfo.buffer = instance.boneBuffer[frameIndex];
+                bufInfo.offset = 0;
+                bufInfo.range = bci.size;
+                VkWriteDescriptorSet write{VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET};
+                write.dstSet = instance.boneSet[frameIndex];
+                write.dstBinding = 0;
+                write.descriptorCount = 1;
+                write.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
+                write.pBufferInfo = &bufInfo;
+                vkUpdateDescriptorSets(vkCtx_->getDevice(), 1, &write, 0, nullptr);
+            }
+        }
+    }
+}
+
 void M2Renderer::render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet, const Camera& camera) {
    if (instances.empty() || !opaquePipeline_) {
        return;
@ -2254,8 +2342,8 @@ void M2Renderer::render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet, const
    }

    // Sort by modelId to minimize vertex/index buffer rebinds
-    std::stable_sort(sortedVisible_.begin(), sortedVisible_.end(),
-                     [](const VisibleEntry& a, const VisibleEntry& b) { return a.modelId < b.modelId; });
+    std::sort(sortedVisible_.begin(), sortedVisible_.end(),
+              [](const VisibleEntry& a, const VisibleEntry& b) { return a.modelId < b.modelId; });

    uint32_t currentModelId = UINT32_MAX;
    const M2ModelGPU* currentModel = nullptr;
@ -2330,44 +2418,26 @@ void M2Renderer::render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet, const
            }
        }

-        // Upload bone matrices to SSBO if model has skeletal animation
-        bool useBones = model.hasAnimation && !model.disableAnimation && !instance.boneMatrices.empty();
+        // Upload bone matrices to SSBO if model has skeletal animation.
+        // Skip animated instances entirely until bones are computed + buffers allocated
+        // to prevent bind-pose/T-pose flash on first appearance.
+        bool modelNeedsAnimation = model.hasAnimation && !model.disableAnimation;
+        if (modelNeedsAnimation && instance.boneMatrices.empty()) {
+            continue;  // Bones not yet computed — skip to avoid bind-pose flash
+        }
+        bool needsBones = modelNeedsAnimation && !instance.boneMatrices.empty();
+        if (needsBones && (!instance.boneBuffer[frameIndex] || !instance.boneSet[frameIndex])) {
+            continue;  // Bone buffers not yet allocated — skip to avoid bind-pose flash
+        }
+        bool useBones = needsBones;
        if (useBones) {
-            // Lazy-allocate bone SSBO on first use
-            if (!instance.boneBuffer[frameIndex]) {
-                VkBufferCreateInfo bci{VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO};
-                bci.size = 128 * sizeof(glm::mat4); // max 128 bones
-                bci.usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT;
-                VmaAllocationCreateInfo aci{};
-                aci.usage = VMA_MEMORY_USAGE_CPU_TO_GPU;
-                aci.flags = VMA_ALLOCATION_CREATE_MAPPED_BIT;
-                VmaAllocationInfo allocInfo{};
-                vmaCreateBuffer(vkCtx_->getAllocator(), &bci, &aci,
-                                &instance.boneBuffer[frameIndex], &instance.boneAlloc[frameIndex], &allocInfo);
-                instance.boneMapped[frameIndex] = allocInfo.pMappedData;
-
-                // Allocate descriptor set for bone SSBO
-                instance.boneSet[frameIndex] = allocateBoneSet();
-                if (instance.boneSet[frameIndex]) {
-                    VkDescriptorBufferInfo bufInfo{};
-                    bufInfo.buffer = instance.boneBuffer[frameIndex];
-                    bufInfo.offset = 0;
-                    bufInfo.range = bci.size;
-                    VkWriteDescriptorSet write{VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET};
-                    write.dstSet = instance.boneSet[frameIndex];
-                    write.dstBinding = 0;
-                    write.descriptorCount = 1;
-                    write.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
-                    write.pBufferInfo = &bufInfo;
-                    vkUpdateDescriptorSets(vkCtx_->getDevice(), 1, &write, 0, nullptr);
-                }
-            }
-
-            // Upload bone matrices
-            if (instance.boneMapped[frameIndex]) {
+            // Upload bone matrices only when recomputed (per-frame-index tracking
+            // ensures both double-buffered SSBOs get the latest bone data)
+            if (instance.bonesDirty[frameIndex] && instance.boneMapped[frameIndex]) {
                int numBones = std::min(static_cast<int>(instance.boneMatrices.size()), 128);
                memcpy(instance.boneMapped[frameIndex], instance.boneMatrices.data(),
                       numBones * sizeof(glm::mat4));
+                instance.bonesDirty[frameIndex] = false;
            }

            // Bind bone descriptor set (set 2)
@ -2384,12 +2454,8 @@ void M2Renderer::render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet, const
        else if (entry.distSq > 40.0f * 40.0f) desiredLOD = 1;

        uint16_t targetLOD = desiredLOD;
-        if (desiredLOD > 0) {
-            bool hasDesiredLOD = false;
-            for (const auto& b : model.batches) {
-                if (b.submeshLevel == desiredLOD) { hasDesiredLOD = true; break; }
-            }
-            if (!hasDesiredLOD) targetLOD = 0;
+        if (desiredLOD > 0 && !(model.availableLODs & (1u << desiredLOD))) {
+            targetLOD = 0;
        }

        const bool foliageLikeModel = model.isFoliageLike;
@ -3597,7 +3663,7 @@ void M2Renderer::rebuildSpatialIndex() {
 void M2Renderer::gatherCandidates(const glm::vec3& queryMin, const glm::vec3& queryMax,
                                  std::vector<size_t>& outIndices) const {
    outIndices.clear();
-    candidateIdScratch.clear();
+    tl_m2_candidateIdScratch.clear();

    GridCell minCell = toCell(queryMin);
    GridCell maxCell = toCell(queryMax);
@ -3607,7 +3673,7 @@ void M2Renderer::gatherCandidates(const glm::vec3& queryMin, const glm::vec3& qu
                auto it = spatialGrid.find(GridCell{x, y, z});
                if (it == spatialGrid.end()) continue;
                for (uint32_t id : it->second) {
-                    if (!candidateIdScratch.insert(id).second) continue;
+                    if (!tl_m2_candidateIdScratch.insert(id).second) continue;
                    auto idxIt = instanceIndexById.find(id);
                    if (idxIt != instanceIndexById.end()) {
                        outIndices.push_back(idxIt->second);
@ -3780,9 +3846,9 @@ std::optional<float> M2Renderer::getFloorHeight(float glX, float glY, float glZ,

    glm::vec3 queryMin(glX - 2.0f, glY - 2.0f, glZ - 6.0f);
    glm::vec3 queryMax(glX + 2.0f, glY + 2.0f, glZ + 8.0f);
-    gatherCandidates(queryMin, queryMax, candidateScratch);
+    gatherCandidates(queryMin, queryMax, tl_m2_candidateScratch);

-    for (size_t idx : candidateScratch) {
+    for (size_t idx : tl_m2_candidateScratch) {
        const auto& instance = instances[idx];
        if (collisionFocusEnabled &&
            pointAABBDistanceSq(collisionFocusPos, instance.worldBoundsMin, instance.worldBoundsMax) > collisionFocusRadiusSq) {
@ -3804,14 +3870,14 @@ std::optional<float> M2Renderer::getFloorHeight(float glX, float glY, float glZ,
            model.collision.getFloorTrisInRange(
                localPos.x - 1.0f, localPos.y - 1.0f,
                localPos.x + 1.0f, localPos.y + 1.0f,
-                collisionTriScratch_);
+                tl_m2_collisionTriScratch);

            glm::vec3 rayOrigin(localPos.x, localPos.y, localPos.z + 5.0f);
            glm::vec3 rayDir(0.0f, 0.0f, -1.0f);
            float bestHitZ = -std::numeric_limits<float>::max();
            bool hitAny = false;

-            for (uint32_t ti : collisionTriScratch_) {
+            for (uint32_t ti : tl_m2_collisionTriScratch) {
                if (ti >= model.collision.triCount) continue;
                if (model.collision.triBounds[ti].maxZ < localPos.z - 10.0f ||
                    model.collision.triBounds[ti].minZ > localPos.z + 5.0f) continue;
@ -3926,10 +3992,10 @@ bool M2Renderer::checkCollision(const glm::vec3& from, const glm::vec3& to,

    glm::vec3 queryMin = glm::min(from, to) - glm::vec3(7.0f, 7.0f, 5.0f);
    glm::vec3 queryMax = glm::max(from, to) + glm::vec3(7.0f, 7.0f, 5.0f);
-    gatherCandidates(queryMin, queryMax, candidateScratch);
+    gatherCandidates(queryMin, queryMax, tl_m2_candidateScratch);

    // Check against all M2 instances in local space (rotation-aware).
-    for (size_t idx : candidateScratch) {
+    for (size_t idx : tl_m2_candidateScratch) {
        const auto& instance = instances[idx];
        if (collisionFocusEnabled &&
            pointAABBDistanceSq(collisionFocusPos, instance.worldBoundsMin, instance.worldBoundsMax) > collisionFocusRadiusSq) {
@ -3962,14 +4028,14 @@ bool M2Renderer::checkCollision(const glm::vec3& from, const glm::vec3& to,
                std::min(localFrom.y, localPos.y) - localRadius - 1.0f,
                std::max(localFrom.x, localPos.x) + localRadius + 1.0f,
                std::max(localFrom.y, localPos.y) + localRadius + 1.0f,
-                collisionTriScratch_);
+                tl_m2_collisionTriScratch);

            constexpr float PLAYER_HEIGHT = 2.0f;
            constexpr float MAX_TOTAL_PUSH = 0.02f; // Cap total push per instance
            bool pushed = false;
            float totalPushX = 0.0f, totalPushY = 0.0f;

-            for (uint32_t ti : collisionTriScratch_) {
+            for (uint32_t ti : tl_m2_collisionTriScratch) {
                if (ti >= model.collision.triCount) continue;
                if (localPos.z + PLAYER_HEIGHT < model.collision.triBounds[ti].minZ ||
                    localPos.z > model.collision.triBounds[ti].maxZ) continue;
@ -4167,9 +4233,9 @@ float M2Renderer::raycastBoundingBoxes(const glm::vec3& origin, const glm::vec3&
    glm::vec3 rayEnd = origin + direction * maxDistance;
    glm::vec3 queryMin = glm::min(origin, rayEnd) - glm::vec3(1.0f);
    glm::vec3 queryMax = glm::max(origin, rayEnd) + glm::vec3(1.0f);
-    gatherCandidates(queryMin, queryMax, candidateScratch);
+    gatherCandidates(queryMin, queryMax, tl_m2_candidateScratch);

-    for (size_t idx : candidateScratch) {
+    for (size_t idx : tl_m2_candidateScratch) {
        const auto& instance = instances[idx];
        if (collisionFocusEnabled &&
            pointAABBDistanceSq(collisionFocusPos, instance.worldBoundsMin, instance.worldBoundsMax) > collisionFocusRadiusSq) {
--- a/src/rendering/performance_hud.cpp
+++ b/src/rendering/performance_hud.cpp
@ -1,5 +1,6 @@
 #include "rendering/performance_hud.hpp"
 #include "rendering/renderer.hpp"
+#include "rendering/vk_context.hpp"
 #include "rendering/terrain_renderer.hpp"
 #include "rendering/terrain_manager.hpp"
 #include "rendering/water_renderer.hpp"
@ -187,6 +188,19 @@ void PerformanceHUD::render(const Renderer* renderer, const Camera* camera) {
                           0, nullptr, 0.0f, 33.33f, ImVec2(200, 40));
        }

+        // FSR info
+        if (renderer->isFSREnabled()) {
+            ImGui::TextColored(ImVec4(0.4f, 1.0f, 0.4f, 1.0f), "FSR 1.0: ON");
+            auto* ctx = renderer->getVkContext();
+            if (ctx) {
+                auto ext = ctx->getSwapchainExtent();
+                float sf = renderer->getFSRScaleFactor();
+                uint32_t iw = static_cast<uint32_t>(ext.width * sf) & ~1u;
+                uint32_t ih = static_cast<uint32_t>(ext.height * sf) & ~1u;
+                ImGui::Text("  %ux%u -> %ux%u (%.0f%%)", iw, ih, ext.width, ext.height, sf * 100.0f);
+            }
+        }
+
        ImGui::Spacing();
    }

--- a/src/rendering/renderer.cpp
+++ b/src/rendering/renderer.cpp
--- a/src/rendering/terrain_manager.cpp
+++ b/src/rendering/terrain_manager.cpp
@ -199,13 +199,29 @@ void TerrainManager::update(const Camera& camera, float deltaTime) {
        currentTile = newTile;
    }

-    // Stream tiles if we've moved significantly or initial load
+    // Stream tiles when player crosses a tile boundary
    if (newTile.x != lastStreamTile.x || newTile.y != lastStreamTile.y) {
        LOG_DEBUG("Streaming: cam=(", camPos.x, ",", camPos.y, ",", camPos.z,
                 ") tile=[", newTile.x, ",", newTile.y,
                 "] loaded=", loadedTiles.size());
        streamTiles();
        lastStreamTile = newTile;
+    } else {
+        // Proactive loading: when workers are idle, periodically re-check for
+        // unloaded tiles within range. Throttled to avoid hitching right after
+        // world load when many tiles finalize simultaneously.
+        proactiveStreamTimer_ += deltaTime;
+        if (proactiveStreamTimer_ >= 2.0f) {
+            proactiveStreamTimer_ = 0.0f;
+            bool workersIdle;
+            {
+                std::lock_guard<std::mutex> lock(queueMutex);
+                workersIdle = loadQueue.empty();
+            }
+            if (workersIdle) {
+                streamTiles();
+            }
+        }
    }
 }

@ -800,7 +816,7 @@ bool TerrainManager::advanceFinalization(FinalizingTile& ft) {
            }
            bool allDone = terrainRenderer->loadTerrainIncremental(
                pending->mesh, pending->terrain.textures, x, y,
-                ft.terrainChunkNext, 32);
+                ft.terrainChunkNext, 16);
            if (!allDone) {
                return false; // More chunks remain — yield to time budget
            }
@ -830,11 +846,19 @@ bool TerrainManager::advanceFinalization(FinalizingTile& ft) {
    }

    case FinalizationPhase::M2_MODELS: {
-        // Upload multiple M2 models per call (batched GPU uploads)
+        // Upload multiple M2 models per call (batched GPU uploads).
+        // When no more tiles are queued for background parsing, increase the
+        // per-frame budget so idle workers don't waste time waiting for the
+        // main thread to trickle-upload models.
        if (m2Renderer && ft.m2ModelIndex < pending->m2Models.size()) {
            // Set pre-decoded BLP cache so loadTexture() skips main-thread BLP decode
            m2Renderer->setPredecodedBLPCache(&pending->preloadedM2Textures);
-            constexpr size_t kModelsPerStep = 4;
+            bool workersIdle;
+            {
+                std::lock_guard<std::mutex> lk(queueMutex);
+                workersIdle = loadQueue.empty() && readyQueue.empty();
+            }
+            const size_t kModelsPerStep = workersIdle ? 6 : 4;
            size_t uploaded = 0;
            while (ft.m2ModelIndex < pending->m2Models.size() && uploaded < kModelsPerStep) {
                auto& m2Ready = pending->m2Models[ft.m2ModelIndex];
@ -896,7 +920,12 @@ bool TerrainManager::advanceFinalization(FinalizingTile& ft) {
            wmoRenderer->setPredecodedBLPCache(&pending->preloadedWMOTextures);
            wmoRenderer->setDeferNormalMaps(true);

-            constexpr size_t kWmosPerStep = 1;
+            bool wmoWorkersIdle;
+            {
+                std::lock_guard<std::mutex> lk(queueMutex);
+                wmoWorkersIdle = loadQueue.empty() && readyQueue.empty();
+            }
+            const size_t kWmosPerStep = wmoWorkersIdle ? 2 : 1;
            size_t uploaded = 0;
            while (ft.wmoModelIndex < pending->wmoModels.size() && uploaded < kWmosPerStep) {
                auto& wmoReady = pending->wmoModels[ft.wmoModelIndex];
@ -911,6 +940,8 @@ bool TerrainManager::advanceFinalization(FinalizingTile& ft) {
            wmoRenderer->setDeferNormalMaps(false);
            wmoRenderer->setPredecodedBLPCache(nullptr);
            if (ft.wmoModelIndex < pending->wmoModels.size()) return false;
+            // All WMO models loaded — backfill normal/height maps that were skipped during streaming
+            wmoRenderer->backfillNormalMaps();
        }
        ft.phase = FinalizationPhase::WMO_INSTANCES;
        return false;
@ -1176,7 +1207,7 @@ void TerrainManager::processReadyTiles() {
    // Async upload batch: record GPU copies into a command buffer, submit with
    // a fence, but DON'T wait.  The fence is polled on subsequent frames.
    // This eliminates the main-thread stall from vkWaitForFences entirely.
-    const int maxSteps = taxiStreamingMode_ ? 8 : 2;
+    const int maxSteps = taxiStreamingMode_ ? 4 : 1;
    int steps = 0;

    if (vkCtx) vkCtx->beginUploadBatch();
--- a/src/rendering/vk_context.cpp
+++ b/src/rendering/vk_context.cpp
@ -252,14 +252,22 @@ bool VkContext::createAllocator() {
 bool VkContext::createSwapchain(int width, int height) {
    vkb::SwapchainBuilder swapchainBuilder{physicalDevice, device, surface};

-    auto swapRet = swapchainBuilder
+    auto& builder = swapchainBuilder
        .set_desired_format({VK_FORMAT_B8G8R8A8_UNORM, VK_COLOR_SPACE_SRGB_NONLINEAR_KHR})
-        .set_desired_present_mode(VK_PRESENT_MODE_FIFO_KHR) // VSync
        .set_desired_extent(static_cast<uint32_t>(width), static_cast<uint32_t>(height))
        .set_image_usage_flags(VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT | VK_IMAGE_USAGE_TRANSFER_SRC_BIT)
        .set_desired_min_image_count(2)
-        .set_old_swapchain(swapchain) // For recreation
-        .build();
+        .set_old_swapchain(swapchain);
+
+    if (vsync_) {
+        builder.set_desired_present_mode(VK_PRESENT_MODE_FIFO_KHR);
+    } else {
+        builder.set_desired_present_mode(VK_PRESENT_MODE_IMMEDIATE_KHR);
+        builder.add_fallback_present_mode(VK_PRESENT_MODE_MAILBOX_KHR);
+        builder.add_fallback_present_mode(VK_PRESENT_MODE_FIFO_RELAXED_KHR);
+    }
+
+    auto swapRet = builder.build();

    if (!swapRet) {
        LOG_ERROR("Failed to create Vulkan swapchain: ", swapRet.error().message());
@ -1026,14 +1034,22 @@ bool VkContext::recreateSwapchain(int width, int height) {
    VkSwapchainKHR oldSwapchain = swapchain;

    vkb::SwapchainBuilder swapchainBuilder{physicalDevice, device, surface};
-    auto swapRet = swapchainBuilder
+    auto& builder = swapchainBuilder
        .set_desired_format({VK_FORMAT_B8G8R8A8_UNORM, VK_COLOR_SPACE_SRGB_NONLINEAR_KHR})
-        .set_desired_present_mode(VK_PRESENT_MODE_FIFO_KHR)
        .set_desired_extent(static_cast<uint32_t>(width), static_cast<uint32_t>(height))
        .set_image_usage_flags(VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT | VK_IMAGE_USAGE_TRANSFER_SRC_BIT)
        .set_desired_min_image_count(2)
-        .set_old_swapchain(oldSwapchain)
-        .build();
+        .set_old_swapchain(oldSwapchain);
+
+    if (vsync_) {
+        builder.set_desired_present_mode(VK_PRESENT_MODE_FIFO_KHR);
+    } else {
+        builder.set_desired_present_mode(VK_PRESENT_MODE_IMMEDIATE_KHR);
+        builder.add_fallback_present_mode(VK_PRESENT_MODE_MAILBOX_KHR);
+        builder.add_fallback_present_mode(VK_PRESENT_MODE_FIFO_RELAXED_KHR);
+    }
+
+    auto swapRet = builder.build();

    if (oldSwapchain) {
        vkDestroySwapchainKHR(device, oldSwapchain, nullptr);
--- a/src/rendering/wmo_renderer.cpp
+++ b/src/rendering/wmo_renderer.cpp
@ -48,6 +48,11 @@ size_t envSizeOrDefault(const char* name, size_t defValue) {
 }
 } // namespace

+// Thread-local scratch buffers for collision queries (allows concurrent getFloorHeight/checkWallCollision calls)
+static thread_local std::vector<size_t> tl_candidateScratch;
+static thread_local std::vector<uint32_t> tl_triScratch;
+static thread_local std::unordered_set<uint32_t> tl_candidateIdScratch;
+
 static void transformAABB(const glm::mat4& modelMatrix,
                          const glm::vec3& localMin,
                          const glm::vec3& localMax,
@ -787,8 +792,8 @@ bool WMORenderer::loadModel(const pipeline::WMOModel& model, uint32_t id) {
            }

            // Build doodad's local transform (WoW coordinates)
-            // WMO doodads use quaternion rotation (X/Y swapped for correct orientation)
-            glm::quat fixedRotation(doodad.rotation.w, doodad.rotation.y, doodad.rotation.x, doodad.rotation.z);
+            // WMO doodads use quaternion rotation
+            glm::quat fixedRotation(doodad.rotation.w, doodad.rotation.x, doodad.rotation.y, doodad.rotation.z);

            glm::mat4 localTransform(1.0f);
            localTransform = glm::translate(localTransform, doodad.position);
@ -1288,7 +1293,7 @@ void WMORenderer::rebuildSpatialIndex() {
 void WMORenderer::gatherCandidates(const glm::vec3& queryMin, const glm::vec3& queryMax,
                                   std::vector<size_t>& outIndices) const {
    outIndices.clear();
-    candidateIdScratch.clear();
+    tl_candidateIdScratch.clear();

    GridCell minCell = toCell(queryMin);
    GridCell maxCell = toCell(queryMax);
@ -1298,7 +1303,7 @@ void WMORenderer::gatherCandidates(const glm::vec3& queryMin, const glm::vec3& q
                auto it = spatialGrid.find(GridCell{x, y, z});
                if (it == spatialGrid.end()) continue;
                for (uint32_t id : it->second) {
-                    if (!candidateIdScratch.insert(id).second) continue;
+                    if (!tl_candidateIdScratch.insert(id).second) continue;
                    auto idxIt = instanceIndexById.find(id);
                    if (idxIt != instanceIndexById.end()) {
                        outIndices.push_back(idxIt->second);
@ -1318,15 +1323,10 @@ void WMORenderer::gatherCandidates(const glm::vec3& queryMin, const glm::vec3& q
    }
 }

-void WMORenderer::render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet, const Camera& camera) {
+void WMORenderer::prepareRender() {
    ++currentFrameId;

-    if (!opaquePipeline_ || instances.empty()) {
-        lastDrawCalls = 0;
-        return;
-    }
-
-    // Update material UBOs if settings changed
+    // Update material UBOs if settings changed (mapped memory writes — main thread only)
    if (materialSettingsDirty_) {
        materialSettingsDirty_ = false;
        static const int pomSampleTable[] = { 16, 32, 64 };
@ -1335,7 +1335,6 @@ void WMORenderer::render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet, const
            for (auto& group : model.groups) {
                for (auto& mb : group.mergedBatches) {
                    if (!mb.materialUBO) continue;
-                    // Read existing UBO data, update normal/POM fields
                    VmaAllocationInfo allocInfo{};
                    vmaGetAllocationInfo(vkCtx_->getAllocator(), mb.materialUBOAlloc, &allocInfo);
                    if (allocInfo.pMappedData) {
@ -1351,6 +1350,13 @@ void WMORenderer::render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet, const
            }
        }
    }
+}
+
+void WMORenderer::render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet, const Camera& camera) {
+    if (!opaquePipeline_ || instances.empty()) {
+        lastDrawCalls = 0;
+        return;
+    }

    lastDrawCalls = 0;

@ -1362,43 +1368,45 @@ void WMORenderer::render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet, const
    lastPortalCulledGroups = 0;
    lastDistanceCulledGroups = 0;

-    // ── Phase 1: Parallel visibility culling ──────────────────────────
-    std::vector<size_t> visibleInstances;
-    visibleInstances.reserve(instances.size());
+    // ── Phase 1: Visibility culling ──────────────────────────
+    visibleInstances_.clear();
    for (size_t i = 0; i < instances.size(); ++i) {
-        const auto& instance = instances[i];
-        if (loadedModels.find(instance.modelId) == loadedModels.end())
-            continue;
-        visibleInstances.push_back(i);
+        if (loadedModels.count(instances[i].modelId))
+            visibleInstances_.push_back(i);
    }

    glm::vec3 camPos = camera.getPosition();
    bool doPortalCull = portalCulling;
-    bool doFrustumCull = false; // Temporarily disabled: can over-cull world WMOs
    bool doDistanceCull = distanceCulling;

-    auto cullInstance = [&](size_t instIdx) -> InstanceDrawList {
-        if (instIdx >= instances.size()) return InstanceDrawList{};
+    auto cullInstance = [&](size_t instIdx, InstanceDrawList& result) {
+        if (instIdx >= instances.size()) return;
        const auto& instance = instances[instIdx];
        auto mdlIt = loadedModels.find(instance.modelId);
-        if (mdlIt == loadedModels.end()) return InstanceDrawList{};
+        if (mdlIt == loadedModels.end()) return;
        const ModelData& model = mdlIt->second;

-        InstanceDrawList result;
        result.instanceIndex = instIdx;
+        result.visibleGroups.clear();
+        result.portalCulled = 0;
+        result.distanceCulled = 0;

-        // Portal-based visibility
-        std::unordered_set<uint32_t> portalVisibleGroups;
+        // Portal-based visibility — use a flat sorted vector instead of unordered_set
+        std::vector<uint32_t> portalVisibleGroups;
        bool usePortalCulling = doPortalCull && !model.portals.empty() && !model.portalRefs.empty();
        if (usePortalCulling) {
+            std::unordered_set<uint32_t> pvgSet;
            glm::vec4 localCamPos = instance.invModelMatrix * glm::vec4(camPos, 1.0f);
            getVisibleGroupsViaPortals(model, glm::vec3(localCamPos), frustum,
-                                       instance.modelMatrix, portalVisibleGroups);
+                                       instance.modelMatrix, pvgSet);
+            portalVisibleGroups.assign(pvgSet.begin(), pvgSet.end());
+            std::sort(portalVisibleGroups.begin(), portalVisibleGroups.end());
        }

        for (size_t gi = 0; gi < model.groups.size(); ++gi) {
            if (usePortalCulling &&
-                portalVisibleGroups.find(static_cast<uint32_t>(gi)) == portalVisibleGroups.end()) {
+                !std::binary_search(portalVisibleGroups.begin(), portalVisibleGroups.end(),
+                                    static_cast<uint32_t>(gi))) {
                result.portalCulled++;
                continue;
            }
@ -1414,62 +1422,18 @@ void WMORenderer::render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet, const
                        continue;
                    }
                }
-
-                if (doFrustumCull && !frustum.intersectsAABB(gMin, gMax))
-                    continue;
            }

            result.visibleGroups.push_back(static_cast<uint32_t>(gi));
        }
-        return result;
    };

-    // Dispatch culling — parallel when enough instances, sequential otherwise.
-    std::vector<InstanceDrawList> drawLists;
-    drawLists.reserve(visibleInstances.size());
+    // Resize drawLists to match (reuses previous capacity)
+    drawLists_.resize(visibleInstances_.size());

-    static const size_t minParallelCullInstances = std::max<size_t>(
-        4, envSizeOrDefault("WOWEE_WMO_CULL_MT_MIN", 128));
-    if (visibleInstances.size() >= minParallelCullInstances && numCullThreads_ > 1) {
-        static const size_t minCullWorkPerThread = std::max<size_t>(
-            16, envSizeOrDefault("WOWEE_WMO_CULL_WORK_PER_THREAD", 64));
-        const size_t maxUsefulThreads = std::max<size_t>(
-            1, (visibleInstances.size() + minCullWorkPerThread - 1) / minCullWorkPerThread);
-        const size_t numThreads = std::min(static_cast<size_t>(numCullThreads_), maxUsefulThreads);
-        if (numThreads <= 1) {
-            for (size_t idx : visibleInstances) {
-                drawLists.push_back(cullInstance(idx));
-            }
-        } else {
-            const size_t chunkSize = visibleInstances.size() / numThreads;
-            const size_t remainder = visibleInstances.size() % numThreads;
-
-            drawLists.resize(visibleInstances.size());
-
-            cullFutures_.clear();
-            if (cullFutures_.capacity() < numThreads) {
-                cullFutures_.reserve(numThreads);
-            }
-
-            size_t start = 0;
-            for (size_t t = 0; t < numThreads; ++t) {
-                const size_t end = start + chunkSize + (t < remainder ? 1 : 0);
-                cullFutures_.push_back(std::async(std::launch::async,
-                    [&, start, end]() {
-                        for (size_t j = start; j < end; ++j) {
-                            drawLists[j] = cullInstance(visibleInstances[j]);
-                        }
-                    }));
-                start = end;
-            }
-
-            for (auto& f : cullFutures_) {
-                f.get();
-            }
-        }
-    } else {
-        for (size_t idx : visibleInstances)
-            drawLists.push_back(cullInstance(idx));
+    // Sequential culling (parallel dispatch overhead > savings for typical instance counts)
+    for (size_t j = 0; j < visibleInstances_.size(); ++j) {
+        cullInstance(visibleInstances_[j], drawLists_[j]);
    }

    // ── Phase 2: Vulkan draw ────────────────────────────────
@ -1484,7 +1448,7 @@ void WMORenderer::render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet, const
    // Track which pipeline is currently bound: 0=opaque, 1=transparent, 2=glass
    int currentPipelineKind = 0;

-    for (const auto& dl : drawLists) {
+    for (const auto& dl : drawLists_) {
        if (dl.instanceIndex >= instances.size()) continue;
        const auto& instance = instances[dl.instanceIndex];
        auto modelIt = loadedModels.find(instance.modelId);
@ -2412,6 +2376,69 @@ VkTexture* WMORenderer::loadTexture(const std::string& path) {
    return rawPtr;
 }

+void WMORenderer::backfillNormalMaps() {
+    if (!normalMappingEnabled_ && !pomEnabled_) return;
+
+    if (!assetManager) return;
+
+    int generated = 0;
+    for (auto& [key, entry] : textureCache) {
+        if (entry.normalHeightMap) continue;  // already has one
+        if (!entry.texture) continue;
+
+        // Re-load the BLP from MPQ to get pixel data for normal map generation
+        pipeline::BLPImage blp = assetManager->loadTexture(key);
+        if (!blp.isValid() || blp.width == 0 || blp.height == 0) continue;
+
+        float variance = 0.0f;
+        auto nhMap = generateNormalHeightMap(blp.data.data(), blp.width, blp.height, variance);
+        if (nhMap) {
+            entry.normalHeightMap = std::move(nhMap);
+            entry.heightMapVariance = variance;
+            generated++;
+        }
+    }
+
+    if (generated > 0) {
+        VkDevice device = vkCtx_->getDevice();
+        int rebound = 0;
+        // Update merged batches: assign normal map pointer and rebind descriptor set
+        for (auto& [modelId, model] : loadedModels) {
+            for (auto& group : model.groups) {
+                for (auto& mb : group.mergedBatches) {
+                    if (mb.normalHeightMap) continue;  // already set
+                    if (!mb.texture) continue;
+                    // Find this texture in the cache
+                    for (const auto& [cacheKey, cacheEntry] : textureCache) {
+                        if (cacheEntry.texture.get() == mb.texture) {
+                            if (cacheEntry.normalHeightMap) {
+                                mb.normalHeightMap = cacheEntry.normalHeightMap.get();
+                                mb.heightMapVariance = cacheEntry.heightMapVariance;
+                                // Rebind descriptor set binding 2 to the real normal/height map
+                                if (mb.materialSet) {
+                                    VkDescriptorImageInfo nhImgInfo = mb.normalHeightMap->descriptorInfo();
+                                    VkWriteDescriptorSet write{};
+                                    write.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
+                                    write.dstSet = mb.materialSet;
+                                    write.dstBinding = 2;
+                                    write.descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER;
+                                    write.descriptorCount = 1;
+                                    write.pImageInfo = &nhImgInfo;
+                                    vkUpdateDescriptorSets(device, 1, &write, 0, nullptr);
+                                    rebound++;
+                                }
+                            }
+                            break;
+                        }
+                    }
+                }
+            }
+        }
+        materialSettingsDirty_ = true;
+        LOG_INFO("Backfilled ", generated, " normal/height maps (", rebound, " descriptor sets rebound) for deferred WMO textures");
+    }
+}
+
 // Ray-AABB intersection (slab method)
 // Returns true if the ray intersects the axis-aligned bounding box
 static bool rayIntersectsAABB(const glm::vec3& origin, const glm::vec3& dir,
@ -2808,9 +2835,9 @@ std::optional<float> WMORenderer::getFloorHeight(float glX, float glY, float glZ
        group.getTrianglesInRange(
            localOrigin.x - 1.0f, localOrigin.y - 1.0f,
            localOrigin.x + 1.0f, localOrigin.y + 1.0f,
-            triScratch_);
+            tl_triScratch);

-        for (uint32_t triStart : triScratch_) {
+        for (uint32_t triStart : tl_triScratch) {
            const glm::vec3& v0 = verts[indices[triStart]];
            const glm::vec3& v1 = verts[indices[triStart + 1]];
            const glm::vec3& v2 = verts[indices[triStart + 2]];
@ -2884,9 +2911,9 @@ std::optional<float> WMORenderer::getFloorHeight(float glX, float glY, float glZ
    // early-returned because overlapping WMO instances need full coverage).
    glm::vec3 queryMin(glX - 2.0f, glY - 2.0f, glZ - 8.0f);
    glm::vec3 queryMax(glX + 2.0f, glY + 2.0f, glZ + 10.0f);
-    gatherCandidates(queryMin, queryMax, candidateScratch);
+    gatherCandidates(queryMin, queryMax, tl_candidateScratch);

-    for (size_t idx : candidateScratch) {
+    for (size_t idx : tl_candidateScratch) {
        const auto& instance = instances[idx];
        if (collisionFocusEnabled &&
            pointAABBDistanceSq(collisionFocusPos, instance.worldBoundsMin, instance.worldBoundsMax) > collisionFocusRadiusSq) {
@ -3059,9 +3086,9 @@ bool WMORenderer::checkWallCollision(const glm::vec3& from, const glm::vec3& to,

    glm::vec3 queryMin = glm::min(from, to) - glm::vec3(8.0f, 8.0f, 5.0f);
    glm::vec3 queryMax = glm::max(from, to) + glm::vec3(8.0f, 8.0f, 5.0f);
-    gatherCandidates(queryMin, queryMax, candidateScratch);
+    gatherCandidates(queryMin, queryMax, tl_candidateScratch);

-    for (size_t idx : candidateScratch) {
+    for (size_t idx : tl_candidateScratch) {
        const auto& instance = instances[idx];
        if (collisionFocusEnabled &&
            pointAABBDistanceSq(collisionFocusPos, instance.worldBoundsMin, instance.worldBoundsMax) > collisionFocusRadiusSq) {
@ -3127,9 +3154,9 @@ bool WMORenderer::checkWallCollision(const glm::vec3& from, const glm::vec3& to,
            float rangeMinY = std::min(localFrom.y, localTo.y) - PLAYER_RADIUS - 1.5f;
            float rangeMaxX = std::max(localFrom.x, localTo.x) + PLAYER_RADIUS + 1.5f;
            float rangeMaxY = std::max(localFrom.y, localTo.y) + PLAYER_RADIUS + 1.5f;
-            group.getTrianglesInRange(rangeMinX, rangeMinY, rangeMaxX, rangeMaxY, triScratch_);
+            group.getTrianglesInRange(rangeMinX, rangeMinY, rangeMaxX, rangeMaxY, tl_triScratch);

-            for (uint32_t triStart : triScratch_) {
+            for (uint32_t triStart : tl_triScratch) {
                // Use pre-computed Z bounds for fast vertical reject
                const auto& tb = group.triBounds[triStart / 3];

@ -3145,18 +3172,13 @@ bool WMORenderer::checkWallCollision(const glm::vec3& from, const glm::vec3& to,
                if (triHeight < 1.0f && tb.maxZ <= localFeetZ + 1.2f) continue;

                // Use MOPY flags to filter wall collision.
-                // Collidable triangles (flag 0x01) block the player — including
-                // invisible collision walls (0x01 without 0x20) used in tunnels.
-                // Skip detail/decorative geometry (0x04) and render-only surfaces.
+                // Collide with triangles that have the collision flag (0x08) or no flags at all.
+                // Skip detail/decorative (0x04) and render-only (0x20 without 0x08) surfaces.
                uint32_t triIdx = triStart / 3;
                if (!group.triMopyFlags.empty() && triIdx < group.triMopyFlags.size()) {
                    uint8_t mopy = group.triMopyFlags[triIdx];
                    if (mopy != 0) {
-                        bool collidable = (mopy & 0x01) != 0;
-                        bool detail = (mopy & 0x04) != 0;
-                        if (!collidable || detail) {
-                            continue;
-                        }
+                        if ((mopy & 0x04) || !(mopy & 0x08)) continue;
                    }
                }

@ -3217,8 +3239,8 @@ bool WMORenderer::checkWallCollision(const glm::vec3& from, const glm::vec3& to,
                    if (absNz >= 0.35f) continue;

                    const float SKIN = 0.005f;        // small separation so we don't re-collide immediately
-                    // Stronger push when inside WMO for more responsive indoor collision
-                    const float MAX_PUSH = insideWMO ? 0.35f : 0.15f;
+                    // Push must cover full penetration to prevent gradual clip-through
+                    const float MAX_PUSH = PLAYER_RADIUS;
                    float penetration = (PLAYER_RADIUS - horizDist);
                    float pushDist = glm::clamp(penetration + SKIN, 0.0f, MAX_PUSH);
                    glm::vec2 pushDir2;
@ -3302,9 +3324,9 @@ void WMORenderer::updateActiveGroup(float glX, float glY, float glZ) {

    glm::vec3 queryMin(glX - 0.5f, glY - 0.5f, glZ - 0.5f);
    glm::vec3 queryMax(glX + 0.5f, glY + 0.5f, glZ + 0.5f);
-    gatherCandidates(queryMin, queryMax, candidateScratch);
+    gatherCandidates(queryMin, queryMax, tl_candidateScratch);

-    for (size_t idx : candidateScratch) {
+    for (size_t idx : tl_candidateScratch) {
        const auto& instance = instances[idx];
        if (glX < instance.worldBoundsMin.x || glX > instance.worldBoundsMax.x ||
            glY < instance.worldBoundsMin.y || glY > instance.worldBoundsMax.y ||
@ -3348,9 +3370,9 @@ bool WMORenderer::isInsideWMO(float glX, float glY, float glZ, uint32_t* outMode
    QueryTimer timer(&queryTimeMs, &queryCallCount);
    glm::vec3 queryMin(glX - 0.5f, glY - 0.5f, glZ - 0.5f);
    glm::vec3 queryMax(glX + 0.5f, glY + 0.5f, glZ + 0.5f);
-    gatherCandidates(queryMin, queryMax, candidateScratch);
+    gatherCandidates(queryMin, queryMax, tl_candidateScratch);

-    for (size_t idx : candidateScratch) {
+    for (size_t idx : tl_candidateScratch) {
        const auto& instance = instances[idx];
        if (collisionFocusEnabled &&
            pointAABBDistanceSq(collisionFocusPos, instance.worldBoundsMin, instance.worldBoundsMax) > collisionFocusRadiusSq) {
@ -3397,9 +3419,9 @@ bool WMORenderer::isInsideWMO(float glX, float glY, float glZ, uint32_t* outMode
 bool WMORenderer::isInsideInteriorWMO(float glX, float glY, float glZ) const {
    glm::vec3 queryMin(glX - 0.5f, glY - 0.5f, glZ - 0.5f);
    glm::vec3 queryMax(glX + 0.5f, glY + 0.5f, glZ + 0.5f);
-    gatherCandidates(queryMin, queryMax, candidateScratch);
+    gatherCandidates(queryMin, queryMax, tl_candidateScratch);

-    for (size_t idx : candidateScratch) {
+    for (size_t idx : tl_candidateScratch) {
        const auto& instance = instances[idx];
        if (collisionFocusEnabled &&
            pointAABBDistanceSq(collisionFocusPos, instance.worldBoundsMin, instance.worldBoundsMax) > collisionFocusRadiusSq) {
@ -3453,9 +3475,9 @@ float WMORenderer::raycastBoundingBoxes(const glm::vec3& origin, const glm::vec3
    glm::vec3 rayEnd = origin + direction * maxDistance;
    glm::vec3 queryMin = glm::min(origin, rayEnd) - glm::vec3(1.0f);
    glm::vec3 queryMax = glm::max(origin, rayEnd) + glm::vec3(1.0f);
-    gatherCandidates(queryMin, queryMax, candidateScratch);
+    gatherCandidates(queryMin, queryMax, tl_candidateScratch);

-    for (size_t idx : candidateScratch) {
+    for (size_t idx : tl_candidateScratch) {
        const auto& instance = instances[idx];
        if (collisionFocusEnabled &&
            pointAABBDistanceSq(collisionFocusPos, instance.worldBoundsMin, instance.worldBoundsMax) > collisionFocusRadiusSq) {
@ -3509,9 +3531,9 @@ float WMORenderer::raycastBoundingBoxes(const glm::vec3& origin, const glm::vec3
            float rMinY = std::min(localOrigin.y, localEnd.y) - 1.0f;
            float rMaxX = std::max(localOrigin.x, localEnd.x) + 1.0f;
            float rMaxY = std::max(localOrigin.y, localEnd.y) + 1.0f;
-            group.getWallTrianglesInRange(rMinX, rMinY, rMaxX, rMaxY, triScratch_);
+            group.getWallTrianglesInRange(rMinX, rMinY, rMaxX, rMaxY, tl_triScratch);

-            for (uint32_t triStart : triScratch_) {
+            for (uint32_t triStart : tl_triScratch) {
                const glm::vec3& v0 = verts[indices[triStart]];
                const glm::vec3& v1 = verts[indices[triStart + 1]];
                const glm::vec3& v2 = verts[indices[triStart + 2]];
--- a/src/ui/game_screen.cpp
+++ b/src/ui/game_screen.cpp
@ -317,6 +317,20 @@ void GameScreen::render(game::GameHandler& gameHandler) {
        }
    }

+    // Apply saved FSR setting once when renderer is available
+    if (!fsrSettingsApplied_ && pendingFSR) {
+        auto* renderer = core::Application::getInstance().getRenderer();
+        if (renderer) {
+            static const float fsrScales[] = { 0.77f, 0.67f, 0.59f, 0.50f };
+            renderer->setFSRQuality(fsrScales[pendingFSRQuality]);
+            renderer->setFSRSharpness(pendingFSRSharpness);
+            renderer->setFSREnabled(true);
+            fsrSettingsApplied_ = true;
+        }
+    } else {
+        fsrSettingsApplied_ = true;
+    }
+
    // Apply auto-loot setting to GameHandler every frame (cheap bool sync)
    gameHandler.setAutoLoot(pendingAutoLoot);

@ -2687,6 +2701,12 @@ void GameScreen::sendChatMessage(game::GameHandler& gameHandler) {
                chatInputBuffer[0] = '\0';
                return;
            }
+            // /unstuckhearth command — teleport to hearthstone bind point
+            if (cmdLower == "unstuckhearth") {
+                gameHandler.unstuckHearth();
+                chatInputBuffer[0] = '\0';
+                return;
+            }

            // /transport board — board test transport
            if (cmdLower == "transport board") {
@ -6250,7 +6270,7 @@ void GameScreen::renderSettingsWindow() {
                if (pendingShadows) {
                    ImGui::SameLine();
                    ImGui::SetNextItemWidth(150.0f);
-                    if (ImGui::SliderFloat("Distance##shadow", &pendingShadowDistance, 40.0f, 200.0f, "%.0f")) {
+                    if (ImGui::SliderFloat("Distance##shadow", &pendingShadowDistance, 40.0f, 500.0f, "%.0f")) {
                        if (renderer) renderer->setShadowDistance(pendingShadowDistance);
                        saveSettings();
                    }
@ -6261,7 +6281,13 @@ void GameScreen::renderSettingsWindow() {
                }
                {
                    const char* aaLabels[] = { "Off", "2x MSAA", "4x MSAA", "8x MSAA" };
-                    if (ImGui::Combo("Anti-Aliasing", &pendingAntiAliasing, aaLabels, 4)) {
+                    bool fsr2Active = renderer && renderer->isFSR2Enabled();
+                    if (fsr2Active) {
+                        ImGui::BeginDisabled();
+                        int disabled = 0;
+                        ImGui::Combo("Anti-Aliasing (FSR2)", &disabled, "Off (FSR2 active)\0", 1);
+                        ImGui::EndDisabled();
+                    } else if (ImGui::Combo("Anti-Aliasing", &pendingAntiAliasing, aaLabels, 4)) {
                        static const VkSampleCountFlagBits aaSamples[] = {
                            VK_SAMPLE_COUNT_1_BIT, VK_SAMPLE_COUNT_2_BIT,
                            VK_SAMPLE_COUNT_4_BIT, VK_SAMPLE_COUNT_8_BIT
@ -6270,6 +6296,33 @@ void GameScreen::renderSettingsWindow() {
                        saveSettings();
                    }
                }
+                // FSR Upscaling
+                {
+                    // FSR mode selection: Off, FSR 1.0 (Spatial), FSR 2.2 (Temporal)
+                    const char* fsrModeLabels[] = { "Off", "FSR 1.0 (Spatial)", "FSR 2.2 (Temporal)" };
+                    int fsrMode = pendingFSR ? 1 : 0;
+                    if (renderer && renderer->isFSR2Enabled()) fsrMode = 2;
+                    if (ImGui::Combo("Upscaling", &fsrMode, fsrModeLabels, 3)) {
+                        pendingFSR = (fsrMode == 1);
+                        if (renderer) {
+                            renderer->setFSREnabled(fsrMode == 1);
+                            renderer->setFSR2Enabled(fsrMode == 2);
+                        }
+                        saveSettings();
+                    }
+                    if (fsrMode > 0) {
+                        const char* fsrQualityLabels[] = { "Ultra Quality (77%)", "Quality (67%)", "Balanced (59%)", "Performance (50%)" };
+                        static const float fsrScaleFactors[] = { 0.77f, 0.67f, 0.59f, 0.50f };
+                        if (ImGui::Combo("FSR Quality", &pendingFSRQuality, fsrQualityLabels, 4)) {
+                            if (renderer) renderer->setFSRQuality(fsrScaleFactors[pendingFSRQuality]);
+                            saveSettings();
+                        }
+                        if (ImGui::SliderFloat("FSR Sharpness", &pendingFSRSharpness, 0.0f, 2.0f, "%.1f")) {
+                            if (renderer) renderer->setFSRSharpness(pendingFSRSharpness);
+                            saveSettings();
+                        }
+                    }
+                }
                if (ImGui::SliderInt("Ground Clutter Density", &pendingGroundClutterDensity, 0, 150, "%d%%")) {
                    if (renderer) {
                        if (auto* tm = renderer->getTerrainManager()) {
@ -6348,7 +6401,7 @@ void GameScreen::renderSettingsWindow() {
                    pendingFullscreen = kDefaultFullscreen;
                    pendingVsync = kDefaultVsync;
                    pendingShadows = kDefaultShadows;
-                    pendingShadowDistance = 72.0f;
+                    pendingShadowDistance = 300.0f;
                    pendingGroundClutterDensity = kDefaultGroundClutterDensity;
                    pendingAntiAliasing = 0;
                    pendingNormalMapping = true;
@ -7384,6 +7437,9 @@ void GameScreen::saveSettings() {
    out << "normal_map_strength=" << pendingNormalMapStrength << "\n";
    out << "pom=" << (pendingPOM ? 1 : 0) << "\n";
    out << "pom_quality=" << pendingPOMQuality << "\n";
+    out << "fsr=" << (pendingFSR ? 1 : 0) << "\n";
+    out << "fsr_quality=" << pendingFSRQuality << "\n";
+    out << "fsr_sharpness=" << pendingFSRSharpness << "\n";

    // Controls
    out << "mouse_sensitivity=" << pendingMouseSensitivity << "\n";
@ -7463,13 +7519,16 @@ void GameScreen::loadSettings() {
            else if (key == "auto_loot") pendingAutoLoot = (std::stoi(val) != 0);
            else if (key == "ground_clutter_density") pendingGroundClutterDensity = std::clamp(std::stoi(val), 0, 150);
            else if (key == "shadows") pendingShadows = (std::stoi(val) != 0);
-            else if (key == "shadow_distance") pendingShadowDistance = std::clamp(std::stof(val), 40.0f, 200.0f);
+            else if (key == "shadow_distance") pendingShadowDistance = std::clamp(std::stof(val), 40.0f, 500.0f);
            else if (key == "water_refraction") pendingWaterRefraction = (std::stoi(val) != 0);
            else if (key == "antialiasing") pendingAntiAliasing = std::clamp(std::stoi(val), 0, 3);
            else if (key == "normal_mapping") pendingNormalMapping = (std::stoi(val) != 0);
            else if (key == "normal_map_strength") pendingNormalMapStrength = std::clamp(std::stof(val), 0.0f, 2.0f);
            else if (key == "pom") pendingPOM = (std::stoi(val) != 0);
            else if (key == "pom_quality") pendingPOMQuality = std::clamp(std::stoi(val), 0, 2);
+            else if (key == "fsr") pendingFSR = (std::stoi(val) != 0);
+            else if (key == "fsr_quality") pendingFSRQuality = std::clamp(std::stoi(val), 0, 3);
+            else if (key == "fsr_sharpness") pendingFSRSharpness = std::clamp(std::stof(val), 0.0f, 2.0f);
            // Controls
            else if (key == "mouse_sensitivity") pendingMouseSensitivity = std::clamp(std::stof(val), 0.05f, 1.0f);
            else if (key == "invert_mouse") pendingInvertMouse = (std::stoi(val) != 0);
Author	SHA1	Message	Date
Kelsi	e94eb7f2d1	FSR2 temporal upscaling fixes: unjittered reprojection, sharpen Y-flip, MSAA guard, descriptor double-buffering Some checks are pending Build / Build (arm64) (push) Waiting to run Details Build / Build (x86-64) (push) Waiting to run Details Build / Build (macOS arm64) (push) Waiting to run Details Build / Build (windows-arm64) (push) Waiting to run Details Build / Build (windows-x86-64) (push) Waiting to run Details Security / CodeQL (C/C++) (push) Waiting to run Details Security / Semgrep (push) Waiting to run Details Security / Sanitizer Build (ASan/UBSan) (push) Waiting to run Details - Motion vectors: single unjittered reprojection matrix (80 bytes) instead of two jittered matrices (160 bytes), eliminating numerical instability from jitter amplification through large world coordinates - Sharpen pass: fix Y-flip for correct UV sampling, double-buffer descriptor sets to avoid race with in-flight command buffers - MSAA: auto-disable when FSR2 enabled, grey out AA setting in UI - Accumulation: variance-based neighborhood clamping in YCoCg space, correct history layout transitions - Frame index: wrap at 256 for stable Halton sequence	2026-03-08 01:22:15 -08:00
Kelsi	52317d1edd	Implement FSR 2.2 temporal upscaling Full FSR 2.2 pipeline with depth-based motion vector reprojection, temporal accumulation with YCoCg neighborhood clamping, and RCAS contrast-adaptive sharpening. Architecture (designed for FSR 3.x frame generation readiness): - Camera: Halton(2,3) sub-pixel jitter with unjittered projection stored separately for motion vector computation - Motion vectors: compute shader reconstructs world position from depth + inverse VP, reprojects with previous frame's VP - Temporal accumulation: compute shader blends 5-10% current frame with 90-95% clamped history, adaptive blend for disocclusion - History: ping-pong R16G16B16A16 buffers at display resolution - Sharpening: RCAS fragment pass with contrast-adaptive weights Integration: - FSR2 replaces both FSR1 and MSAA when enabled - Scene renders to internal resolution framebuffer (no MSAA) - Compute passes run between scene and swapchain render passes - Camera cut detection resets history on teleport - Quality presets shared with FSR1 (0.50-0.77 scale factors) - UI: "Upscaling" combo with Off/FSR 1.0/FSR 2.2 options	2026-03-07 23:13:01 -08:00
Kelsi	0ffeabd4ed	Revert "Further reduce tile streaming aggressiveness" This reverts commit `f681a8b361`.	2026-03-07 23:02:25 -08:00
Kelsi	f681a8b361	Further reduce tile streaming aggressiveness - Load radius: 4→3 (normal), 6→5 (taxi) - Terrain chunks per step: 16→8 - M2 models per step: 6→2 (removed idle boost) - WMO models per step: 2→1 (removed idle boost) - WMO doodads per step: 4→2 - All budgets now constant (no idle-vs-busy branching)	2026-03-07 22:55:02 -08:00
Kelsi	7f573fc06b	Reduce tile finalization aggressiveness to prevent spawn hitching - Reduce max finalization steps per frame: 2→1 (normal), 8→4 (taxi) - Reduce terrain chunk upload batch: 32→16 chunks per step - Reduce idle M2 model upload budget: 16→6 per step - Reduce idle WMO model upload budget: 4→2 per step Tiles still stream in quickly but spread GPU upload work across more frames, eliminating the frame spikes right after spawning.	2026-03-07 22:51:59 -08:00
Kelsi	ac3c90dd75	Fix M2 animated instance flashing (deer/bird/critter pop-in) Root cause: bonesDirty was a single bool shared across both double-buffered frame indices. When bones were copied to frame 0's SSBO and bonesDirty cleared, frame 1's newly-allocated SSBO would contain garbage/zeros and never get populated — causing animated M2 instances to flash invisible on alternating frames. Fix: Make bonesDirty per-frame-index (bool[2]) so each buffer independently tracks whether it needs bone data uploaded. When bones are recomputed, both indices are marked dirty. When uploaded during render, only the current frame index is cleared. New buffer allocations in prepareRender force their frame index dirty.	2026-03-07 22:47:07 -08:00
Kelsi	6cf08fbaa6	Throttle proactive tile streaming to reduce post-load hitching Add 2-second cooldown timer before re-checking for unloaded tiles when workers are idle, preventing excessive streamTiles() calls that caused frame hitches right after world load.	2026-03-07 22:40:07 -08:00
Kelsi	c13dbf2198	Proactive tile streaming, faster finalization, tree trunk collision - Re-check for unloaded tiles when workers are idle (no tile boundary needed) - Increase M2 upload budget 4→16 and WMO 1→4 per frame when not under pressure - Lower tree collision threshold from 40 to 6 units so large trees block movement	2026-03-07 22:35:18 -08:00
Kelsi	4cb03c38fe	Parallel animation updates, thread-safe collision, M2 pop-in fix, shadow stabilization - Overlap M2 and character animation updates via std::async (~2-5ms saved) - Thread-local collision scratch buffers for concurrent floor queries - Parallel terrain/WMO/M2 floor queries in camera controller - Seed new M2 instance bones from existing siblings to eliminate pop-in flash - Fix shadow flicker: snap center along stable light axes instead of in view space - Increase shadow distance default to 300 units (slider max 500)	2026-03-07 22:29:06 -08:00
Kelsi	a4966e486f	Fix WMO wall collision, normal mapping, POM backfill, and M2/WMO rendering performance - Fix MOPY flag check (0x08 not 0x01) for proper wall collision detection - Cap MAX_PUSH to PLAYER_RADIUS to prevent gradual clip-through - Fix WMO doodad quaternion component ordering (X/Y swap) - Linear normal map strength blend in shader for smooth slider control - Enable shadow sampling for interior WMO groups (covered outdoor areas) - Backfill deferred normal/height maps after streaming with descriptor rebind - M2: prepareRender only iterates animated instances, bone dirty flag - M2: remove worker thread VMA allocation, skip unready bone instances - WMO: persistent visibility vectors, sequential culling - Add FSR EASU/RCAS shaders	2026-03-07 22:03:28 -08:00
Kelsi	16c6c2b6a0	Raise diagnostic log thresholds to reduce log noise SLOW update stages: 3ms → 50ms, renderer update: 5ms → 50ms, loadModel/processAsync/spawnCreature: 3ms → 100ms, terrain/camera: 3-5ms → 50ms. Remove per-frame spawn breakdown.	2026-03-07 18:43:13 -08:00
Kelsi	02cf0e4df3	Background normal map generation, queue-draining load screen warmup - Normal map CPU work (luminance→blur→Sobel) moved to background threads, main thread only does GPU upload (~1-2ms vs 15-22ms per texture) - Load screen warmup now waits until ALL spawn/equipment/gameobject queues are drained before transitioning (prevents naked character, NPC pop-in) - Exit condition: min 2s + 5 consecutive empty iterations, hard cap 15s - Equipment queue processes 8 items per warmup iteration instead of 1 - Added LoadingScreen::renderOverlay() for future world-behind-loading use	2026-03-07 18:40:24 -08:00
Kelsi	63efac9fa6	Unlimited creature model uploads during load screen, remove duplicate code Loading screen now calls processCreatureSpawnQueue(unlimited=true) which removes the 1-upload-per-frame cap and 2ms time budget, allowing all pending creature models to upload to GPU in bulk. Also increases concurrent async background loads from 4 to 16 during load screen. Replaces 40-line inline duplicate of processAsyncCreatureResults with the shared function.	2026-03-07 17:31:47 -08:00