FSR2 temporal upscaling fixes: unjittered reprojection, sharpen Y-flip, MSAA guard, descriptor double-buffering

- Motion vectors: single unjittered reprojection matrix (80 bytes) instead of two jittered matrices (160 bytes), eliminating numerical instability from jitter amplification through large world coordinates - Sharpen pass: fix Y-flip for correct UV sampling, double-buffer descriptor sets to avoid race with in-flight command buffers - MSAA: auto-disable when FSR2 enabled, grey out AA setting in UI - Accumulation: variance-based neighborhood clamping in YCoCg space, correct history layout transitions - Frame index: wrap at 256 for stable Halton sequence
Implement FSR 2.2 temporal upscaling
2026-03-22 23:30:14 +00:00 · 2026-03-08 01:22:15 -08:00 · 2026-03-07 23:13:01 -08:00 · 2026-03-07 23:02:25 -08:00 · 2026-03-07 22:55:02 -08:00 · 2026-03-07 22:51:59 -08:00
37 changed files with 2840 additions and 586 deletions
--- a/assets/shaders/fsr2_accumulate.comp.glsl
+++ b/assets/shaders/fsr2_accumulate.comp.glsl
@ -0,0 +1,85 @@
 #version 450
 layout(local_size_x = 8, local_size_y = 8) in;
 layout(set = 0, binding = 0) uniform sampler2D sceneColor;
 layout(set = 0, binding = 1) uniform sampler2D depthBuffer;
 layout(set = 0, binding = 2) uniform sampler2D motionVectors;
 layout(set = 0, binding = 3) uniform sampler2D historyInput;
 layout(set = 0, binding = 4, rgba16f) uniform writeonly image2D historyOutput;
 layout(push_constant) uniform PushConstants {
    vec4 internalSize;   // xy = internal resolution, zw = 1/internal
    vec4 displaySize;    // xy = display resolution, zw = 1/display
    vec4 jitterOffset;   // xy = current jitter (NDC-space), zw = unused
    vec4 params;         // x = resetHistory (1=reset), y = sharpness, zw = unused
 } pc;
 vec3 rgbToYCoCg(vec3 rgb) {
    float y  = 0.25 * rgb.r + 0.5 * rgb.g + 0.25 * rgb.b;
    float co = 0.5  * rgb.r                - 0.5  * rgb.b;
    float cg = -0.25 * rgb.r + 0.5 * rgb.g - 0.25 * rgb.b;
    return vec3(y, co, cg);
 }
 vec3 yCoCgToRgb(vec3 ycocg) {
    float y  = ycocg.x;
    float co = ycocg.y;
    float cg = ycocg.z;
    return vec3(y + co - cg, y + cg, y - co - cg);
 }
 void main() {
    ivec2 outPixel = ivec2(gl_GlobalInvocationID.xy);
    ivec2 outSize = ivec2(pc.displaySize.xy);
    if (outPixel.x >= outSize.x || outPixel.y >= outSize.y) return;
    vec2 outUV = (vec2(outPixel) + 0.5) * pc.displaySize.zw;
    vec3 currentColor = texture(sceneColor, outUV).rgb;
    if (pc.params.x > 0.5) {
        imageStore(historyOutput, outPixel, vec4(currentColor, 1.0));
        return;
    }
    vec2 motion = texture(motionVectors, outUV).rg;
    vec2 historyUV = outUV + motion;
    float historyValid = (historyUV.x >= 0.0 && historyUV.x <= 1.0 &&
                          historyUV.y >= 0.0 && historyUV.y <= 1.0) ? 1.0 : 0.0;
    vec3 historyColor = texture(historyInput, historyUV).rgb;
    // Neighborhood clamping in YCoCg space
    vec2 texelSize = pc.internalSize.zw;
    vec3 s0 = rgbToYCoCg(currentColor);
    vec3 s1 = rgbToYCoCg(texture(sceneColor, outUV + vec2(-texelSize.x, 0.0)).rgb);
    vec3 s2 = rgbToYCoCg(texture(sceneColor, outUV + vec2( texelSize.x, 0.0)).rgb);
    vec3 s3 = rgbToYCoCg(texture(sceneColor, outUV + vec2(0.0, -texelSize.y)).rgb);
    vec3 s4 = rgbToYCoCg(texture(sceneColor, outUV + vec2(0.0,  texelSize.y)).rgb);
    vec3 s5 = rgbToYCoCg(texture(sceneColor, outUV + vec2(-texelSize.x, -texelSize.y)).rgb);
    vec3 s6 = rgbToYCoCg(texture(sceneColor, outUV + vec2( texelSize.x, -texelSize.y)).rgb);
    vec3 s7 = rgbToYCoCg(texture(sceneColor, outUV + vec2(-texelSize.x,  texelSize.y)).rgb);
    vec3 s8 = rgbToYCoCg(texture(sceneColor, outUV + vec2( texelSize.x,  texelSize.y)).rgb);
    vec3 m1 = s0 + s1 + s2 + s3 + s4 + s5 + s6 + s7 + s8;
    vec3 m2 = s0*s0 + s1*s1 + s2*s2 + s3*s3 + s4*s4 + s5*s5 + s6*s6 + s7*s7 + s8*s8;
    vec3 mean = m1 / 9.0;
    vec3 variance = max(m2 / 9.0 - mean * mean, vec3(0.0));
    vec3 stddev = sqrt(variance);
    float gamma = 1.5;
    vec3 boxMin = mean - gamma * stddev;
    vec3 boxMax = mean + gamma * stddev;
    vec3 historyYCoCg = rgbToYCoCg(historyColor);
    vec3 clampedHistory = clamp(historyYCoCg, boxMin, boxMax);
    historyColor = yCoCgToRgb(clampedHistory);
    float clampDist = length(historyYCoCg - clampedHistory);
    float blendFactor = mix(0.05, 0.30, clamp(clampDist * 2.0, 0.0, 1.0));
    blendFactor = mix(blendFactor, 1.0, 1.0 - historyValid);
    vec3 result = mix(historyColor, currentColor, blendFactor);
    imageStore(historyOutput, outPixel, vec4(result, 1.0));
 }
--- a/assets/shaders/fsr2_accumulate.comp.spv
+++ b/assets/shaders/fsr2_accumulate.comp.spv
--- a/assets/shaders/fsr2_motion.comp.glsl
+++ b/assets/shaders/fsr2_motion.comp.glsl
@ -0,0 +1,35 @@
 #version 450
 layout(local_size_x = 8, local_size_y = 8) in;
 layout(set = 0, binding = 0) uniform sampler2D depthBuffer;
 layout(set = 0, binding = 1, rg16f) uniform writeonly image2D motionVectors;
 layout(push_constant) uniform PushConstants {
    mat4 reprojMatrix;      // prevUnjitteredVP * inverse(currentUnjitteredVP)
    vec4 resolution;        // xy = internal size, zw = 1/internal size
 } pc;
 void main() {
    ivec2 pixelCoord = ivec2(gl_GlobalInvocationID.xy);
    ivec2 imgSize = ivec2(pc.resolution.xy);
    if (pixelCoord.x >= imgSize.x || pixelCoord.y >= imgSize.y) return;
    // Sample depth (Vulkan: 0 = near, 1 = far)
    float depth = texelFetch(depthBuffer, pixelCoord, 0).r;
    // Pixel center in UV [0,1] and NDC [-1,1]
    vec2 uv = (vec2(pixelCoord) + 0.5) * pc.resolution.zw;
    vec2 ndc = uv * 2.0 - 1.0;
    // Clip-to-clip reprojection: current unjittered clip → previous unjittered clip
    vec4 clipPos = vec4(ndc, depth, 1.0);
    vec4 prevClip = pc.reprojMatrix * clipPos;
    vec2 prevNdc = prevClip.xy / prevClip.w;
    vec2 prevUV = prevNdc * 0.5 + 0.5;
    // Motion = previous position - current position (both unjittered, in UV space)
    vec2 motion = prevUV - uv;
    imageStore(motionVectors, pixelCoord, vec4(motion, 0.0, 0.0));
 }
--- a/assets/shaders/fsr2_motion.comp.spv
+++ b/assets/shaders/fsr2_motion.comp.spv
--- a/assets/shaders/fsr2_sharpen.frag.glsl
+++ b/assets/shaders/fsr2_sharpen.frag.glsl
@ -0,0 +1,50 @@
 #version 450
 layout(location = 0) in vec2 TexCoord;
 layout(location = 0) out vec4 FragColor;
 layout(set = 0, binding = 0) uniform sampler2D inputImage;
 layout(push_constant) uniform PushConstants {
    vec4 params;  // x = 1/width, y = 1/height, z = sharpness (0-2), w = unused
 } pc;
 void main() {
    // Undo the vertex shader Y flip (postprocess.vert flips for Vulkan overlay,
    // but we need standard UV coords for texture sampling)
    vec2 tc = vec2(TexCoord.x, 1.0 - TexCoord.y);
    vec2 texelSize = pc.params.xy;
    float sharpness = pc.params.z;
    // RCAS: Robust Contrast-Adaptive Sharpening
    // 5-tap cross pattern
    vec3 center = texture(inputImage, tc).rgb;
    vec3 north  = texture(inputImage, tc + vec2(0.0, -texelSize.y)).rgb;
    vec3 south  = texture(inputImage, tc + vec2(0.0,  texelSize.y)).rgb;
    vec3 west   = texture(inputImage, tc + vec2(-texelSize.x, 0.0)).rgb;
    vec3 east   = texture(inputImage, tc + vec2( texelSize.x, 0.0)).rgb;
    // Compute local contrast (min/max of neighborhood)
    vec3 minRGB = min(center, min(min(north, south), min(west, east)));
    vec3 maxRGB = max(center, max(max(north, south), max(west, east)));
    // Adaptive sharpening weight based on local contrast
    // High contrast = less sharpening (prevent ringing)
    vec3 range = maxRGB - minRGB;
    vec3 rcpRange = 1.0 / (range + 0.001);
    // Sharpening amount: inversely proportional to contrast
    float luma = dot(center, vec3(0.299, 0.587, 0.114));
    float lumaRange = max(range.r, max(range.g, range.b));
    float w = clamp(1.0 - lumaRange * 2.0, 0.0, 1.0) * sharpness * 0.25;
    // Apply sharpening via unsharp mask
    vec3 avg = (north + south + west + east) * 0.25;
    vec3 sharpened = center + (center - avg) * w;
    // Clamp to prevent ringing artifacts
    sharpened = clamp(sharpened, minRGB, maxRGB);
    FragColor = vec4(sharpened, 1.0);
 }
--- a/assets/shaders/fsr2_sharpen.frag.spv
+++ b/assets/shaders/fsr2_sharpen.frag.spv
--- a/assets/shaders/fsr_easu.frag.glsl
+++ b/assets/shaders/fsr_easu.frag.glsl
@ -0,0 +1,102 @@
 #version 450
 // FSR 1.0 EASU (Edge Adaptive Spatial Upsampling) — Fragment Shader
 // Based on AMD FidelityFX Super Resolution 1.0
 // Implements edge-adaptive bilinear upsampling with directional filtering
 layout(set = 0, binding = 0) uniform sampler2D uInput;
 layout(push_constant) uniform FSRConstants {
    vec4 con0; // inputSize.xy, 1/inputSize.xy
    vec4 con1; // inputSize.xy / outputSize.xy, 0.5 * inputSize.xy / outputSize.xy
    vec4 con2; // outputSize.xy, 1/outputSize.xy
    vec4 con3; // sharpness, 0, 0, 0
 } fsr;
 layout(location = 0) in vec2 TexCoord;
 layout(location = 0) out vec4 outColor;
 // Fetch a texel with offset (in input pixels)
 vec3 fsrFetch(vec2 p, vec2 off) {
    return textureLod(uInput, (p + off + 0.5) * fsr.con0.zw, 0.0).rgb;
 }
 void main() {
    // Undo the vertex shader Y flip (postprocess.vert flips for Vulkan overlay,
    // but we need standard UV coords for texture sampling)
    vec2 tc = vec2(TexCoord.x, 1.0 - TexCoord.y);
    // Map output pixel to input space
    vec2 pp = tc * fsr.con2.xy; // output pixel position
    vec2 ip = pp * fsr.con1.xy - 0.5; // input pixel position (centered)
    vec2 fp = floor(ip);
    vec2 ff = ip - fp;
    // 12-tap filter: 4x3 grid around the pixel
    //  b c
    // e f g h
    // i j k l
    //  n o
    vec3 b = fsrFetch(fp, vec2( 0, -1));
    vec3 c = fsrFetch(fp, vec2( 1, -1));
    vec3 e = fsrFetch(fp, vec2(-1,  0));
    vec3 f = fsrFetch(fp, vec2( 0,  0));
    vec3 g = fsrFetch(fp, vec2( 1,  0));
    vec3 h = fsrFetch(fp, vec2( 2,  0));
    vec3 i = fsrFetch(fp, vec2(-1,  1));
    vec3 j = fsrFetch(fp, vec2( 0,  1));
    vec3 k = fsrFetch(fp, vec2( 1,  1));
    vec3 l = fsrFetch(fp, vec2( 2,  1));
    vec3 n = fsrFetch(fp, vec2( 0,  2));
    vec3 o = fsrFetch(fp, vec2( 1,  2));
    // Luma (use green channel as good perceptual approximation)
    float bL = b.g, cL = c.g, eL = e.g, fL = f.g;
    float gL = g.g, hL = h.g, iL = i.g, jL = j.g;
    float kL = k.g, lL = l.g, nL = n.g, oL = o.g;
    // Directional edge detection
    // Compute gradients in 4 directions (N-S, E-W, NE-SW, NW-SE)
    float dc = cL - jL;
    float db = bL - kL;
    float de = eL - hL;
    float di = iL - lL;
    // Length of the edge in each direction
    float lenH = abs(eL - fL) + abs(fL - gL) + abs(iL - jL) + abs(jL - kL);
    float lenV = abs(bL - fL) + abs(fL - jL) + abs(cL - gL) + abs(gL - kL);
    // Determine dominant edge direction
    float dirH = lenV / (lenH + lenV + 1e-7);
    float dirV = lenH / (lenH + lenV + 1e-7);
    // Bilinear weights
    float w1 = (1.0 - ff.x) * (1.0 - ff.y);
    float w2 = ff.x * (1.0 - ff.y);
    float w3 = (1.0 - ff.x) * ff.y;
    float w4 = ff.x * ff.y;
    // Edge-aware sharpening: boost weights along edges
    float sharpness = fsr.con3.x;
    float edgeStr = max(abs(lenH - lenV) / (lenH + lenV + 1e-7), 0.0);
    float sharp = mix(0.0, sharpness, edgeStr);
    // Sharpen bilinear by pulling toward nearest texel
    float maxW = max(max(w1, w2), max(w3, w4));
    w1 = mix(w1, float(w1 == maxW), sharp * 0.25);
    w2 = mix(w2, float(w2 == maxW), sharp * 0.25);
    w3 = mix(w3, float(w3 == maxW), sharp * 0.25);
    w4 = mix(w4, float(w4 == maxW), sharp * 0.25);
    // Normalize
    float wSum = w1 + w2 + w3 + w4;
    w1 /= wSum; w2 /= wSum; w3 /= wSum; w4 /= wSum;
    // Final color: weighted blend of the 4 nearest texels with edge awareness
    vec3 color = f * w1 + g * w2 + j * w3 + k * w4;
    // Optional: blend in some of the surrounding texels for anti-aliasing
    float aa = 0.125 * edgeStr;
    color = mix(color, (b + c + e + h + i + l + n + o) / 8.0, aa * 0.15);
    outColor = vec4(clamp(color, 0.0, 1.0), 1.0);
 }
--- a/assets/shaders/fsr_easu.frag.spv
+++ b/assets/shaders/fsr_easu.frag.spv
--- a/assets/shaders/fsr_rcas.frag.glsl
+++ b/assets/shaders/fsr_rcas.frag.glsl
@ -0,0 +1,43 @@
 #version 450
 // FSR 1.0 RCAS (Robust Contrast Adaptive Sharpening) — Fragment Shader
 // Based on AMD FidelityFX Super Resolution 1.0
 // Applies contrast-adaptive sharpening after EASU upscaling
 layout(set = 0, binding = 0) uniform sampler2D uInput;
 layout(push_constant) uniform RCASConstants {
    vec4 con0; // 1/outputSize.xy, outputSize.xy
    vec4 con1; // sharpness (x), 0, 0, 0
 } rcas;
 layout(location = 0) in vec2 TexCoord;
 layout(location = 0) out vec4 outColor;
 void main() {
    // Fetch center and 4-neighborhood
    vec2 texelSize = rcas.con0.xy;
    vec3 c = texture(uInput, TexCoord).rgb;
    vec3 n = texture(uInput, TexCoord + vec2( 0, -texelSize.y)).rgb;
    vec3 s = texture(uInput, TexCoord + vec2( 0,  texelSize.y)).rgb;
    vec3 w = texture(uInput, TexCoord + vec2(-texelSize.x,  0)).rgb;
    vec3 e = texture(uInput, TexCoord + vec2( texelSize.x,  0)).rgb;
    // Luma (green channel approximation)
    float cL = c.g, nL = n.g, sL = s.g, wL = w.g, eL = e.g;
    // Min/max of neighborhood
    float minL = min(min(nL, sL), min(wL, eL));
    float maxL = max(max(nL, sL), max(wL, eL));
    // Contrast adaptive sharpening weight
    // Higher contrast = less sharpening to avoid ringing
    float contrast = maxL - minL;
    float sharpness = rcas.con1.x;
    float w0 = sharpness * (1.0 - smoothstep(0.0, 0.3, contrast));
    // Apply sharpening: center + w0 * (center - average_neighbors)
    vec3 avg = (n + s + w + e) * 0.25;
    vec3 sharpened = c + w0 * (c - avg);
    outColor = vec4(clamp(sharpened, 0.0, 1.0), 1.0);
 }
--- a/assets/shaders/fsr_rcas.frag.spv
+++ b/assets/shaders/fsr_rcas.frag.spv
--- a/assets/shaders/wmo.frag.glsl
+++ b/assets/shaders/wmo.frag.glsl
@ -149,21 +149,21 @@ void main() {
    vec3 norm = vertexNormal;
    if (enableNormalMap != 0 && lodFactor < 0.99 && normalMapStrength > 0.001) {
        vec3 mapNormal = texture(uNormalHeightMap, finalUV).rgb * 2.0 - 1.0;
        // Scale XY by strength to control effect intensity
        mapNormal.xy *= normalMapStrength;
        mapNormal = normalize(mapNormal);
        vec3 worldNormal = normalize(TBN * mapNormal);
        if (!gl_FrontFacing) worldNormal = -worldNormal;
-        // Blend: strength + LOD both contribute to fade toward vertex normal
+        // Linear blend: strength controls how much normal map detail shows,
-        float blendFactor = max(lodFactor, 1.0 - normalMapStrength);
+        // LOD fades out at distance. Both multiply for smooth falloff.
-        norm = normalize(mix(worldNormal, vertexNormal, blendFactor));
+        float blend = clamp(normalMapStrength, 0.0, 1.0) * (1.0 - lodFactor);
        norm = normalize(mix(vertexNormal, worldNormal, blend));
    }
    vec3 result;
-    // Sample shadow map — skip for interior WMO groups (no sun indoors)
+    // Sample shadow map for all WMO groups (interior groups with 0x2000 flag
    // include covered outdoor areas like archways/streets that should receive shadows)
    float shadow = 1.0;
-    if (shadowParams.x > 0.5 && isInterior == 0) {
+    if (shadowParams.x > 0.5) {
        vec3 ldir = normalize(-lightDir.xyz);
        float normalOffset = SHADOW_TEXEL * 2.0 * (1.0 - abs(dot(norm, ldir)));
        vec3 biasedPos = FragPos + norm * normalOffset;
--- a/assets/shaders/wmo.frag.spv
+++ b/assets/shaders/wmo.frag.spv
--- a/include/core/application.hpp
+++ b/include/core/application.hpp
@ -215,7 +215,7 @@ private:
        std::future<PreparedCreatureModel> future;
    };
    std::vector<AsyncCreatureLoad> asyncCreatureLoads_;
-    void processAsyncCreatureResults();
+    void processAsyncCreatureResults(bool unlimited = false);
    static constexpr int MAX_ASYNC_CREATURE_LOADS = 4; // concurrent background loads
    std::unordered_set<uint64_t> deadCreatureGuids_;            // GUIDs that should spawn in corpse/death pose
    std::unordered_map<uint32_t, uint32_t> displayIdModelCache_; // displayId → modelId (model caching)
@ -236,6 +236,11 @@ private:
    std::optional<PendingWorldEntry> pendingWorldEntry_;  // Deferred world entry during loading
    float taxiLandingClampTimer_ = 0.0f;
    float worldEntryMovementGraceTimer_ = 0.0f;
    // Hearth teleport: freeze player until terrain loads at destination
    bool hearthTeleportPending_ = false;
    glm::vec3 hearthTeleportPos_{0.0f};  // render coords
    float hearthTeleportTimer_ = 0.0f;   // timeout safety
    float facingSendCooldown_ = 0.0f;        // Rate-limits MSG_MOVE_SET_FACING
    float lastSentCanonicalYaw_ = 1000.0f;   // Sentinel — triggers first send
    float taxiStreamCooldown_ = 0.0f;
@ -373,7 +378,7 @@ private:
    std::unordered_set<uint64_t> pendingPlayerSpawnGuids_;
    void processPlayerSpawnQueue();
    std::unordered_set<uint64_t> creaturePermanentFailureGuids_;
-    void processCreatureSpawnQueue();
+    void processCreatureSpawnQueue(bool unlimited = false);
    struct PendingGameObjectSpawn {
        uint64_t guid;
--- a/include/game/game_handler.hpp
+++ b/include/game/game_handler.hpp
@ -565,6 +565,8 @@ public:
    void unstuck();
    void setUnstuckGyCallback(UnstuckCallback cb) { unstuckGyCallback_ = std::move(cb); }
    void unstuckGy();
    void setUnstuckHearthCallback(UnstuckCallback cb) { unstuckHearthCallback_ = std::move(cb); }
    void unstuckHearth();
    using BindPointCallback = std::function<void(uint32_t mapId, float x, float y, float z)>;
    void setBindPointCallback(BindPointCallback cb) { bindPointCallback_ = std::move(cb); }
@ -1445,6 +1447,7 @@ private:
    WorldEntryCallback worldEntryCallback_;
    UnstuckCallback unstuckCallback_;
    UnstuckCallback unstuckGyCallback_;
    UnstuckCallback unstuckHearthCallback_;
    BindPointCallback bindPointCallback_;
    CreatureSpawnCallback creatureSpawnCallback_;
    CreatureDespawnCallback creatureDespawnCallback_;
--- a/include/rendering/camera.hpp
+++ b/include/rendering/camera.hpp
@ -23,9 +23,16 @@ public:
    const glm::vec3& getPosition() const { return position; }
    const glm::mat4& getViewMatrix() const { return viewMatrix; }
    const glm::mat4& getProjectionMatrix() const { return projectionMatrix; }
    const glm::mat4& getUnjitteredProjectionMatrix() const { return unjitteredProjectionMatrix; }
    glm::mat4 getViewProjectionMatrix() const { return projectionMatrix * viewMatrix; }
    glm::mat4 getUnjitteredViewProjectionMatrix() const { return unjitteredProjectionMatrix * viewMatrix; }
    float getAspectRatio() const { return aspectRatio; }
    // Sub-pixel jitter for temporal upscaling (FSR 2)
    void setJitter(float jx, float jy);
    void clearJitter();
    glm::vec2 getJitter() const { return jitterOffset; }
    glm::vec3 getForward() const;
    glm::vec3 getRight() const;
    glm::vec3 getUp() const;
@ -46,6 +53,8 @@ private:
    glm::mat4 viewMatrix = glm::mat4(1.0f);
    glm::mat4 projectionMatrix = glm::mat4(1.0f);
    glm::mat4 unjitteredProjectionMatrix = glm::mat4(1.0f);
    glm::vec2 jitterOffset = glm::vec2(0.0f);  // NDC jitter (applied to projection)
 };
 } // namespace rendering
--- a/include/rendering/character_renderer.hpp
+++ b/include/rendering/character_renderer.hpp
@ -13,6 +13,8 @@
 #include <utility>
 #include <future>
 #include <deque>
 #include <mutex>
 #include <atomic>
 namespace wowee {
 namespace pipeline { class AssetManager; }
@ -64,6 +66,8 @@ public:
    void update(float deltaTime, const glm::vec3& cameraPos = glm::vec3(0.0f));
    /** Pre-allocate GPU resources (bone SSBOs, descriptors) on main thread before parallel render. */
    void prepareRender(uint32_t frameIndex);
    void render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet, const Camera& camera);
    void recreatePipelines();
    bool initializeShadow(VkRenderPass shadowRenderPass);
@ -304,15 +308,23 @@ private:
    std::unique_ptr<VkTexture> generateNormalHeightMap(
        const uint8_t* pixels, uint32_t width, uint32_t height, float& outVariance);
-    // Deferred normal map generation — avoids stalling loadModel
+    // Background normal map generation — CPU work on thread pool, GPU upload on main thread
-    struct PendingNormalMap {
+    struct NormalMapResult {
        std::string cacheKey;
-        std::vector<uint8_t> pixels;  // RGBA pixel data
+        std::vector<uint8_t> pixels;  // RGBA normal map output
        uint32_t width, height;
        float variance;
    };
-    std::deque<PendingNormalMap> pendingNormalMaps_;
+    // Completed results ready for GPU upload (populated by background threads)
    std::mutex normalMapResultsMutex_;
    std::deque<NormalMapResult> completedNormalMaps_;
    std::atomic<int> pendingNormalMapCount_{0};  // in-flight background tasks
    // Pure CPU normal map generation (thread-safe, no GPU access)
    static NormalMapResult generateNormalHeightMapCPU(
        std::string cacheKey, std::vector<uint8_t> pixels, uint32_t width, uint32_t height);
 public:
-    void processPendingNormalMaps(int budget = 2);
+    void processPendingNormalMaps(int budget = 4);
 private:
    // Normal mapping / POM settings
--- a/include/rendering/loading_screen.hpp
+++ b/include/rendering/loading_screen.hpp
@ -24,6 +24,10 @@ public:
    // Render the loading screen with progress bar and status text (pure ImGui)
    void render();
    // Draw loading screen as ImGui overlay (call within an existing ImGui frame).
    // Used during warmup to overlay loading screen on top of the rendered world.
    void renderOverlay();
    void setProgress(float progress) { loadProgress = progress; }
    void setStatus(const std::string& status) { statusText = status; }
--- a/include/rendering/m2_renderer.hpp
+++ b/include/rendering/m2_renderer.hpp
@ -122,6 +122,7 @@ struct M2ModelGPU {
    bool isKoboldFlame = false;     // Model name matches kobold+(candle/torch/mine) (precomputed)
    bool isLavaModel = false;       // Model name contains lava/molten/magma (UV scroll fallback)
    bool hasTextureAnimation = false; // True if any batch has UV animation
    uint8_t availableLODs = 0;  // Bitmask: bit N set if any batch has submeshLevel==N
    // Particle emitter data (kept from M2Model)
    std::vector<pipeline::M2ParticleEmitter> particleEmitters;
@ -193,6 +194,7 @@ struct M2Instance {
    // Frame-skip optimization (update distant animations less frequently)
    uint8_t frameSkipCounter = 0;
    bool bonesDirty[2] = {false, false};  // Per-frame-index: set when bones recomputed, cleared after upload
    // Per-instance bone SSBO (double-buffered)
    ::VkBuffer boneBuffer[2] = {};
@ -265,6 +267,8 @@ public:
    /**
     * Render all visible instances (Vulkan)
     */
    /** Pre-allocate GPU resources (bone SSBOs, descriptors) on main thread before parallel render. */
    void prepareRender(uint32_t frameIndex, const Camera& camera);
    void render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet, const Camera& camera);
    /**
@ -471,9 +475,7 @@ private:
    static constexpr float SPATIAL_CELL_SIZE = 64.0f;
    std::unordered_map<GridCell, std::vector<uint32_t>, GridCellHash> spatialGrid;
    std::unordered_map<uint32_t, size_t> instanceIndexById;
-    mutable std::vector<size_t> candidateScratch;
+    // Collision scratch buffers are thread_local (see m2_renderer.cpp) for thread-safety.
    mutable std::unordered_set<uint32_t> candidateIdScratch;
    mutable std::vector<uint32_t> collisionTriScratch_;
    // Collision query profiling (per frame).
    mutable double queryTimeMs = 0.0;
--- a/include/rendering/renderer.hpp
+++ b/include/rendering/renderer.hpp
@ -4,10 +4,12 @@
 #include <string>
 #include <cstdint>
 #include <vector>
 #include <future>
 #include <glm/glm.hpp>
 #include <vulkan/vulkan.h>
 #include <vk_mem_alloc.h>
 #include "rendering/vk_frame_data.hpp"
 #include "rendering/vk_utils.hpp"
 #include "rendering/sky_system.hpp"
 namespace wowee {
@ -244,7 +246,7 @@ private:
    glm::vec3 shadowCenter = glm::vec3(0.0f);
    bool shadowCenterInitialized = false;
    bool shadowsEnabled = true;
-    float shadowDistance_ = 72.0f;  // Shadow frustum half-extent (default: 72 units)
+    float shadowDistance_ = 300.0f;  // Shadow frustum half-extent (default: 300 units)
    uint32_t shadowFrameCounter_ = 0;
@ -255,10 +257,20 @@ public:
    void setShadowsEnabled(bool enabled) { shadowsEnabled = enabled; }
    bool areShadowsEnabled() const { return shadowsEnabled; }
-    void setShadowDistance(float dist) { shadowDistance_ = glm::clamp(dist, 40.0f, 200.0f); }
+    void setShadowDistance(float dist) { shadowDistance_ = glm::clamp(dist, 40.0f, 500.0f); }
    float getShadowDistance() const { return shadowDistance_; }
    void setMsaaSamples(VkSampleCountFlagBits samples);
    // FSR (FidelityFX Super Resolution) upscaling
    void setFSREnabled(bool enabled);
    bool isFSREnabled() const { return fsr_.enabled; }
    void setFSRQuality(float scaleFactor);  // 0.50=Perf, 0.59=Balanced, 0.67=Quality, 0.77=UltraQuality
    void setFSRSharpness(float sharpness);  // 0.0 - 2.0
    float getFSRScaleFactor() const { return fsr_.scaleFactor; }
    float getFSRSharpness() const { return fsr_.sharpness; }
    void setFSR2Enabled(bool enabled);
    bool isFSR2Enabled() const { return fsr2_.enabled; }
    void setWaterRefractionEnabled(bool enabled);
    bool isWaterRefractionEnabled() const;
@ -312,7 +324,7 @@ private:
    VmaAllocation selCircleIdxAlloc = VK_NULL_HANDLE;
    int selCircleVertCount = 0;
    void initSelectionCircle();
-    void renderSelectionCircle(const glm::mat4& view, const glm::mat4& projection);
+    void renderSelectionCircle(const glm::mat4& view, const glm::mat4& projection, VkCommandBuffer overrideCmd = VK_NULL_HANDLE);
    glm::vec3 selCirclePos{0.0f};
    glm::vec3 selCircleColor{1.0f, 0.0f, 0.0f};
    float selCircleRadius = 1.5f;
@ -322,7 +334,95 @@ private:
    VkPipeline overlayPipeline = VK_NULL_HANDLE;
    VkPipelineLayout overlayPipelineLayout = VK_NULL_HANDLE;
    void initOverlayPipeline();
-    void renderOverlay(const glm::vec4& color);
+    void renderOverlay(const glm::vec4& color, VkCommandBuffer overrideCmd = VK_NULL_HANDLE);
    // FSR 1.0 upscaling state
    struct FSRState {
        bool enabled = false;
        bool needsRecreate = false;
        float scaleFactor = 0.77f;  // Ultra Quality default
        float sharpness = 0.5f;
        uint32_t internalWidth = 0;
        uint32_t internalHeight = 0;
        // Off-screen scene target (reduced resolution)
        AllocatedImage sceneColor{};        // 1x color (non-MSAA render target / MSAA resolve target)
        AllocatedImage sceneDepth{};        // Depth (matches current MSAA sample count)
        AllocatedImage sceneMsaaColor{};    // MSAA color target (only when MSAA > 1x)
        AllocatedImage sceneDepthResolve{}; // Depth resolve (only when MSAA + depth resolve)
        VkFramebuffer sceneFramebuffer = VK_NULL_HANDLE;
        VkSampler sceneSampler = VK_NULL_HANDLE;
        // Upscale pipeline
        VkPipeline pipeline = VK_NULL_HANDLE;
        VkPipelineLayout pipelineLayout = VK_NULL_HANDLE;
        VkDescriptorSetLayout descSetLayout = VK_NULL_HANDLE;
        VkDescriptorPool descPool = VK_NULL_HANDLE;
        VkDescriptorSet descSet = VK_NULL_HANDLE;
    };
    FSRState fsr_;
    bool initFSRResources();
    void destroyFSRResources();
    void renderFSRUpscale();
    // FSR 2.2 temporal upscaling state
    struct FSR2State {
        bool enabled = false;
        bool needsRecreate = false;
        float scaleFactor = 0.77f;
        float sharpness = 0.5f;
        uint32_t internalWidth = 0;
        uint32_t internalHeight = 0;
        // Off-screen scene targets (internal resolution, no MSAA — FSR2 replaces AA)
        AllocatedImage sceneColor{};
        AllocatedImage sceneDepth{};
        VkFramebuffer sceneFramebuffer = VK_NULL_HANDLE;
        // Samplers
        VkSampler linearSampler = VK_NULL_HANDLE;   // For color
        VkSampler nearestSampler = VK_NULL_HANDLE;  // For depth / motion vectors
        // Motion vector buffer (internal resolution)
        AllocatedImage motionVectors{};
        // History buffers (display resolution, ping-pong)
        AllocatedImage history[2]{};
        uint32_t currentHistory = 0;  // Output index (0 or 1)
        // Compute pipelines
        VkPipeline motionVecPipeline = VK_NULL_HANDLE;
        VkPipelineLayout motionVecPipelineLayout = VK_NULL_HANDLE;
        VkDescriptorSetLayout motionVecDescSetLayout = VK_NULL_HANDLE;
        VkDescriptorPool motionVecDescPool = VK_NULL_HANDLE;
        VkDescriptorSet motionVecDescSet = VK_NULL_HANDLE;
        VkPipeline accumulatePipeline = VK_NULL_HANDLE;
        VkPipelineLayout accumulatePipelineLayout = VK_NULL_HANDLE;
        VkDescriptorSetLayout accumulateDescSetLayout = VK_NULL_HANDLE;
        VkDescriptorPool accumulateDescPool = VK_NULL_HANDLE;
        VkDescriptorSet accumulateDescSets[2] = {};  // Per ping-pong
        // RCAS sharpening pass (display resolution)
        VkPipeline sharpenPipeline = VK_NULL_HANDLE;
        VkPipelineLayout sharpenPipelineLayout = VK_NULL_HANDLE;
        VkDescriptorSetLayout sharpenDescSetLayout = VK_NULL_HANDLE;
        VkDescriptorPool sharpenDescPool = VK_NULL_HANDLE;
        VkDescriptorSet sharpenDescSets[2] = {};
        // Previous frame state for motion vector reprojection
        glm::mat4 prevViewProjection = glm::mat4(1.0f);
        glm::vec2 prevJitter = glm::vec2(0.0f);
        uint32_t frameIndex = 0;
        bool needsHistoryReset = true;
    };
    FSR2State fsr2_;
    bool initFSR2Resources();
    void destroyFSR2Resources();
    void dispatchMotionVectors();
    void dispatchTemporalAccumulate();
    void renderFSR2Sharpen();
    static float halton(uint32_t index, uint32_t base);
    // Footstep event tracking (animation-driven)
    uint32_t footstepLastAnimationId = 0;
@ -411,6 +511,36 @@ private:
    void setupWater1xPass();
    void renderReflectionPass();
    // ── Multithreaded secondary command buffer recording ──
    // Indices into secondaryCmds_ arrays
    static constexpr uint32_t SEC_SKY     = 0;  // sky (main thread)
    static constexpr uint32_t SEC_TERRAIN = 1;  // terrain (worker 0)
    static constexpr uint32_t SEC_WMO     = 2;  // WMO (worker 1)
    static constexpr uint32_t SEC_CHARS   = 3;  // selection circle + characters (main thread)
    static constexpr uint32_t SEC_M2      = 4;  // M2 + particles + glow (worker 2)
    static constexpr uint32_t SEC_POST    = 5;  // water + weather + effects (main thread)
    static constexpr uint32_t SEC_IMGUI   = 6;  // ImGui (main thread, non-FSR only)
    static constexpr uint32_t NUM_SECONDARIES = 7;
    static constexpr uint32_t NUM_WORKERS = 3;  // terrain, WMO, M2
    // Per-worker command pools (thread-safe: one pool per thread)
    VkCommandPool workerCmdPools_[NUM_WORKERS] = {};
    // Main-thread command pool for its secondary buffers
    VkCommandPool mainSecondaryCmdPool_ = VK_NULL_HANDLE;
    // Pre-allocated secondary command buffers [secondaryIndex][frameInFlight]
    VkCommandBuffer secondaryCmds_[NUM_SECONDARIES][MAX_FRAMES] = {};
    bool parallelRecordingEnabled_ = false;  // set true after pools/buffers created
    bool createSecondaryCommandResources();
    void destroySecondaryCommandResources();
    VkCommandBuffer beginSecondary(uint32_t secondaryIndex);
    void setSecondaryViewportScissor(VkCommandBuffer cmd);
    // Cached render pass state for secondary buffer inheritance
    VkRenderPass activeRenderPass_ = VK_NULL_HANDLE;
    VkFramebuffer activeFramebuffer_ = VK_NULL_HANDLE;
    VkExtent2D activeRenderExtent_ = {0, 0};
    // Active character previews for off-screen rendering
    std::vector<CharacterPreview*> activePreviews_;
--- a/include/rendering/terrain_manager.hpp
+++ b/include/rendering/terrain_manager.hpp
@ -348,6 +348,7 @@ private:
    int unloadRadius = 7;    // Unload tiles beyond this radius
    float updateInterval = 0.033f;  // Check streaming every 33ms (~30 fps)
    float timeSinceLastUpdate = 0.0f;
    float proactiveStreamTimer_ = 0.0f;
    bool taxiStreamingMode_ = false;
    // Tile size constants (WoW ADT specifications)
--- a/include/rendering/vk_context.hpp
+++ b/include/rendering/vk_context.hpp
@ -84,6 +84,10 @@ public:
    bool isSwapchainDirty() const { return swapchainDirty; }
    void markSwapchainDirty() { swapchainDirty = true; }
    // VSync (present mode)
    bool isVsyncEnabled() const { return vsync_; }
    void setVsync(bool enabled) { vsync_ = enabled; }
    bool isDeviceLost() const { return deviceLost_; }
    // MSAA
@ -145,6 +149,7 @@ private:
    std::vector<VkFramebuffer> swapchainFramebuffers;
    bool swapchainDirty = false;
    bool deviceLost_ = false;
    bool vsync_ = true;
    // Per-frame resources
    FrameData frames[MAX_FRAMES_IN_FLIGHT];
--- a/include/rendering/wmo_renderer.hpp
+++ b/include/rendering/wmo_renderer.hpp
@ -148,6 +148,8 @@ public:
     * @param perFrameSet Per-frame descriptor set (set 0)
     * @param camera Camera for frustum culling
     */
    /** Pre-update mutable state (frame ID, material UBOs) on main thread before parallel render. */
    void prepareRender();
    void render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet, const Camera& camera);
    /**
@ -332,6 +334,9 @@ public:
    // Defer normal/height map generation during streaming to avoid CPU stalls
    void setDeferNormalMaps(bool defer) { deferNormalMaps_ = defer; }
    // Generate normal/height maps for cached textures that were loaded while deferred
    void backfillNormalMaps();
 private:
    // WMO material UBO — matches WMOMaterial in wmo.frag.glsl
    struct WMOMaterialUBO {
@ -706,9 +711,7 @@ private:
    static constexpr float SPATIAL_CELL_SIZE = 64.0f;
    std::unordered_map<GridCell, std::vector<uint32_t>, GridCellHash> spatialGrid;
    std::unordered_map<uint32_t, size_t> instanceIndexById;
-    mutable std::vector<size_t> candidateScratch;
+    // Collision scratch buffers are thread_local (see wmo_renderer.cpp) for thread-safety.
    mutable std::vector<uint32_t> triScratch_;  // Scratch for collision grid queries
    mutable std::unordered_set<uint32_t> candidateIdScratch;
    // Parallel visibility culling
    uint32_t numCullThreads_ = 1;
@ -720,6 +723,8 @@ private:
        uint32_t distanceCulled = 0;
    };
    std::vector<std::future<void>> cullFutures_;
    std::vector<size_t> visibleInstances_;      // reused per frame
    std::vector<InstanceDrawList> drawLists_;    // reused per frame
    // Collision query profiling (per frame).
    mutable double queryTimeMs = 0.0;
--- a/include/ui/game_screen.hpp
+++ b/include/ui/game_screen.hpp
@ -87,7 +87,7 @@ private:
    bool pendingVsync = false;
    int pendingResIndex = 0;
    bool pendingShadows = true;
-    float pendingShadowDistance = 72.0f;
+    float pendingShadowDistance = 300.0f;
    bool pendingWaterRefraction = false;
    int pendingMasterVolume = 100;
    int pendingMusicVolume = 30;
@ -116,6 +116,10 @@ private:
    float pendingNormalMapStrength = 0.8f;  // 0.0-2.0
    bool pendingPOM = true;             // on by default
    int pendingPOMQuality = 1;          // 0=Low(16), 1=Medium(32), 2=High(64)
    bool pendingFSR = false;
    int pendingFSRQuality = 0;          // 0=UltraQuality, 1=Quality, 2=Balanced, 3=Performance
    float pendingFSRSharpness = 0.5f;
    bool fsrSettingsApplied_ = false;
    // UI element transparency (0.0 = fully transparent, 1.0 = fully opaque)
    float uiOpacity_ = 0.65f;
--- a/src/core/application.cpp
+++ b/src/core/application.cpp
@ -49,9 +49,9 @@
 #include <SDL2/SDL.h>
 // GL/glew.h removed — Vulkan migration Phase 1
 #include <cstdlib>
 #include <climits>
 #include <algorithm>
 #include <cctype>
 #include <cctype>
 #include <optional>
 #include <sstream>
 #include <set>
@ -868,7 +868,7 @@ void Application::update(float deltaTime) {
                }
                auto stageEnd = std::chrono::steady_clock::now();
                float stageMs = std::chrono::duration<float, std::milli>(stageEnd - stageStart).count();
-                if (stageMs > 3.0f) {
+                if (stageMs > 50.0f) {
                    LOG_WARNING("SLOW update stage '", stageName, "': ", stageMs, "ms");
                }
            };
@ -913,29 +913,12 @@ void Application::update(float deltaTime) {
            inGameStep = "spawn/equipment queues";
            updateCheckpoint = "in_game: spawn/equipment queues";
            runInGameStage("spawn/equipment queues", [&] {
                auto t0 = std::chrono::steady_clock::now();
                processPlayerSpawnQueue();
                auto t1 = std::chrono::steady_clock::now();
                processCreatureSpawnQueue();
                auto t2 = std::chrono::steady_clock::now();
                processAsyncNpcCompositeResults();
                auto t3 = std::chrono::steady_clock::now();
                processDeferredEquipmentQueue();
                auto t4 = std::chrono::steady_clock::now();
                // Process deferred normal maps (2 per frame to spread CPU cost)
                if (auto* cr = renderer ? renderer->getCharacterRenderer() : nullptr) {
-                    cr->processPendingNormalMaps(2);
+                    cr->processPendingNormalMaps(4);
                }
                auto t5 = std::chrono::steady_clock::now();
                float pMs = std::chrono::duration<float, std::milli>(t1 - t0).count();
                float cMs = std::chrono::duration<float, std::milli>(t2 - t1).count();
                float nMs = std::chrono::duration<float, std::milli>(t3 - t2).count();
                float eMs = std::chrono::duration<float, std::milli>(t4 - t3).count();
                float nmMs = std::chrono::duration<float, std::milli>(t5 - t4).count();
                float total = pMs + cMs + nMs + eMs + nmMs;
                if (total > 4.0f) {
                    LOG_WARNING("spawn/equip breakdown: player=", pMs, "ms creature=", cMs,
                                "ms npcComposite=", nMs, "ms equip=", eMs, "ms normalMaps=", nmMs, "ms");
                }
            });
            // Self-heal missing creature visuals: if a nearby UNIT exists in
@ -1032,14 +1015,33 @@ void Application::update(float deltaTime) {
                    if (renderer && renderer->getCameraController())
                        renderer->getCameraController()->clearMovementInputs();
                }
                // Hearth teleport: keep player frozen until terrain loads at destination
                if (hearthTeleportPending_ && renderer && renderer->getTerrainManager()) {
                    hearthTeleportTimer_ -= deltaTime;
                    auto terrainH = renderer->getTerrainManager()->getHeightAt(
                        hearthTeleportPos_.x, hearthTeleportPos_.y);
                    if (terrainH || hearthTeleportTimer_ <= 0.0f) {
                        // Terrain loaded (or timeout) — snap to floor and release
                        if (terrainH) {
                            hearthTeleportPos_.z = *terrainH + 0.5f;
                            renderer->getCameraController()->teleportTo(hearthTeleportPos_);
                        }
                        renderer->getCameraController()->setExternalFollow(false);
                        worldEntryMovementGraceTimer_ = 1.0f;
                        hearthTeleportPending_ = false;
                        LOG_INFO("Unstuck hearth: terrain loaded, player released",
                                 terrainH ? "" : " (timeout)");
                    }
                }
                if (renderer && renderer->getCameraController()) {
                const bool externallyDrivenMotion = onTaxi || onWMOTransport || chargeActive_;
                // Keep physics frozen (externalFollow) during landing clamp when terrain
                // hasn't loaded yet — prevents gravity from pulling player through void.
                bool hearthFreeze = hearthTeleportPending_;
                bool landingClampActive = !onTaxi && taxiLandingClampTimer_ > 0.0f &&
                                          worldEntryMovementGraceTimer_ <= 0.0f &&
                                          !gameHandler->isMounted();
-                renderer->getCameraController()->setExternalFollow(externallyDrivenMotion || landingClampActive);
+                renderer->getCameraController()->setExternalFollow(externallyDrivenMotion || landingClampActive || hearthFreeze);
                renderer->getCameraController()->setExternalMoving(externallyDrivenMotion);
                if (externallyDrivenMotion) {
                    // Drop any stale local movement toggles while server drives taxi motion.
@ -1514,7 +1516,7 @@ void Application::update(float deltaTime) {
        }
        float ruMs = std::chrono::duration<float, std::milli>(
            std::chrono::steady_clock::now() - rendererUpdateStart).count();
-        if (ruMs > 5.0f) {
+        if (ruMs > 50.0f) {
            LOG_WARNING("SLOW update stage 'renderer->update': ", ruMs, "ms");
        }
    }
@ -1894,9 +1896,43 @@ void Application::setupUICallbacks() {
        LOG_INFO("Unstuck: high fallback snap");
    });
    // /unstuckhearth — teleport to hearthstone bind point (server-synced).
    // Freezes player until terrain loads at destination to prevent falling through world.
    gameHandler->setUnstuckHearthCallback([this, clearStuckMovement, forceServerTeleportCommand]() {
        if (!renderer || !renderer->getCameraController() || !gameHandler) return;
        uint32_t bindMap = 0;
        glm::vec3 bindPos(0.0f);
        if (!gameHandler->getHomeBind(bindMap, bindPos)) {
            LOG_WARNING("Unstuck hearth: no bind point available");
            return;
        }
        worldEntryMovementGraceTimer_ = 10.0f;  // long grace — terrain load check will clear it
        taxiLandingClampTimer_ = 0.0f;
        lastTaxiFlight_ = false;
        clearStuckMovement();
        auto* cc = renderer->getCameraController();
        glm::vec3 renderPos = core::coords::canonicalToRender(bindPos);
        renderPos.z += 2.0f;
        // Freeze player in place (no gravity/movement) until terrain loads
        cc->teleportTo(renderPos);
        cc->setExternalFollow(true);
        forceServerTeleportCommand(renderPos);
        clearStuckMovement();
        // Set pending state — update loop will unfreeze once terrain is loaded
        hearthTeleportPending_ = true;
        hearthTeleportPos_ = renderPos;
        hearthTeleportTimer_ = 15.0f;  // 15s safety timeout
        LOG_INFO("Unstuck hearth: teleporting to bind point, waiting for terrain...");
    });
    // Auto-unstuck: falling for > 5 seconds = void fall, teleport to map entry
    if (renderer->getCameraController()) {
-        renderer->getCameraController()->setAutoUnstuckCallback([this]() {
+        renderer->getCameraController()->setAutoUnstuckCallback([this, forceServerTeleportCommand]() {
            if (!renderer || !renderer->getCameraController()) return;
            auto* cc = renderer->getCameraController();
@ -1904,7 +1940,8 @@ void Application::setupUICallbacks() {
            glm::vec3 spawnPos = cc->getDefaultPosition();
            spawnPos.z += 5.0f;
            cc->teleportTo(spawnPos);
-            LOG_INFO("Auto-unstuck: teleported to map entry point");
+            forceServerTeleportCommand(spawnPos);
            LOG_INFO("Auto-unstuck: teleported to map entry point (server synced)");
        });
    }
@ -4167,11 +4204,17 @@ void Application::loadOnlineWorldTerrain(uint32_t mapId, float x, float y, float
        });
    }
-    // Hide first-login hitch by draining initial world packets/spawn queues before
+    // Keep the loading screen visible until all spawn/equipment/gameobject queues
-    // dropping the loading screen. Keep this bounded so we don't stall indefinitely.
+    // are fully drained. This ensures the player sees a fully populated world
    // (character clothed, NPCs placed, game objects loaded) when the screen drops.
    {
-        const float kWarmupMaxSeconds = 2.5f;
+        const float kMinWarmupSeconds = 2.0f;   // minimum time to drain network packets
        const float kMaxWarmupSeconds = 15.0f;  // hard cap to avoid infinite stall
        const auto warmupStart = std::chrono::high_resolution_clock::now();
        // Track consecutive idle iterations (all queues empty) to detect convergence
        int idleIterations = 0;
        const int kIdleThreshold = 5;  // require 5 consecutive empty loops (~80ms)
        while (true) {
            SDL_Event event;
            while (SDL_PollEvent(&event)) {
@ -4185,7 +4228,6 @@ void Application::loadOnlineWorldTerrain(uint32_t mapId, float x, float y, float
                    int w = event.window.data1;
                    int h = event.window.data2;
                    window->setSize(w, h);
                    // Vulkan viewport set in command buffer
                    if (renderer && renderer->getCamera()) {
                        renderer->getCamera()->setAspectRatio(static_cast<float>(w) / h);
                    }
@ -4207,60 +4249,18 @@ void Application::loadOnlineWorldTerrain(uint32_t mapId, float x, float y, float
            processPlayerSpawnQueue();
            // During load screen warmup: lift per-frame budgets so GPU uploads
-            // happen in bulk while the loading screen is still visible.
+            // and spawns happen in bulk while the loading screen is still visible.
-            // Process ALL async creature model uploads (no 3-per-frame cap).
+            processCreatureSpawnQueue(true);
            {
                for (auto it = asyncCreatureLoads_.begin(); it != asyncCreatureLoads_.end(); ) {
                    if (!it->future.valid() ||
                        it->future.wait_for(std::chrono::milliseconds(0)) != std::future_status::ready) {
                        ++it;
                        continue;
                    }
                    auto result = it->future.get();
                    it = asyncCreatureLoads_.erase(it);
                    if (result.permanent_failure) {
                        nonRenderableCreatureDisplayIds_.insert(result.displayId);
                        creaturePermanentFailureGuids_.insert(result.guid);
                        pendingCreatureSpawnGuids_.erase(result.guid);
                        creatureSpawnRetryCounts_.erase(result.guid);
                        continue;
                    }
                    if (!result.valid || !result.model) {
                        pendingCreatureSpawnGuids_.erase(result.guid);
                        creatureSpawnRetryCounts_.erase(result.guid);
                        continue;
                    }
                    auto* charRenderer = renderer ? renderer->getCharacterRenderer() : nullptr;
                    if (!charRenderer) { pendingCreatureSpawnGuids_.erase(result.guid); continue; }
                    if (!charRenderer->loadModel(*result.model, result.modelId)) {
                        nonRenderableCreatureDisplayIds_.insert(result.displayId);
                        creaturePermanentFailureGuids_.insert(result.guid);
                        pendingCreatureSpawnGuids_.erase(result.guid);
                        creatureSpawnRetryCounts_.erase(result.guid);
                        continue;
                    }
                    displayIdModelCache_[result.displayId] = result.modelId;
                    pendingCreatureSpawnGuids_.erase(result.guid);
                    creatureSpawnRetryCounts_.erase(result.guid);
                    if (!creatureInstances_.count(result.guid) &&
                        !creaturePermanentFailureGuids_.count(result.guid)) {
                        PendingCreatureSpawn s{};
                        s.guid = result.guid; s.displayId = result.displayId;
                        s.x = result.x; s.y = result.y; s.z = result.z;
                        s.orientation = result.orientation;
                        pendingCreatureSpawns_.push_back(s);
                        pendingCreatureSpawnGuids_.insert(result.guid);
                    }
                }
            }
            processCreatureSpawnQueue();
            processAsyncNpcCompositeResults();
-            processDeferredEquipmentQueue();
+            // Process equipment queue more aggressively during warmup (multiple per iteration)
            for (int i = 0; i < 8 && (!deferredEquipmentQueue_.empty() || !asyncEquipmentLoads_.empty()); i++) {
                processDeferredEquipmentQueue();
            }
            if (auto* cr = renderer ? renderer->getCharacterRenderer() : nullptr) {
-                cr->processPendingNormalMaps(10);  // higher budget during load screen
+                cr->processPendingNormalMaps(INT_MAX);
            }
-            // Process ALL pending game object spawns (no 1-per-frame cap during load screen).
+            // Process ALL pending game object spawns.
            while (!pendingGameObjectSpawns_.empty()) {
                auto& s = pendingGameObjectSpawns_.front();
                spawnOnlineGameObject(s.guid, s.entry, s.displayId, s.x, s.y, s.z, s.orientation);
@ -4271,14 +4271,42 @@ void Application::loadOnlineWorldTerrain(uint32_t mapId, float x, float y, float
            processPendingMount();
            updateQuestMarkers();
            // Update renderer (terrain streaming, animations)
            if (renderer) {
                renderer->update(1.0f / 60.0f);
            }
            const auto now = std::chrono::high_resolution_clock::now();
            const float elapsed = std::chrono::duration<float>(now - warmupStart).count();
            const float t = std::clamp(elapsed / kWarmupMaxSeconds, 0.0f, 1.0f);
            showProgress("Finalizing world sync...", 0.97f + t * 0.025f);
-            if (elapsed >= kWarmupMaxSeconds) {
+            // Check if all queues are drained
            bool queuesEmpty =
                pendingCreatureSpawns_.empty() &&
                asyncCreatureLoads_.empty() &&
                asyncNpcCompositeLoads_.empty() &&
                deferredEquipmentQueue_.empty() &&
                asyncEquipmentLoads_.empty() &&
                pendingGameObjectSpawns_.empty() &&
                asyncGameObjectLoads_.empty() &&
                pendingPlayerSpawns_.empty();
            if (queuesEmpty) {
                idleIterations++;
            } else {
                idleIterations = 0;
            }
            // Exit when: (min time passed AND queues drained for several iterations) OR hard cap
            bool readyToExit = (elapsed >= kMinWarmupSeconds && idleIterations >= kIdleThreshold);
            if (readyToExit || elapsed >= kMaxWarmupSeconds) {
                if (elapsed >= kMaxWarmupSeconds) {
                    LOG_WARNING("Warmup hit hard cap (", kMaxWarmupSeconds, "s), entering world with pending work");
                }
                break;
            }
            const float t = std::clamp(elapsed / kMaxWarmupSeconds, 0.0f, 1.0f);
            showProgress("Finalizing world sync...", 0.97f + t * 0.025f);
            SDL_Delay(16);
        }
    }
@ -5154,7 +5182,7 @@ void Application::spawnOnlineCreature(uint64_t guid, uint32_t displayId, float x
        {
            auto texEnd = std::chrono::steady_clock::now();
            float texMs = std::chrono::duration<float, std::milli>(texEnd - texStart).count();
-            if (texMs > 3.0f) {
+            if (texMs > 50.0f) {
                LOG_WARNING("spawnCreature texture setup took ", texMs, "ms displayId=", displayId,
                            " hasPreDec=", hasPreDec, " extra=", dispData.extraDisplayId);
            }
@ -6804,9 +6832,10 @@ void Application::spawnOnlineGameObject(uint64_t guid, uint32_t entry, uint32_t
             " displayId=", displayId, " at (", x, ", ", y, ", ", z, ")");
 }
-void Application::processAsyncCreatureResults() {
+void Application::processAsyncCreatureResults(bool unlimited) {
    // Check completed async model loads and finalize on main thread (GPU upload + instance creation).
    // Limit GPU model uploads per frame to avoid spikes, but always drain cheap bookkeeping.
    // In unlimited mode (load screen), process all pending uploads without cap.
    static constexpr int kMaxModelUploadsPerFrame = 1;
    int modelUploads = 0;
@ -6819,9 +6848,7 @@ void Application::processAsyncCreatureResults() {
        // Peek: if this result needs a NEW model upload (not cached) and we've hit
        // the upload budget, defer to next frame without consuming the future.
-        if (modelUploads >= kMaxModelUploadsPerFrame) {
+        if (!unlimited && modelUploads >= kMaxModelUploadsPerFrame) {
            // Check if this displayId already has a cached model (cheap spawn, no GPU upload).
            // We can't peek the displayId without getting the future, so just break.
            break;
        }
@ -6864,7 +6891,7 @@ void Application::processAsyncCreatureResults() {
        {
            auto uploadEnd = std::chrono::steady_clock::now();
            float uploadMs = std::chrono::duration<float, std::milli>(uploadEnd - uploadStart).count();
-            if (uploadMs > 3.0f) {
+            if (uploadMs > 100.0f) {
                LOG_WARNING("charRenderer->loadModel took ", uploadMs, "ms displayId=", result.displayId,
                            " preDecoded=", result.predecodedTextures.size());
            }
@ -6967,17 +6994,18 @@ void Application::processAsyncNpcCompositeResults() {
    }
 }
-void Application::processCreatureSpawnQueue() {
+void Application::processCreatureSpawnQueue(bool unlimited) {
    auto startTime = std::chrono::steady_clock::now();
    // Budget: max 2ms per frame for creature spawning to prevent stutter.
    // In unlimited mode (load screen), process everything without budget cap.
    static constexpr float kSpawnBudgetMs = 2.0f;
    // First, finalize any async model loads that completed on background threads.
-    processAsyncCreatureResults();
+    processAsyncCreatureResults(unlimited);
    {
        auto now = std::chrono::steady_clock::now();
        float asyncMs = std::chrono::duration<float, std::milli>(now - startTime).count();
-        if (asyncMs > 3.0f) {
+        if (asyncMs > 100.0f) {
            LOG_WARNING("processAsyncCreatureResults took ", asyncMs, "ms");
        }
    }
@ -6992,11 +7020,11 @@ void Application::processCreatureSpawnQueue() {
    int asyncLaunched = 0;
    size_t rotationsLeft = pendingCreatureSpawns_.size();
    while (!pendingCreatureSpawns_.empty() &&
-           processed < MAX_SPAWNS_PER_FRAME &&
+           (unlimited || processed < MAX_SPAWNS_PER_FRAME) &&
           rotationsLeft > 0) {
        // Check time budget every iteration (including first — async results may
        // have already consumed the budget via GPU model uploads).
-        {
+        if (!unlimited) {
            auto now = std::chrono::steady_clock::now();
            float elapsedMs = std::chrono::duration<float, std::milli>(now - startTime).count();
            if (elapsedMs >= kSpawnBudgetMs) break;
@ -7017,7 +7045,8 @@ void Application::processCreatureSpawnQueue() {
        // For new models: launch async load on background thread instead of blocking.
        if (needsNewModel) {
-            if (static_cast<int>(asyncCreatureLoads_.size()) + asyncLaunched >= MAX_ASYNC_CREATURE_LOADS) {
+            const int maxAsync = unlimited ? (MAX_ASYNC_CREATURE_LOADS * 4) : MAX_ASYNC_CREATURE_LOADS;
            if (static_cast<int>(asyncCreatureLoads_.size()) + asyncLaunched >= maxAsync) {
                // Too many in-flight — defer to next frame
                pendingCreatureSpawns_.push_back(s);
                rotationsLeft--;
@ -7273,7 +7302,7 @@ void Application::processCreatureSpawnQueue() {
            spawnOnlineCreature(s.guid, s.displayId, s.x, s.y, s.z, s.orientation);
            auto spawnEnd = std::chrono::steady_clock::now();
            float spawnMs = std::chrono::duration<float, std::milli>(spawnEnd - spawnStart).count();
-            if (spawnMs > 3.0f) {
+            if (spawnMs > 100.0f) {
                LOG_WARNING("spawnOnlineCreature took ", spawnMs, "ms displayId=", s.displayId);
            }
        }
--- a/src/core/window.cpp
+++ b/src/core/window.cpp
@ -84,6 +84,7 @@ bool Window::initialize() {
    // Initialize Vulkan context
    vkContext = std::make_unique<rendering::VkContext>();
    vkContext->setVsync(vsync);
    if (!vkContext->initialize(window)) {
        LOG_ERROR("Failed to initialize Vulkan context");
        return false;
@ -158,11 +159,13 @@ void Window::setFullscreen(bool enable) {
    }
 }
-void Window::setVsync([[maybe_unused]] bool enable) {
+void Window::setVsync(bool enable) {
    // VSync in Vulkan is controlled by present mode (set at swapchain creation)
    // For now, store the preference — applied on next swapchain recreation
    vsync = enable;
-    LOG_INFO("VSync preference set to ", enable ? "on" : "off", " (applied on swapchain recreation)");
+    if (vkContext) {
        vkContext->setVsync(enable);
        vkContext->markSwapchainDirty();
    }
    LOG_INFO("VSync ", enable ? "enabled" : "disabled");
 }
 void Window::applyResolution(int w, int h) {
--- a/src/game/game_handler.cpp
+++ b/src/game/game_handler.cpp
@ -11435,6 +11435,15 @@ void GameHandler::unstuckGy() {
    }
 }
 void GameHandler::unstuckHearth() {
    if (unstuckHearthCallback_) {
        unstuckHearthCallback_();
        addSystemChatMessage("Unstuck: teleported to hearthstone location.");
    } else {
        addSystemChatMessage("No hearthstone bind point set.");
    }
 }
 void GameHandler::handleLootResponse(network::Packet& packet) {
    if (!LootResponseParser::parse(packet, currentLoot)) return;
    lootWindowOpen = true;
--- a/src/rendering/camera.cpp
+++ b/src/rendering/camera.cpp
@ -20,6 +20,13 @@ void Camera::updateProjectionMatrix() {
    projectionMatrix = glm::perspective(glm::radians(fov), aspectRatio, nearPlane, farPlane);
    // Vulkan clip-space has Y pointing down; flip the projection's Y axis.
    projectionMatrix[1][1] *= -1.0f;
    unjitteredProjectionMatrix = projectionMatrix;
    // Re-apply jitter if active
    if (jitterOffset.x != 0.0f || jitterOffset.y != 0.0f) {
        projectionMatrix[2][0] += jitterOffset.x;
        projectionMatrix[2][1] += jitterOffset.y;
    }
 }
 glm::vec3 Camera::getForward() const {
@ -40,6 +47,21 @@ glm::vec3 Camera::getUp() const {
    return glm::normalize(glm::cross(getRight(), getForward()));
 }
 void Camera::setJitter(float jx, float jy) {
    // Remove old jitter, apply new
    projectionMatrix[2][0] -= jitterOffset.x;
    projectionMatrix[2][1] -= jitterOffset.y;
    jitterOffset = glm::vec2(jx, jy);
    projectionMatrix[2][0] += jitterOffset.x;
    projectionMatrix[2][1] += jitterOffset.y;
 }
 void Camera::clearJitter() {
    projectionMatrix[2][0] -= jitterOffset.x;
    projectionMatrix[2][1] -= jitterOffset.y;
    jitterOffset = glm::vec2(0.0f);
 }
 Ray Camera::screenToWorldRay(float screenX, float screenY, float screenW, float screenH) const {
    float ndcX = (2.0f * screenX / screenW) - 1.0f;
    // Vulkan Y-flip is baked into projectionMatrix, so NDC Y maps directly:
--- a/src/rendering/camera_controller.cpp
+++ b/src/rendering/camera_controller.cpp
@ -1,5 +1,6 @@
 #include "rendering/camera_controller.hpp"
 #include <algorithm>
 #include <future>
 #include <imgui.h>
 #include "rendering/terrain_manager.hpp"
 #include "rendering/wmo_renderer.hpp"
@ -808,25 +809,53 @@ void CameraController::update(float deltaTime) {
                if (useCached) {
                    groundH = cachedFloorHeight_;
                } else {
-                    // Full collision check
+                    // Full collision check — run terrain/WMO/M2 queries in parallel
                    std::optional<float> terrainH;
                    std::optional<float> wmoH;
                    std::optional<float> m2H;
                    if (terrainManager) {
                        terrainH = terrainManager->getHeightAt(targetPos.x, targetPos.y);
                    }
                    // When airborne, anchor probe to last ground level so the
                    // ceiling doesn't rise with the jump and catch roof geometry.
                    float wmoBaseZ = grounded ? std::max(targetPos.z, lastGroundZ) : lastGroundZ;
                    float wmoProbeZ = wmoBaseZ + stepUpBudget + 0.5f;
                    float wmoNormalZ = 1.0f;
                    // Launch WMO + M2 floor queries asynchronously while terrain runs on this thread.
                    // Collision scratch buffers are thread_local so concurrent calls are safe.
                    using FloorResult = std::pair<std::optional<float>, float>;
                    std::future<FloorResult> wmoFuture;
                    std::future<FloorResult> m2Future;
                    bool wmoAsync = false, m2Async = false;
                    float px = targetPos.x, py = targetPos.y;
                    if (wmoRenderer) {
-                        wmoH = wmoRenderer->getFloorHeight(targetPos.x, targetPos.y, wmoProbeZ, &wmoNormalZ);
+                        wmoAsync = true;
                        wmoFuture = std::async(std::launch::async,
                            [this, px, py, wmoProbeZ]() -> FloorResult {
                                float nz = 1.0f;
                                auto h = wmoRenderer->getFloorHeight(px, py, wmoProbeZ, &nz);
                                return {h, nz};
                            });
                    }
                    if (m2Renderer && !externalFollow_) {
-                        float m2NormalZ = 1.0f;
+                        m2Async = true;
-                        m2H = m2Renderer->getFloorHeight(targetPos.x, targetPos.y, wmoProbeZ, &m2NormalZ);
+                        m2Future = std::async(std::launch::async,
-                        if (m2H && m2NormalZ < MIN_WALKABLE_NORMAL_M2) {
+                            [this, px, py, wmoProbeZ]() -> FloorResult {
                                float nz = 1.0f;
                                auto h = m2Renderer->getFloorHeight(px, py, wmoProbeZ, &nz);
                                return {h, nz};
                            });
                    }
                    if (terrainManager) {
                        terrainH = terrainManager->getHeightAt(targetPos.x, targetPos.y);
                    }
                    if (wmoAsync) {
                        auto [h, nz] = wmoFuture.get();
                        wmoH = h;
                        wmoNormalZ = nz;
                    }
                    if (m2Async) {
                        auto [h, nz] = m2Future.get();
                        m2H = h;
                        if (m2H && nz < MIN_WALKABLE_NORMAL_M2) {
                            m2H = std::nullopt;
                        }
                    }
--- a/src/rendering/character_renderer.cpp
+++ b/src/rendering/character_renderer.cpp
@ -332,6 +332,11 @@ void CharacterRenderer::shutdown() {
    LOG_INFO("CharacterRenderer::shutdown instances=", instances.size(),
             " models=", models.size(), " override=", (void*)renderPassOverride_);
    // Wait for any in-flight background normal map generation threads
    while (pendingNormalMapCount_.load(std::memory_order_relaxed) > 0) {
        std::this_thread::sleep_for(std::chrono::milliseconds(1));
    }
    vkDeviceWaitIdle(vkCtx_->getDevice());
    VkDevice device = vkCtx_->getDevice();
    VmaAllocator alloc = vkCtx_->getAllocator();
@ -413,6 +418,16 @@ void CharacterRenderer::clear() {
    LOG_INFO("CharacterRenderer::clear instances=", instances.size(),
             " models=", models.size());
    // Wait for any in-flight background normal map generation threads
    while (pendingNormalMapCount_.load(std::memory_order_relaxed) > 0) {
        std::this_thread::sleep_for(std::chrono::milliseconds(1));
    }
    // Discard any completed results that haven't been uploaded
    {
        std::lock_guard<std::mutex> lock(normalMapResultsMutex_);
        completedNormalMaps_.clear();
    }
    vkDeviceWaitIdle(vkCtx_->getDevice());
    VkDevice device = vkCtx_->getDevice();
@ -509,7 +524,32 @@ std::unique_ptr<VkTexture> CharacterRenderer::generateNormalHeightMap(
        const uint8_t* pixels, uint32_t width, uint32_t height, float& outVariance) {
    if (!vkCtx_ || width == 0 || height == 0) return nullptr;
    // Use the CPU-only static method, then upload to GPU
    std::vector<uint8_t> dummy(width * height * 4);
    std::memcpy(dummy.data(), pixels, dummy.size());
    auto result = generateNormalHeightMapCPU("", std::move(dummy), width, height);
    outVariance = result.variance;
    auto tex = std::make_unique<VkTexture>();
    if (!tex->upload(*vkCtx_, result.pixels.data(), width, height, VK_FORMAT_R8G8B8A8_UNORM, true)) {
        return nullptr;
    }
    tex->createSampler(vkCtx_->getDevice(), VK_FILTER_LINEAR, VK_FILTER_LINEAR,
                        VK_SAMPLER_ADDRESS_MODE_REPEAT);
    return tex;
 }
 // Static, thread-safe CPU-only normal map generation (no GPU access)
 CharacterRenderer::NormalMapResult CharacterRenderer::generateNormalHeightMapCPU(
        std::string cacheKey, std::vector<uint8_t> srcPixels, uint32_t width, uint32_t height) {
    NormalMapResult result;
    result.cacheKey = std::move(cacheKey);
    result.width = width;
    result.height = height;
    result.variance = 0.0f;
    const uint32_t totalPixels = width * height;
    const uint8_t* pixels = srcPixels.data();
    // Step 1: Compute height from luminance
    std::vector<float> heightMap(totalPixels);
@ -524,7 +564,7 @@ std::unique_ptr<VkTexture> CharacterRenderer::generateNormalHeightMap(
        sumH2 += h * h;
    }
    double mean = sumH / totalPixels;
-    outVariance = static_cast<float>(sumH2 / totalPixels - mean * mean);
+    result.variance = static_cast<float>(sumH2 / totalPixels - mean * mean);
    // Step 1.5: Box blur the height map to reduce noise from diffuse textures
    auto wrapSample = [&](const std::vector<float>& map, int x, int y) -> float {
@ -545,11 +585,9 @@ std::unique_ptr<VkTexture> CharacterRenderer::generateNormalHeightMap(
        }
    }
-    // Step 2: Sobel 3x3 → normal map (crisp detail from original, blurred for POM alpha)
+    // Step 2: Sobel 3x3 → normal map
    // Higher strength than WMO (2.0) because character/weapon textures are hand-painted
    // with baked-in lighting that produces low-contrast gradients in the Sobel filter.
    const float strength = 5.0f;
-    std::vector<uint8_t> output(totalPixels * 4);
+    result.pixels.resize(totalPixels * 4);
    auto sampleH = [&](int x, int y) -> float {
        x = ((x % (int)width) + (int)width) % (int)width;
@ -573,20 +611,14 @@ std::unique_ptr<VkTexture> CharacterRenderer::generateNormalHeightMap(
            if (len > 0.0f) { nx /= len; ny /= len; nz /= len; }
            uint32_t idx = (y * width + x) * 4;
-            output[idx + 0] = static_cast<uint8_t>(std::clamp((nx * 0.5f + 0.5f) * 255.0f, 0.0f, 255.0f));
+            result.pixels[idx + 0] = static_cast<uint8_t>(std::clamp((nx * 0.5f + 0.5f) * 255.0f, 0.0f, 255.0f));
-            output[idx + 1] = static_cast<uint8_t>(std::clamp((ny * 0.5f + 0.5f) * 255.0f, 0.0f, 255.0f));
+            result.pixels[idx + 1] = static_cast<uint8_t>(std::clamp((ny * 0.5f + 0.5f) * 255.0f, 0.0f, 255.0f));
-            output[idx + 2] = static_cast<uint8_t>(std::clamp((nz * 0.5f + 0.5f) * 255.0f, 0.0f, 255.0f));
+            result.pixels[idx + 2] = static_cast<uint8_t>(std::clamp((nz * 0.5f + 0.5f) * 255.0f, 0.0f, 255.0f));
-            output[idx + 3] = static_cast<uint8_t>(std::clamp(blurredHeight[y * width + x] * 255.0f, 0.0f, 255.0f));
+            result.pixels[idx + 3] = static_cast<uint8_t>(std::clamp(blurredHeight[y * width + x] * 255.0f, 0.0f, 255.0f));
        }
    }
-    auto tex = std::make_unique<VkTexture>();
+    return result;
    if (!tex->upload(*vkCtx_, output.data(), width, height, VK_FORMAT_R8G8B8A8_UNORM, true)) {
        return nullptr;
    }
    tex->createSampler(vkCtx_->getDevice(), VK_FILTER_LINEAR, VK_FILTER_LINEAR,
                        VK_SAMPLER_ADDRESS_MODE_REPEAT);
    return tex;
 }
 VkTexture* CharacterRenderer::loadTexture(const std::string& path) {
@ -687,15 +719,22 @@ VkTexture* CharacterRenderer::loadTexture(const std::string& path) {
    e.hasAlpha = hasAlpha;
    e.colorKeyBlack = colorKeyBlackHint;
-    // Defer normal/height map generation to avoid stalling loadModel.
+    // Launch normal map generation on background thread — CPU work is pure compute,
-    // Normal maps are generated in processPendingNormalMaps() at a per-frame budget.
+    // only the GPU upload (in processPendingNormalMaps) needs the main thread (~1-2ms).
    if (blpImage.width >= 32 && blpImage.height >= 32) {
-        PendingNormalMap pending;
+        uint32_t w = blpImage.width, h = blpImage.height;
-        pending.cacheKey = key;
+        std::string ck = key;
-        pending.pixels.assign(blpImage.data.begin(), blpImage.data.end());
+        std::vector<uint8_t> px(blpImage.data.begin(), blpImage.data.end());
-        pending.width = blpImage.width;
+        pendingNormalMapCount_.fetch_add(1, std::memory_order_relaxed);
-        pending.height = blpImage.height;
+        auto* self = this;
-        pendingNormalMaps_.push_back(std::move(pending));
+        std::thread([self, ck = std::move(ck), px = std::move(px), w, h]() mutable {
            auto result = generateNormalHeightMapCPU(std::move(ck), std::move(px), w, h);
            {
                std::lock_guard<std::mutex> lock(self->normalMapResultsMutex_);
                self->completedNormalMaps_.push_back(std::move(result));
            }
            self->pendingNormalMapCount_.fetch_sub(1, std::memory_order_relaxed);
        }).detach();
        e.normalMapPending = true;
    }
@ -709,30 +748,39 @@ VkTexture* CharacterRenderer::loadTexture(const std::string& path) {
 }
 void CharacterRenderer::processPendingNormalMaps(int budget) {
-    if (pendingNormalMaps_.empty() || !vkCtx_) return;
+    if (!vkCtx_) return;
-    int processed = 0;
+    // Collect completed results from background threads
-    while (!pendingNormalMaps_.empty() && processed < budget) {
+    std::deque<NormalMapResult> ready;
-        auto pending = std::move(pendingNormalMaps_.front());
+    {
-        pendingNormalMaps_.pop_front();
+        std::lock_guard<std::mutex> lock(normalMapResultsMutex_);
        if (completedNormalMaps_.empty()) return;
        int count = std::min(budget, static_cast<int>(completedNormalMaps_.size()));
        for (int i = 0; i < count; i++) {
            ready.push_back(std::move(completedNormalMaps_.front()));
            completedNormalMaps_.pop_front();
        }
    }
-        auto it = textureCache.find(pending.cacheKey);
+    // GPU upload only (~1-2ms each) — CPU work already done on background thread
    for (auto& result : ready) {
        auto it = textureCache.find(result.cacheKey);
        if (it == textureCache.end()) continue;  // texture was evicted
        float nhVariance = 0.0f;
        vkCtx_->beginUploadBatch();
-        auto nhMap = generateNormalHeightMap(pending.pixels.data(),
+        auto tex = std::make_unique<VkTexture>();
-            pending.width, pending.height, nhVariance);
+        bool ok = tex->upload(*vkCtx_, result.pixels.data(), result.width, result.height,
-        vkCtx_->endUploadBatch();
+                              VK_FORMAT_R8G8B8A8_UNORM, true);
-
+        if (ok) {
-        if (nhMap) {
+            tex->createSampler(vkCtx_->getDevice(), VK_FILTER_LINEAR, VK_FILTER_LINEAR,
-            it->second.heightMapVariance = nhVariance;
+                               VK_SAMPLER_ADDRESS_MODE_REPEAT);
-            it->second.approxBytes += approxTextureBytesWithMips(pending.width, pending.height);
+            it->second.heightMapVariance = result.variance;
-            textureCacheBytes_ += approxTextureBytesWithMips(pending.width, pending.height);
+            it->second.approxBytes += approxTextureBytesWithMips(result.width, result.height);
-            it->second.normalHeightMap = std::move(nhMap);
+            textureCacheBytes_ += approxTextureBytesWithMips(result.width, result.height);
            it->second.normalHeightMap = std::move(tex);
        }
        vkCtx_->endUploadBatch();
        it->second.normalMapPending = false;
        processed++;
    }
 }
@ -1876,6 +1924,61 @@ glm::mat4 CharacterRenderer::getBoneTransform(const pipeline::M2Bone& bone, floa
 // --- Rendering ---
 void CharacterRenderer::prepareRender(uint32_t frameIndex) {
    if (instances.empty() || !opaquePipeline_) return;
    // Pre-allocate bone SSBOs + descriptor sets on main thread (pool ops not thread-safe)
    for (auto& [id, instance] : instances) {
        int numBones = std::min(static_cast<int>(instance.boneMatrices.size()), MAX_BONES);
        if (numBones <= 0) continue;
        if (!instance.boneBuffer[frameIndex]) {
            VkBufferCreateInfo bci{VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO};
            bci.size = MAX_BONES * sizeof(glm::mat4);
            bci.usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT;
            VmaAllocationCreateInfo aci{};
            aci.usage = VMA_MEMORY_USAGE_CPU_TO_GPU;
            aci.flags = VMA_ALLOCATION_CREATE_MAPPED_BIT;
            VmaAllocationInfo allocInfo{};
            vmaCreateBuffer(vkCtx_->getAllocator(), &bci, &aci,
                            &instance.boneBuffer[frameIndex], &instance.boneAlloc[frameIndex], &allocInfo);
            instance.boneMapped[frameIndex] = allocInfo.pMappedData;
            VkDescriptorSetAllocateInfo ai{VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO};
            ai.descriptorPool = boneDescPool_;
            ai.descriptorSetCount = 1;
            ai.pSetLayouts = &boneSetLayout_;
            VkResult dsRes = vkAllocateDescriptorSets(vkCtx_->getDevice(), &ai, &instance.boneSet[frameIndex]);
            if (dsRes != VK_SUCCESS) {
                LOG_ERROR("CharacterRenderer::prepareRender: bone descriptor alloc failed (instance=",
                          id, ", frame=", frameIndex, ", vk=", static_cast<int>(dsRes), ")");
                if (instance.boneBuffer[frameIndex]) {
                    vmaDestroyBuffer(vkCtx_->getAllocator(),
                                     instance.boneBuffer[frameIndex], instance.boneAlloc[frameIndex]);
                    instance.boneBuffer[frameIndex] = VK_NULL_HANDLE;
                    instance.boneAlloc[frameIndex] = VK_NULL_HANDLE;
                    instance.boneMapped[frameIndex] = nullptr;
                }
                continue;
            }
            if (instance.boneSet[frameIndex]) {
                VkDescriptorBufferInfo bufInfo{};
                bufInfo.buffer = instance.boneBuffer[frameIndex];
                bufInfo.offset = 0;
                bufInfo.range = bci.size;
                VkWriteDescriptorSet write{VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET};
                write.dstSet = instance.boneSet[frameIndex];
                write.dstBinding = 0;
                write.descriptorCount = 1;
                write.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
                write.pBufferInfo = &bufInfo;
                vkUpdateDescriptorSets(vkCtx_->getDevice(), 1, &write, 0, nullptr);
            }
        }
    }
 }
 void CharacterRenderer::render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet, [[maybe_unused]] const Camera& camera) {
    if (instances.empty() || !opaquePipeline_) {
        return;
--- a/src/rendering/loading_screen.cpp
+++ b/src/rendering/loading_screen.cpp
@ -240,6 +240,66 @@ bool LoadingScreen::loadImage(const std::string& path) {
    return true;
 }
 void LoadingScreen::renderOverlay() {
    // Draw loading screen content as ImGui overlay within an existing ImGui frame.
    // Caller is responsible for ImGui NewFrame/Render and Vulkan frame management.
    ImGuiIO& io = ImGui::GetIO();
    float screenW = io.DisplaySize.x;
    float screenH = io.DisplaySize.y;
    ImGui::SetNextWindowPos(ImVec2(0, 0));
    ImGui::SetNextWindowSize(ImVec2(screenW, screenH));
    ImGui::Begin("##LoadingScreenOverlay", nullptr,
        ImGuiWindowFlags_NoTitleBar | ImGuiWindowFlags_NoResize |
        ImGuiWindowFlags_NoMove | ImGuiWindowFlags_NoScrollbar |
        ImGuiWindowFlags_NoInputs | ImGuiWindowFlags_NoBackground |
        ImGuiWindowFlags_NoBringToFrontOnFocus);
    if (bgDescriptorSet) {
        ImGui::GetWindowDrawList()->AddImage(
            reinterpret_cast<ImTextureID>(bgDescriptorSet),
            ImVec2(0, 0), ImVec2(screenW, screenH));
    }
    // Progress bar
    {
        const float barWidthFrac = 0.6f;
        const float barHeight = 6.0f;
        const float barY = screenH * 0.06f;
        float barX = screenW * (0.5f - barWidthFrac * 0.5f);
        float barW = screenW * barWidthFrac;
        ImDrawList* drawList = ImGui::GetWindowDrawList();
        drawList->AddRectFilled(ImVec2(barX, barY), ImVec2(barX + barW, barY + barHeight),
            IM_COL32(25, 25, 25, 200), 2.0f);
        if (loadProgress > 0.001f) {
            drawList->AddRectFilled(ImVec2(barX, barY), ImVec2(barX + barW * loadProgress, barY + barHeight),
                IM_COL32(199, 156, 33, 255), 2.0f);
        }
        drawList->AddRect(ImVec2(barX - 1, barY - 1), ImVec2(barX + barW + 1, barY + barHeight + 1),
            IM_COL32(140, 110, 25, 255), 2.0f);
    }
    // Percentage text
    {
        char pctBuf[32];
        snprintf(pctBuf, sizeof(pctBuf), "%d%%", static_cast<int>(loadProgress * 100.0f));
        float textY = screenH * 0.06f - 20.0f;
        ImVec2 pctSize = ImGui::CalcTextSize(pctBuf);
        ImGui::SetCursorPos(ImVec2((screenW - pctSize.x) * 0.5f, textY));
        ImGui::TextColored(ImVec4(0.0f, 0.0f, 0.0f, 1.0f), "%s", pctBuf);
    }
    // Status text
    {
        float statusY = screenH * 0.06f + 14.0f;
        ImVec2 statusSize = ImGui::CalcTextSize(statusText.c_str());
        ImGui::SetCursorPos(ImVec2((screenW - statusSize.x) * 0.5f, statusY));
        ImGui::TextColored(ImVec4(0.0f, 0.0f, 0.0f, 1.0f), "%s", statusText.c_str());
    }
    ImGui::End();
 }
 void LoadingScreen::render() {
    // If a frame is already in progress (e.g. called from a UI callback),
    // end it before starting our own
--- a/src/rendering/m2_renderer.cpp
+++ b/src/rendering/m2_renderer.cpp
@ -282,6 +282,14 @@ glm::vec3 closestPointOnTriangle(const glm::vec3& p,
 } // namespace
 // Thread-local scratch buffers for collision queries (allows concurrent getFloorHeight calls)
 static thread_local std::vector<size_t> tl_m2_candidateScratch;
 static thread_local std::unordered_set<uint32_t> tl_m2_candidateIdScratch;
 static thread_local std::vector<uint32_t> tl_m2_collisionTriScratch;
 // Forward declaration (defined after animation helpers)
 static void computeBoneMatrices(const M2ModelGPU& model, M2Instance& instance);
 void M2Instance::updateModelMatrix() {
    modelMatrix = glm::mat4(1.0f);
    modelMatrix = glm::translate(modelMatrix, position);
@ -1028,10 +1036,9 @@ bool M2Renderer::loadModel(const pipeline::M2Model& model, uint32_t modelId) {
            (lowerName.find("trunk") != std::string::npos) ||
            (lowerName.find("stump") != std::string::npos) ||
            (lowerName.find("log") != std::string::npos);
-        // Only large trees (canopy > 20 model units wide) get trunk collision.
+        // Trees with visible trunks get collision. Threshold: canopy wider than 6
-        // Small/mid trees are walkthrough to avoid getting stuck between them.
+        // model units AND taller than 4 units (filters out small bushes/saplings).
-        // Only large trees get trunk collision; all smaller trees are walkthrough.
+        bool treeWithTrunk = treeLike && !hardTreePart && !foliageName && horiz > 6.0f && vert > 4.0f;
        bool treeWithTrunk = treeLike && !hardTreePart && !foliageName && horiz > 40.0f;
        bool softTree = treeLike && !hardTreePart && !treeWithTrunk;
        bool forceSolidCurb = gpuModel.collisionSteppedLowPlatform || knownStormwindPlanter || likelyCurbName || gpuModel.collisionPlanter;
        bool narrowVerticalName =
@ -1602,6 +1609,12 @@ bool M2Renderer::loadModel(const pipeline::M2Model& model, uint32_t modelId) {
        }
    }
    // Pre-compute available LOD levels to avoid per-instance batch iteration
    gpuModel.availableLODs = 0;
    for (const auto& b : gpuModel.batches) {
        if (b.submeshLevel < 8) gpuModel.availableLODs |= (1u << b.submeshLevel);
    }
    models[modelId] = std::move(gpuModel);
    LOG_DEBUG("Loaded M2 model: ", model.name, " (", models[modelId].vertexCount, " vertices, ",
@ -1667,6 +1680,21 @@ uint32_t M2Renderer::createInstance(uint32_t modelId, const glm::vec3& position,
        instance.animDuration = static_cast<float>(mdl.sequences[0].duration);
        instance.animTime = static_cast<float>(rand() % std::max(1u, mdl.sequences[0].duration));
        instance.variationTimer = 3000.0f + static_cast<float>(rand() % 8000);
        // Seed bone matrices from an existing instance of the same model so the
        // new instance renders immediately instead of being invisible until the
        // next update() computes bones (prevents pop-in flash).
        for (const auto& existing : instances) {
            if (existing.modelId == modelId && !existing.boneMatrices.empty()) {
                instance.boneMatrices = existing.boneMatrices;
                instance.bonesDirty[0] = instance.bonesDirty[1] = true;
                break;
            }
        }
        // If no sibling exists yet, compute bones immediately
        if (instance.boneMatrices.empty()) {
            computeBoneMatrices(mdlRef, instance);
        }
    }
    // Register in dedup map before pushing (uses original position, not ground-adjusted)
@ -1758,6 +1786,18 @@ uint32_t M2Renderer::createInstanceWithMatrix(uint32_t modelId, const glm::mat4&
        instance.animDuration = static_cast<float>(mdl2.sequences[0].duration);
        instance.animTime = static_cast<float>(rand() % std::max(1u, mdl2.sequences[0].duration));
        instance.variationTimer = 3000.0f + static_cast<float>(rand() % 8000);
        // Seed bone matrices from an existing sibling so the instance renders immediately
        for (const auto& existing : instances) {
            if (existing.modelId == modelId && !existing.boneMatrices.empty()) {
                instance.boneMatrices = existing.boneMatrices;
                instance.bonesDirty[0] = instance.bonesDirty[1] = true;
                break;
            }
        }
        if (instance.boneMatrices.empty()) {
            computeBoneMatrices(mdl2, instance);
        }
    } else {
        instance.animTime = static_cast<float>(rand()) / RAND_MAX * 10000.0f;
    }
@ -1911,6 +1951,7 @@ static void computeBoneMatrices(const M2ModelGPU& model, M2Instance& instance) {
            instance.boneMatrices[i] = local;
        }
    }
    instance.bonesDirty[0] = instance.bonesDirty[1] = true;
 }
 void M2Renderer::update(float deltaTime, const glm::vec3& cameraPos, const glm::mat4& viewProjection) {
@ -2172,6 +2213,53 @@ void M2Renderer::update(float deltaTime, const glm::vec3& cameraPos, const glm::
 }
 void M2Renderer::prepareRender(uint32_t frameIndex, const Camera& camera) {
    if (!initialized_ || instances.empty()) return;
    (void)camera;  // reserved for future frustum-based culling
    // Pre-allocate bone SSBOs + descriptor sets on main thread (pool ops not thread-safe).
    // Only iterate animated instances — static doodads don't need bone buffers.
    for (size_t idx : animatedInstanceIndices_) {
        if (idx >= instances.size()) continue;
        auto& instance = instances[idx];
        if (instance.boneMatrices.empty()) continue;
        if (!instance.boneBuffer[frameIndex]) {
            VkBufferCreateInfo bci{VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO};
            bci.size = 128 * sizeof(glm::mat4);
            bci.usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT;
            VmaAllocationCreateInfo aci{};
            aci.usage = VMA_MEMORY_USAGE_CPU_TO_GPU;
            aci.flags = VMA_ALLOCATION_CREATE_MAPPED_BIT;
            VmaAllocationInfo allocInfo{};
            vmaCreateBuffer(vkCtx_->getAllocator(), &bci, &aci,
                            &instance.boneBuffer[frameIndex], &instance.boneAlloc[frameIndex], &allocInfo);
            instance.boneMapped[frameIndex] = allocInfo.pMappedData;
            // Force dirty so current boneMatrices get copied into this
            // newly-allocated buffer during render (prevents garbage/zero
            // bones when the other frame index already cleared bonesDirty).
            instance.bonesDirty[frameIndex] = true;
            instance.boneSet[frameIndex] = allocateBoneSet();
            if (instance.boneSet[frameIndex]) {
                VkDescriptorBufferInfo bufInfo{};
                bufInfo.buffer = instance.boneBuffer[frameIndex];
                bufInfo.offset = 0;
                bufInfo.range = bci.size;
                VkWriteDescriptorSet write{VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET};
                write.dstSet = instance.boneSet[frameIndex];
                write.dstBinding = 0;
                write.descriptorCount = 1;
                write.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
                write.pBufferInfo = &bufInfo;
                vkUpdateDescriptorSets(vkCtx_->getDevice(), 1, &write, 0, nullptr);
            }
        }
    }
 }
 void M2Renderer::render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet, const Camera& camera) {
    if (instances.empty() || !opaquePipeline_) {
        return;
@ -2254,8 +2342,8 @@ void M2Renderer::render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet, const
    }
    // Sort by modelId to minimize vertex/index buffer rebinds
-    std::stable_sort(sortedVisible_.begin(), sortedVisible_.end(),
+    std::sort(sortedVisible_.begin(), sortedVisible_.end(),
-                     [](const VisibleEntry& a, const VisibleEntry& b) { return a.modelId < b.modelId; });
+              [](const VisibleEntry& a, const VisibleEntry& b) { return a.modelId < b.modelId; });
    uint32_t currentModelId = UINT32_MAX;
    const M2ModelGPU* currentModel = nullptr;
@ -2330,44 +2418,26 @@ void M2Renderer::render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet, const
            }
        }
-        // Upload bone matrices to SSBO if model has skeletal animation
+        // Upload bone matrices to SSBO if model has skeletal animation.
-        bool useBones = model.hasAnimation && !model.disableAnimation && !instance.boneMatrices.empty();
+        // Skip animated instances entirely until bones are computed + buffers allocated
        // to prevent bind-pose/T-pose flash on first appearance.
        bool modelNeedsAnimation = model.hasAnimation && !model.disableAnimation;
        if (modelNeedsAnimation && instance.boneMatrices.empty()) {
            continue;  // Bones not yet computed — skip to avoid bind-pose flash
        }
        bool needsBones = modelNeedsAnimation && !instance.boneMatrices.empty();
        if (needsBones && (!instance.boneBuffer[frameIndex] || !instance.boneSet[frameIndex])) {
            continue;  // Bone buffers not yet allocated — skip to avoid bind-pose flash
        }
        bool useBones = needsBones;
        if (useBones) {
-            // Lazy-allocate bone SSBO on first use
+            // Upload bone matrices only when recomputed (per-frame-index tracking
-            if (!instance.boneBuffer[frameIndex]) {
+            // ensures both double-buffered SSBOs get the latest bone data)
-                VkBufferCreateInfo bci{VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO};
+            if (instance.bonesDirty[frameIndex] && instance.boneMapped[frameIndex]) {
                bci.size = 128 * sizeof(glm::mat4); // max 128 bones
                bci.usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT;
                VmaAllocationCreateInfo aci{};
                aci.usage = VMA_MEMORY_USAGE_CPU_TO_GPU;
                aci.flags = VMA_ALLOCATION_CREATE_MAPPED_BIT;
                VmaAllocationInfo allocInfo{};
                vmaCreateBuffer(vkCtx_->getAllocator(), &bci, &aci,
                                &instance.boneBuffer[frameIndex], &instance.boneAlloc[frameIndex], &allocInfo);
                instance.boneMapped[frameIndex] = allocInfo.pMappedData;
                // Allocate descriptor set for bone SSBO
                instance.boneSet[frameIndex] = allocateBoneSet();
                if (instance.boneSet[frameIndex]) {
                    VkDescriptorBufferInfo bufInfo{};
                    bufInfo.buffer = instance.boneBuffer[frameIndex];
                    bufInfo.offset = 0;
                    bufInfo.range = bci.size;
                    VkWriteDescriptorSet write{VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET};
                    write.dstSet = instance.boneSet[frameIndex];
                    write.dstBinding = 0;
                    write.descriptorCount = 1;
                    write.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
                    write.pBufferInfo = &bufInfo;
                    vkUpdateDescriptorSets(vkCtx_->getDevice(), 1, &write, 0, nullptr);
                }
            }
            // Upload bone matrices
            if (instance.boneMapped[frameIndex]) {
                int numBones = std::min(static_cast<int>(instance.boneMatrices.size()), 128);
                memcpy(instance.boneMapped[frameIndex], instance.boneMatrices.data(),
                       numBones * sizeof(glm::mat4));
                instance.bonesDirty[frameIndex] = false;
            }
            // Bind bone descriptor set (set 2)
@ -2384,12 +2454,8 @@ void M2Renderer::render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet, const
        else if (entry.distSq > 40.0f * 40.0f) desiredLOD = 1;
        uint16_t targetLOD = desiredLOD;
-        if (desiredLOD > 0) {
+        if (desiredLOD > 0 && !(model.availableLODs & (1u << desiredLOD))) {
-            bool hasDesiredLOD = false;
+            targetLOD = 0;
            for (const auto& b : model.batches) {
                if (b.submeshLevel == desiredLOD) { hasDesiredLOD = true; break; }
            }
            if (!hasDesiredLOD) targetLOD = 0;
        }
        const bool foliageLikeModel = model.isFoliageLike;
@ -3597,7 +3663,7 @@ void M2Renderer::rebuildSpatialIndex() {
 void M2Renderer::gatherCandidates(const glm::vec3& queryMin, const glm::vec3& queryMax,
                                  std::vector<size_t>& outIndices) const {
    outIndices.clear();
-    candidateIdScratch.clear();
+    tl_m2_candidateIdScratch.clear();
    GridCell minCell = toCell(queryMin);
    GridCell maxCell = toCell(queryMax);
@ -3607,7 +3673,7 @@ void M2Renderer::gatherCandidates(const glm::vec3& queryMin, const glm::vec3& qu
                auto it = spatialGrid.find(GridCell{x, y, z});
                if (it == spatialGrid.end()) continue;
                for (uint32_t id : it->second) {
-                    if (!candidateIdScratch.insert(id).second) continue;
+                    if (!tl_m2_candidateIdScratch.insert(id).second) continue;
                    auto idxIt = instanceIndexById.find(id);
                    if (idxIt != instanceIndexById.end()) {
                        outIndices.push_back(idxIt->second);
@ -3780,9 +3846,9 @@ std::optional<float> M2Renderer::getFloorHeight(float glX, float glY, float glZ,
    glm::vec3 queryMin(glX - 2.0f, glY - 2.0f, glZ - 6.0f);
    glm::vec3 queryMax(glX + 2.0f, glY + 2.0f, glZ + 8.0f);
-    gatherCandidates(queryMin, queryMax, candidateScratch);
+    gatherCandidates(queryMin, queryMax, tl_m2_candidateScratch);
-    for (size_t idx : candidateScratch) {
+    for (size_t idx : tl_m2_candidateScratch) {
        const auto& instance = instances[idx];
        if (collisionFocusEnabled &&
            pointAABBDistanceSq(collisionFocusPos, instance.worldBoundsMin, instance.worldBoundsMax) > collisionFocusRadiusSq) {
@ -3804,14 +3870,14 @@ std::optional<float> M2Renderer::getFloorHeight(float glX, float glY, float glZ,
            model.collision.getFloorTrisInRange(
                localPos.x - 1.0f, localPos.y - 1.0f,
                localPos.x + 1.0f, localPos.y + 1.0f,
-                collisionTriScratch_);
+                tl_m2_collisionTriScratch);
            glm::vec3 rayOrigin(localPos.x, localPos.y, localPos.z + 5.0f);
            glm::vec3 rayDir(0.0f, 0.0f, -1.0f);
            float bestHitZ = -std::numeric_limits<float>::max();
            bool hitAny = false;
-            for (uint32_t ti : collisionTriScratch_) {
+            for (uint32_t ti : tl_m2_collisionTriScratch) {
                if (ti >= model.collision.triCount) continue;
                if (model.collision.triBounds[ti].maxZ < localPos.z - 10.0f ||
                    model.collision.triBounds[ti].minZ > localPos.z + 5.0f) continue;
@ -3926,10 +3992,10 @@ bool M2Renderer::checkCollision(const glm::vec3& from, const glm::vec3& to,
    glm::vec3 queryMin = glm::min(from, to) - glm::vec3(7.0f, 7.0f, 5.0f);
    glm::vec3 queryMax = glm::max(from, to) + glm::vec3(7.0f, 7.0f, 5.0f);
-    gatherCandidates(queryMin, queryMax, candidateScratch);
+    gatherCandidates(queryMin, queryMax, tl_m2_candidateScratch);
    // Check against all M2 instances in local space (rotation-aware).
-    for (size_t idx : candidateScratch) {
+    for (size_t idx : tl_m2_candidateScratch) {
        const auto& instance = instances[idx];
        if (collisionFocusEnabled &&
            pointAABBDistanceSq(collisionFocusPos, instance.worldBoundsMin, instance.worldBoundsMax) > collisionFocusRadiusSq) {
@ -3962,14 +4028,14 @@ bool M2Renderer::checkCollision(const glm::vec3& from, const glm::vec3& to,
                std::min(localFrom.y, localPos.y) - localRadius - 1.0f,
                std::max(localFrom.x, localPos.x) + localRadius + 1.0f,
                std::max(localFrom.y, localPos.y) + localRadius + 1.0f,
-                collisionTriScratch_);
+                tl_m2_collisionTriScratch);
            constexpr float PLAYER_HEIGHT = 2.0f;
            constexpr float MAX_TOTAL_PUSH = 0.02f; // Cap total push per instance
            bool pushed = false;
            float totalPushX = 0.0f, totalPushY = 0.0f;
-            for (uint32_t ti : collisionTriScratch_) {
+            for (uint32_t ti : tl_m2_collisionTriScratch) {
                if (ti >= model.collision.triCount) continue;
                if (localPos.z + PLAYER_HEIGHT < model.collision.triBounds[ti].minZ ||
                    localPos.z > model.collision.triBounds[ti].maxZ) continue;
@ -4167,9 +4233,9 @@ float M2Renderer::raycastBoundingBoxes(const glm::vec3& origin, const glm::vec3&
    glm::vec3 rayEnd = origin + direction * maxDistance;
    glm::vec3 queryMin = glm::min(origin, rayEnd) - glm::vec3(1.0f);
    glm::vec3 queryMax = glm::max(origin, rayEnd) + glm::vec3(1.0f);
-    gatherCandidates(queryMin, queryMax, candidateScratch);
+    gatherCandidates(queryMin, queryMax, tl_m2_candidateScratch);
-    for (size_t idx : candidateScratch) {
+    for (size_t idx : tl_m2_candidateScratch) {
        const auto& instance = instances[idx];
        if (collisionFocusEnabled &&
            pointAABBDistanceSq(collisionFocusPos, instance.worldBoundsMin, instance.worldBoundsMax) > collisionFocusRadiusSq) {
--- a/src/rendering/performance_hud.cpp
+++ b/src/rendering/performance_hud.cpp
@ -1,5 +1,6 @@
 #include "rendering/performance_hud.hpp"
 #include "rendering/renderer.hpp"
 #include "rendering/vk_context.hpp"
 #include "rendering/terrain_renderer.hpp"
 #include "rendering/terrain_manager.hpp"
 #include "rendering/water_renderer.hpp"
@ -187,6 +188,19 @@ void PerformanceHUD::render(const Renderer* renderer, const Camera* camera) {
                           0, nullptr, 0.0f, 33.33f, ImVec2(200, 40));
        }
        // FSR info
        if (renderer->isFSREnabled()) {
            ImGui::TextColored(ImVec4(0.4f, 1.0f, 0.4f, 1.0f), "FSR 1.0: ON");
            auto* ctx = renderer->getVkContext();
            if (ctx) {
                auto ext = ctx->getSwapchainExtent();
                float sf = renderer->getFSRScaleFactor();
                uint32_t iw = static_cast<uint32_t>(ext.width * sf) & ~1u;
                uint32_t ih = static_cast<uint32_t>(ext.height * sf) & ~1u;
                ImGui::Text("  %ux%u -> %ux%u (%.0f%%)", iw, ih, ext.width, ext.height, sf * 100.0f);
            }
        }
        ImGui::Spacing();
    }
--- a/src/rendering/renderer.cpp
+++ b/src/rendering/renderer.cpp
--- a/src/rendering/terrain_manager.cpp
+++ b/src/rendering/terrain_manager.cpp
@ -199,13 +199,29 @@ void TerrainManager::update(const Camera& camera, float deltaTime) {
        currentTile = newTile;
    }
-    // Stream tiles if we've moved significantly or initial load
+    // Stream tiles when player crosses a tile boundary
    if (newTile.x != lastStreamTile.x || newTile.y != lastStreamTile.y) {
        LOG_DEBUG("Streaming: cam=(", camPos.x, ",", camPos.y, ",", camPos.z,
                 ") tile=[", newTile.x, ",", newTile.y,
                 "] loaded=", loadedTiles.size());
        streamTiles();
        lastStreamTile = newTile;
    } else {
        // Proactive loading: when workers are idle, periodically re-check for
        // unloaded tiles within range. Throttled to avoid hitching right after
        // world load when many tiles finalize simultaneously.
        proactiveStreamTimer_ += deltaTime;
        if (proactiveStreamTimer_ >= 2.0f) {
            proactiveStreamTimer_ = 0.0f;
            bool workersIdle;
            {
                std::lock_guard<std::mutex> lock(queueMutex);
                workersIdle = loadQueue.empty();
            }
            if (workersIdle) {
                streamTiles();
            }
        }
    }
 }
@ -800,7 +816,7 @@ bool TerrainManager::advanceFinalization(FinalizingTile& ft) {
            }
            bool allDone = terrainRenderer->loadTerrainIncremental(
                pending->mesh, pending->terrain.textures, x, y,
-                ft.terrainChunkNext, 32);
+                ft.terrainChunkNext, 16);
            if (!allDone) {
                return false; // More chunks remain — yield to time budget
            }
@ -830,11 +846,19 @@ bool TerrainManager::advanceFinalization(FinalizingTile& ft) {
    }
    case FinalizationPhase::M2_MODELS: {
-        // Upload multiple M2 models per call (batched GPU uploads)
+        // Upload multiple M2 models per call (batched GPU uploads).
        // When no more tiles are queued for background parsing, increase the
        // per-frame budget so idle workers don't waste time waiting for the
        // main thread to trickle-upload models.
        if (m2Renderer && ft.m2ModelIndex < pending->m2Models.size()) {
            // Set pre-decoded BLP cache so loadTexture() skips main-thread BLP decode
            m2Renderer->setPredecodedBLPCache(&pending->preloadedM2Textures);
-            constexpr size_t kModelsPerStep = 4;
+            bool workersIdle;
            {
                std::lock_guard<std::mutex> lk(queueMutex);
                workersIdle = loadQueue.empty() && readyQueue.empty();
            }
            const size_t kModelsPerStep = workersIdle ? 6 : 4;
            size_t uploaded = 0;
            while (ft.m2ModelIndex < pending->m2Models.size() && uploaded < kModelsPerStep) {
                auto& m2Ready = pending->m2Models[ft.m2ModelIndex];
@ -896,7 +920,12 @@ bool TerrainManager::advanceFinalization(FinalizingTile& ft) {
            wmoRenderer->setPredecodedBLPCache(&pending->preloadedWMOTextures);
            wmoRenderer->setDeferNormalMaps(true);
-            constexpr size_t kWmosPerStep = 1;
+            bool wmoWorkersIdle;
            {
                std::lock_guard<std::mutex> lk(queueMutex);
                wmoWorkersIdle = loadQueue.empty() && readyQueue.empty();
            }
            const size_t kWmosPerStep = wmoWorkersIdle ? 2 : 1;
            size_t uploaded = 0;
            while (ft.wmoModelIndex < pending->wmoModels.size() && uploaded < kWmosPerStep) {
                auto& wmoReady = pending->wmoModels[ft.wmoModelIndex];
@ -911,6 +940,8 @@ bool TerrainManager::advanceFinalization(FinalizingTile& ft) {
            wmoRenderer->setDeferNormalMaps(false);
            wmoRenderer->setPredecodedBLPCache(nullptr);
            if (ft.wmoModelIndex < pending->wmoModels.size()) return false;
            // All WMO models loaded — backfill normal/height maps that were skipped during streaming
            wmoRenderer->backfillNormalMaps();
        }
        ft.phase = FinalizationPhase::WMO_INSTANCES;
        return false;
@ -1176,7 +1207,7 @@ void TerrainManager::processReadyTiles() {
    // Async upload batch: record GPU copies into a command buffer, submit with
    // a fence, but DON'T wait.  The fence is polled on subsequent frames.
    // This eliminates the main-thread stall from vkWaitForFences entirely.
-    const int maxSteps = taxiStreamingMode_ ? 8 : 2;
+    const int maxSteps = taxiStreamingMode_ ? 4 : 1;
    int steps = 0;
    if (vkCtx) vkCtx->beginUploadBatch();
--- a/src/rendering/vk_context.cpp
+++ b/src/rendering/vk_context.cpp
@ -252,14 +252,22 @@ bool VkContext::createAllocator() {
 bool VkContext::createSwapchain(int width, int height) {
    vkb::SwapchainBuilder swapchainBuilder{physicalDevice, device, surface};
-    auto swapRet = swapchainBuilder
+    auto& builder = swapchainBuilder
        .set_desired_format({VK_FORMAT_B8G8R8A8_UNORM, VK_COLOR_SPACE_SRGB_NONLINEAR_KHR})
        .set_desired_present_mode(VK_PRESENT_MODE_FIFO_KHR) // VSync
        .set_desired_extent(static_cast<uint32_t>(width), static_cast<uint32_t>(height))
        .set_image_usage_flags(VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT | VK_IMAGE_USAGE_TRANSFER_SRC_BIT)
        .set_desired_min_image_count(2)
-        .set_old_swapchain(swapchain) // For recreation
+        .set_old_swapchain(swapchain);
-        .build();
+
    if (vsync_) {
        builder.set_desired_present_mode(VK_PRESENT_MODE_FIFO_KHR);
    } else {
        builder.set_desired_present_mode(VK_PRESENT_MODE_IMMEDIATE_KHR);
        builder.add_fallback_present_mode(VK_PRESENT_MODE_MAILBOX_KHR);
        builder.add_fallback_present_mode(VK_PRESENT_MODE_FIFO_RELAXED_KHR);
    }
    auto swapRet = builder.build();
    if (!swapRet) {
        LOG_ERROR("Failed to create Vulkan swapchain: ", swapRet.error().message());
@ -1026,14 +1034,22 @@ bool VkContext::recreateSwapchain(int width, int height) {
    VkSwapchainKHR oldSwapchain = swapchain;
    vkb::SwapchainBuilder swapchainBuilder{physicalDevice, device, surface};
-    auto swapRet = swapchainBuilder
+    auto& builder = swapchainBuilder
        .set_desired_format({VK_FORMAT_B8G8R8A8_UNORM, VK_COLOR_SPACE_SRGB_NONLINEAR_KHR})
        .set_desired_present_mode(VK_PRESENT_MODE_FIFO_KHR)
        .set_desired_extent(static_cast<uint32_t>(width), static_cast<uint32_t>(height))
        .set_image_usage_flags(VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT | VK_IMAGE_USAGE_TRANSFER_SRC_BIT)
        .set_desired_min_image_count(2)
-        .set_old_swapchain(oldSwapchain)
+        .set_old_swapchain(oldSwapchain);
-        .build();
+
    if (vsync_) {
        builder.set_desired_present_mode(VK_PRESENT_MODE_FIFO_KHR);
    } else {
        builder.set_desired_present_mode(VK_PRESENT_MODE_IMMEDIATE_KHR);
        builder.add_fallback_present_mode(VK_PRESENT_MODE_MAILBOX_KHR);
        builder.add_fallback_present_mode(VK_PRESENT_MODE_FIFO_RELAXED_KHR);
    }
    auto swapRet = builder.build();
    if (oldSwapchain) {
        vkDestroySwapchainKHR(device, oldSwapchain, nullptr);
--- a/src/rendering/wmo_renderer.cpp
+++ b/src/rendering/wmo_renderer.cpp
@ -48,6 +48,11 @@ size_t envSizeOrDefault(const char* name, size_t defValue) {
 }
 } // namespace
 // Thread-local scratch buffers for collision queries (allows concurrent getFloorHeight/checkWallCollision calls)
 static thread_local std::vector<size_t> tl_candidateScratch;
 static thread_local std::vector<uint32_t> tl_triScratch;
 static thread_local std::unordered_set<uint32_t> tl_candidateIdScratch;
 static void transformAABB(const glm::mat4& modelMatrix,
                          const glm::vec3& localMin,
                          const glm::vec3& localMax,
@ -787,8 +792,8 @@ bool WMORenderer::loadModel(const pipeline::WMOModel& model, uint32_t id) {
            }
            // Build doodad's local transform (WoW coordinates)
-            // WMO doodads use quaternion rotation (X/Y swapped for correct orientation)
+            // WMO doodads use quaternion rotation
-            glm::quat fixedRotation(doodad.rotation.w, doodad.rotation.y, doodad.rotation.x, doodad.rotation.z);
+            glm::quat fixedRotation(doodad.rotation.w, doodad.rotation.x, doodad.rotation.y, doodad.rotation.z);
            glm::mat4 localTransform(1.0f);
            localTransform = glm::translate(localTransform, doodad.position);
@ -1288,7 +1293,7 @@ void WMORenderer::rebuildSpatialIndex() {
 void WMORenderer::gatherCandidates(const glm::vec3& queryMin, const glm::vec3& queryMax,
                                   std::vector<size_t>& outIndices) const {
    outIndices.clear();
-    candidateIdScratch.clear();
+    tl_candidateIdScratch.clear();
    GridCell minCell = toCell(queryMin);
    GridCell maxCell = toCell(queryMax);
@ -1298,7 +1303,7 @@ void WMORenderer::gatherCandidates(const glm::vec3& queryMin, const glm::vec3& q
                auto it = spatialGrid.find(GridCell{x, y, z});
                if (it == spatialGrid.end()) continue;
                for (uint32_t id : it->second) {
-                    if (!candidateIdScratch.insert(id).second) continue;
+                    if (!tl_candidateIdScratch.insert(id).second) continue;
                    auto idxIt = instanceIndexById.find(id);
                    if (idxIt != instanceIndexById.end()) {
                        outIndices.push_back(idxIt->second);
@ -1318,15 +1323,10 @@ void WMORenderer::gatherCandidates(const glm::vec3& queryMin, const glm::vec3& q
    }
 }
-void WMORenderer::render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet, const Camera& camera) {
+void WMORenderer::prepareRender() {
    ++currentFrameId;
-    if (!opaquePipeline_ || instances.empty()) {
+    // Update material UBOs if settings changed (mapped memory writes — main thread only)
        lastDrawCalls = 0;
        return;
    }
    // Update material UBOs if settings changed
    if (materialSettingsDirty_) {
        materialSettingsDirty_ = false;
        static const int pomSampleTable[] = { 16, 32, 64 };
@ -1335,7 +1335,6 @@ void WMORenderer::render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet, const
            for (auto& group : model.groups) {
                for (auto& mb : group.mergedBatches) {
                    if (!mb.materialUBO) continue;
                    // Read existing UBO data, update normal/POM fields
                    VmaAllocationInfo allocInfo{};
                    vmaGetAllocationInfo(vkCtx_->getAllocator(), mb.materialUBOAlloc, &allocInfo);
                    if (allocInfo.pMappedData) {
@ -1351,6 +1350,13 @@ void WMORenderer::render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet, const
            }
        }
    }
 }
 void WMORenderer::render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet, const Camera& camera) {
    if (!opaquePipeline_ || instances.empty()) {
        lastDrawCalls = 0;
        return;
    }
    lastDrawCalls = 0;
@ -1362,43 +1368,45 @@ void WMORenderer::render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet, const
    lastPortalCulledGroups = 0;
    lastDistanceCulledGroups = 0;
-    // ── Phase 1: Parallel visibility culling ──────────────────────────
+    // ── Phase 1: Visibility culling ──────────────────────────
-    std::vector<size_t> visibleInstances;
+    visibleInstances_.clear();
    visibleInstances.reserve(instances.size());
    for (size_t i = 0; i < instances.size(); ++i) {
-        const auto& instance = instances[i];
+        if (loadedModels.count(instances[i].modelId))
-        if (loadedModels.find(instance.modelId) == loadedModels.end())
+            visibleInstances_.push_back(i);
            continue;
        visibleInstances.push_back(i);
    }
    glm::vec3 camPos = camera.getPosition();
    bool doPortalCull = portalCulling;
    bool doFrustumCull = false; // Temporarily disabled: can over-cull world WMOs
    bool doDistanceCull = distanceCulling;
-    auto cullInstance = [&](size_t instIdx) -> InstanceDrawList {
+    auto cullInstance = [&](size_t instIdx, InstanceDrawList& result) {
-        if (instIdx >= instances.size()) return InstanceDrawList{};
+        if (instIdx >= instances.size()) return;
        const auto& instance = instances[instIdx];
        auto mdlIt = loadedModels.find(instance.modelId);
-        if (mdlIt == loadedModels.end()) return InstanceDrawList{};
+        if (mdlIt == loadedModels.end()) return;
        const ModelData& model = mdlIt->second;
        InstanceDrawList result;
        result.instanceIndex = instIdx;
        result.visibleGroups.clear();
        result.portalCulled = 0;
        result.distanceCulled = 0;
-        // Portal-based visibility
+        // Portal-based visibility — use a flat sorted vector instead of unordered_set
-        std::unordered_set<uint32_t> portalVisibleGroups;
+        std::vector<uint32_t> portalVisibleGroups;
        bool usePortalCulling = doPortalCull && !model.portals.empty() && !model.portalRefs.empty();
        if (usePortalCulling) {
            std::unordered_set<uint32_t> pvgSet;
            glm::vec4 localCamPos = instance.invModelMatrix * glm::vec4(camPos, 1.0f);
            getVisibleGroupsViaPortals(model, glm::vec3(localCamPos), frustum,
-                                       instance.modelMatrix, portalVisibleGroups);
+                                       instance.modelMatrix, pvgSet);
            portalVisibleGroups.assign(pvgSet.begin(), pvgSet.end());
            std::sort(portalVisibleGroups.begin(), portalVisibleGroups.end());
        }
        for (size_t gi = 0; gi < model.groups.size(); ++gi) {
            if (usePortalCulling &&
-                portalVisibleGroups.find(static_cast<uint32_t>(gi)) == portalVisibleGroups.end()) {
+                !std::binary_search(portalVisibleGroups.begin(), portalVisibleGroups.end(),
                                    static_cast<uint32_t>(gi))) {
                result.portalCulled++;
                continue;
            }
@ -1414,62 +1422,18 @@ void WMORenderer::render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet, const
                        continue;
                    }
                }
                if (doFrustumCull && !frustum.intersectsAABB(gMin, gMax))
                    continue;
            }
            result.visibleGroups.push_back(static_cast<uint32_t>(gi));
        }
        return result;
    };
-    // Dispatch culling — parallel when enough instances, sequential otherwise.
+    // Resize drawLists to match (reuses previous capacity)
-    std::vector<InstanceDrawList> drawLists;
+    drawLists_.resize(visibleInstances_.size());
    drawLists.reserve(visibleInstances.size());
-    static const size_t minParallelCullInstances = std::max<size_t>(
+    // Sequential culling (parallel dispatch overhead > savings for typical instance counts)
-        4, envSizeOrDefault("WOWEE_WMO_CULL_MT_MIN", 128));
+    for (size_t j = 0; j < visibleInstances_.size(); ++j) {
-    if (visibleInstances.size() >= minParallelCullInstances && numCullThreads_ > 1) {
+        cullInstance(visibleInstances_[j], drawLists_[j]);
        static const size_t minCullWorkPerThread = std::max<size_t>(
            16, envSizeOrDefault("WOWEE_WMO_CULL_WORK_PER_THREAD", 64));
        const size_t maxUsefulThreads = std::max<size_t>(
            1, (visibleInstances.size() + minCullWorkPerThread - 1) / minCullWorkPerThread);
        const size_t numThreads = std::min(static_cast<size_t>(numCullThreads_), maxUsefulThreads);
        if (numThreads <= 1) {
            for (size_t idx : visibleInstances) {
                drawLists.push_back(cullInstance(idx));
            }
        } else {
            const size_t chunkSize = visibleInstances.size() / numThreads;
            const size_t remainder = visibleInstances.size() % numThreads;
            drawLists.resize(visibleInstances.size());
            cullFutures_.clear();
            if (cullFutures_.capacity() < numThreads) {
                cullFutures_.reserve(numThreads);
            }
            size_t start = 0;
            for (size_t t = 0; t < numThreads; ++t) {
                const size_t end = start + chunkSize + (t < remainder ? 1 : 0);
                cullFutures_.push_back(std::async(std::launch::async,
                    [&, start, end]() {
                        for (size_t j = start; j < end; ++j) {
                            drawLists[j] = cullInstance(visibleInstances[j]);
                        }
                    }));
                start = end;
            }
            for (auto& f : cullFutures_) {
                f.get();
            }
        }
    } else {
        for (size_t idx : visibleInstances)
            drawLists.push_back(cullInstance(idx));
    }
    // ── Phase 2: Vulkan draw ────────────────────────────────
@ -1484,7 +1448,7 @@ void WMORenderer::render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet, const
    // Track which pipeline is currently bound: 0=opaque, 1=transparent, 2=glass
    int currentPipelineKind = 0;
-    for (const auto& dl : drawLists) {
+    for (const auto& dl : drawLists_) {
        if (dl.instanceIndex >= instances.size()) continue;
        const auto& instance = instances[dl.instanceIndex];
        auto modelIt = loadedModels.find(instance.modelId);
@ -2412,6 +2376,69 @@ VkTexture* WMORenderer::loadTexture(const std::string& path) {
    return rawPtr;
 }
 void WMORenderer::backfillNormalMaps() {
    if (!normalMappingEnabled_ && !pomEnabled_) return;
    if (!assetManager) return;
    int generated = 0;
    for (auto& [key, entry] : textureCache) {
        if (entry.normalHeightMap) continue;  // already has one
        if (!entry.texture) continue;
        // Re-load the BLP from MPQ to get pixel data for normal map generation
        pipeline::BLPImage blp = assetManager->loadTexture(key);
        if (!blp.isValid() || blp.width == 0 || blp.height == 0) continue;
        float variance = 0.0f;
        auto nhMap = generateNormalHeightMap(blp.data.data(), blp.width, blp.height, variance);
        if (nhMap) {
            entry.normalHeightMap = std::move(nhMap);
            entry.heightMapVariance = variance;
            generated++;
        }
    }
    if (generated > 0) {
        VkDevice device = vkCtx_->getDevice();
        int rebound = 0;
        // Update merged batches: assign normal map pointer and rebind descriptor set
        for (auto& [modelId, model] : loadedModels) {
            for (auto& group : model.groups) {
                for (auto& mb : group.mergedBatches) {
                    if (mb.normalHeightMap) continue;  // already set
                    if (!mb.texture) continue;
                    // Find this texture in the cache
                    for (const auto& [cacheKey, cacheEntry] : textureCache) {
                        if (cacheEntry.texture.get() == mb.texture) {
                            if (cacheEntry.normalHeightMap) {
                                mb.normalHeightMap = cacheEntry.normalHeightMap.get();
                                mb.heightMapVariance = cacheEntry.heightMapVariance;
                                // Rebind descriptor set binding 2 to the real normal/height map
                                if (mb.materialSet) {
                                    VkDescriptorImageInfo nhImgInfo = mb.normalHeightMap->descriptorInfo();
                                    VkWriteDescriptorSet write{};
                                    write.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
                                    write.dstSet = mb.materialSet;
                                    write.dstBinding = 2;
                                    write.descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER;
                                    write.descriptorCount = 1;
                                    write.pImageInfo = &nhImgInfo;
                                    vkUpdateDescriptorSets(device, 1, &write, 0, nullptr);
                                    rebound++;
                                }
                            }
                            break;
                        }
                    }
                }
            }
        }
        materialSettingsDirty_ = true;
        LOG_INFO("Backfilled ", generated, " normal/height maps (", rebound, " descriptor sets rebound) for deferred WMO textures");
    }
 }
 // Ray-AABB intersection (slab method)
 // Returns true if the ray intersects the axis-aligned bounding box
 static bool rayIntersectsAABB(const glm::vec3& origin, const glm::vec3& dir,
@ -2808,9 +2835,9 @@ std::optional<float> WMORenderer::getFloorHeight(float glX, float glY, float glZ
        group.getTrianglesInRange(
            localOrigin.x - 1.0f, localOrigin.y - 1.0f,
            localOrigin.x + 1.0f, localOrigin.y + 1.0f,
-            triScratch_);
+            tl_triScratch);
-        for (uint32_t triStart : triScratch_) {
+        for (uint32_t triStart : tl_triScratch) {
            const glm::vec3& v0 = verts[indices[triStart]];
            const glm::vec3& v1 = verts[indices[triStart + 1]];
            const glm::vec3& v2 = verts[indices[triStart + 2]];
@ -2884,9 +2911,9 @@ std::optional<float> WMORenderer::getFloorHeight(float glX, float glY, float glZ
    // early-returned because overlapping WMO instances need full coverage).
    glm::vec3 queryMin(glX - 2.0f, glY - 2.0f, glZ - 8.0f);
    glm::vec3 queryMax(glX + 2.0f, glY + 2.0f, glZ + 10.0f);
-    gatherCandidates(queryMin, queryMax, candidateScratch);
+    gatherCandidates(queryMin, queryMax, tl_candidateScratch);
-    for (size_t idx : candidateScratch) {
+    for (size_t idx : tl_candidateScratch) {
        const auto& instance = instances[idx];
        if (collisionFocusEnabled &&
            pointAABBDistanceSq(collisionFocusPos, instance.worldBoundsMin, instance.worldBoundsMax) > collisionFocusRadiusSq) {
@ -3059,9 +3086,9 @@ bool WMORenderer::checkWallCollision(const glm::vec3& from, const glm::vec3& to,
    glm::vec3 queryMin = glm::min(from, to) - glm::vec3(8.0f, 8.0f, 5.0f);
    glm::vec3 queryMax = glm::max(from, to) + glm::vec3(8.0f, 8.0f, 5.0f);
-    gatherCandidates(queryMin, queryMax, candidateScratch);
+    gatherCandidates(queryMin, queryMax, tl_candidateScratch);
-    for (size_t idx : candidateScratch) {
+    for (size_t idx : tl_candidateScratch) {
        const auto& instance = instances[idx];
        if (collisionFocusEnabled &&
            pointAABBDistanceSq(collisionFocusPos, instance.worldBoundsMin, instance.worldBoundsMax) > collisionFocusRadiusSq) {
@ -3127,9 +3154,9 @@ bool WMORenderer::checkWallCollision(const glm::vec3& from, const glm::vec3& to,
            float rangeMinY = std::min(localFrom.y, localTo.y) - PLAYER_RADIUS - 1.5f;
            float rangeMaxX = std::max(localFrom.x, localTo.x) + PLAYER_RADIUS + 1.5f;
            float rangeMaxY = std::max(localFrom.y, localTo.y) + PLAYER_RADIUS + 1.5f;
-            group.getTrianglesInRange(rangeMinX, rangeMinY, rangeMaxX, rangeMaxY, triScratch_);
+            group.getTrianglesInRange(rangeMinX, rangeMinY, rangeMaxX, rangeMaxY, tl_triScratch);
-            for (uint32_t triStart : triScratch_) {
+            for (uint32_t triStart : tl_triScratch) {
                // Use pre-computed Z bounds for fast vertical reject
                const auto& tb = group.triBounds[triStart / 3];
@ -3145,18 +3172,13 @@ bool WMORenderer::checkWallCollision(const glm::vec3& from, const glm::vec3& to,
                if (triHeight < 1.0f && tb.maxZ <= localFeetZ + 1.2f) continue;
                // Use MOPY flags to filter wall collision.
-                // Collidable triangles (flag 0x01) block the player — including
+                // Collide with triangles that have the collision flag (0x08) or no flags at all.
-                // invisible collision walls (0x01 without 0x20) used in tunnels.
+                // Skip detail/decorative (0x04) and render-only (0x20 without 0x08) surfaces.
                // Skip detail/decorative geometry (0x04) and render-only surfaces.
                uint32_t triIdx = triStart / 3;
                if (!group.triMopyFlags.empty() && triIdx < group.triMopyFlags.size()) {
                    uint8_t mopy = group.triMopyFlags[triIdx];
                    if (mopy != 0) {
-                        bool collidable = (mopy & 0x01) != 0;
+                        if ((mopy & 0x04) || !(mopy & 0x08)) continue;
                        bool detail = (mopy & 0x04) != 0;
                        if (!collidable || detail) {
                            continue;
                        }
                    }
                }
@ -3217,8 +3239,8 @@ bool WMORenderer::checkWallCollision(const glm::vec3& from, const glm::vec3& to,
                    if (absNz >= 0.35f) continue;
                    const float SKIN = 0.005f;        // small separation so we don't re-collide immediately
-                    // Stronger push when inside WMO for more responsive indoor collision
+                    // Push must cover full penetration to prevent gradual clip-through
-                    const float MAX_PUSH = insideWMO ? 0.35f : 0.15f;
+                    const float MAX_PUSH = PLAYER_RADIUS;
                    float penetration = (PLAYER_RADIUS - horizDist);
                    float pushDist = glm::clamp(penetration + SKIN, 0.0f, MAX_PUSH);
                    glm::vec2 pushDir2;
@ -3302,9 +3324,9 @@ void WMORenderer::updateActiveGroup(float glX, float glY, float glZ) {
    glm::vec3 queryMin(glX - 0.5f, glY - 0.5f, glZ - 0.5f);
    glm::vec3 queryMax(glX + 0.5f, glY + 0.5f, glZ + 0.5f);
-    gatherCandidates(queryMin, queryMax, candidateScratch);
+    gatherCandidates(queryMin, queryMax, tl_candidateScratch);
-    for (size_t idx : candidateScratch) {
+    for (size_t idx : tl_candidateScratch) {
        const auto& instance = instances[idx];
        if (glX < instance.worldBoundsMin.x || glX > instance.worldBoundsMax.x ||
            glY < instance.worldBoundsMin.y || glY > instance.worldBoundsMax.y ||
@ -3348,9 +3370,9 @@ bool WMORenderer::isInsideWMO(float glX, float glY, float glZ, uint32_t* outMode
    QueryTimer timer(&queryTimeMs, &queryCallCount);
    glm::vec3 queryMin(glX - 0.5f, glY - 0.5f, glZ - 0.5f);
    glm::vec3 queryMax(glX + 0.5f, glY + 0.5f, glZ + 0.5f);
-    gatherCandidates(queryMin, queryMax, candidateScratch);
+    gatherCandidates(queryMin, queryMax, tl_candidateScratch);
-    for (size_t idx : candidateScratch) {
+    for (size_t idx : tl_candidateScratch) {
        const auto& instance = instances[idx];
        if (collisionFocusEnabled &&
            pointAABBDistanceSq(collisionFocusPos, instance.worldBoundsMin, instance.worldBoundsMax) > collisionFocusRadiusSq) {
@ -3397,9 +3419,9 @@ bool WMORenderer::isInsideWMO(float glX, float glY, float glZ, uint32_t* outMode
 bool WMORenderer::isInsideInteriorWMO(float glX, float glY, float glZ) const {
    glm::vec3 queryMin(glX - 0.5f, glY - 0.5f, glZ - 0.5f);
    glm::vec3 queryMax(glX + 0.5f, glY + 0.5f, glZ + 0.5f);
-    gatherCandidates(queryMin, queryMax, candidateScratch);
+    gatherCandidates(queryMin, queryMax, tl_candidateScratch);
-    for (size_t idx : candidateScratch) {
+    for (size_t idx : tl_candidateScratch) {
        const auto& instance = instances[idx];
        if (collisionFocusEnabled &&
            pointAABBDistanceSq(collisionFocusPos, instance.worldBoundsMin, instance.worldBoundsMax) > collisionFocusRadiusSq) {
@ -3453,9 +3475,9 @@ float WMORenderer::raycastBoundingBoxes(const glm::vec3& origin, const glm::vec3
    glm::vec3 rayEnd = origin + direction * maxDistance;
    glm::vec3 queryMin = glm::min(origin, rayEnd) - glm::vec3(1.0f);
    glm::vec3 queryMax = glm::max(origin, rayEnd) + glm::vec3(1.0f);
-    gatherCandidates(queryMin, queryMax, candidateScratch);
+    gatherCandidates(queryMin, queryMax, tl_candidateScratch);
-    for (size_t idx : candidateScratch) {
+    for (size_t idx : tl_candidateScratch) {
        const auto& instance = instances[idx];
        if (collisionFocusEnabled &&
            pointAABBDistanceSq(collisionFocusPos, instance.worldBoundsMin, instance.worldBoundsMax) > collisionFocusRadiusSq) {
@ -3509,9 +3531,9 @@ float WMORenderer::raycastBoundingBoxes(const glm::vec3& origin, const glm::vec3
            float rMinY = std::min(localOrigin.y, localEnd.y) - 1.0f;
            float rMaxX = std::max(localOrigin.x, localEnd.x) + 1.0f;
            float rMaxY = std::max(localOrigin.y, localEnd.y) + 1.0f;
-            group.getWallTrianglesInRange(rMinX, rMinY, rMaxX, rMaxY, triScratch_);
+            group.getWallTrianglesInRange(rMinX, rMinY, rMaxX, rMaxY, tl_triScratch);
-            for (uint32_t triStart : triScratch_) {
+            for (uint32_t triStart : tl_triScratch) {
                const glm::vec3& v0 = verts[indices[triStart]];
                const glm::vec3& v1 = verts[indices[triStart + 1]];
                const glm::vec3& v2 = verts[indices[triStart + 2]];
--- a/src/ui/game_screen.cpp
+++ b/src/ui/game_screen.cpp
@ -317,6 +317,20 @@ void GameScreen::render(game::GameHandler& gameHandler) {
        }
    }
    // Apply saved FSR setting once when renderer is available
    if (!fsrSettingsApplied_ && pendingFSR) {
        auto* renderer = core::Application::getInstance().getRenderer();
        if (renderer) {
            static const float fsrScales[] = { 0.77f, 0.67f, 0.59f, 0.50f };
            renderer->setFSRQuality(fsrScales[pendingFSRQuality]);
            renderer->setFSRSharpness(pendingFSRSharpness);
            renderer->setFSREnabled(true);
            fsrSettingsApplied_ = true;
        }
    } else {
        fsrSettingsApplied_ = true;
    }
    // Apply auto-loot setting to GameHandler every frame (cheap bool sync)
    gameHandler.setAutoLoot(pendingAutoLoot);
@ -2687,6 +2701,12 @@ void GameScreen::sendChatMessage(game::GameHandler& gameHandler) {
                chatInputBuffer[0] = '\0';
                return;
            }
            // /unstuckhearth command — teleport to hearthstone bind point
            if (cmdLower == "unstuckhearth") {
                gameHandler.unstuckHearth();
                chatInputBuffer[0] = '\0';
                return;
            }
            // /transport board — board test transport
            if (cmdLower == "transport board") {
@ -6250,7 +6270,7 @@ void GameScreen::renderSettingsWindow() {
                if (pendingShadows) {
                    ImGui::SameLine();
                    ImGui::SetNextItemWidth(150.0f);
-                    if (ImGui::SliderFloat("Distance##shadow", &pendingShadowDistance, 40.0f, 200.0f, "%.0f")) {
+                    if (ImGui::SliderFloat("Distance##shadow", &pendingShadowDistance, 40.0f, 500.0f, "%.0f")) {
                        if (renderer) renderer->setShadowDistance(pendingShadowDistance);
                        saveSettings();
                    }
@ -6261,7 +6281,13 @@ void GameScreen::renderSettingsWindow() {
                }
                {
                    const char* aaLabels[] = { "Off", "2x MSAA", "4x MSAA", "8x MSAA" };
-                    if (ImGui::Combo("Anti-Aliasing", &pendingAntiAliasing, aaLabels, 4)) {
+                    bool fsr2Active = renderer && renderer->isFSR2Enabled();
                    if (fsr2Active) {
                        ImGui::BeginDisabled();
                        int disabled = 0;
                        ImGui::Combo("Anti-Aliasing (FSR2)", &disabled, "Off (FSR2 active)\0", 1);
                        ImGui::EndDisabled();
                    } else if (ImGui::Combo("Anti-Aliasing", &pendingAntiAliasing, aaLabels, 4)) {
                        static const VkSampleCountFlagBits aaSamples[] = {
                            VK_SAMPLE_COUNT_1_BIT, VK_SAMPLE_COUNT_2_BIT,
                            VK_SAMPLE_COUNT_4_BIT, VK_SAMPLE_COUNT_8_BIT
@ -6270,6 +6296,33 @@ void GameScreen::renderSettingsWindow() {
                        saveSettings();
                    }
                }
                // FSR Upscaling
                {
                    // FSR mode selection: Off, FSR 1.0 (Spatial), FSR 2.2 (Temporal)
                    const char* fsrModeLabels[] = { "Off", "FSR 1.0 (Spatial)", "FSR 2.2 (Temporal)" };
                    int fsrMode = pendingFSR ? 1 : 0;
                    if (renderer && renderer->isFSR2Enabled()) fsrMode = 2;
                    if (ImGui::Combo("Upscaling", &fsrMode, fsrModeLabels, 3)) {
                        pendingFSR = (fsrMode == 1);
                        if (renderer) {
                            renderer->setFSREnabled(fsrMode == 1);
                            renderer->setFSR2Enabled(fsrMode == 2);
                        }
                        saveSettings();
                    }
                    if (fsrMode > 0) {
                        const char* fsrQualityLabels[] = { "Ultra Quality (77%)", "Quality (67%)", "Balanced (59%)", "Performance (50%)" };
                        static const float fsrScaleFactors[] = { 0.77f, 0.67f, 0.59f, 0.50f };
                        if (ImGui::Combo("FSR Quality", &pendingFSRQuality, fsrQualityLabels, 4)) {
                            if (renderer) renderer->setFSRQuality(fsrScaleFactors[pendingFSRQuality]);
                            saveSettings();
                        }
                        if (ImGui::SliderFloat("FSR Sharpness", &pendingFSRSharpness, 0.0f, 2.0f, "%.1f")) {
                            if (renderer) renderer->setFSRSharpness(pendingFSRSharpness);
                            saveSettings();
                        }
                    }
                }
                if (ImGui::SliderInt("Ground Clutter Density", &pendingGroundClutterDensity, 0, 150, "%d%%")) {
                    if (renderer) {
                        if (auto* tm = renderer->getTerrainManager()) {
@ -6348,7 +6401,7 @@ void GameScreen::renderSettingsWindow() {
                    pendingFullscreen = kDefaultFullscreen;
                    pendingVsync = kDefaultVsync;
                    pendingShadows = kDefaultShadows;
-                    pendingShadowDistance = 72.0f;
+                    pendingShadowDistance = 300.0f;
                    pendingGroundClutterDensity = kDefaultGroundClutterDensity;
                    pendingAntiAliasing = 0;
                    pendingNormalMapping = true;
@ -7384,6 +7437,9 @@ void GameScreen::saveSettings() {
    out << "normal_map_strength=" << pendingNormalMapStrength << "\n";
    out << "pom=" << (pendingPOM ? 1 : 0) << "\n";
    out << "pom_quality=" << pendingPOMQuality << "\n";
    out << "fsr=" << (pendingFSR ? 1 : 0) << "\n";
    out << "fsr_quality=" << pendingFSRQuality << "\n";
    out << "fsr_sharpness=" << pendingFSRSharpness << "\n";
    // Controls
    out << "mouse_sensitivity=" << pendingMouseSensitivity << "\n";
@ -7463,13 +7519,16 @@ void GameScreen::loadSettings() {
            else if (key == "auto_loot") pendingAutoLoot = (std::stoi(val) != 0);
            else if (key == "ground_clutter_density") pendingGroundClutterDensity = std::clamp(std::stoi(val), 0, 150);
            else if (key == "shadows") pendingShadows = (std::stoi(val) != 0);
-            else if (key == "shadow_distance") pendingShadowDistance = std::clamp(std::stof(val), 40.0f, 200.0f);
+            else if (key == "shadow_distance") pendingShadowDistance = std::clamp(std::stof(val), 40.0f, 500.0f);
            else if (key == "water_refraction") pendingWaterRefraction = (std::stoi(val) != 0);
            else if (key == "antialiasing") pendingAntiAliasing = std::clamp(std::stoi(val), 0, 3);
            else if (key == "normal_mapping") pendingNormalMapping = (std::stoi(val) != 0);
            else if (key == "normal_map_strength") pendingNormalMapStrength = std::clamp(std::stof(val), 0.0f, 2.0f);
            else if (key == "pom") pendingPOM = (std::stoi(val) != 0);
            else if (key == "pom_quality") pendingPOMQuality = std::clamp(std::stoi(val), 0, 2);
            else if (key == "fsr") pendingFSR = (std::stoi(val) != 0);
            else if (key == "fsr_quality") pendingFSRQuality = std::clamp(std::stoi(val), 0, 3);
            else if (key == "fsr_sharpness") pendingFSRSharpness = std::clamp(std::stof(val), 0.0f, 2.0f);
            // Controls
            else if (key == "mouse_sensitivity") pendingMouseSensitivity = std::clamp(std::stof(val), 0.05f, 1.0f);
            else if (key == "invert_mouse") pendingInvertMouse = (std::stoi(val) != 0);
Author	SHA1	Message	Date
Kelsi	e94eb7f2d1	FSR2 temporal upscaling fixes: unjittered reprojection, sharpen Y-flip, MSAA guard, descriptor double-buffering Some checks are pending Build / Build (arm64) (push) Waiting to run Details Build / Build (x86-64) (push) Waiting to run Details Build / Build (macOS arm64) (push) Waiting to run Details Build / Build (windows-arm64) (push) Waiting to run Details Build / Build (windows-x86-64) (push) Waiting to run Details Security / CodeQL (C/C++) (push) Waiting to run Details Security / Semgrep (push) Waiting to run Details Security / Sanitizer Build (ASan/UBSan) (push) Waiting to run Details - Motion vectors: single unjittered reprojection matrix (80 bytes) instead of two jittered matrices (160 bytes), eliminating numerical instability from jitter amplification through large world coordinates - Sharpen pass: fix Y-flip for correct UV sampling, double-buffer descriptor sets to avoid race with in-flight command buffers - MSAA: auto-disable when FSR2 enabled, grey out AA setting in UI - Accumulation: variance-based neighborhood clamping in YCoCg space, correct history layout transitions - Frame index: wrap at 256 for stable Halton sequence	2026-03-08 01:22:15 -08:00
Kelsi	52317d1edd	Implement FSR 2.2 temporal upscaling Full FSR 2.2 pipeline with depth-based motion vector reprojection, temporal accumulation with YCoCg neighborhood clamping, and RCAS contrast-adaptive sharpening. Architecture (designed for FSR 3.x frame generation readiness): - Camera: Halton(2,3) sub-pixel jitter with unjittered projection stored separately for motion vector computation - Motion vectors: compute shader reconstructs world position from depth + inverse VP, reprojects with previous frame's VP - Temporal accumulation: compute shader blends 5-10% current frame with 90-95% clamped history, adaptive blend for disocclusion - History: ping-pong R16G16B16A16 buffers at display resolution - Sharpening: RCAS fragment pass with contrast-adaptive weights Integration: - FSR2 replaces both FSR1 and MSAA when enabled - Scene renders to internal resolution framebuffer (no MSAA) - Compute passes run between scene and swapchain render passes - Camera cut detection resets history on teleport - Quality presets shared with FSR1 (0.50-0.77 scale factors) - UI: "Upscaling" combo with Off/FSR 1.0/FSR 2.2 options	2026-03-07 23:13:01 -08:00
Kelsi	0ffeabd4ed	Revert "Further reduce tile streaming aggressiveness" This reverts commit `f681a8b361`.	2026-03-07 23:02:25 -08:00
Kelsi	f681a8b361	Further reduce tile streaming aggressiveness - Load radius: 4→3 (normal), 6→5 (taxi) - Terrain chunks per step: 16→8 - M2 models per step: 6→2 (removed idle boost) - WMO models per step: 2→1 (removed idle boost) - WMO doodads per step: 4→2 - All budgets now constant (no idle-vs-busy branching)	2026-03-07 22:55:02 -08:00
Kelsi	7f573fc06b	Reduce tile finalization aggressiveness to prevent spawn hitching - Reduce max finalization steps per frame: 2→1 (normal), 8→4 (taxi) - Reduce terrain chunk upload batch: 32→16 chunks per step - Reduce idle M2 model upload budget: 16→6 per step - Reduce idle WMO model upload budget: 4→2 per step Tiles still stream in quickly but spread GPU upload work across more frames, eliminating the frame spikes right after spawning.	2026-03-07 22:51:59 -08:00
Kelsi	ac3c90dd75	Fix M2 animated instance flashing (deer/bird/critter pop-in) Root cause: bonesDirty was a single bool shared across both double-buffered frame indices. When bones were copied to frame 0's SSBO and bonesDirty cleared, frame 1's newly-allocated SSBO would contain garbage/zeros and never get populated — causing animated M2 instances to flash invisible on alternating frames. Fix: Make bonesDirty per-frame-index (bool[2]) so each buffer independently tracks whether it needs bone data uploaded. When bones are recomputed, both indices are marked dirty. When uploaded during render, only the current frame index is cleared. New buffer allocations in prepareRender force their frame index dirty.	2026-03-07 22:47:07 -08:00
Kelsi	6cf08fbaa6	Throttle proactive tile streaming to reduce post-load hitching Add 2-second cooldown timer before re-checking for unloaded tiles when workers are idle, preventing excessive streamTiles() calls that caused frame hitches right after world load.	2026-03-07 22:40:07 -08:00
Kelsi	c13dbf2198	Proactive tile streaming, faster finalization, tree trunk collision - Re-check for unloaded tiles when workers are idle (no tile boundary needed) - Increase M2 upload budget 4→16 and WMO 1→4 per frame when not under pressure - Lower tree collision threshold from 40 to 6 units so large trees block movement	2026-03-07 22:35:18 -08:00
Kelsi	4cb03c38fe	Parallel animation updates, thread-safe collision, M2 pop-in fix, shadow stabilization - Overlap M2 and character animation updates via std::async (~2-5ms saved) - Thread-local collision scratch buffers for concurrent floor queries - Parallel terrain/WMO/M2 floor queries in camera controller - Seed new M2 instance bones from existing siblings to eliminate pop-in flash - Fix shadow flicker: snap center along stable light axes instead of in view space - Increase shadow distance default to 300 units (slider max 500)	2026-03-07 22:29:06 -08:00
Kelsi	a4966e486f	Fix WMO wall collision, normal mapping, POM backfill, and M2/WMO rendering performance - Fix MOPY flag check (0x08 not 0x01) for proper wall collision detection - Cap MAX_PUSH to PLAYER_RADIUS to prevent gradual clip-through - Fix WMO doodad quaternion component ordering (X/Y swap) - Linear normal map strength blend in shader for smooth slider control - Enable shadow sampling for interior WMO groups (covered outdoor areas) - Backfill deferred normal/height maps after streaming with descriptor rebind - M2: prepareRender only iterates animated instances, bone dirty flag - M2: remove worker thread VMA allocation, skip unready bone instances - WMO: persistent visibility vectors, sequential culling - Add FSR EASU/RCAS shaders	2026-03-07 22:03:28 -08:00
Kelsi	16c6c2b6a0	Raise diagnostic log thresholds to reduce log noise SLOW update stages: 3ms → 50ms, renderer update: 5ms → 50ms, loadModel/processAsync/spawnCreature: 3ms → 100ms, terrain/camera: 3-5ms → 50ms. Remove per-frame spawn breakdown.	2026-03-07 18:43:13 -08:00
Kelsi	02cf0e4df3	Background normal map generation, queue-draining load screen warmup - Normal map CPU work (luminance→blur→Sobel) moved to background threads, main thread only does GPU upload (~1-2ms vs 15-22ms per texture) - Load screen warmup now waits until ALL spawn/equipment/gameobject queues are drained before transitioning (prevents naked character, NPC pop-in) - Exit condition: min 2s + 5 consecutive empty iterations, hard cap 15s - Equipment queue processes 8 items per warmup iteration instead of 1 - Added LoadingScreen::renderOverlay() for future world-behind-loading use	2026-03-07 18:40:24 -08:00
Kelsi	63efac9fa6	Unlimited creature model uploads during load screen, remove duplicate code Loading screen now calls processCreatureSpawnQueue(unlimited=true) which removes the 1-upload-per-frame cap and 2ms time budget, allowing all pending creature models to upload to GPU in bulk. Also increases concurrent async background loads from 4 to 16 during load screen. Replaces 40-line inline duplicate of processAsyncCreatureResults with the shared function.	2026-03-07 17:31:47 -08:00