diff --git a/CMakeLists.txt b/CMakeLists.txt
index f61b4024..88daaa4a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -605,6 +605,7 @@ set(WOWEE_SOURCES
     src/rendering/wmo_renderer.cpp
     src/rendering/m2_renderer.cpp
     src/rendering/m2_model_classifier.cpp
+    src/rendering/render_graph.cpp
     src/rendering/quest_marker_renderer.cpp
     src/rendering/minimap.cpp
     src/rendering/world_map.cpp
diff --git a/assets/shaders/m2.vert.glsl b/assets/shaders/m2.vert.glsl
index 6f4545c8..a5913ca2 100644
--- a/assets/shaders/m2.vert.glsl
+++ b/assets/shaders/m2.vert.glsl
@@ -13,19 +13,29 @@ layout(set = 0, binding = 0) uniform PerFrame {
     vec4 shadowParams;
 };
 
+// Phase 2.1: Per-draw push constants (batch-level data only)
 layout(push_constant) uniform Push {
-    mat4 model;
-    vec2 uvOffset;
-    int texCoordSet;
-    int useBones;
-    int isFoliage;
-    float fadeAlpha;
+    int texCoordSet;         // UV set index (0 or 1)
+    int isFoliage;           // Foliage wind animation flag
+    int instanceDataOffset;  // Base index into InstanceSSBO for this draw group
 } push;
 
 layout(set = 2, binding = 0) readonly buffer BoneSSBO {
     mat4 bones[];
 };
 
+// Phase 2.1: Per-instance data read via gl_InstanceIndex (GPU instancing)
+struct InstanceData {
+    mat4 model;
+    vec2 uvOffset;
+    float fadeAlpha;
+    int useBones;
+    int boneBase;
+};
+layout(set = 3, binding = 0) readonly buffer InstanceSSBO {
+    InstanceData instanceData[];
+};
+
 layout(location = 0) in vec3 aPos;
 layout(location = 1) in vec3 aNormal;
 layout(location = 2) in vec2 aTexCoord;
@@ -41,15 +51,23 @@ layout(location = 4) out float ModelHeight;
 layout(location = 5) out float vFadeAlpha;
 
 void main() {
+    // Phase 2.1: Fetch per-instance data from SSBO
+    int instIdx = push.instanceDataOffset + gl_InstanceIndex;
+    mat4 model = instanceData[instIdx].model;
+    vec2 uvOff = instanceData[instIdx].uvOffset;
+    float fade = instanceData[instIdx].fadeAlpha;
+    int uBones = instanceData[instIdx].useBones;
+    int bBase  = instanceData[instIdx].boneBase;
+
     vec4 pos = vec4(aPos, 1.0);
     vec4 norm = vec4(aNormal, 0.0);
 
-    if (push.useBones != 0) {
+    if (uBones != 0) {
         ivec4 bi = ivec4(aBoneIndicesF);
-        mat4 skinMat = bones[bi.x] * aBoneWeights.x
-                     + bones[bi.y] * aBoneWeights.y
-                     + bones[bi.z] * aBoneWeights.z
-                     + bones[bi.w] * aBoneWeights.w;
+        mat4 skinMat = bones[bBase + bi.x] * aBoneWeights.x
+                     + bones[bBase + bi.y] * aBoneWeights.y
+                     + bones[bBase + bi.z] * aBoneWeights.z
+                     + bones[bBase + bi.w] * aBoneWeights.w;
         pos = skinMat * pos;
         norm = skinMat * norm;
     }
@@ -57,7 +75,7 @@ void main() {
     // Wind animation for foliage
     if (push.isFoliage != 0) {
         float windTime = fogParams.z;
-        vec3 worldRef = push.model[3].xyz;
+        vec3 worldRef = model[3].xyz;
         float heightFactor = clamp(pos.z / 20.0, 0.0, 1.0);
         heightFactor *= heightFactor; // quadratic — base stays grounded
 
@@ -80,15 +98,15 @@ void main() {
         pos.y += trunkSwayY + branchSwayY + leafFlutterY;
     }
 
-    vec4 worldPos = push.model * pos;
+    vec4 worldPos = model * pos;
     FragPos = worldPos.xyz;
-    Normal = mat3(push.model) * norm.xyz;
+    Normal = mat3(model) * norm.xyz;
 
-    TexCoord = (push.texCoordSet == 1 ? aTexCoord2 : aTexCoord) + push.uvOffset;
+    TexCoord = (push.texCoordSet == 1 ? aTexCoord2 : aTexCoord) + uvOff;
 
-    InstanceOrigin = push.model[3].xyz;
+    InstanceOrigin = model[3].xyz;
     ModelHeight = pos.z;
-    vFadeAlpha = push.fadeAlpha;
+    vFadeAlpha = fade;
 
     gl_Position = projection * view * worldPos;
 }
diff --git a/assets/shaders/m2.vert.spv b/assets/shaders/m2.vert.spv
index 8397440f..11364e67 100644
Binary files a/assets/shaders/m2.vert.spv and b/assets/shaders/m2.vert.spv differ
diff --git a/assets/shaders/m2_cull.comp.glsl b/assets/shaders/m2_cull.comp.glsl
new file mode 100644
index 00000000..831a521e
--- /dev/null
+++ b/assets/shaders/m2_cull.comp.glsl
@@ -0,0 +1,76 @@
+#version 450
+
+// Phase 2.3: GPU Frustum Culling for M2 doodads
+// Each compute thread tests one M2 instance against 6 frustum planes.
+// Input:  per-instance bounding sphere + flags.
+// Output: uint visibility array (1 = visible, 0 = culled).
+
+layout(local_size_x = 64) in;
+
+// Per-instance cull data (uploaded from CPU each frame)
+struct CullInstance {
+    vec4  sphere;              // xyz = world position, w = padded radius
+    float effectiveMaxDistSq;  // adaptive distance cull threshold
+    uint  flags;               // bit 0 = valid, bit 1 = smoke, bit 2 = invisibleTrap
+    float _pad0;
+    float _pad1;
+};
+
+layout(std140, set = 0, binding = 0) uniform CullUniforms {
+    vec4  frustumPlanes[6]; // xyz = normal, w = distance
+    vec4  cameraPos;        // xyz = camera position, w = maxPossibleDistSq
+    uint  instanceCount;
+    uint  _pad0;
+    uint  _pad1;
+    uint  _pad2;
+};
+
+layout(std430, set = 0, binding = 1) readonly buffer CullInput {
+    CullInstance cullInstances[];
+};
+
+layout(std430, set = 0, binding = 2) writeonly buffer CullOutput {
+    uint visibility[];
+};
+
+void main() {
+    uint id = gl_GlobalInvocationID.x;
+    if (id >= instanceCount) return;
+
+    CullInstance inst = cullInstances[id];
+
+    // Flag check: must be valid, not smoke, not invisible trap
+    uint f = inst.flags;
+    if ((f & 1u) == 0u || (f & 6u) != 0u) {
+        visibility[id] = 0u;
+        return;
+    }
+
+    // Early distance rejection (loose upper bound)
+    vec3 toCam = inst.sphere.xyz - cameraPos.xyz;
+    float distSq = dot(toCam, toCam);
+    if (distSq > cameraPos.w) {
+        visibility[id] = 0u;
+        return;
+    }
+
+    // Accurate per-instance distance cull
+    if (distSq > inst.effectiveMaxDistSq) {
+        visibility[id] = 0u;
+        return;
+    }
+
+    // Frustum cull: sphere vs 6 planes
+    float radius = inst.sphere.w;
+    if (radius > 0.0) {
+        for (int i = 0; i < 6; i++) {
+            float d = dot(frustumPlanes[i].xyz, inst.sphere.xyz) + frustumPlanes[i].w;
+            if (d < -radius) {
+                visibility[id] = 0u;
+                return;
+            }
+        }
+    }
+
+    visibility[id] = 1u;
+}
diff --git a/assets/shaders/m2_cull.comp.spv b/assets/shaders/m2_cull.comp.spv
new file mode 100644
index 00000000..ef1a08fd
Binary files /dev/null and b/assets/shaders/m2_cull.comp.spv differ
diff --git a/include/rendering/camera.hpp b/include/rendering/camera.hpp
index ee58c8f2..ed4732f2 100644
--- a/include/rendering/camera.hpp
+++ b/include/rendering/camera.hpp
@@ -51,7 +51,7 @@ private:
     float pitch = 0.0f;
     float fov = 45.0f;
     float aspectRatio = 16.0f / 9.0f;
-    float nearPlane = 0.05f;
+    float nearPlane = 0.5f;
     float farPlane = 30000.0f;   // Improves depth precision vs extremely large far clip
 
     glm::mat4 viewMatrix = glm::mat4(1.0f);
diff --git a/include/rendering/m2_renderer.hpp b/include/rendering/m2_renderer.hpp
index dbeeeae8..0acd9972 100644
--- a/include/rendering/m2_renderer.hpp
+++ b/include/rendering/m2_renderer.hpp
@@ -219,12 +219,15 @@ struct M2Instance {
     uint8_t frameSkipCounter = 0;
     bool bonesDirty[2] = {false, false};  // Per-frame-index: set when bones recomputed, cleared after upload
 
-    // Per-instance bone SSBO (double-buffered)
+    // Per-instance bone SSBO (double-buffered) — legacy; see mega bone SSBO in M2Renderer
     ::VkBuffer boneBuffer[2] = {};
     VmaAllocation boneAlloc[2] = {};
     void* boneMapped[2] = {};
     VkDescriptorSet boneSet[2] = {};
 
+    // Mega bone SSBO offset — base bone index for this instance (set per-frame in prepareRender)
+    uint32_t megaBoneOffset = 0;
+
     void updateModelMatrix();
 };
 
@@ -292,6 +295,8 @@ public:
      */
     /** Pre-allocate GPU resources (bone SSBOs, descriptors) on main thread before parallel render. */
     void prepareRender(uint32_t frameIndex, const Camera& camera);
+    /** Phase 2.3: Dispatch GPU frustum culling compute shader on primary cmd before render pass. */
+    void dispatchCullCompute(VkCommandBuffer cmd, uint32_t frameIndex, const Camera& camera);
     void render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet, const Camera& camera);
 
     /**
@@ -425,6 +430,65 @@ private:
     VmaAllocation dummyBoneAlloc_ = VK_NULL_HANDLE;
     VkDescriptorSet dummyBoneSet_ = VK_NULL_HANDLE;
 
+    // Mega bone SSBO — consolidates all per-instance bone matrices into a single buffer per frame.
+    // Replaces per-instance bone SSBOs for fewer descriptor binds and enables GPU instancing.
+    static constexpr uint32_t MEGA_BONE_MAX_INSTANCES = 2048;
+    static constexpr uint32_t MAX_BONES_PER_INSTANCE = 128;
+    ::VkBuffer megaBoneBuffer_[2] = {};
+    VmaAllocation megaBoneAlloc_[2] = {};
+    void* megaBoneMapped_[2] = {};
+    VkDescriptorSet megaBoneSet_[2] = {};
+
+    // Phase 2.1: GPU instance data SSBO — per-instance transforms, fade, bones for instanced draws.
+    // Shader reads instanceData[push.instanceDataOffset + gl_InstanceIndex].
+    struct M2InstanceGPU {
+        glm::mat4 model;           // 64 bytes @ offset 0
+        glm::vec2 uvOffset;        //  8 bytes @ offset 64
+        float fadeAlpha;           //  4 bytes @ offset 72
+        int32_t useBones;          //  4 bytes @ offset 76
+        int32_t boneBase;          //  4 bytes @ offset 80
+        int32_t _pad[3] = {};      // 12 bytes @ offset 84 — align to 96 (std430)
+    };
+    static constexpr uint32_t MAX_INSTANCE_DATA = 16384;
+    VkDescriptorSetLayout instanceSetLayout_ = VK_NULL_HANDLE;
+    VkDescriptorPool instanceDescPool_ = VK_NULL_HANDLE;
+    ::VkBuffer instanceBuffer_[2] = {};
+    VmaAllocation instanceAlloc_[2] = {};
+    void* instanceMapped_[2] = {};
+    VkDescriptorSet instanceSet_[2] = {};
+    uint32_t instanceDataCount_ = 0; // reset each frame in render()
+
+    // Phase 2.3: GPU Frustum Culling via Compute Shader
+    // Compute shader tests each M2 instance against frustum planes + distance, writes visibility[].
+    // CPU reads back visibility to build sortedVisible_ without per-instance frustum/distance tests.
+    struct CullInstanceGPU {        // matches CullInstance in m2_cull.comp.glsl (32 bytes, std430)
+        glm::vec4 sphere;           // xyz = world position, w = padded radius
+        float effectiveMaxDistSq;   // adaptive distance cull threshold
+        uint32_t flags;             // bit 0 = valid, bit 1 = smoke, bit 2 = invisibleTrap
+        float _pad[2] = {};
+    };
+    struct CullUniformsGPU {        // matches CullUniforms in m2_cull.comp.glsl (128 bytes, std140)
+        glm::vec4 frustumPlanes[6]; // xyz = normal, w = distance
+        glm::vec4 cameraPos;        // xyz = camera position, w = maxPossibleDistSq
+        uint32_t instanceCount;
+        uint32_t _pad[3] = {};
+    };
+    static constexpr uint32_t MAX_CULL_INSTANCES = 16384;
+    VkPipeline cullPipeline_ = VK_NULL_HANDLE;
+    VkPipelineLayout cullPipelineLayout_ = VK_NULL_HANDLE;
+    VkDescriptorSetLayout cullSetLayout_ = VK_NULL_HANDLE;
+    VkDescriptorPool cullDescPool_ = VK_NULL_HANDLE;
+    VkDescriptorSet cullSet_[2] = {};               // double-buffered
+    ::VkBuffer cullUniformBuffer_[2] = {};           // frustum planes + camera (UBO)
+    VmaAllocation cullUniformAlloc_[2] = {};
+    void* cullUniformMapped_[2] = {};
+    ::VkBuffer cullInputBuffer_[2] = {};             // per-instance bounding sphere + flags (SSBO)
+    VmaAllocation cullInputAlloc_[2] = {};
+    void* cullInputMapped_[2] = {};
+    ::VkBuffer cullOutputBuffer_[2] = {};            // uint visibility[] (SSBO, host-readable)
+    VmaAllocation cullOutputAlloc_[2] = {};
+    void* cullOutputMapped_[2] = {};
+
     // Dynamic ribbon vertex buffer (CPU-written triangle strip)
     static constexpr size_t MAX_RIBBON_VERTS = 2048;  // 9 floats each
     ::VkBuffer ribbonVB_ = VK_NULL_HANDLE;
diff --git a/include/rendering/render_graph.hpp b/include/rendering/render_graph.hpp
new file mode 100644
index 00000000..39ea34bd
--- /dev/null
+++ b/include/rendering/render_graph.hpp
@@ -0,0 +1,117 @@
+#pragma once
+
+#include <vulkan/vulkan.h>
+#include <string>
+#include <vector>
+#include <functional>
+#include <cstdint>
+
+namespace wowee {
+namespace rendering {
+
+// Phase 2.5: Lightweight Render Graph / Frame Graph
+// Converts hardcoded pass sequence (shadow → reflection → compute cull →
+// main → post-process → ImGui → present) into declarative graph nodes.
+// Graph auto-inserts VkImageMemoryBarrier between passes.
+
+// Resource handle — identifies a virtual resource (image or buffer) within the graph.
+struct RGResource {
+    uint32_t id = UINT32_MAX;
+    bool valid() const { return id != UINT32_MAX; }
+};
+
+// Image barrier descriptor for automatic synchronization between passes.
+struct RGImageBarrier {
+    VkImage image;
+    VkImageLayout oldLayout;
+    VkImageLayout newLayout;
+    VkAccessFlags srcAccess;
+    VkAccessFlags dstAccess;
+    VkPipelineStageFlags srcStage;
+    VkPipelineStageFlags dstStage;
+    VkImageAspectFlags aspectMask;
+};
+
+// Buffer barrier descriptor for automatic synchronization between passes.
+struct RGBufferBarrier {
+    VkBuffer buffer;
+    VkDeviceSize offset;
+    VkDeviceSize size;
+    VkAccessFlags srcAccess;
+    VkAccessFlags dstAccess;
+    VkPipelineStageFlags srcStage;
+    VkPipelineStageFlags dstStage;
+};
+
+// Render pass node — wraps an execution callback with declared inputs/outputs.
+struct RGPass {
+    std::string name;
+    std::vector<RGResource> inputs;
+    std::vector<RGResource> outputs;
+    std::function<void(VkCommandBuffer cmd)> execute;
+    bool enabled = true; // Can be dynamically disabled per-frame
+
+    // Barriers to insert before this pass executes
+    std::vector<RGImageBarrier> imageBarriers;
+    std::vector<RGBufferBarrier> bufferBarriers;
+};
+
+class RenderGraph {
+public:
+    RenderGraph() = default;
+    ~RenderGraph() = default;
+
+    // Reset graph for a new frame (clears passes, keeps resource registry).
+    void reset();
+
+    // Register a virtual resource (returns handle for input/output declarations).
+    RGResource registerResource(const std::string& name);
+
+    // Look up a previously registered resource by name.
+    RGResource findResource(const std::string& name) const;
+
+    // Add a render pass node.
+    // inputs: resources this pass reads from
+    // outputs: resources this pass writes to
+    // execute: callback invoked with the frame's command buffer
+    void addPass(const std::string& name,
+                 const std::vector<RGResource>& inputs,
+                 const std::vector<RGResource>& outputs,
+                 std::function<void(VkCommandBuffer cmd)> execute);
+
+    // Enable/disable a pass by name (for dynamic toggling, e.g. shadows off).
+    void setPassEnabled(const std::string& name, bool enabled);
+
+    // Compile: topological sort by dependency order, insert barriers.
+    // Must be called after all addPass() calls and before execute().
+    void compile();
+
+    // Execute all enabled passes in compiled order on the given command buffer.
+    void execute(VkCommandBuffer cmd);
+
+    // Query: get the compiled execution order (pass names, for debug HUD).
+    const std::vector<uint32_t>& getExecutionOrder() const { return executionOrder_; }
+    const std::vector<RGPass>& getPasses() const { return passes_; }
+
+private:
+    // Topological sort helper (Kahn's algorithm).
+    void topologicalSort();
+
+    // Resource registry: name → id
+    struct ResourceEntry {
+        std::string name;
+        uint32_t id;
+    };
+    std::vector<ResourceEntry> resources_;
+    uint32_t nextResourceId_ = 0;
+
+    // Pass storage
+    std::vector<RGPass> passes_;
+
+    // Compiled execution order (indices into passes_)
+    std::vector<uint32_t> executionOrder_;
+    bool compiled_ = false;
+};
+
+} // namespace rendering
+} // namespace wowee
diff --git a/include/rendering/renderer.hpp b/include/rendering/renderer.hpp
index 54372da9..a4d075e9 100644
--- a/include/rendering/renderer.hpp
+++ b/include/rendering/renderer.hpp
@@ -56,6 +56,7 @@ class AnimationController;
 class LevelUpEffect;
 class ChargeEffect;
 class SwimEffects;
+class RenderGraph;
 
 class Renderer {
 public:
@@ -433,6 +434,10 @@ private:
 
     bool ghostMode_ = false;  // set each frame from gameHandler->isPlayerGhost()
 
+    // Phase 2.5: Render Graph — declarative pass ordering with automatic barriers
+    std::unique_ptr<RenderGraph> renderGraph_;
+    void buildFrameGraph(game::GameHandler* gameHandler);
+
     // CPU timing stats (last frame/update).
     double lastUpdateMs = 0.0;
     double lastRenderMs = 0.0;
diff --git a/include/rendering/terrain_manager.hpp b/include/rendering/terrain_manager.hpp
index 50c09680..59c9c4e2 100644
--- a/include/rendering/terrain_manager.hpp
+++ b/include/rendering/terrain_manager.hpp
@@ -346,8 +346,8 @@ private:
 
     // Streaming parameters
     bool streamingEnabled = true;
-    int loadRadius = 4;      // Load tiles within this radius (9x9 grid = 81 tiles)
-    int unloadRadius = 7;    // Unload tiles beyond this radius
+    int loadRadius = 6;      // Load tiles within this radius (13x13 grid = 169 tiles)
+    int unloadRadius = 9;    // Unload tiles beyond this radius
     float updateInterval = 0.033f;  // Check streaming every 33ms (~30 fps)
     float timeSinceLastUpdate = 0.0f;
     float proactiveStreamTimer_ = 0.0f;
diff --git a/include/rendering/terrain_renderer.hpp b/include/rendering/terrain_renderer.hpp
index 5bc13252..24fa1955 100644
--- a/include/rendering/terrain_renderer.hpp
+++ b/include/rendering/terrain_renderer.hpp
@@ -60,6 +60,11 @@ struct TerrainChunkGPU {
     float boundingSphereRadius = 0.0f;
     glm::vec3 boundingSphereCenter = glm::vec3(0.0f);
 
+    // Phase 2.2: Offsets into mega buffers for indirect drawing (-1 = not in mega buffer)
+    int32_t megaBaseVertex = -1;
+    uint32_t megaFirstIndex = 0;
+    uint32_t vertexCount = 0;
+
     bool isValid() const { return vertexBuffer != VK_NULL_HANDLE && indexBuffer != VK_NULL_HANDLE; }
 };
 
@@ -200,6 +205,25 @@ private:
     bool fogEnabled = true;
     int renderedChunks = 0;
     int culledChunks = 0;
+
+    // Phase 2.2: Mega vertex/index buffers for indirect drawing
+    // All terrain chunks share a single VB + IB, eliminating per-chunk rebinds.
+    // Indirect draw commands are built CPU-side each frame for visible chunks.
+    VkBuffer megaVB_ = VK_NULL_HANDLE;
+    VmaAllocation megaVBAlloc_ = VK_NULL_HANDLE;
+    void* megaVBMapped_ = nullptr;
+    VkBuffer megaIB_ = VK_NULL_HANDLE;
+    VmaAllocation megaIBAlloc_ = VK_NULL_HANDLE;
+    void* megaIBMapped_ = nullptr;
+    uint32_t megaVBUsed_ = 0;  // vertices used
+    uint32_t megaIBUsed_ = 0;  // indices used
+    static constexpr uint32_t MEGA_VB_MAX_VERTS   = 1536 * 1024; // ~1.5M verts × 44B ≈ 64MB
+    static constexpr uint32_t MEGA_IB_MAX_INDICES  = 6 * 1024 * 1024; // 6M indices × 4B = 24MB
+
+    VkBuffer indirectBuffer_ = VK_NULL_HANDLE;
+    VmaAllocation indirectAlloc_ = VK_NULL_HANDLE;
+    void* indirectMapped_ = nullptr;
+    static constexpr uint32_t MAX_INDIRECT_DRAWS = 8192;
 };
 
 } // namespace rendering
diff --git a/include/rendering/vk_pipeline.hpp b/include/rendering/vk_pipeline.hpp
index e95337f8..e53229e3 100644
--- a/include/rendering/vk_pipeline.hpp
+++ b/include/rendering/vk_pipeline.hpp
@@ -75,6 +75,10 @@ public:
     // Dynamic state
     PipelineBuilder& setDynamicStates(const std::vector<VkDynamicState>& states);
 
+    // Pipeline derivatives — hint driver to share compiled state between similar pipelines
+    PipelineBuilder& setFlags(VkPipelineCreateFlags flags);
+    PipelineBuilder& setBasePipeline(VkPipeline basePipeline);
+
     // Build the pipeline (pass a VkPipelineCache for faster creation)
     VkPipeline build(VkDevice device, VkPipelineCache cache = VK_NULL_HANDLE) const;
 
@@ -106,6 +110,8 @@ private:
     VkRenderPass renderPass_ = VK_NULL_HANDLE;
     uint32_t subpass_ = 0;
     std::vector<VkDynamicState> dynamicStates_;
+    VkPipelineCreateFlags flags_ = 0;
+    VkPipeline basePipelineHandle_ = VK_NULL_HANDLE;
 };
 
 // Helper to create a pipeline layout from descriptor set layouts and push constant ranges
diff --git a/src/core/world_loader.cpp b/src/core/world_loader.cpp
index 9e90e747..4e967b18 100644
--- a/src/core/world_loader.cpp
+++ b/src/core/world_loader.cpp
@@ -734,9 +734,9 @@ void WorldLoader::loadOnlineWorldTerrain(uint32_t mapId, float x, float y, float
             // Use a small radius for the initial load (just immediate tiles),
             // then restore the full radius after entering the game.
             // This matches WoW's behavior: load quickly, stream the rest in-game.
-            const int savedLoadRadius = 4;
-            terrainMgr->setLoadRadius(3);   // 7x7=49 tiles — prevents hitches on spawn
-            terrainMgr->setUnloadRadius(7);
+            const int savedLoadRadius = 6;
+            terrainMgr->setLoadRadius(4);   // 9x9=81 tiles — prevents hitches on spawn
+            terrainMgr->setUnloadRadius(9);
 
             // Trigger tile streaming for surrounding area
             terrainMgr->update(*camera, 1.0f);
diff --git a/src/main.cpp b/src/main.cpp
index a4481a9f..7ac84715 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -111,13 +111,13 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char* argv[]) {
         _NSGetExecutablePath(nullptr, &bufSize);
         std::string exePath(bufSize, '\0');
         _NSGetExecutablePath(exePath.data(), &bufSize);
-        chdir(dirname(exePath.data()));
+        if (chdir(dirname(exePath.data())) != 0) {}
     }
 #elif defined(__linux__)
     {
         char buf[4096];
         ssize_t len = readlink("/proc/self/exe", buf, sizeof(buf) - 1);
-        if (len > 0) { buf[len] = '\0'; chdir(dirname(buf)); }
+        if (len > 0) { buf[len] = '\0'; if (chdir(dirname(buf)) != 0) {} }
     }
 #endif
 
diff --git a/src/rendering/m2_renderer.cpp b/src/rendering/m2_renderer.cpp
index d87a6844..8fccc598 100644
--- a/src/rendering/m2_renderer.cpp
+++ b/src/rendering/m2_renderer.cpp
@@ -349,6 +349,20 @@ bool M2Renderer::initialize(VkContext* ctx, VkDescriptorSetLayout perFrameLayout
         vkCreateDescriptorSetLayout(device, &ci, nullptr, &boneSetLayout_);
     }
 
+    // Phase 2.1: Instance data set layout (set 3): binding 0 = STORAGE_BUFFER (per-instance data)
+    {
+        VkDescriptorSetLayoutBinding binding{};
+        binding.binding = 0;
+        binding.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
+        binding.descriptorCount = 1;
+        binding.stageFlags = VK_SHADER_STAGE_VERTEX_BIT;
+
+        VkDescriptorSetLayoutCreateInfo ci{VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO};
+        ci.bindingCount = 1;
+        ci.pBindings = &binding;
+        vkCreateDescriptorSetLayout(device, &ci, nullptr, &instanceSetLayout_);
+    }
+
     // Particle texture set layout (set 1 for particles): binding 0 = sampler2D
     {
         VkDescriptorSetLayoutBinding binding{};
@@ -423,19 +437,244 @@ bool M2Renderer::initialize(VkContext* ctx, VkDescriptorSetLayout perFrameLayout
         }
     }
 
+    // Mega bone SSBO — consolidates all animated instance bones into one buffer per frame.
+    // Slot 0 = identity matrix (for non-animated instances), slots 1..N = animated instances.
+    {
+        const VkDeviceSize megaSize = MEGA_BONE_MAX_INSTANCES * MAX_BONES_PER_INSTANCE * sizeof(glm::mat4);
+        glm::mat4 identity(1.0f);
+        for (int i = 0; i < 2; i++) {
+            VkBufferCreateInfo bci{VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO};
+            bci.size = megaSize;
+            bci.usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT;
+            VmaAllocationCreateInfo aci{};
+            aci.usage = VMA_MEMORY_USAGE_CPU_TO_GPU;
+            aci.flags = VMA_ALLOCATION_CREATE_MAPPED_BIT;
+            VmaAllocationInfo allocInfo{};
+            vmaCreateBuffer(ctx->getAllocator(), &bci, &aci,
+                            &megaBoneBuffer_[i], &megaBoneAlloc_[i], &allocInfo);
+            megaBoneMapped_[i] = allocInfo.pMappedData;
+
+            // Slot 0: identity matrix (for non-animated instances)
+            if (megaBoneMapped_[i]) {
+                memcpy(megaBoneMapped_[i], &identity, sizeof(identity));
+            }
+
+            megaBoneSet_[i] = allocateBoneSet();
+            if (megaBoneSet_[i]) {
+                VkDescriptorBufferInfo bufInfo{};
+                bufInfo.buffer = megaBoneBuffer_[i];
+                bufInfo.offset = 0;
+                bufInfo.range = megaSize;
+                VkWriteDescriptorSet write{VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET};
+                write.dstSet = megaBoneSet_[i];
+                write.dstBinding = 0;
+                write.descriptorCount = 1;
+                write.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
+                write.pBufferInfo = &bufInfo;
+                vkUpdateDescriptorSets(device, 1, &write, 0, nullptr);
+            }
+        }
+    }
+
+    // Phase 2.1: Instance data SSBO — per-frame buffer holding per-instance transforms, fade, bones.
+    // Shader reads instanceData[push.instanceDataOffset + gl_InstanceIndex].
+    {
+        static_assert(sizeof(M2InstanceGPU) == 96, "M2InstanceGPU must be 96 bytes (std430)");
+        const VkDeviceSize instBufSize = MAX_INSTANCE_DATA * sizeof(M2InstanceGPU);
+
+        // Descriptor pool for 2 sets (double-buffered)
+        VkDescriptorPoolSize poolSize{VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 2};
+        VkDescriptorPoolCreateInfo poolCi{VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO};
+        poolCi.maxSets = 2;
+        poolCi.poolSizeCount = 1;
+        poolCi.pPoolSizes = &poolSize;
+        vkCreateDescriptorPool(device, &poolCi, nullptr, &instanceDescPool_);
+
+        for (int i = 0; i < 2; i++) {
+            VkBufferCreateInfo bci{VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO};
+            bci.size = instBufSize;
+            bci.usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT;
+            VmaAllocationCreateInfo aci{};
+            aci.usage = VMA_MEMORY_USAGE_CPU_TO_GPU;
+            aci.flags = VMA_ALLOCATION_CREATE_MAPPED_BIT;
+            VmaAllocationInfo allocInfo{};
+            vmaCreateBuffer(ctx->getAllocator(), &bci, &aci,
+                            &instanceBuffer_[i], &instanceAlloc_[i], &allocInfo);
+            instanceMapped_[i] = allocInfo.pMappedData;
+
+            VkDescriptorSetAllocateInfo setAi{VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO};
+            setAi.descriptorPool = instanceDescPool_;
+            setAi.descriptorSetCount = 1;
+            setAi.pSetLayouts = &instanceSetLayout_;
+            vkAllocateDescriptorSets(device, &setAi, &instanceSet_[i]);
+
+            VkDescriptorBufferInfo bufInfo{};
+            bufInfo.buffer = instanceBuffer_[i];
+            bufInfo.offset = 0;
+            bufInfo.range = instBufSize;
+            VkWriteDescriptorSet write{VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET};
+            write.dstSet = instanceSet_[i];
+            write.dstBinding = 0;
+            write.descriptorCount = 1;
+            write.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
+            write.pBufferInfo = &bufInfo;
+            vkUpdateDescriptorSets(device, 1, &write, 0, nullptr);
+        }
+    }
+
+    // Phase 2.3: GPU frustum culling — compute pipeline, buffers, descriptors.
+    // Compute shader tests each instance bounding sphere against 6 frustum planes + distance.
+    // Output: uint visibility[] read back by CPU to skip culled instances in sortedVisible_ build.
+    {
+        static_assert(sizeof(CullInstanceGPU) == 32, "CullInstanceGPU must be 32 bytes (std430)");
+        static_assert(sizeof(CullUniformsGPU) == 128, "CullUniformsGPU must be 128 bytes (std140)");
+
+        // Descriptor set layout: binding 0 = UBO (frustum+camera), 1 = SSBO (input), 2 = SSBO (output)
+        VkDescriptorSetLayoutBinding bindings[3] = {};
+        bindings[0].binding = 0;
+        bindings[0].descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER;
+        bindings[0].descriptorCount = 1;
+        bindings[0].stageFlags = VK_SHADER_STAGE_COMPUTE_BIT;
+        bindings[1].binding = 1;
+        bindings[1].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
+        bindings[1].descriptorCount = 1;
+        bindings[1].stageFlags = VK_SHADER_STAGE_COMPUTE_BIT;
+        bindings[2].binding = 2;
+        bindings[2].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
+        bindings[2].descriptorCount = 1;
+        bindings[2].stageFlags = VK_SHADER_STAGE_COMPUTE_BIT;
+
+        VkDescriptorSetLayoutCreateInfo layoutCi{VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO};
+        layoutCi.bindingCount = 3;
+        layoutCi.pBindings = bindings;
+        vkCreateDescriptorSetLayout(device, &layoutCi, nullptr, &cullSetLayout_);
+
+        // Pipeline layout (no push constants — everything via UBO)
+        VkPipelineLayoutCreateInfo plCi{VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO};
+        plCi.setLayoutCount = 1;
+        plCi.pSetLayouts = &cullSetLayout_;
+        vkCreatePipelineLayout(device, &plCi, nullptr, &cullPipelineLayout_);
+
+        // Load compute shader
+        rendering::VkShaderModule cullComp;
+        if (!cullComp.loadFromFile(device, "assets/shaders/m2_cull.comp.spv")) {
+            LOG_ERROR("M2Renderer: failed to load m2_cull.comp.spv — GPU culling disabled");
+        } else {
+            VkComputePipelineCreateInfo cpCi{VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO};
+            cpCi.stage = cullComp.stageInfo(VK_SHADER_STAGE_COMPUTE_BIT);
+            cpCi.layout = cullPipelineLayout_;
+            if (vkCreateComputePipelines(device, VK_NULL_HANDLE, 1, &cpCi, nullptr, &cullPipeline_) != VK_SUCCESS) {
+                LOG_ERROR("M2Renderer: failed to create cull compute pipeline");
+                cullPipeline_ = VK_NULL_HANDLE;
+            }
+            cullComp.destroy();
+        }
+
+        // Descriptor pool: 2 sets × 3 descriptors each (1 UBO + 2 SSBO)
+        VkDescriptorPoolSize poolSizes[2] = {};
+        poolSizes[0] = {VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, 2};
+        poolSizes[1] = {VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 4};  // 2 input + 2 output
+        VkDescriptorPoolCreateInfo poolCi{VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO};
+        poolCi.maxSets = 2;
+        poolCi.poolSizeCount = 2;
+        poolCi.pPoolSizes = poolSizes;
+        vkCreateDescriptorPool(device, &poolCi, nullptr, &cullDescPool_);
+
+        const VkDeviceSize uniformSize = sizeof(CullUniformsGPU);
+        const VkDeviceSize inputSize   = MAX_CULL_INSTANCES * sizeof(CullInstanceGPU);
+        const VkDeviceSize outputSize  = MAX_CULL_INSTANCES * sizeof(uint32_t);
+
+        for (int i = 0; i < 2; i++) {
+            // Uniform buffer (frustum planes + camera)
+            {
+                VkBufferCreateInfo bci{VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO};
+                bci.size = uniformSize;
+                bci.usage = VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT;
+                VmaAllocationCreateInfo aci{};
+                aci.usage = VMA_MEMORY_USAGE_CPU_TO_GPU;
+                aci.flags = VMA_ALLOCATION_CREATE_MAPPED_BIT;
+                VmaAllocationInfo ai{};
+                vmaCreateBuffer(ctx->getAllocator(), &bci, &aci,
+                                &cullUniformBuffer_[i], &cullUniformAlloc_[i], &ai);
+                cullUniformMapped_[i] = ai.pMappedData;
+            }
+            // Input SSBO (per-instance cull data)
+            {
+                VkBufferCreateInfo bci{VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO};
+                bci.size = inputSize;
+                bci.usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT;
+                VmaAllocationCreateInfo aci{};
+                aci.usage = VMA_MEMORY_USAGE_CPU_TO_GPU;
+                aci.flags = VMA_ALLOCATION_CREATE_MAPPED_BIT;
+                VmaAllocationInfo ai{};
+                vmaCreateBuffer(ctx->getAllocator(), &bci, &aci,
+                                &cullInputBuffer_[i], &cullInputAlloc_[i], &ai);
+                cullInputMapped_[i] = ai.pMappedData;
+            }
+            // Output SSBO (visibility flags — GPU writes, CPU reads)
+            {
+                VkBufferCreateInfo bci{VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO};
+                bci.size = outputSize;
+                bci.usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT;
+                VmaAllocationCreateInfo aci{};
+                aci.usage = VMA_MEMORY_USAGE_GPU_TO_CPU;
+                aci.flags = VMA_ALLOCATION_CREATE_MAPPED_BIT;
+                VmaAllocationInfo ai{};
+                vmaCreateBuffer(ctx->getAllocator(), &bci, &aci,
+                                &cullOutputBuffer_[i], &cullOutputAlloc_[i], &ai);
+                cullOutputMapped_[i] = ai.pMappedData;
+            }
+
+            // Allocate and write descriptor set
+            VkDescriptorSetAllocateInfo setAi{VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO};
+            setAi.descriptorPool = cullDescPool_;
+            setAi.descriptorSetCount = 1;
+            setAi.pSetLayouts = &cullSetLayout_;
+            vkAllocateDescriptorSets(device, &setAi, &cullSet_[i]);
+
+            VkDescriptorBufferInfo uboInfo{cullUniformBuffer_[i], 0, uniformSize};
+            VkDescriptorBufferInfo inputInfo{cullInputBuffer_[i], 0, inputSize};
+            VkDescriptorBufferInfo outputInfo{cullOutputBuffer_[i], 0, outputSize};
+
+            VkWriteDescriptorSet writes[3] = {};
+            writes[0] = {VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET};
+            writes[0].dstSet = cullSet_[i];
+            writes[0].dstBinding = 0;
+            writes[0].descriptorCount = 1;
+            writes[0].descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER;
+            writes[0].pBufferInfo = &uboInfo;
+
+            writes[1] = {VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET};
+            writes[1].dstSet = cullSet_[i];
+            writes[1].dstBinding = 1;
+            writes[1].descriptorCount = 1;
+            writes[1].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
+            writes[1].pBufferInfo = &inputInfo;
+
+            writes[2] = {VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET};
+            writes[2].dstSet = cullSet_[i];
+            writes[2].dstBinding = 2;
+            writes[2].descriptorCount = 1;
+            writes[2].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
+            writes[2].pBufferInfo = &outputInfo;
+
+            vkUpdateDescriptorSets(device, 3, writes, 0, nullptr);
+        }
+    }
+
     // --- Pipeline layouts ---
 
-    // Main M2 pipeline layout: set 0 = perFrame, set 1 = material, set 2 = bones
-    // Push constant: mat4 model + vec2 uvOffset + int texCoordSet + int useBones = 80 bytes
+    // Main M2 pipeline layout: set 0 = perFrame, set 1 = material, set 2 = bones, set 3 = instances
+    // Push constant: int texCoordSet + int isFoliage + int instanceDataOffset (12 bytes)
     {
-        VkDescriptorSetLayout setLayouts[] = {perFrameLayout, materialSetLayout_, boneSetLayout_};
+        VkDescriptorSetLayout setLayouts[] = {perFrameLayout, materialSetLayout_, boneSetLayout_, instanceSetLayout_};
         VkPushConstantRange pushRange{};
         pushRange.stageFlags = VK_SHADER_STAGE_VERTEX_BIT;
         pushRange.offset = 0;
-        pushRange.size = 88; // mat4(64) + vec2(8) + int(4) + int(4) + int(4) + float(4)
+        pushRange.size = 12; // int texCoordSet + int isFoliage + int instanceDataOffset
 
         VkPipelineLayoutCreateInfo ci{VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO};
-        ci.setLayoutCount = 3;
+        ci.setLayoutCount = 4;
         ci.pSetLayouts = setLayouts;
         ci.pushConstantRangeCount = 1;
         ci.pPushConstantRanges = &pushRange;
@@ -513,7 +752,9 @@ bool M2Renderer::initialize(VkContext* ctx, VkDescriptorSetLayout perFrameLayout
         {4, 0, VK_FORMAT_R32G32B32A32_SFLOAT, 14 * sizeof(float)}, // boneIndices (float)
     };
 
-    auto buildM2Pipeline = [&](VkPipelineColorBlendAttachmentState blendState, bool depthWrite) -> VkPipeline {
+    // Pipeline derivatives — opaque is the base, others derive from it for shared state optimization
+    auto buildM2Pipeline = [&](VkPipelineColorBlendAttachmentState blendState, bool depthWrite,
+                               VkPipelineCreateFlags flags = 0, VkPipeline basePipeline = VK_NULL_HANDLE) -> VkPipeline {
         return PipelineBuilder()
             .setShaders(m2Vert.stageInfo(VK_SHADER_STAGE_VERTEX_BIT),
                         m2Frag.stageInfo(VK_SHADER_STAGE_FRAGMENT_BIT))
@@ -526,13 +767,19 @@ bool M2Renderer::initialize(VkContext* ctx, VkDescriptorSetLayout perFrameLayout
             .setLayout(pipelineLayout_)
             .setRenderPass(mainPass)
             .setDynamicStates({VK_DYNAMIC_STATE_VIEWPORT, VK_DYNAMIC_STATE_SCISSOR})
+            .setFlags(flags)
+            .setBasePipeline(basePipeline)
             .build(device, vkCtx_->getPipelineCache());
     };
 
-    opaquePipeline_ = buildM2Pipeline(PipelineBuilder::blendDisabled(), true);
-    alphaTestPipeline_ = buildM2Pipeline(PipelineBuilder::blendAlpha(), true);
-    alphaPipeline_ = buildM2Pipeline(PipelineBuilder::blendAlpha(), false);
-    additivePipeline_ = buildM2Pipeline(PipelineBuilder::blendAdditive(), false);
+    opaquePipeline_ = buildM2Pipeline(PipelineBuilder::blendDisabled(), true,
+                                      VK_PIPELINE_CREATE_ALLOW_DERIVATIVES_BIT);
+    alphaTestPipeline_ = buildM2Pipeline(PipelineBuilder::blendAlpha(), true,
+                                         VK_PIPELINE_CREATE_DERIVATIVE_BIT, opaquePipeline_);
+    alphaPipeline_ = buildM2Pipeline(PipelineBuilder::blendAlpha(), false,
+                                     VK_PIPELINE_CREATE_DERIVATIVE_BIT, opaquePipeline_);
+    additivePipeline_ = buildM2Pipeline(PipelineBuilder::blendAdditive(), false,
+                                        VK_PIPELINE_CREATE_DERIVATIVE_BIT, opaquePipeline_);
 
     // --- Build particle pipelines ---
     if (particleVert.isValid() && particleFrag.isValid()) {
@@ -805,10 +1052,38 @@ void M2Renderer::shutdown() {
     if (dummyBoneBuffer_) { vmaDestroyBuffer(alloc, dummyBoneBuffer_, dummyBoneAlloc_); dummyBoneBuffer_ = VK_NULL_HANDLE; }
     // dummyBoneSet_ is freed implicitly when boneDescPool_ is destroyed
     dummyBoneSet_ = VK_NULL_HANDLE;
+    // Mega bone SSBO cleanup (sets freed implicitly with boneDescPool_)
+    for (int i = 0; i < 2; i++) {
+        if (megaBoneBuffer_[i]) { vmaDestroyBuffer(alloc, megaBoneBuffer_[i], megaBoneAlloc_[i]); megaBoneBuffer_[i] = VK_NULL_HANDLE; }
+        megaBoneMapped_[i] = nullptr;
+        megaBoneSet_[i] = VK_NULL_HANDLE;
+    }
     if (materialDescPool_) { vkDestroyDescriptorPool(device, materialDescPool_, nullptr); materialDescPool_ = VK_NULL_HANDLE; }
     if (boneDescPool_) { vkDestroyDescriptorPool(device, boneDescPool_, nullptr); boneDescPool_ = VK_NULL_HANDLE; }
+    // Phase 2.1: Instance data SSBO cleanup (sets freed with instanceDescPool_)
+    for (int i = 0; i < 2; i++) {
+        if (instanceBuffer_[i]) { vmaDestroyBuffer(alloc, instanceBuffer_[i], instanceAlloc_[i]); instanceBuffer_[i] = VK_NULL_HANDLE; }
+        instanceMapped_[i] = nullptr;
+        instanceSet_[i] = VK_NULL_HANDLE;
+    }
+    if (instanceDescPool_) { vkDestroyDescriptorPool(device, instanceDescPool_, nullptr); instanceDescPool_ = VK_NULL_HANDLE; }
+
+    // Phase 2.3: GPU frustum culling compute pipeline + buffers cleanup
+    if (cullPipeline_) { vkDestroyPipeline(device, cullPipeline_, nullptr); cullPipeline_ = VK_NULL_HANDLE; }
+    if (cullPipelineLayout_) { vkDestroyPipelineLayout(device, cullPipelineLayout_, nullptr); cullPipelineLayout_ = VK_NULL_HANDLE; }
+    for (int i = 0; i < 2; i++) {
+        if (cullUniformBuffer_[i]) { vmaDestroyBuffer(alloc, cullUniformBuffer_[i], cullUniformAlloc_[i]); cullUniformBuffer_[i] = VK_NULL_HANDLE; }
+        if (cullInputBuffer_[i])   { vmaDestroyBuffer(alloc, cullInputBuffer_[i], cullInputAlloc_[i]); cullInputBuffer_[i] = VK_NULL_HANDLE; }
+        if (cullOutputBuffer_[i])  { vmaDestroyBuffer(alloc, cullOutputBuffer_[i], cullOutputAlloc_[i]); cullOutputBuffer_[i] = VK_NULL_HANDLE; }
+        cullUniformMapped_[i] = cullInputMapped_[i] = cullOutputMapped_[i] = nullptr;
+        cullSet_[i] = VK_NULL_HANDLE;
+    }
+    if (cullDescPool_) { vkDestroyDescriptorPool(device, cullDescPool_, nullptr); cullDescPool_ = VK_NULL_HANDLE; }
+    if (cullSetLayout_) { vkDestroyDescriptorSetLayout(device, cullSetLayout_, nullptr); cullSetLayout_ = VK_NULL_HANDLE; }
+
     if (materialSetLayout_) { vkDestroyDescriptorSetLayout(device, materialSetLayout_, nullptr); materialSetLayout_ = VK_NULL_HANDLE; }
     if (boneSetLayout_) { vkDestroyDescriptorSetLayout(device, boneSetLayout_, nullptr); boneSetLayout_ = VK_NULL_HANDLE; }
+    if (instanceSetLayout_) { vkDestroyDescriptorSetLayout(device, instanceSetLayout_, nullptr); instanceSetLayout_ = VK_NULL_HANDLE; }
     if (particleTexLayout_) { vkDestroyDescriptorSetLayout(device, particleTexLayout_, nullptr); particleTexLayout_ = VK_NULL_HANDLE; }
 
     // Destroy shadow resources
@@ -2212,47 +2487,117 @@ void M2Renderer::prepareRender(uint32_t frameIndex, const Camera& camera) {
     if (!initialized_ || instances.empty()) return;
     (void)camera;  // reserved for future frustum-based culling
 
-    // Pre-allocate bone SSBOs + descriptor sets on main thread (pool ops not thread-safe).
-    // Only iterate animated instances — static doodads don't need bone buffers.
+    // --- Mega bone SSBO: assign slots and upload all animated instance bones ---
+    // Slot 0 = identity (non-animated), slots 1..N = animated instances.
+    uint32_t nextSlot = 1;
     for (size_t idx : animatedInstanceIndices_) {
         if (idx >= instances.size()) continue;
         auto& instance = instances[idx];
 
-        if (instance.boneMatrices.empty()) continue;
+        if (instance.boneMatrices.empty()) {
+            instance.megaBoneOffset = 0;  // Use identity slot
+            continue;
+        }
 
-        if (!instance.boneBuffer[frameIndex]) {
-            VkBufferCreateInfo bci{VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO};
-            bci.size = 128 * sizeof(glm::mat4);
-            bci.usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT;
-            VmaAllocationCreateInfo aci{};
-            aci.usage = VMA_MEMORY_USAGE_CPU_TO_GPU;
-            aci.flags = VMA_ALLOCATION_CREATE_MAPPED_BIT;
-            VmaAllocationInfo allocInfo{};
-            vmaCreateBuffer(vkCtx_->getAllocator(), &bci, &aci,
-                            &instance.boneBuffer[frameIndex], &instance.boneAlloc[frameIndex], &allocInfo);
-            instance.boneMapped[frameIndex] = allocInfo.pMappedData;
+        if (nextSlot >= MEGA_BONE_MAX_INSTANCES) {
+            instance.megaBoneOffset = 0;  // Overflow — use identity
+            continue;
+        }
 
-            // Force dirty so current boneMatrices get copied into this
-            // newly-allocated buffer during render (prevents garbage/zero
-            // bones when the other frame index already cleared bonesDirty).
-            instance.bonesDirty[frameIndex] = true;
+        instance.megaBoneOffset = nextSlot * MAX_BONES_PER_INSTANCE;
 
-            instance.boneSet[frameIndex] = allocateBoneSet();
-            if (instance.boneSet[frameIndex]) {
-                VkDescriptorBufferInfo bufInfo{};
-                bufInfo.buffer = instance.boneBuffer[frameIndex];
-                bufInfo.offset = 0;
-                bufInfo.range = bci.size;
-                VkWriteDescriptorSet write{VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET};
-                write.dstSet = instance.boneSet[frameIndex];
-                write.dstBinding = 0;
-                write.descriptorCount = 1;
-                write.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
-                write.pBufferInfo = &bufInfo;
-                vkUpdateDescriptorSets(vkCtx_->getDevice(), 1, &write, 0, nullptr);
+        // Upload bone matrices to mega buffer
+        if (megaBoneMapped_[frameIndex]) {
+            int numBones = std::min(static_cast<int>(instance.boneMatrices.size()),
+                                    static_cast<int>(MAX_BONES_PER_INSTANCE));
+            auto* dst = static_cast<glm::mat4*>(megaBoneMapped_[frameIndex]) + instance.megaBoneOffset;
+            memcpy(dst, instance.boneMatrices.data(), numBones * sizeof(glm::mat4));
+        }
+
+        nextSlot++;
+    }
+}
+
+// Phase 2.3: Dispatch GPU frustum culling compute shader.
+// Called on the primary command buffer BEFORE the render pass begins so that
+// compute dispatch and memory barrier complete before secondary command buffers
+// read the visibility output in render().
+void M2Renderer::dispatchCullCompute(VkCommandBuffer cmd, uint32_t frameIndex, const Camera& camera) {
+    if (!cullPipeline_ || instances.empty()) return;
+
+    const uint32_t numInstances = std::min(static_cast<uint32_t>(instances.size()), MAX_CULL_INSTANCES);
+
+    // --- Compute per-instance adaptive distances (same formula as old CPU cull) ---
+    const float targetRenderDist = (instances.size() > 2000) ? 300.0f
+                                 : (instances.size() > 1000) ? 500.0f
+                                 : 1000.0f;
+    const float shrinkRate = 0.005f;
+    const float growRate   = 0.05f;
+    float blendRate = (targetRenderDist < smoothedRenderDist_) ? shrinkRate : growRate;
+    smoothedRenderDist_ = glm::mix(smoothedRenderDist_, targetRenderDist, blendRate);
+    const float maxRenderDistance = smoothedRenderDist_;
+    const float maxRenderDistanceSq = maxRenderDistance * maxRenderDistance;
+    const float maxPossibleDistSq = maxRenderDistanceSq * 4.0f; // 2x safety margin
+
+    // --- Upload frustum planes + camera (UBO, binding 0) ---
+    const glm::mat4 vp = camera.getProjectionMatrix() * camera.getViewMatrix();
+    Frustum frustum;
+    frustum.extractFromMatrix(vp);
+    const glm::vec3 camPos = camera.getPosition();
+
+    if (cullUniformMapped_[frameIndex]) {
+        auto* ubo = static_cast<CullUniformsGPU*>(cullUniformMapped_[frameIndex]);
+        for (int i = 0; i < 6; i++) {
+            const auto& p = frustum.getPlane(static_cast<Frustum::Side>(i));
+            ubo->frustumPlanes[i] = glm::vec4(p.normal, p.distance);
+        }
+        ubo->cameraPos = glm::vec4(camPos, maxPossibleDistSq);
+        ubo->instanceCount = numInstances;
+    }
+
+    // --- Upload per-instance cull data (SSBO, binding 1) ---
+    if (cullInputMapped_[frameIndex]) {
+        auto* input = static_cast<CullInstanceGPU*>(cullInputMapped_[frameIndex]);
+        for (uint32_t i = 0; i < numInstances; i++) {
+            const auto& inst = instances[i];
+            float worldRadius = inst.cachedBoundRadius * inst.scale;
+            float cullRadius = worldRadius;
+            if (inst.cachedDisableAnimation) {
+                cullRadius = std::max(cullRadius, 3.0f);
             }
+            float effectiveMaxDistSq = maxRenderDistanceSq * std::max(1.0f, cullRadius / 12.0f);
+            if (inst.cachedDisableAnimation)  effectiveMaxDistSq *= 2.6f;
+            if (inst.cachedIsGroundDetail)     effectiveMaxDistSq *= 0.9f;
+
+            float paddedRadius = std::max(cullRadius * 1.5f, cullRadius + 3.0f);
+
+            uint32_t flags = 0;
+            if (inst.cachedIsValid)          flags |= 1u;
+            if (inst.cachedIsSmoke)           flags |= 2u;
+            if (inst.cachedIsInvisibleTrap)   flags |= 4u;
+
+            input[i].sphere = glm::vec4(inst.position, paddedRadius);
+            input[i].effectiveMaxDistSq = effectiveMaxDistSq;
+            input[i].flags = flags;
         }
     }
+
+    // --- Dispatch compute shader ---
+    vkCmdBindPipeline(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, cullPipeline_);
+    vkCmdBindDescriptorSets(cmd, VK_PIPELINE_BIND_POINT_COMPUTE,
+                            cullPipelineLayout_, 0, 1, &cullSet_[frameIndex], 0, nullptr);
+
+    const uint32_t groupCount = (numInstances + 63) / 64;
+    vkCmdDispatch(cmd, groupCount, 1, 1);
+
+    // --- Memory barrier: compute writes → host reads ---
+    VkMemoryBarrier barrier{VK_STRUCTURE_TYPE_MEMORY_BARRIER};
+    barrier.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT;
+    barrier.dstAccessMask = VK_ACCESS_HOST_READ_BIT;
+    vkCmdPipelineBarrier(cmd,
+        VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
+        VK_PIPELINE_STAGE_HOST_BIT,
+        0, 1, &barrier, 0, nullptr, 0, nullptr);
 }
 
 void M2Renderer::render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet, const Camera& camera) {
@@ -2267,71 +2612,86 @@ void M2Renderer::render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet, const
         LOG_INFO("M2 render: ", instances.size(), " instances, ", models.size(), " models");
     }
 
-    // Build frustum for culling
-    const glm::mat4 view = camera.getViewMatrix();
-    const glm::mat4 projection = camera.getProjectionMatrix();
-    Frustum frustum;
-    frustum.extractFromMatrix(projection * view);
-
     // Reuse persistent buffers (clear instead of reallocating)
     glowSprites_.clear();
 
     lastDrawCallCount = 0;
 
-    // Adaptive render distance: smoothed to prevent pop-in/pop-out flickering
-    const float targetRenderDist = (instances.size() > 2000) ? 300.0f
-                                 : (instances.size() > 1000) ? 500.0f
-                                 : 1000.0f;
-    // Smooth transitions: shrink slowly (avoid popping out nearby objects)
-    const float shrinkRate = 0.005f;  // very slow decrease
-    const float growRate = 0.05f;     // faster increase
-    float blendRate = (targetRenderDist < smoothedRenderDist_) ? shrinkRate : growRate;
-    smoothedRenderDist_ = glm::mix(smoothedRenderDist_, targetRenderDist, blendRate);
-    const float maxRenderDistance = smoothedRenderDist_;
-    const float maxRenderDistanceSq = maxRenderDistance * maxRenderDistance;
+    // Phase 2.3: GPU cull results — dispatchCullCompute() already updated smoothedRenderDist_.
+    // Use the cached value (set by dispatchCullCompute or fallback below).
+    const uint32_t frameIndex = vkCtx_->getCurrentFrame();
+    const uint32_t numInstances = std::min(static_cast<uint32_t>(instances.size()), MAX_CULL_INSTANCES);
+    const uint32_t* visibility = static_cast<const uint32_t*>(cullOutputMapped_[frameIndex]);
+    const bool gpuCullAvailable = (cullPipeline_ != VK_NULL_HANDLE && visibility != nullptr);
+
+    // If GPU culling was not dispatched, fallback: compute distances on CPU
+    float maxRenderDistanceSq;
+    if (!gpuCullAvailable) {
+        const float targetRenderDist = (instances.size() > 2000) ? 300.0f
+                                     : (instances.size() > 1000) ? 500.0f
+                                     : 1000.0f;
+        const float shrinkRate = 0.005f;
+        const float growRate = 0.05f;
+        float blendRate = (targetRenderDist < smoothedRenderDist_) ? shrinkRate : growRate;
+        smoothedRenderDist_ = glm::mix(smoothedRenderDist_, targetRenderDist, blendRate);
+        maxRenderDistanceSq = smoothedRenderDist_ * smoothedRenderDist_;
+    } else {
+        maxRenderDistanceSq = smoothedRenderDist_ * smoothedRenderDist_;
+    }
+
     const float fadeStartFraction = 0.75f;
     const glm::vec3 camPos = camera.getPosition();
 
-    // Build sorted visible instance list: cull then sort by modelId to batch VAO binds
-    // Reuse persistent vector to avoid allocation
+    // Build sorted visible instance list
     sortedVisible_.clear();
-    // Reserve based on expected visible count (roughly 30% of total instances in dense areas)
     const size_t expectedVisible = std::min(instances.size() / 3, size_t(600));
     if (sortedVisible_.capacity() < expectedVisible) {
         sortedVisible_.reserve(expectedVisible);
     }
 
-    // Early distance rejection: max possible render distance (tight but safe upper bound)
-    const float maxPossibleDistSq = maxRenderDistance * maxRenderDistance * 4.0f;  // 2x safety margin (reduced from 4x)
+    // Phase 2.3: GPU frustum culling — build frustum only for CPU fallback path
+    Frustum frustum;
+    if (!gpuCullAvailable) {
+        const glm::mat4 vp = camera.getProjectionMatrix() * camera.getViewMatrix();
+        frustum.extractFromMatrix(vp);
+    }
+    const float maxPossibleDistSq = maxRenderDistanceSq * 4.0f;
 
-    for (uint32_t i = 0; i < static_cast<uint32_t>(instances.size()); ++i) {
+    for (uint32_t i = 0; i < numInstances; ++i) {
         const auto& instance = instances[i];
 
-        // Use cached model flags — no hash lookup needed
-        if (!instance.cachedIsValid || instance.cachedIsSmoke || instance.cachedIsInvisibleTrap) continue;
+        if (gpuCullAvailable) {
+            // Phase 2.3: GPU already tested flags + distance + frustum
+            if (!visibility[i]) continue;
+        } else {
+            // CPU fallback: same culling logic as before Phase 2.3
+            if (!instance.cachedIsValid || instance.cachedIsSmoke || instance.cachedIsInvisibleTrap) continue;
 
+            glm::vec3 toCam = instance.position - camPos;
+            float distSqTest = glm::dot(toCam, toCam);
+            if (distSqTest > maxPossibleDistSq) continue;
+
+            float worldRadius = instance.cachedBoundRadius * instance.scale;
+            float cullRadius = worldRadius;
+            if (instance.cachedDisableAnimation) cullRadius = std::max(cullRadius, 3.0f);
+            float effDistSq = maxRenderDistanceSq * std::max(1.0f, cullRadius / 12.0f);
+            if (instance.cachedDisableAnimation) effDistSq *= 2.6f;
+            if (instance.cachedIsGroundDetail) effDistSq *= 0.9f;
+            if (distSqTest > effDistSq) continue;
+
+            float paddedRadius = std::max(cullRadius * 1.5f, cullRadius + 3.0f);
+            if (cullRadius > 0.0f && !frustum.intersectsSphere(instance.position, paddedRadius)) continue;
+        }
+
+        // Compute distSq + effectiveMaxDistSq for sorting and fade alpha (cheap for visible-only)
         glm::vec3 toCam = instance.position - camPos;
         float distSq = glm::dot(toCam, toCam);
-        if (distSq > maxPossibleDistSq) continue;
-
         float worldRadius = instance.cachedBoundRadius * instance.scale;
         float cullRadius = worldRadius;
-        if (instance.cachedDisableAnimation) {
-            cullRadius = std::max(cullRadius, 3.0f);
-        }
+        if (instance.cachedDisableAnimation) cullRadius = std::max(cullRadius, 3.0f);
         float effectiveMaxDistSq = maxRenderDistanceSq * std::max(1.0f, cullRadius / 12.0f);
-        if (instance.cachedDisableAnimation) {
-            effectiveMaxDistSq *= 2.6f;
-        }
-        if (instance.cachedIsGroundDetail) {
-            effectiveMaxDistSq *= 0.75f;
-        }
-
-        if (distSq > effectiveMaxDistSq) continue;
-
-        // Frustum cull with padding
-        float paddedRadius = std::max(cullRadius * 1.5f, cullRadius + 3.0f);
-        if (cullRadius > 0.0f && !frustum.intersectsSphere(instance.position, paddedRadius)) continue;
+        if (instance.cachedDisableAnimation)  effectiveMaxDistSq *= 2.6f;
+        if (instance.cachedIsGroundDetail)     effectiveMaxDistSq *= 0.9f;
 
         sortedVisible_.push_back({i, instance.modelId, distSq, effectiveMaxDistSq});
     }
@@ -2351,17 +2711,12 @@ void M2Renderer::render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet, const
     // State tracking
     VkPipeline currentPipeline = VK_NULL_HANDLE;
     VkDescriptorSet currentMaterialSet = VK_NULL_HANDLE;
-    VkDescriptorSet currentBoneSet = VK_NULL_HANDLE;
-    uint32_t frameIndex = vkCtx_->getCurrentFrame();
 
-    // Push constants struct matching m2.vert.glsl push_constant block
+    // Phase 2.1: Push constants now carry per-batch data only; per-instance data is in instance SSBO.
     struct M2PushConstants {
-        glm::mat4 model;
-        glm::vec2 uvOffset;
-        int texCoordSet;
-        int useBones;
-        int isFoliage;
-        float fadeAlpha;
+        int32_t texCoordSet;        // UV set index (0 or 1)
+        int32_t isFoliage;          // Foliage wind animation flag
+        int32_t instanceDataOffset; // Base index into instance SSBO for this draw group
     };
 
     // Validate per-frame descriptor set before any Vulkan commands
@@ -2377,311 +2732,338 @@ void M2Renderer::render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet, const
     // Start with opaque pipeline
     vkCmdBindPipeline(cmd, VK_PIPELINE_BIND_POINT_GRAPHICS, opaquePipeline_);
     currentPipeline = opaquePipeline_;
-    bool opaquePass = true; // Pass 1 = opaque, pass 2 = transparent (set below for second pass)
 
     // Bind dummy bone set (set 2) so non-animated draws have a valid binding.
-    // Animated instances override this with their real bone set per-instance.
-    if (dummyBoneSet_) {
+    // Phase 2.4: Bind mega bone SSBO instead — all instances index into one buffer via boneBase.
+    if (megaBoneSet_[frameIndex]) {
+        vkCmdBindDescriptorSets(cmd, VK_PIPELINE_BIND_POINT_GRAPHICS,
+                                pipelineLayout_, 2, 1, &megaBoneSet_[frameIndex], 0, nullptr);
+    } else if (dummyBoneSet_) {
         vkCmdBindDescriptorSets(cmd, VK_PIPELINE_BIND_POINT_GRAPHICS,
                                 pipelineLayout_, 2, 1, &dummyBoneSet_, 0, nullptr);
     }
 
-    for (const auto& entry : sortedVisible_) {
-        if (entry.index >= instances.size()) continue;
-        auto& instance = instances[entry.index];
+    // Phase 2.1: Bind instance data SSBO (set 3) — per-instance transforms, fade, bones
+    if (instanceSet_[frameIndex]) {
+        vkCmdBindDescriptorSets(cmd, VK_PIPELINE_BIND_POINT_GRAPHICS,
+                                pipelineLayout_, 3, 1, &instanceSet_[frameIndex], 0, nullptr);
+    }
 
-        // Bind vertex + index buffers once per model group
-        if (entry.modelId != currentModelId) {
-            currentModelId = entry.modelId;
-            currentModelValid = false;
-            auto mdlIt = models.find(currentModelId);
-            if (mdlIt == models.end()) continue;
-            currentModel = &mdlIt->second;
-            if (!currentModel->vertexBuffer || !currentModel->indexBuffer) continue;
-            currentModelValid = true;
-            VkDeviceSize offset = 0;
-            vkCmdBindVertexBuffers(cmd, 0, 1, &currentModel->vertexBuffer, &offset);
-            vkCmdBindIndexBuffer(cmd, currentModel->indexBuffer, 0, VK_INDEX_TYPE_UINT16);
-        }
-        if (!currentModelValid) continue;
+    // Phase 2.1: Reset instance SSBO write cursor for this frame
+    instanceDataCount_ = 0;
+    auto* instSSBO = static_cast<M2InstanceGPU*>(instanceMapped_[frameIndex]);
 
-        const M2ModelGPU& model = *currentModel;
+    // =====================================================================
+    // Phase 2.1: Opaque pass — instanced draws grouped by (modelId, LOD)
+    // =====================================================================
+    // sortedVisible_ is already sorted by modelId so consecutive entries share
+    // the same vertex/index buffer.  Within each model group we sub-group by
+    // targetLOD to guarantee all instances in one vkCmdDrawIndexed use the
+    // same batch set.  Per-instance data (model matrix, fade, bones) is
+    // written to the instance SSBO; the shader reads it via gl_InstanceIndex.
+    {
+        struct PendingInstance {
+            uint32_t instanceIdx;
+            float fadeAlpha;
+            bool useBones;
+            uint16_t targetLOD;
+        };
+        std::vector<PendingInstance> pending;
+        pending.reserve(128);
 
-        // Distance-based fade alpha for smooth pop-in (squared-distance, no sqrt)
-        float fadeAlpha = 1.0f;
-        float fadeFrac = model.disableAnimation ? 0.55f : fadeStartFraction;
-        float fadeStartDistSq = entry.effectiveMaxDistSq * fadeFrac * fadeFrac;
-        if (entry.distSq > fadeStartDistSq) {
-            fadeAlpha = std::clamp((entry.effectiveMaxDistSq - entry.distSq) /
-                                  (entry.effectiveMaxDistSq - fadeStartDistSq), 0.0f, 1.0f);
-        }
+        size_t visStart = 0;
+        while (visStart < sortedVisible_.size()) {
+            // Find group of consecutive entries with same modelId
+            uint32_t groupModelId = sortedVisible_[visStart].modelId;
+            size_t groupEnd = visStart;
+            while (groupEnd < sortedVisible_.size() && sortedVisible_[groupEnd].modelId == groupModelId)
+                groupEnd++;
 
-        float instanceFadeAlpha = fadeAlpha;
-        if (model.isGroundDetail) {
-            instanceFadeAlpha *= 0.82f;
-        }
-        if (model.isInstancePortal) {
-            // Render mesh at low alpha + emit glow sprite at center
-            instanceFadeAlpha *= 0.12f;
-            if (entry.distSq < 400.0f * 400.0f) {
-                glm::vec3 center = glm::vec3(instance.modelMatrix * glm::vec4(0.0f, 0.0f, 0.0f, 1.0f));
-                GlowSprite gs;
-                gs.worldPos = center;
-                gs.color = glm::vec4(0.35f, 0.5f, 1.0f, 1.1f);
-                gs.size = instance.scale * 5.0f;
-                glowSprites_.push_back(gs);
-                GlowSprite halo = gs;
-                halo.color.a *= 0.3f;
-                halo.size *= 2.2f;
-                glowSprites_.push_back(halo);
-            }
-        }
-
-        // Upload bone matrices to SSBO if model has skeletal animation.
-        // Skip animated instances entirely until bones are computed + buffers allocated
-        // to prevent bind-pose/T-pose flash on first appearance.
-        bool modelNeedsAnimation = model.hasAnimation && !model.disableAnimation;
-        if (modelNeedsAnimation && instance.boneMatrices.empty()) {
-            continue;  // Bones not yet computed — skip to avoid bind-pose flash
-        }
-        bool needsBones = modelNeedsAnimation && !instance.boneMatrices.empty();
-        if (needsBones && (!instance.boneBuffer[frameIndex] || !instance.boneSet[frameIndex])) {
-            continue;  // Bone buffers not yet allocated — skip to avoid bind-pose flash
-        }
-        bool useBones = needsBones;
-        if (useBones) {
-            // Upload bone matrices only when recomputed (per-frame-index tracking
-            // ensures both double-buffered SSBOs get the latest bone data)
-            if (instance.bonesDirty[frameIndex] && instance.boneMapped[frameIndex]) {
-                int numBones = std::min(static_cast<int>(instance.boneMatrices.size()), 128);
-                memcpy(instance.boneMapped[frameIndex], instance.boneMatrices.data(),
-                       numBones * sizeof(glm::mat4));
-                instance.bonesDirty[frameIndex] = false;
-            }
-
-            // Bind bone descriptor set (set 2) — skip if already bound
-            if (instance.boneSet[frameIndex] && instance.boneSet[frameIndex] != currentBoneSet) {
-                vkCmdBindDescriptorSets(cmd, VK_PIPELINE_BIND_POINT_GRAPHICS,
-                                        pipelineLayout_, 2, 1, &instance.boneSet[frameIndex], 0, nullptr);
-                currentBoneSet = instance.boneSet[frameIndex];
-            }
-        }
-
-        // LOD selection based on squared distance (avoid sqrt)
-        uint16_t desiredLOD = 0;
-        if (entry.distSq > 150.0f * 150.0f) desiredLOD = 3;
-        else if (entry.distSq > 80.0f * 80.0f) desiredLOD = 2;
-        else if (entry.distSq > 40.0f * 40.0f) desiredLOD = 1;
-
-        uint16_t targetLOD = desiredLOD;
-        if (desiredLOD > 0 && !(model.availableLODs & (1u << desiredLOD))) {
-            targetLOD = 0;
-        }
-
-        const bool foliageLikeModel = model.isFoliageLike;
-        // Particle-dominant spell effects: mesh is emission geometry, render dim
-        const bool particleDominantEffect = model.isSpellEffect &&
-            !model.particleEmitters.empty() && model.batches.size() <= 2;
-        for (const auto& batch : model.batches) {
-            if (batch.indexCount == 0) continue;
-            if (!model.isGroundDetail && batch.submeshLevel != targetLOD) continue;
-            if (batch.batchOpacity < 0.01f) continue;
-
-            // Two-pass gate: pass 1 = opaque/cutout only, pass 2 = transparent/additive only.
-            // Alpha-test (blendMode==1) and spell effects that force-additive are handled
-            // by their effective blend mode below; gate on raw blendMode here.
-            {
-                const bool rawTransparent = (batch.blendMode >= 2) || model.isSpellEffect;
-                if (opaquePass && rawTransparent) continue;   // skip transparent in opaque pass
-                if (!opaquePass && !rawTransparent) continue; // skip opaque in transparent pass
-            }
-
-            const bool koboldFlameCard = batch.colorKeyBlack && model.isKoboldFlame;
-            const bool smallCardLikeBatch =
-                (batch.glowSize <= 1.35f) ||
-                (batch.lanternGlowHint && batch.glowSize <= 6.0f);
-            const bool batchUnlit = (batch.materialFlags & 0x01) != 0;
-            const bool elvenLikeModel = model.isElvenLike;
-            const bool lanternLikeModel = model.isLanternLike;
-            const bool shouldUseGlowSprite =
-                !koboldFlameCard &&
-                (elvenLikeModel || (lanternLikeModel && batch.lanternGlowHint)) &&
-                !model.isSpellEffect &&
-                smallCardLikeBatch &&
-                (batch.lanternGlowHint ||
-                 (batch.blendMode >= 3) ||
-                 (batch.colorKeyBlack && batchUnlit && batch.blendMode >= 1));
-            if (shouldUseGlowSprite) {
-                if (entry.distSq < 180.0f * 180.0f) {
-                    glm::vec3 worldPos = glm::vec3(instance.modelMatrix * glm::vec4(batch.center, 1.0f));
-                    GlowSprite gs;
-                    gs.worldPos = worldPos;
-                    if (batch.glowTint == 1 || elvenLikeModel) {
-                        gs.color = glm::vec4(0.48f, 0.72f, 1.0f, 1.05f);
-                    } else if (batch.glowTint == 2) {
-                        gs.color = glm::vec4(1.0f, 0.28f, 0.22f, 1.10f);
-                    } else {
-                        gs.color = glm::vec4(1.0f, 0.82f, 0.46f, 1.15f);
-                    }
-                    gs.size = batch.glowSize * instance.scale * 1.45f;
-                    glowSprites_.push_back(gs);
-                    GlowSprite halo = gs;
-                    halo.color.a *= 0.42f;
-                    halo.size *= 1.8f;
-                    glowSprites_.push_back(halo);
-                }
-                const bool cardLikeSkipMesh =
-                    (batch.blendMode >= 3) ||
-                    batch.colorKeyBlack ||
-                    ((batch.materialFlags & 0x01) != 0);
-                const bool lanternGlowCardSkip =
-                    lanternLikeModel &&
-                    batch.lanternGlowHint &&
-                    smallCardLikeBatch &&
-                    cardLikeSkipMesh;
-                if (lanternGlowCardSkip || (cardLikeSkipMesh && !lanternLikeModel)) {
-                    continue;
-                }
-            }
-
-            // Compute UV offset for texture animation
-            glm::vec2 uvOffset(0.0f, 0.0f);
-            if (batch.textureAnimIndex != 0xFFFF && model.hasTextureAnimation) {
-                uint16_t lookupIdx = batch.textureAnimIndex;
-                if (lookupIdx < model.textureTransformLookup.size()) {
-                    uint16_t transformIdx = model.textureTransformLookup[lookupIdx];
-                    if (transformIdx < model.textureTransforms.size()) {
-                        const auto& tt = model.textureTransforms[transformIdx];
-                        glm::vec3 trans = interpVec3(tt.translation,
-                            instance.currentSequenceIndex, instance.animTime,
-                            glm::vec3(0.0f), model.globalSequenceDurations);
-                        uvOffset = glm::vec2(trans.x, trans.y);
-                    }
-                }
-            }
-            // Lava M2 models: fallback UV scroll if no texture animation.
-            // Uses kLavaAnimStart (file-scope) for consistent timing across passes.
-            if (model.isLavaModel && uvOffset == glm::vec2(0.0f)) {
-                float t = std::chrono::duration<float>(std::chrono::steady_clock::now() - kLavaAnimStart).count();
-                uvOffset = glm::vec2(t * 0.03f, -t * 0.08f);
-            }
-
-            // Foliage/card-like batches render more stably as cutout (depth-write on)
-            // instead of alpha-blended sorting.
-            const bool foliageCutout =
-                foliageLikeModel &&
-                !model.isSpellEffect &&
-                batch.blendMode <= 3;
-            const bool forceCutout =
-                !model.isSpellEffect &&
-                (model.isGroundDetail ||
-                 foliageCutout ||
-                 batch.blendMode == 1 ||
-                 (batch.blendMode >= 2 && !batch.hasAlpha) ||
-                 batch.colorKeyBlack);
-
-            // Select pipeline based on blend mode
-            uint8_t effectiveBlendMode = batch.blendMode;
-            if (model.isSpellEffect) {
-                // Effect models: force additive blend for opaque/cutout batches
-                // so the mesh renders as a transparent glow, not a solid object
-                if (effectiveBlendMode <= 1) {
-                    effectiveBlendMode = 3;  // additive
-                } else if (effectiveBlendMode == 4 || effectiveBlendMode == 5) {
-                    effectiveBlendMode = 3;
-                }
-            }
-            if (forceCutout) {
-                effectiveBlendMode = 1;
-            }
-
-            VkPipeline desiredPipeline;
-            if (forceCutout) {
-                // Use opaque pipeline + shader discard for stable foliage cards.
-                desiredPipeline = opaquePipeline_;
-            } else {
-                switch (effectiveBlendMode) {
-                    case 0: desiredPipeline = opaquePipeline_; break;
-                    case 1: desiredPipeline = alphaTestPipeline_; break;
-                    case 2: desiredPipeline = alphaPipeline_; break;
-                    default: desiredPipeline = additivePipeline_; break;
-                }
-            }
-            if (desiredPipeline != currentPipeline) {
-                vkCmdBindPipeline(cmd, VK_PIPELINE_BIND_POINT_GRAPHICS, desiredPipeline);
-                currentPipeline = desiredPipeline;
-            }
-
-            // Update material UBO with per-draw dynamic values (interiorDarken, forceCutout overrides)
-            // Note: fadeAlpha is in push constants (per-draw) to avoid shared-UBO race
-            if (batch.materialUBOMapped) {
-                auto* mat = static_cast<M2MaterialUBO*>(batch.materialUBOMapped);
-                mat->interiorDarken = insideInterior ? 1.0f : 0.0f;
-                if (batch.colorKeyBlack) {
-                    mat->colorKeyThreshold = (effectiveBlendMode == 4 || effectiveBlendMode == 5) ? 0.7f : 0.08f;
-                }
-                if (forceCutout) {
-                    mat->alphaTest = model.isGroundDetail ? 3 : (foliageCutout ? 2 : 1);
-                    if (model.isGroundDetail) {
-                        mat->unlit = 0;
-                    }
-                }
-            }
-
-            // Bind material descriptor set (set 1) — skip batch if missing
-            // to avoid inheriting a stale descriptor set from a prior renderer
-            if (!batch.materialSet) continue;
-            if (batch.materialSet != currentMaterialSet) {
-                vkCmdBindDescriptorSets(cmd, VK_PIPELINE_BIND_POINT_GRAPHICS,
-                                        pipelineLayout_, 1, 1, &batch.materialSet, 0, nullptr);
-                currentMaterialSet = batch.materialSet;
-            }
-
-            // Push constants
-            M2PushConstants pc;
-            pc.model = instance.modelMatrix;
-            pc.uvOffset = uvOffset;
-            pc.texCoordSet = static_cast<int>(batch.textureUnit);
-            pc.useBones = useBones ? 1 : 0;
-            pc.isFoliage = model.shadowWindFoliage ? 1 : 0;
-            pc.fadeAlpha = instanceFadeAlpha;
-            // Particle-dominant effects: mesh is emission geometry, don't render
-            if (particleDominantEffect && batch.blendMode <= 1) {
+            auto mdlIt = models.find(groupModelId);
+            if (mdlIt == models.end() || !mdlIt->second.vertexBuffer || !mdlIt->second.indexBuffer) {
+                visStart = groupEnd;
                 continue;
             }
-            vkCmdPushConstants(cmd, pipelineLayout_, VK_SHADER_STAGE_VERTEX_BIT, 0, sizeof(pc), &pc);
-            vkCmdDrawIndexed(cmd, batch.indexCount, 1, batch.indexStart, 0, 0);
-            lastDrawCallCount++;
+            const M2ModelGPU& model = mdlIt->second;
+
+            bool modelNeedsAnimation = model.hasAnimation && !model.disableAnimation;
+            const bool foliageLikeModel = model.isFoliageLike;
+            const bool particleDominantEffect = model.isSpellEffect &&
+                !model.particleEmitters.empty() && model.batches.size() <= 2;
+
+            // Collect per-instance data for this model group
+            pending.clear();
+            for (size_t vi = visStart; vi < groupEnd; vi++) {
+                const auto& entry = sortedVisible_[vi];
+                if (entry.index >= instances.size()) continue;
+                auto& instance = instances[entry.index];
+
+                // Distance-based fade alpha
+                float fadeFrac = model.disableAnimation ? 0.55f : fadeStartFraction;
+                float fadeStartDistSq = entry.effectiveMaxDistSq * fadeFrac * fadeFrac;
+                float fadeAlpha = 1.0f;
+                if (entry.distSq > fadeStartDistSq) {
+                    fadeAlpha = std::clamp((entry.effectiveMaxDistSq - entry.distSq) /
+                                          (entry.effectiveMaxDistSq - fadeStartDistSq), 0.0f, 1.0f);
+                }
+                float instanceFadeAlpha = fadeAlpha;
+                if (model.isGroundDetail) instanceFadeAlpha *= 0.82f;
+                if (model.isInstancePortal) {
+                    instanceFadeAlpha *= 0.12f;
+                    if (entry.distSq < 400.0f * 400.0f) {
+                        glm::vec3 center = glm::vec3(instance.modelMatrix * glm::vec4(0.0f, 0.0f, 0.0f, 1.0f));
+                        GlowSprite gs;
+                        gs.worldPos = center;
+                        gs.color = glm::vec4(0.35f, 0.5f, 1.0f, 1.1f);
+                        gs.size = instance.scale * 5.0f;
+                        glowSprites_.push_back(gs);
+                        GlowSprite halo = gs;
+                        halo.color.a *= 0.3f;
+                        halo.size *= 2.2f;
+                        glowSprites_.push_back(halo);
+                    }
+                }
+
+                // Bone readiness check
+                if (modelNeedsAnimation && instance.boneMatrices.empty()) continue;
+                bool needsBones = modelNeedsAnimation && !instance.boneMatrices.empty();
+                if (needsBones && instance.megaBoneOffset == 0) continue;
+
+                // LOD selection
+                uint16_t desiredLOD = 0;
+                if (entry.distSq > 150.0f * 150.0f) desiredLOD = 3;
+                else if (entry.distSq > 80.0f * 80.0f) desiredLOD = 2;
+                else if (entry.distSq > 40.0f * 40.0f) desiredLOD = 1;
+                uint16_t targetLOD = desiredLOD;
+                if (desiredLOD > 0 && !(model.availableLODs & (1u << desiredLOD))) targetLOD = 0;
+
+                pending.push_back({entry.index, instanceFadeAlpha, needsBones, targetLOD});
+            }
+
+            if (pending.empty()) { visStart = groupEnd; continue; }
+
+            // Sort by targetLOD so each sub-group occupies a contiguous SSBO range
+            std::sort(pending.begin(), pending.end(),
+                      [](const PendingInstance& a, const PendingInstance& b) { return a.targetLOD < b.targetLOD; });
+
+            // Bind vertex/index buffers once per model group
+            VkDeviceSize vbOffset = 0;
+            vkCmdBindVertexBuffers(cmd, 0, 1, &model.vertexBuffer, &vbOffset);
+            vkCmdBindIndexBuffer(cmd, model.indexBuffer, 0, VK_INDEX_TYPE_UINT16);
+
+            // Write base instance data to SSBO (uvOffset=0 — overridden for tex-anim batches)
+            uint32_t baseSSBOOffset = instanceDataCount_;
+            for (const auto& p : pending) {
+                if (instanceDataCount_ >= MAX_INSTANCE_DATA) break;
+                auto& inst = instances[p.instanceIdx];
+                auto& e = instSSBO[instanceDataCount_];
+                e.model = inst.modelMatrix;
+                e.uvOffset = glm::vec2(0.0f);
+                e.fadeAlpha = p.fadeAlpha;
+                e.useBones = p.useBones ? 1 : 0;
+                e.boneBase = p.useBones ? static_cast<int32_t>(inst.megaBoneOffset) : 0;
+                std::memset(e._pad, 0, sizeof(e._pad));
+                instanceDataCount_++;
+            }
+
+            // Process LOD sub-groups within this model group
+            size_t lodIdx = 0;
+            while (lodIdx < pending.size()) {
+                uint16_t lod = pending[lodIdx].targetLOD;
+                size_t lodEnd = lodIdx + 1;
+                while (lodEnd < pending.size() && pending[lodEnd].targetLOD == lod) lodEnd++;
+                uint32_t groupSize = static_cast<uint32_t>(lodEnd - lodIdx);
+                uint32_t groupSSBOOffset = baseSSBOOffset + static_cast<uint32_t>(lodIdx);
+
+                for (size_t bi = 0; bi < model.batches.size(); bi++) {
+                    const auto& batch = model.batches[bi];
+                    if (batch.indexCount == 0) continue;
+                    if (!model.isGroundDetail && batch.submeshLevel != lod) continue;
+                    if (batch.batchOpacity < 0.01f) continue;
+
+                    // Opaque gate — skip transparent batches
+                    const bool rawTransparent = (batch.blendMode >= 2) || model.isSpellEffect;
+                    if (rawTransparent) continue;
+
+                    // Particle-dominant effects: emission geometry — skip opaque
+                    if (particleDominantEffect && batch.blendMode <= 1) continue;
+
+                    // Glow sprite check (per model+batch, sprites generated per instance)
+                    const bool koboldFlameCard = batch.colorKeyBlack && model.isKoboldFlame;
+                    const bool smallCardLikeBatch =
+                        (batch.glowSize <= 1.35f) ||
+                        (batch.lanternGlowHint && batch.glowSize <= 6.0f);
+                    const bool batchUnlit = (batch.materialFlags & 0x01) != 0;
+                    const bool shouldUseGlowSprite =
+                        !koboldFlameCard &&
+                        (model.isElvenLike || (model.isLanternLike && batch.lanternGlowHint)) &&
+                        !model.isSpellEffect &&
+                        smallCardLikeBatch &&
+                        (batch.lanternGlowHint ||
+                         (batch.blendMode >= 3) ||
+                         (batch.colorKeyBlack && batchUnlit && batch.blendMode >= 1));
+                    if (shouldUseGlowSprite) {
+                        // Generate glow sprites for each instance in the group
+                        for (size_t j = lodIdx; j < lodEnd; j++) {
+                            auto& inst = instances[pending[j].instanceIdx];
+                            float distSq = sortedVisible_[visStart].distSq; // approximate with group
+                            if (distSq < 180.0f * 180.0f) {
+                                glm::vec3 worldPos = glm::vec3(inst.modelMatrix * glm::vec4(batch.center, 1.0f));
+                                GlowSprite gs;
+                                gs.worldPos = worldPos;
+                                if (batch.glowTint == 1 || model.isElvenLike)
+                                    gs.color = glm::vec4(0.48f, 0.72f, 1.0f, 1.05f);
+                                else if (batch.glowTint == 2)
+                                    gs.color = glm::vec4(1.0f, 0.28f, 0.22f, 1.10f);
+                                else
+                                    gs.color = glm::vec4(1.0f, 0.82f, 0.46f, 1.15f);
+                                gs.size = batch.glowSize * inst.scale * 1.45f;
+                                glowSprites_.push_back(gs);
+                                GlowSprite halo = gs;
+                                halo.color.a *= 0.42f;
+                                halo.size *= 1.8f;
+                                glowSprites_.push_back(halo);
+                            }
+                        }
+                        const bool cardLikeSkipMesh =
+                            (batch.blendMode >= 3) || batch.colorKeyBlack || batchUnlit;
+                        const bool lanternGlowCardSkip =
+                            model.isLanternLike && batch.lanternGlowHint &&
+                            smallCardLikeBatch && cardLikeSkipMesh;
+                        if (lanternGlowCardSkip || (cardLikeSkipMesh && !model.isLanternLike))
+                            continue;
+                    }
+
+                    // Handle texture animation: if this batch has per-instance uvOffset,
+                    // write a separate SSBO range with the correct offsets.
+                    bool hasBatchTexAnim = (batch.textureAnimIndex != 0xFFFF && model.hasTextureAnimation)
+                                           || model.isLavaModel;
+                    uint32_t drawOffset = groupSSBOOffset;
+                    if (hasBatchTexAnim && instanceDataCount_ + groupSize <= MAX_INSTANCE_DATA) {
+                        drawOffset = instanceDataCount_;
+                        for (size_t j = lodIdx; j < lodEnd; j++) {
+                            auto& inst = instances[pending[j].instanceIdx];
+                            glm::vec2 uvOffset(0.0f);
+                            if (batch.textureAnimIndex != 0xFFFF && model.hasTextureAnimation) {
+                                uint16_t lookupIdx = batch.textureAnimIndex;
+                                if (lookupIdx < model.textureTransformLookup.size()) {
+                                    uint16_t transformIdx = model.textureTransformLookup[lookupIdx];
+                                    if (transformIdx < model.textureTransforms.size()) {
+                                        const auto& tt = model.textureTransforms[transformIdx];
+                                        glm::vec3 trans = interpVec3(tt.translation,
+                                            inst.currentSequenceIndex, inst.animTime,
+                                            glm::vec3(0.0f), model.globalSequenceDurations);
+                                        uvOffset = glm::vec2(trans.x, trans.y);
+                                    }
+                                }
+                            }
+                            if (model.isLavaModel && uvOffset == glm::vec2(0.0f)) {
+                                float t = std::chrono::duration<float>(
+                                    std::chrono::steady_clock::now() - kLavaAnimStart).count();
+                                uvOffset = glm::vec2(t * 0.03f, -t * 0.08f);
+                            }
+                            // Copy base entry and override uvOffset
+                            instSSBO[instanceDataCount_] = instSSBO[groupSSBOOffset + (j - lodIdx)];
+                            instSSBO[instanceDataCount_].uvOffset = uvOffset;
+                            instanceDataCount_++;
+                        }
+                    }
+
+                    // Pipeline selection (per-model/batch, not per-instance)
+                    const bool foliageCutout = foliageLikeModel && !model.isSpellEffect && batch.blendMode <= 3;
+                    const bool forceCutout =
+                        !model.isSpellEffect &&
+                        (model.isGroundDetail || foliageCutout ||
+                         batch.blendMode == 1 ||
+                         (batch.blendMode >= 2 && !batch.hasAlpha) ||
+                         batch.colorKeyBlack);
+
+                    uint8_t effectiveBlendMode = batch.blendMode;
+                    if (model.isSpellEffect) {
+                        if (effectiveBlendMode <= 1) effectiveBlendMode = 3;
+                        else if (effectiveBlendMode == 4 || effectiveBlendMode == 5) effectiveBlendMode = 3;
+                    }
+                    if (forceCutout) effectiveBlendMode = 1;
+
+                    VkPipeline desiredPipeline;
+                    if (forceCutout) {
+                        desiredPipeline = opaquePipeline_;
+                    } else {
+                        switch (effectiveBlendMode) {
+                            case 0: desiredPipeline = opaquePipeline_; break;
+                            case 1: desiredPipeline = alphaTestPipeline_; break;
+                            case 2: desiredPipeline = alphaPipeline_; break;
+                            default: desiredPipeline = additivePipeline_; break;
+                        }
+                    }
+                    if (desiredPipeline != currentPipeline) {
+                        vkCmdBindPipeline(cmd, VK_PIPELINE_BIND_POINT_GRAPHICS, desiredPipeline);
+                        currentPipeline = desiredPipeline;
+                    }
+
+                    // Update material UBO
+                    if (batch.materialUBOMapped) {
+                        auto* mat = static_cast<M2MaterialUBO*>(batch.materialUBOMapped);
+                        mat->interiorDarken = insideInterior ? 1.0f : 0.0f;
+                        if (batch.colorKeyBlack)
+                            mat->colorKeyThreshold = (effectiveBlendMode == 4 || effectiveBlendMode == 5) ? 0.7f : 0.08f;
+                        if (forceCutout) {
+                            mat->alphaTest = model.isGroundDetail ? 3 : (foliageCutout ? 2 : 1);
+                            if (model.isGroundDetail) mat->unlit = 0;
+                        }
+                    }
+
+                    // Bind material descriptor set (set 1)
+                    if (!batch.materialSet) continue;
+                    if (batch.materialSet != currentMaterialSet) {
+                        vkCmdBindDescriptorSets(cmd, VK_PIPELINE_BIND_POINT_GRAPHICS,
+                                                pipelineLayout_, 1, 1, &batch.materialSet, 0, nullptr);
+                        currentMaterialSet = batch.materialSet;
+                    }
+
+                    // Push constants + instanced draw
+                    M2PushConstants pc;
+                    pc.texCoordSet = static_cast<int32_t>(batch.textureUnit);
+                    pc.isFoliage = model.shadowWindFoliage ? 1 : 0;
+                    pc.instanceDataOffset = static_cast<int32_t>(drawOffset);
+                    vkCmdPushConstants(cmd, pipelineLayout_, VK_SHADER_STAGE_VERTEX_BIT, 0, sizeof(pc), &pc);
+                    vkCmdDrawIndexed(cmd, batch.indexCount, groupSize, batch.indexStart, 0, 0);
+                    lastDrawCallCount++;
+                }
+
+                lodIdx = lodEnd;
+            }
+
+            visStart = groupEnd;
         }
     }
 
-    // Pass 2: transparent/additive batches — sort back-to-front by distance so
-    // overlapping transparent geometry composites in the correct painter's order.
-    opaquePass = false;
+    // =====================================================================
+    // Pass 2: Transparent/additive batches — back-to-front per instance
+    // =====================================================================
+    // Transparent geometry must be drawn individually per instance in back-to-
+    // front order for correct alpha compositing.  Each draw writes one
+    // M2InstanceGPU entry and issues a single-instance indexed draw.
     std::sort(sortedVisible_.begin(), sortedVisible_.end(),
               [](const VisibleEntry& a, const VisibleEntry& b) { return a.distSq > b.distSq; });
 
     currentModelId = UINT32_MAX;
     currentModel = nullptr;
     currentModelValid = false;
-    // Reset state so the first transparent bind always sets explicitly
     currentPipeline = opaquePipeline_;
     currentMaterialSet = VK_NULL_HANDLE;
-    currentBoneSet = VK_NULL_HANDLE;
 
     for (const auto& entry : sortedVisible_) {
         if (entry.index >= instances.size()) continue;
         auto& instance = instances[entry.index];
 
-        // Quick skip: if model has no transparent batches at all, skip it entirely
+        // Quick skip: if model has no transparent batches at all
         if (entry.modelId != currentModelId) {
             auto mdlIt = models.find(entry.modelId);
             if (mdlIt == models.end()) continue;
             if (!mdlIt->second.hasTransparentBatches && !mdlIt->second.isSpellEffect) continue;
         }
 
-        // Reuse the same rendering logic as pass 1 (via fallthrough — the batch gate
-        // `!opaquePass && !rawTransparent → continue` handles opaque skipping)
         if (entry.modelId != currentModelId) {
             currentModelId = entry.modelId;
             currentModelValid = false;
@@ -2690,15 +3072,15 @@ void M2Renderer::render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet, const
             currentModel = &mdlIt->second;
             if (!currentModel->vertexBuffer || !currentModel->indexBuffer) continue;
             currentModelValid = true;
-            VkDeviceSize offset = 0;
-            vkCmdBindVertexBuffers(cmd, 0, 1, &currentModel->vertexBuffer, &offset);
+            VkDeviceSize vbOff = 0;
+            vkCmdBindVertexBuffers(cmd, 0, 1, &currentModel->vertexBuffer, &vbOff);
             vkCmdBindIndexBuffer(cmd, currentModel->indexBuffer, 0, VK_INDEX_TYPE_UINT16);
         }
         if (!currentModelValid) continue;
 
         const M2ModelGPU& model = *currentModel;
 
-        // Distance-based fade alpha (same as pass 1)
+        // Fade alpha
         float fadeAlpha = 1.0f;
         float fadeFrac = model.disableAnimation ? 0.55f : fadeStartFraction;
         float fadeStartDistSq = entry.effectiveMaxDistSq * fadeFrac * fadeFrac;
@@ -2713,13 +3095,7 @@ void M2Renderer::render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet, const
         bool modelNeedsAnimation = model.hasAnimation && !model.disableAnimation;
         if (modelNeedsAnimation && instance.boneMatrices.empty()) continue;
         bool needsBones = modelNeedsAnimation && !instance.boneMatrices.empty();
-        if (needsBones && (!instance.boneBuffer[frameIndex] || !instance.boneSet[frameIndex])) continue;
-        bool useBones = needsBones;
-        if (useBones && instance.boneSet[frameIndex] && instance.boneSet[frameIndex] != currentBoneSet) {
-            vkCmdBindDescriptorSets(cmd, VK_PIPELINE_BIND_POINT_GRAPHICS,
-                                    pipelineLayout_, 2, 1, &instance.boneSet[frameIndex], 0, nullptr);
-            currentBoneSet = instance.boneSet[frameIndex];
-        }
+        if (needsBones && instance.megaBoneOffset == 0) continue;
 
         uint16_t desiredLOD = 0;
         if (entry.distSq > 150.0f * 150.0f) desiredLOD = 3;
@@ -2742,7 +3118,7 @@ void M2Renderer::render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet, const
                 if (!rawTransparent) continue;
             }
 
-            // Skip glow sprites (handled after loop)
+            // Skip glow sprites (handled in opaque pass)
             const bool batchUnlit = (batch.materialFlags & 0x01) != 0;
             const bool koboldFlameCard = batch.colorKeyBlack && model.isKoboldFlame;
             const bool smallCardLikeBatch =
@@ -2766,7 +3142,10 @@ void M2Renderer::render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet, const
                     continue;
             }
 
-            glm::vec2 uvOffset(0.0f, 0.0f);
+            if (particleDominantEffect) continue; // emission-only mesh
+
+            // Compute UV offset for this instance + batch
+            glm::vec2 uvOffset(0.0f);
             if (batch.textureAnimIndex != 0xFFFF && model.hasTextureAnimation) {
                 uint16_t lookupIdx = batch.textureAnimIndex;
                 if (lookupIdx < model.textureTransformLookup.size()) {
@@ -2785,6 +3164,19 @@ void M2Renderer::render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet, const
                 uvOffset = glm::vec2(t * 0.03f, -t * 0.08f);
             }
 
+            // Write single instance entry to SSBO
+            if (instanceDataCount_ >= MAX_INSTANCE_DATA) continue;
+            uint32_t drawOffset = instanceDataCount_;
+            auto& e = instSSBO[instanceDataCount_];
+            e.model = instance.modelMatrix;
+            e.uvOffset = uvOffset;
+            e.fadeAlpha = instanceFadeAlpha;
+            e.useBones = needsBones ? 1 : 0;
+            e.boneBase = needsBones ? static_cast<int32_t>(instance.megaBoneOffset) : 0;
+            std::memset(e._pad, 0, sizeof(e._pad));
+            instanceDataCount_++;
+
+            // Pipeline selection
             uint8_t effectiveBlendMode = batch.blendMode;
             if (model.isSpellEffect) {
                 if (effectiveBlendMode <= 1) effectiveBlendMode = 3;
@@ -2815,14 +3207,11 @@ void M2Renderer::render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet, const
                 currentMaterialSet = batch.materialSet;
             }
 
+            // Push constants + single-instance draw
             M2PushConstants pc;
-            pc.model = instance.modelMatrix;
-            pc.uvOffset = uvOffset;
-            pc.texCoordSet = static_cast<int>(batch.textureUnit);
-            pc.useBones = useBones ? 1 : 0;
+            pc.texCoordSet = static_cast<int32_t>(batch.textureUnit);
             pc.isFoliage = model.shadowWindFoliage ? 1 : 0;
-            pc.fadeAlpha = instanceFadeAlpha;
-            if (particleDominantEffect) continue; // emission-only mesh
+            pc.instanceDataOffset = static_cast<int32_t>(drawOffset);
             vkCmdPushConstants(cmd, pipelineLayout_, VK_SHADER_STAGE_VERTEX_BIT, 0, sizeof(pc), &pc);
             vkCmdDrawIndexed(cmd, batch.indexCount, 1, batch.indexStart, 0, 0);
             lastDrawCallCount++;
@@ -4842,7 +5231,9 @@ void M2Renderer::recreatePipelines() {
         {4, 0, VK_FORMAT_R32G32B32A32_SFLOAT, 14 * sizeof(float)}, // boneIndices (float)
     };
 
-    auto buildM2Pipeline = [&](VkPipelineColorBlendAttachmentState blendState, bool depthWrite) -> VkPipeline {
+    // Pipeline derivatives — opaque is the base, others derive from it for shared state optimization
+    auto buildM2Pipeline = [&](VkPipelineColorBlendAttachmentState blendState, bool depthWrite,
+                               VkPipelineCreateFlags flags = 0, VkPipeline basePipeline = VK_NULL_HANDLE) -> VkPipeline {
         return PipelineBuilder()
             .setShaders(m2Vert.stageInfo(VK_SHADER_STAGE_VERTEX_BIT),
                         m2Frag.stageInfo(VK_SHADER_STAGE_FRAGMENT_BIT))
@@ -4855,13 +5246,19 @@ void M2Renderer::recreatePipelines() {
             .setLayout(pipelineLayout_)
             .setRenderPass(mainPass)
             .setDynamicStates({VK_DYNAMIC_STATE_VIEWPORT, VK_DYNAMIC_STATE_SCISSOR})
+            .setFlags(flags)
+            .setBasePipeline(basePipeline)
             .build(device, vkCtx_->getPipelineCache());
     };
 
-    opaquePipeline_ = buildM2Pipeline(PipelineBuilder::blendDisabled(), true);
-    alphaTestPipeline_ = buildM2Pipeline(PipelineBuilder::blendAlpha(), true);
-    alphaPipeline_ = buildM2Pipeline(PipelineBuilder::blendAlpha(), false);
-    additivePipeline_ = buildM2Pipeline(PipelineBuilder::blendAdditive(), false);
+    opaquePipeline_ = buildM2Pipeline(PipelineBuilder::blendDisabled(), true,
+                                      VK_PIPELINE_CREATE_ALLOW_DERIVATIVES_BIT);
+    alphaTestPipeline_ = buildM2Pipeline(PipelineBuilder::blendAlpha(), true,
+                                         VK_PIPELINE_CREATE_DERIVATIVE_BIT, opaquePipeline_);
+    alphaPipeline_ = buildM2Pipeline(PipelineBuilder::blendAlpha(), false,
+                                     VK_PIPELINE_CREATE_DERIVATIVE_BIT, opaquePipeline_);
+    additivePipeline_ = buildM2Pipeline(PipelineBuilder::blendAdditive(), false,
+                                        VK_PIPELINE_CREATE_DERIVATIVE_BIT, opaquePipeline_);
 
     // --- Particle pipelines ---
     if (particleVert.isValid() && particleFrag.isValid()) {
diff --git a/src/rendering/render_graph.cpp b/src/rendering/render_graph.cpp
new file mode 100644
index 00000000..d36d20bc
--- /dev/null
+++ b/src/rendering/render_graph.cpp
@@ -0,0 +1,194 @@
+#include "rendering/render_graph.hpp"
+#include "core/logger.hpp"
+#include <algorithm>
+#include <unordered_map>
+#include <queue>
+
+namespace wowee {
+namespace rendering {
+
+void RenderGraph::reset() {
+    passes_.clear();
+    executionOrder_.clear();
+    compiled_ = false;
+    // Keep resource registry — resources are stable across frames
+}
+
+RGResource RenderGraph::registerResource(const std::string& name) {
+    // Check for duplicate
+    for (const auto& r : resources_) {
+        if (r.name == name) return {r.id};
+    }
+    uint32_t id = nextResourceId_++;
+    resources_.push_back({name, id});
+    return {id};
+}
+
+RGResource RenderGraph::findResource(const std::string& name) const {
+    for (const auto& r : resources_) {
+        if (r.name == name) return {r.id};
+    }
+    return {}; // invalid
+}
+
+void RenderGraph::addPass(const std::string& name,
+                          const std::vector<RGResource>& inputs,
+                          const std::vector<RGResource>& outputs,
+                          std::function<void(VkCommandBuffer cmd)> execute) {
+    RGPass pass;
+    pass.name = name;
+    pass.inputs = inputs;
+    pass.outputs = outputs;
+    pass.execute = std::move(execute);
+    pass.enabled = true;
+    passes_.push_back(std::move(pass));
+}
+
+void RenderGraph::setPassEnabled(const std::string& name, bool enabled) {
+    for (auto& pass : passes_) {
+        if (pass.name == name) {
+            pass.enabled = enabled;
+            return;
+        }
+    }
+}
+
+void RenderGraph::compile() {
+    topologicalSort();
+    compiled_ = true;
+}
+
+void RenderGraph::topologicalSort() {
+    const uint32_t n = static_cast<uint32_t>(passes_.size());
+    if (n == 0) { executionOrder_.clear(); return; }
+
+    // Build adjacency: if pass A outputs resource R and pass B inputs resource R,
+    // then A must execute before B (edge A → B).
+    // Map: resource id → index of pass that produces it
+    std::unordered_map<uint32_t, uint32_t> producer;
+    for (uint32_t i = 0; i < n; ++i) {
+        for (const auto& out : passes_[i].outputs) {
+            producer[out.id] = i;
+        }
+    }
+
+    // Build in-degree and adjacency list
+    std::vector<uint32_t> inDegree(n, 0);
+    std::vector<std::vector<uint32_t>> adj(n);
+
+    for (uint32_t i = 0; i < n; ++i) {
+        for (const auto& inp : passes_[i].inputs) {
+            auto it = producer.find(inp.id);
+            if (it != producer.end() && it->second != i) {
+                adj[it->second].push_back(i);
+                inDegree[i]++;
+            }
+        }
+    }
+
+    // Kahn's algorithm
+    std::queue<uint32_t> queue;
+    for (uint32_t i = 0; i < n; ++i) {
+        if (inDegree[i] == 0) queue.push(i);
+    }
+
+    executionOrder_.clear();
+    executionOrder_.reserve(n);
+
+    while (!queue.empty()) {
+        uint32_t u = queue.front();
+        queue.pop();
+        executionOrder_.push_back(u);
+        for (uint32_t v : adj[u]) {
+            if (--inDegree[v] == 0) queue.push(v);
+        }
+    }
+
+    // If not all passes are in the order, there's a cycle — fall back to insertion order
+    if (executionOrder_.size() != n) {
+        LOG_WARNING("RenderGraph: dependency cycle detected, falling back to insertion order");
+        executionOrder_.clear();
+        for (uint32_t i = 0; i < n; ++i) executionOrder_.push_back(i);
+    }
+}
+
+void RenderGraph::execute(VkCommandBuffer cmd) {
+    if (!compiled_) {
+        LOG_WARNING("RenderGraph::execute called without compile()");
+        compile();
+    }
+
+    for (uint32_t idx : executionOrder_) {
+        const auto& pass = passes_[idx];
+        if (!pass.enabled) continue;
+
+        // Insert image barriers declared for this pass
+        if (!pass.imageBarriers.empty()) {
+            std::vector<VkImageMemoryBarrier> barriers;
+            barriers.reserve(pass.imageBarriers.size());
+
+            VkPipelineStageFlags srcStages = 0;
+            VkPipelineStageFlags dstStages = 0;
+
+            for (const auto& b : pass.imageBarriers) {
+                VkImageMemoryBarrier ib{};
+                ib.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER;
+                ib.oldLayout = b.oldLayout;
+                ib.newLayout = b.newLayout;
+                ib.srcAccessMask = b.srcAccess;
+                ib.dstAccessMask = b.dstAccess;
+                ib.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+                ib.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+                ib.image = b.image;
+                ib.subresourceRange = {b.aspectMask, 0, 1, 0, 1};
+                barriers.push_back(ib);
+                srcStages |= b.srcStage;
+                dstStages |= b.dstStage;
+            }
+
+            vkCmdPipelineBarrier(cmd,
+                srcStages, dstStages,
+                0,
+                0, nullptr,
+                0, nullptr,
+                static_cast<uint32_t>(barriers.size()), barriers.data());
+        }
+
+        // Insert buffer barriers declared for this pass
+        if (!pass.bufferBarriers.empty()) {
+            std::vector<VkBufferMemoryBarrier> barriers;
+            barriers.reserve(pass.bufferBarriers.size());
+
+            VkPipelineStageFlags srcStages = 0;
+            VkPipelineStageFlags dstStages = 0;
+
+            for (const auto& b : pass.bufferBarriers) {
+                VkBufferMemoryBarrier bb{};
+                bb.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;
+                bb.srcAccessMask = b.srcAccess;
+                bb.dstAccessMask = b.dstAccess;
+                bb.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+                bb.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+                bb.buffer = b.buffer;
+                bb.offset = b.offset;
+                bb.size = b.size;
+                barriers.push_back(bb);
+                srcStages |= b.srcStage;
+                dstStages |= b.dstStage;
+            }
+
+            vkCmdPipelineBarrier(cmd,
+                srcStages, dstStages,
+                0,
+                0, nullptr,
+                static_cast<uint32_t>(barriers.size()), barriers.data(),
+                0, nullptr);
+        }
+
+        // Execute the pass
+        pass.execute(cmd);
+    }
+}
+
+} // namespace rendering
+} // namespace wowee
diff --git a/src/rendering/renderer.cpp b/src/rendering/renderer.cpp
index 31f4c68c..1daf09cf 100644
--- a/src/rendering/renderer.cpp
+++ b/src/rendering/renderer.cpp
@@ -61,6 +61,7 @@
 #include "rendering/spell_visual_system.hpp"
 #include "rendering/post_process_pipeline.hpp"
 #include "rendering/animation_controller.hpp"
+#include "rendering/render_graph.hpp"
 #include <imgui.h>
 #include <imgui_impl_vulkan.h>
 #include <glm/gtc/matrix_transform.hpp>
@@ -458,7 +459,9 @@ void Renderer::updatePerFrameUBO() {
     }
 
     currentFrameData.lightSpaceMatrix = lightSpaceMatrix;
-    currentFrameData.shadowParams = glm::vec4(shadowsEnabled ? 1.0f : 0.0f, 0.8f, 0.0f, 0.0f);
+    // Scale shadow bias proportionally to ortho extent to avoid acne at close range / gaps at far range
+    float shadowBias = 0.8f * (shadowDistance_ / 300.0f);
+    currentFrameData.shadowParams = glm::vec4(shadowsEnabled ? 1.0f : 0.0f, shadowBias, 0.0f, 0.0f);
 
     // Player water ripple data: pack player XY into shadowParams.zw, ripple strength into fogParams.w
     if (cameraController) {
@@ -563,6 +566,15 @@ bool Renderer::initialize(core::Window* win) {
     postProcessPipeline_ = std::make_unique<PostProcessPipeline>();
     postProcessPipeline_->initialize(vkCtx);
 
+    // Phase 2.5: Create render graph and register virtual resources
+    renderGraph_ = std::make_unique<RenderGraph>();
+    renderGraph_->registerResource("shadow_depth");
+    renderGraph_->registerResource("reflection_texture");
+    renderGraph_->registerResource("cull_visibility");
+    renderGraph_->registerResource("scene_color");
+    renderGraph_->registerResource("scene_depth");
+    renderGraph_->registerResource("final_image");
+
     LOG_INFO("Renderer initialized");
     return true;
 }
@@ -674,6 +686,10 @@ void Renderer::shutdown() {
         postProcessPipeline_->shutdown();
         postProcessPipeline_.reset();
     }
+
+    // Phase 2.5: Destroy render graph
+    renderGraph_.reset();
+
     destroyPerFrameResources();
 
     zoneManager.reset();
@@ -839,36 +855,19 @@ void Renderer::beginFrame() {
     // FSR2 jitter pattern (§4.3 — delegates to PostProcessPipeline)
     if (postProcessPipeline_ && camera) postProcessPipeline_->applyJitter(camera.get());
 
+    // Compute fresh shadow matrix BEFORE UBO update so shaders get current-frame data.
+    lightSpaceMatrix = computeLightSpaceMatrix();
+
     // Update per-frame UBO with current camera/lighting state
     updatePerFrameUBO();
 
-    // --- Off-screen pre-passes (before main render pass) ---
-    // Minimap composite (renders 3x3 tile grid into 768x768 render target)
-    if (minimap && minimap->isEnabled() && camera) {
-        glm::vec3 minimapCenter = camera->getPosition();
-        if (cameraController && cameraController->isThirdPerson())
-            minimapCenter = characterPosition;
-        minimap->compositePass(currentCmd, minimapCenter);
+    // --- Off-screen pre-passes (Phase 2.5: render graph) ---
+    // Build frame graph: registers pre-passes as graph nodes with dependencies.
+    // compile() topologically sorts; execute() runs them with auto barriers.
+    buildFrameGraph(nullptr);
+    if (renderGraph_) {
+        renderGraph_->execute(currentCmd);
     }
-    // World map composite (renders zone tiles into 1024x768 render target)
-    if (worldMap) {
-        worldMap->compositePass(currentCmd);
-    }
-
-    // Character preview composite passes
-    for (auto* preview : activePreviews_) {
-        if (preview && preview->isModelLoaded()) {
-            preview->compositePass(currentCmd, vkCtx->getCurrentFrame());
-        }
-    }
-
-    // Shadow pre-pass (before main render pass)
-    if (shadowsEnabled && shadowDepthImage[0] != VK_NULL_HANDLE) {
-        renderShadowPass();
-    }
-
-    // Water reflection pre-pass (renders scene from mirrored camera into 512x512 texture)
-    renderReflectionPass();
 
     // --- Begin render pass ---
     // Select framebuffer: PP off-screen target or swapchain (§4.3 — PostProcessPipeline)
@@ -3063,17 +3062,10 @@ void Renderer::renderShadowPass() {
 
     // Shadows render every frame — throttling causes visible flicker on player/NPCs
 
-    // Compute and store light space matrix; write to per-frame UBO
-    lightSpaceMatrix = computeLightSpaceMatrix();
+    // lightSpaceMatrix was already computed at frame start (before updatePerFrameUBO).
     // Zero matrix means character position isn't set yet — skip shadow pass entirely.
     if (lightSpaceMatrix == glm::mat4(0.0f)) return;
     uint32_t frame = vkCtx->getCurrentFrame();
-    auto* ubo = reinterpret_cast<GPUPerFrameData*>(perFrameUBOMapped[frame]);
-    if (ubo) {
-        ubo->lightSpaceMatrix = lightSpaceMatrix;
-        ubo->shadowParams.x = shadowsEnabled ? 1.0f : 0.0f;
-        ubo->shadowParams.y = 0.8f;
-    }
 
     // Barrier 1: transition this frame's shadow map into writable depth layout.
     VkImageMemoryBarrier b1{};
@@ -3147,5 +3139,69 @@ void Renderer::renderShadowPass() {
     shadowDepthLayout_[frame] = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL;
 }
 
+// Phase 2.5: Build the per-frame render graph for off-screen pre-passes.
+// Declares passes as graph nodes with input/output dependencies.
+// compile() performs topological sort; execute() runs them with auto barriers.
+void Renderer::buildFrameGraph(game::GameHandler* gameHandler) {
+    (void)gameHandler;
+    if (!renderGraph_) return;
+
+    renderGraph_->reset();
+
+    auto shadowDepth = renderGraph_->findResource("shadow_depth");
+    auto reflTex = renderGraph_->findResource("reflection_texture");
+    auto cullVis = renderGraph_->findResource("cull_visibility");
+
+    // Minimap composites (no dependencies — standalone off-screen render target)
+    renderGraph_->addPass("minimap_composite", {}, {},
+        [this](VkCommandBuffer cmd) {
+            if (minimap && minimap->isEnabled() && camera) {
+                glm::vec3 minimapCenter = camera->getPosition();
+                if (cameraController && cameraController->isThirdPerson())
+                    minimapCenter = characterPosition;
+                minimap->compositePass(cmd, minimapCenter);
+            }
+        });
+
+    // World map composite (standalone)
+    renderGraph_->addPass("worldmap_composite", {}, {},
+        [this](VkCommandBuffer cmd) {
+            if (worldMap) worldMap->compositePass(cmd);
+        });
+
+    // Character preview composites (standalone)
+    renderGraph_->addPass("preview_composite", {}, {},
+        [this](VkCommandBuffer cmd) {
+            uint32_t frame = vkCtx->getCurrentFrame();
+            for (auto* preview : activePreviews_) {
+                if (preview && preview->isModelLoaded())
+                    preview->compositePass(cmd, frame);
+            }
+        });
+
+    // Shadow pre-pass → outputs shadow_depth
+    renderGraph_->addPass("shadow_pass", {}, {shadowDepth},
+        [this](VkCommandBuffer) {
+            if (shadowsEnabled && shadowDepthImage[0] != VK_NULL_HANDLE)
+                renderShadowPass();
+        });
+    renderGraph_->setPassEnabled("shadow_pass", shadowsEnabled && shadowDepthImage[0] != VK_NULL_HANDLE);
+
+    // Reflection pre-pass → outputs reflection_texture (reads scene, so after shadow)
+    renderGraph_->addPass("reflection_pass", {shadowDepth}, {reflTex},
+        [this](VkCommandBuffer) {
+            renderReflectionPass();
+        });
+
+    // GPU frustum cull compute → outputs cull_visibility
+    renderGraph_->addPass("compute_cull", {}, {cullVis},
+        [this](VkCommandBuffer cmd) {
+            if (m2Renderer && camera)
+                m2Renderer->dispatchCullCompute(cmd, vkCtx->getCurrentFrame(), *camera);
+        });
+
+    renderGraph_->compile();
+}
+
 } // namespace rendering
 } // namespace wowee
diff --git a/src/rendering/terrain_renderer.cpp b/src/rendering/terrain_renderer.cpp
index 0de9698a..458714a5 100644
--- a/src/rendering/terrain_renderer.cpp
+++ b/src/rendering/terrain_renderer.cpp
@@ -128,7 +128,7 @@ bool TerrainRenderer::initialize(VkContext* ctx, VkDescriptorSetLayout perFrameL
     vertexAttribs[3] = { 3, 0, VK_FORMAT_R32G32_SFLOAT,
         static_cast<uint32_t>(offsetof(pipeline::TerrainVertex, layerUV)) };
 
-    // --- Build fill pipeline ---
+    // --- Build fill pipeline (base for derivatives — shared state optimization) ---
     VkRenderPass mainPass = vkCtx->getImGuiRenderPass();
 
     pipeline = PipelineBuilder()
@@ -143,6 +143,7 @@ bool TerrainRenderer::initialize(VkContext* ctx, VkDescriptorSetLayout perFrameL
         .setLayout(pipelineLayout)
         .setRenderPass(mainPass)
         .setDynamicStates({ VK_DYNAMIC_STATE_VIEWPORT, VK_DYNAMIC_STATE_SCISSOR })
+        .setFlags(VK_PIPELINE_CREATE_ALLOW_DERIVATIVES_BIT)
         .build(device, vkCtx->getPipelineCache());
 
     if (!pipeline) {
@@ -152,7 +153,7 @@ bool TerrainRenderer::initialize(VkContext* ctx, VkDescriptorSetLayout perFrameL
         return false;
     }
 
-    // --- Build wireframe pipeline ---
+    // --- Build wireframe pipeline (derivative of fill) ---
     wireframePipeline = PipelineBuilder()
         .setShaders(vertShader.stageInfo(VK_SHADER_STAGE_VERTEX_BIT),
                     fragShader.stageInfo(VK_SHADER_STAGE_FRAGMENT_BIT))
@@ -165,6 +166,8 @@ bool TerrainRenderer::initialize(VkContext* ctx, VkDescriptorSetLayout perFrameL
         .setLayout(pipelineLayout)
         .setRenderPass(mainPass)
         .setDynamicStates({ VK_DYNAMIC_STATE_VIEWPORT, VK_DYNAMIC_STATE_SCISSOR })
+        .setFlags(VK_PIPELINE_CREATE_DERIVATIVE_BIT)
+        .setBasePipeline(pipeline)
         .build(device, vkCtx->getPipelineCache());
 
     if (!wireframePipeline) {
@@ -190,6 +193,64 @@ bool TerrainRenderer::initialize(VkContext* ctx, VkDescriptorSetLayout perFrameL
         envSizeMBOrDefault("WOWEE_TERRAIN_TEX_CACHE_MB", 4096) * 1024ull * 1024ull;
     LOG_INFO("Terrain texture cache budget: ", textureCacheBudgetBytes_ / (1024 * 1024), " MB");
 
+    // Phase 2.2: Allocate mega vertex/index buffers and indirect draw buffer.
+    // All terrain chunks share these buffers, eliminating per-chunk VB/IB rebinds.
+    {
+        VmaAllocator allocator = vkCtx->getAllocator();
+
+        // Mega vertex buffer (host-visible for direct write during chunk upload)
+        VkBufferCreateInfo vbCI{};
+        vbCI.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO;
+        vbCI.size = static_cast<VkDeviceSize>(MEGA_VB_MAX_VERTS) * sizeof(pipeline::TerrainVertex);
+        vbCI.usage = VK_BUFFER_USAGE_VERTEX_BUFFER_BIT;
+        VmaAllocationCreateInfo vbAllocCI{};
+        vbAllocCI.usage = VMA_MEMORY_USAGE_CPU_TO_GPU;
+        vbAllocCI.flags = VMA_ALLOCATION_CREATE_MAPPED_BIT;
+        VmaAllocationInfo vbInfo{};
+        if (vmaCreateBuffer(allocator, &vbCI, &vbAllocCI,
+                &megaVB_, &megaVBAlloc_, &vbInfo) == VK_SUCCESS) {
+            megaVBMapped_ = vbInfo.pMappedData;
+        } else {
+            LOG_WARNING("TerrainRenderer: mega VB allocation failed, per-chunk fallback");
+        }
+
+        // Mega index buffer
+        VkBufferCreateInfo ibCI{};
+        ibCI.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO;
+        ibCI.size = static_cast<VkDeviceSize>(MEGA_IB_MAX_INDICES) * sizeof(uint32_t);
+        ibCI.usage = VK_BUFFER_USAGE_INDEX_BUFFER_BIT;
+        VmaAllocationCreateInfo ibAllocCI{};
+        ibAllocCI.usage = VMA_MEMORY_USAGE_CPU_TO_GPU;
+        ibAllocCI.flags = VMA_ALLOCATION_CREATE_MAPPED_BIT;
+        VmaAllocationInfo ibInfo{};
+        if (vmaCreateBuffer(allocator, &ibCI, &ibAllocCI,
+                &megaIB_, &megaIBAlloc_, &ibInfo) == VK_SUCCESS) {
+            megaIBMapped_ = ibInfo.pMappedData;
+        } else {
+            LOG_WARNING("TerrainRenderer: mega IB allocation failed, per-chunk fallback");
+        }
+
+        // Indirect draw command buffer
+        VkBufferCreateInfo indCI{};
+        indCI.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO;
+        indCI.size = MAX_INDIRECT_DRAWS * sizeof(VkDrawIndexedIndirectCommand);
+        indCI.usage = VK_BUFFER_USAGE_INDIRECT_BUFFER_BIT;
+        VmaAllocationCreateInfo indAllocCI{};
+        indAllocCI.usage = VMA_MEMORY_USAGE_CPU_TO_GPU;
+        indAllocCI.flags = VMA_ALLOCATION_CREATE_MAPPED_BIT;
+        VmaAllocationInfo indInfo{};
+        if (vmaCreateBuffer(allocator, &indCI, &indAllocCI,
+                &indirectBuffer_, &indirectAlloc_, &indInfo) == VK_SUCCESS) {
+            indirectMapped_ = indInfo.pMappedData;
+        } else {
+            LOG_WARNING("TerrainRenderer: indirect buffer allocation failed");
+        }
+
+        LOG_INFO("Terrain mega buffers: VB=", vbCI.size / (1024*1024), "MB IB=",
+                 ibCI.size / (1024*1024), "MB indirect=",
+                 indCI.size / 1024, "KB");
+    }
+
     LOG_INFO("Terrain renderer initialized (Vulkan)");
     return true;
 }
@@ -232,7 +293,7 @@ void TerrainRenderer::recreatePipelines() {
 
     VkRenderPass mainPass = vkCtx->getImGuiRenderPass();
 
-    // Rebuild fill pipeline
+    // Rebuild fill pipeline (base for derivatives — shared state optimization)
     pipeline = PipelineBuilder()
         .setShaders(vertShader.stageInfo(VK_SHADER_STAGE_VERTEX_BIT),
                     fragShader.stageInfo(VK_SHADER_STAGE_FRAGMENT_BIT))
@@ -245,13 +306,14 @@ void TerrainRenderer::recreatePipelines() {
         .setLayout(pipelineLayout)
         .setRenderPass(mainPass)
         .setDynamicStates({ VK_DYNAMIC_STATE_VIEWPORT, VK_DYNAMIC_STATE_SCISSOR })
+        .setFlags(VK_PIPELINE_CREATE_ALLOW_DERIVATIVES_BIT)
         .build(device, vkCtx->getPipelineCache());
 
     if (!pipeline) {
         LOG_ERROR("TerrainRenderer::recreatePipelines: failed to create fill pipeline");
     }
 
-    // Rebuild wireframe pipeline
+    // Rebuild wireframe pipeline (derivative of fill)
     wireframePipeline = PipelineBuilder()
         .setShaders(vertShader.stageInfo(VK_SHADER_STAGE_VERTEX_BIT),
                     fragShader.stageInfo(VK_SHADER_STAGE_FRAGMENT_BIT))
@@ -264,6 +326,8 @@ void TerrainRenderer::recreatePipelines() {
         .setLayout(pipelineLayout)
         .setRenderPass(mainPass)
         .setDynamicStates({ VK_DYNAMIC_STATE_VIEWPORT, VK_DYNAMIC_STATE_SCISSOR })
+        .setFlags(VK_PIPELINE_CREATE_DERIVATIVE_BIT)
+        .setBasePipeline(pipeline)
         .build(device, vkCtx->getPipelineCache());
 
     if (!wireframePipeline) {
@@ -311,6 +375,13 @@ void TerrainRenderer::shutdown() {
     if (shadowParamsLayout_) { vkDestroyDescriptorSetLayout(device, shadowParamsLayout_, nullptr); shadowParamsLayout_ = VK_NULL_HANDLE; }
     if (shadowParamsUBO_) { vmaDestroyBuffer(allocator, shadowParamsUBO_, shadowParamsAlloc_); shadowParamsUBO_ = VK_NULL_HANDLE; shadowParamsAlloc_ = VK_NULL_HANDLE; }
 
+    // Phase 2.2: Destroy mega buffers and indirect draw buffer
+    if (megaVB_) { vmaDestroyBuffer(allocator, megaVB_, megaVBAlloc_); megaVB_ = VK_NULL_HANDLE; megaVBAlloc_ = VK_NULL_HANDLE; megaVBMapped_ = nullptr; }
+    if (megaIB_) { vmaDestroyBuffer(allocator, megaIB_, megaIBAlloc_); megaIB_ = VK_NULL_HANDLE; megaIBAlloc_ = VK_NULL_HANDLE; megaIBMapped_ = nullptr; }
+    if (indirectBuffer_) { vmaDestroyBuffer(allocator, indirectBuffer_, indirectAlloc_); indirectBuffer_ = VK_NULL_HANDLE; indirectAlloc_ = VK_NULL_HANDLE; indirectMapped_ = nullptr; }
+    megaVBUsed_ = 0;
+    megaIBUsed_ = 0;
+
     vkCtx = nullptr;
 }
 
@@ -537,6 +608,7 @@ TerrainChunkGPU TerrainRenderer::uploadChunk(const pipeline::ChunkMesh& chunk) {
     gpuChunk.worldY = chunk.worldY;
     gpuChunk.worldZ = chunk.worldZ;
     gpuChunk.indexCount = static_cast<uint32_t>(chunk.indices.size());
+    gpuChunk.vertexCount = static_cast<uint32_t>(chunk.vertices.size());
 
     VkDeviceSize vbSize = chunk.vertices.size() * sizeof(pipeline::TerrainVertex);
     AllocatedBuffer vb = uploadBuffer(*vkCtx, chunk.vertices.data(), vbSize,
@@ -550,6 +622,25 @@ TerrainChunkGPU TerrainRenderer::uploadChunk(const pipeline::ChunkMesh& chunk) {
     gpuChunk.indexBuffer = ib.buffer;
     gpuChunk.indexAlloc = ib.allocation;
 
+    // Phase 2.2: Also copy into mega buffers for indirect drawing
+    uint32_t vertCount = static_cast<uint32_t>(chunk.vertices.size());
+    uint32_t idxCount = static_cast<uint32_t>(chunk.indices.size());
+    if (megaVBMapped_ && megaIBMapped_ &&
+        megaVBUsed_ + vertCount <= MEGA_VB_MAX_VERTS &&
+        megaIBUsed_ + idxCount <= MEGA_IB_MAX_INDICES) {
+        // Copy vertices
+        auto* vbDst = static_cast<pipeline::TerrainVertex*>(megaVBMapped_) + megaVBUsed_;
+        std::memcpy(vbDst, chunk.vertices.data(), vertCount * sizeof(pipeline::TerrainVertex));
+        // Copy indices
+        auto* ibDst = static_cast<uint32_t*>(megaIBMapped_) + megaIBUsed_;
+        std::memcpy(ibDst, chunk.indices.data(), idxCount * sizeof(uint32_t));
+
+        gpuChunk.megaBaseVertex = static_cast<int32_t>(megaVBUsed_);
+        gpuChunk.megaFirstIndex = megaIBUsed_;
+        megaVBUsed_ += vertCount;
+        megaIBUsed_ += idxCount;
+    }
+
     return gpuChunk;
 }
 
@@ -789,6 +880,15 @@ void TerrainRenderer::render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet, c
     renderedChunks = 0;
     culledChunks = 0;
 
+    // Phase 2.2: Use mega VB + IB when available.
+    // Bind mega buffers once, then use direct draws with base vertex/index offsets.
+    const bool useMegaBuffers = (megaVB_ && megaIB_);
+    if (useMegaBuffers) {
+        VkDeviceSize megaOffset = 0;
+        vkCmdBindVertexBuffers(cmd, 0, 1, &megaVB_, &megaOffset);
+        vkCmdBindIndexBuffer(cmd, megaIB_, 0, VK_INDEX_TYPE_UINT32);
+    }
+
     for (const auto& chunk : chunks) {
         if (!chunk.isValid() || !chunk.materialSet) continue;
 
@@ -808,11 +908,17 @@ void TerrainRenderer::render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet, c
         vkCmdBindDescriptorSets(cmd, VK_PIPELINE_BIND_POINT_GRAPHICS, pipelineLayout,
                                  1, 1, &chunk.materialSet, 0, nullptr);
 
-        VkDeviceSize offset = 0;
-        vkCmdBindVertexBuffers(cmd, 0, 1, &chunk.vertexBuffer, &offset);
-        vkCmdBindIndexBuffer(cmd, chunk.indexBuffer, 0, VK_INDEX_TYPE_UINT32);
-
-        vkCmdDrawIndexed(cmd, chunk.indexCount, 1, 0, 0, 0);
+        if (useMegaBuffers && chunk.megaBaseVertex >= 0) {
+            // Direct draw from mega buffer — single VB/IB already bound
+            vkCmdDrawIndexed(cmd, chunk.indexCount, 1,
+                             chunk.megaFirstIndex, chunk.megaBaseVertex, 0);
+        } else {
+            // Fallback: per-chunk VB/IB bind + direct draw
+            VkDeviceSize offset = 0;
+            vkCmdBindVertexBuffers(cmd, 0, 1, &chunk.vertexBuffer, &offset);
+            vkCmdBindIndexBuffer(cmd, chunk.indexBuffer, 0, VK_INDEX_TYPE_UINT32);
+            vkCmdDrawIndexed(cmd, chunk.indexCount, 1, 0, 0, 0);
+        }
         renderedChunks++;
     }
 
@@ -986,6 +1092,14 @@ void TerrainRenderer::renderShadow(VkCommandBuffer cmd, const glm::mat4& lightSp
     vkCmdPushConstants(cmd, shadowPipelineLayout_, VK_SHADER_STAGE_VERTEX_BIT,
                        0, 128, &push);
 
+    // Phase 2.2: Bind mega buffers once for shadow pass (same as opaque)
+    const bool useMegaShadow = (megaVB_ && megaIB_);
+    if (useMegaShadow) {
+        VkDeviceSize megaOffset = 0;
+        vkCmdBindVertexBuffers(cmd, 0, 1, &megaVB_, &megaOffset);
+        vkCmdBindIndexBuffer(cmd, megaIB_, 0, VK_INDEX_TYPE_UINT32);
+    }
+
     for (const auto& chunk : chunks) {
         if (!chunk.isValid()) continue;
 
@@ -995,10 +1109,14 @@ void TerrainRenderer::renderShadow(VkCommandBuffer cmd, const glm::mat4& lightSp
         float combinedRadius = shadowRadius + chunk.boundingSphereRadius;
         if (distSq > combinedRadius * combinedRadius) continue;
 
-        VkDeviceSize offset = 0;
-        vkCmdBindVertexBuffers(cmd, 0, 1, &chunk.vertexBuffer, &offset);
-        vkCmdBindIndexBuffer(cmd, chunk.indexBuffer, 0, VK_INDEX_TYPE_UINT16);
-        vkCmdDrawIndexed(cmd, chunk.indexCount, 1, 0, 0, 0);
+        if (useMegaShadow && chunk.megaBaseVertex >= 0) {
+            vkCmdDrawIndexed(cmd, chunk.indexCount, 1, chunk.megaFirstIndex, chunk.megaBaseVertex, 0);
+        } else {
+            VkDeviceSize offset = 0;
+            vkCmdBindVertexBuffers(cmd, 0, 1, &chunk.vertexBuffer, &offset);
+            vkCmdBindIndexBuffer(cmd, chunk.indexBuffer, 0, VK_INDEX_TYPE_UINT32);
+            vkCmdDrawIndexed(cmd, chunk.indexCount, 1, 0, 0, 0);
+        }
     }
 }
 
diff --git a/src/rendering/vk_context.cpp b/src/rendering/vk_context.cpp
index c2a37415..4a5d6366 100644
--- a/src/rendering/vk_context.cpp
+++ b/src/rendering/vk_context.cpp
@@ -334,7 +334,7 @@ bool VkContext::selectPhysicalDevice() {
 
     VkPhysicalDeviceProperties props;
     vkGetPhysicalDeviceProperties(physicalDevice, &props);
-    uint32_t apiVersion = props.apiVersion;
+    (void)props.apiVersion; // Available if needed for version checks
     gpuVendorId_ = props.vendorID;
     std::strncpy(gpuName_, props.deviceName, sizeof(gpuName_) - 1);
     gpuName_[sizeof(gpuName_) - 1] = '\0';
diff --git a/src/rendering/vk_pipeline.cpp b/src/rendering/vk_pipeline.cpp
index 2a95bd8b..e5c32e6c 100644
--- a/src/rendering/vk_pipeline.cpp
+++ b/src/rendering/vk_pipeline.cpp
@@ -111,6 +111,17 @@ PipelineBuilder& PipelineBuilder::setDynamicStates(const std::vector<VkDynamicSt
     return *this;
 }
 
+// Pipeline derivatives — hint driver to share compiled state between similar pipelines
+PipelineBuilder& PipelineBuilder::setFlags(VkPipelineCreateFlags flags) {
+    flags_ = flags;
+    return *this;
+}
+
+PipelineBuilder& PipelineBuilder::setBasePipeline(VkPipeline basePipeline) {
+    basePipelineHandle_ = basePipeline;
+    return *this;
+}
+
 VkPipeline PipelineBuilder::build(VkDevice device, VkPipelineCache cache) const {
     // Vertex input
     VkPipelineVertexInputStateCreateInfo vertexInput{};
@@ -188,6 +199,9 @@ VkPipeline PipelineBuilder::build(VkDevice device, VkPipelineCache cache) const
     pipelineInfo.pColorBlendState = colorBlendAttachments_.empty() ? nullptr : &colorBlending;
     pipelineInfo.pDynamicState = dynamicStates_.empty() ? nullptr : &dynamicState;
     pipelineInfo.layout = pipelineLayout_;
+    pipelineInfo.flags = flags_;
+    pipelineInfo.basePipelineHandle = basePipelineHandle_;
+    pipelineInfo.basePipelineIndex = -1;
     pipelineInfo.renderPass = renderPass_;
     pipelineInfo.subpass = subpass_;
 
diff --git a/src/rendering/wmo_renderer.cpp b/src/rendering/wmo_renderer.cpp
index 79c830ee..32996c31 100644
--- a/src/rendering/wmo_renderer.cpp
+++ b/src/rendering/wmo_renderer.cpp
@@ -169,7 +169,7 @@ bool WMORenderer::initialize(VkContext* ctx, VkDescriptorSetLayout perFrameLayou
     vertexAttribs[4] = { 4, 0, VK_FORMAT_R32G32B32A32_SFLOAT,
         static_cast<uint32_t>(offsetof(WMOVertexData, tangent)) };
 
-    // --- Build opaque pipeline ---
+    // --- Build opaque pipeline (base for derivatives — shared state optimization) ---
     VkRenderPass mainPass = vkCtx_->getImGuiRenderPass();
 
     opaquePipeline_ = PipelineBuilder()
@@ -184,6 +184,7 @@ bool WMORenderer::initialize(VkContext* ctx, VkDescriptorSetLayout perFrameLayou
         .setLayout(pipelineLayout_)
         .setRenderPass(mainPass)
         .setDynamicStates({ VK_DYNAMIC_STATE_VIEWPORT, VK_DYNAMIC_STATE_SCISSOR })
+        .setFlags(VK_PIPELINE_CREATE_ALLOW_DERIVATIVES_BIT)
         .build(device, vkCtx_->getPipelineCache());
 
     if (!opaquePipeline_) {
@@ -193,7 +194,7 @@ bool WMORenderer::initialize(VkContext* ctx, VkDescriptorSetLayout perFrameLayou
         return false;
     }
 
-    // --- Build transparent pipeline ---
+    // --- Build transparent pipeline (derivative of opaque) ---
     transparentPipeline_ = PipelineBuilder()
         .setShaders(vertShader.stageInfo(VK_SHADER_STAGE_VERTEX_BIT),
                     fragShader.stageInfo(VK_SHADER_STAGE_FRAGMENT_BIT))
@@ -206,13 +207,15 @@ bool WMORenderer::initialize(VkContext* ctx, VkDescriptorSetLayout perFrameLayou
         .setLayout(pipelineLayout_)
         .setRenderPass(mainPass)
         .setDynamicStates({ VK_DYNAMIC_STATE_VIEWPORT, VK_DYNAMIC_STATE_SCISSOR })
+        .setFlags(VK_PIPELINE_CREATE_DERIVATIVE_BIT)
+        .setBasePipeline(opaquePipeline_)
         .build(device, vkCtx_->getPipelineCache());
 
     if (!transparentPipeline_) {
         core::Logger::getInstance().warning("WMORenderer: transparent pipeline not available");
     }
 
-    // --- Build glass pipeline (alpha blend WITH depth write for windows) ---
+    // --- Build glass pipeline (derivative — alpha blend WITH depth write for windows) ---
     glassPipeline_ = PipelineBuilder()
         .setShaders(vertShader.stageInfo(VK_SHADER_STAGE_VERTEX_BIT),
                     fragShader.stageInfo(VK_SHADER_STAGE_FRAGMENT_BIT))
@@ -225,9 +228,11 @@ bool WMORenderer::initialize(VkContext* ctx, VkDescriptorSetLayout perFrameLayou
         .setLayout(pipelineLayout_)
         .setRenderPass(mainPass)
         .setDynamicStates({ VK_DYNAMIC_STATE_VIEWPORT, VK_DYNAMIC_STATE_SCISSOR })
+        .setFlags(VK_PIPELINE_CREATE_DERIVATIVE_BIT)
+        .setBasePipeline(opaquePipeline_)
         .build(device, vkCtx_->getPipelineCache());
 
-    // --- Build wireframe pipeline ---
+    // --- Build wireframe pipeline (derivative of opaque) ---
     wireframePipeline_ = PipelineBuilder()
         .setShaders(vertShader.stageInfo(VK_SHADER_STAGE_VERTEX_BIT),
                     fragShader.stageInfo(VK_SHADER_STAGE_FRAGMENT_BIT))
@@ -240,6 +245,8 @@ bool WMORenderer::initialize(VkContext* ctx, VkDescriptorSetLayout perFrameLayou
         .setLayout(pipelineLayout_)
         .setRenderPass(mainPass)
         .setDynamicStates({ VK_DYNAMIC_STATE_VIEWPORT, VK_DYNAMIC_STATE_SCISSOR })
+        .setFlags(VK_PIPELINE_CREATE_DERIVATIVE_BIT)
+        .setBasePipeline(opaquePipeline_)
         .build(device, vkCtx_->getPipelineCache());
 
     if (!wireframePipeline_) {
@@ -1434,7 +1441,7 @@ void WMORenderer::render(VkCommandBuffer cmd, VkDescriptorSet perFrameSet, const
                 if (doDistanceCull) {
                     glm::vec3 closestPoint = glm::clamp(camPos, gMin, gMax);
                     float distSq = glm::dot(closestPoint - camPos, closestPoint - camPos);
-                    if (distSq > 250000.0f) {
+                    if (distSq > 1440000.0f) { // 1200 units — matches terrain view distance
                         result.distanceCulled++;
                         continue;
                     }
@@ -3733,6 +3740,7 @@ void WMORenderer::recreatePipelines() {
 
     VkRenderPass mainPass = vkCtx_->getImGuiRenderPass();
 
+    // Pipeline derivatives — opaque is the base, others derive for shared state optimization
     opaquePipeline_ = PipelineBuilder()
         .setShaders(vertShader.stageInfo(VK_SHADER_STAGE_VERTEX_BIT),
                     fragShader.stageInfo(VK_SHADER_STAGE_FRAGMENT_BIT))
@@ -3745,6 +3753,7 @@ void WMORenderer::recreatePipelines() {
         .setLayout(pipelineLayout_)
         .setRenderPass(mainPass)
         .setDynamicStates({ VK_DYNAMIC_STATE_VIEWPORT, VK_DYNAMIC_STATE_SCISSOR })
+        .setFlags(VK_PIPELINE_CREATE_ALLOW_DERIVATIVES_BIT)
         .build(device, vkCtx_->getPipelineCache());
 
     transparentPipeline_ = PipelineBuilder()
@@ -3759,6 +3768,8 @@ void WMORenderer::recreatePipelines() {
         .setLayout(pipelineLayout_)
         .setRenderPass(mainPass)
         .setDynamicStates({ VK_DYNAMIC_STATE_VIEWPORT, VK_DYNAMIC_STATE_SCISSOR })
+        .setFlags(VK_PIPELINE_CREATE_DERIVATIVE_BIT)
+        .setBasePipeline(opaquePipeline_)
         .build(device, vkCtx_->getPipelineCache());
 
     glassPipeline_ = PipelineBuilder()
@@ -3773,6 +3784,8 @@ void WMORenderer::recreatePipelines() {
         .setLayout(pipelineLayout_)
         .setRenderPass(mainPass)
         .setDynamicStates({ VK_DYNAMIC_STATE_VIEWPORT, VK_DYNAMIC_STATE_SCISSOR })
+        .setFlags(VK_PIPELINE_CREATE_DERIVATIVE_BIT)
+        .setBasePipeline(opaquePipeline_)
         .build(device, vkCtx_->getPipelineCache());
 
     wireframePipeline_ = PipelineBuilder()
@@ -3787,6 +3800,8 @@ void WMORenderer::recreatePipelines() {
         .setLayout(pipelineLayout_)
         .setRenderPass(mainPass)
         .setDynamicStates({ VK_DYNAMIC_STATE_VIEWPORT, VK_DYNAMIC_STATE_SCISSOR })
+        .setFlags(VK_PIPELINE_CREATE_DERIVATIVE_BIT)
+        .setBasePipeline(opaquePipeline_)
         .build(device, vkCtx_->getPipelineCache());
 
     vertShader.destroy();
diff --git a/tools/asset_extract/extractor.cpp b/tools/asset_extract/extractor.cpp
index 3c61bef3..d79d4671 100644
--- a/tools/asset_extract/extractor.cpp
+++ b/tools/asset_extract/extractor.cpp
@@ -537,20 +537,6 @@ static std::vector<ArchiveDesc> discoverArchives(const std::string& mpqDir,
     return result;
 }
 
-// Read a text file into a vector of lines (for external listfile loading)
-static std::vector<std::string> readLines(const std::string& path) {
-    std::vector<std::string> lines;
-    std::ifstream f(path);
-    if (!f) return lines;
-    std::string line;
-    while (std::getline(f, line)) {
-        // Trim trailing \r
-        if (!line.empty() && line.back() == '\r') line.pop_back();
-        if (!line.empty()) lines.push_back(std::move(line));
-    }
-    return lines;
-}
-
 // Extract the (listfile) from an MPQ archive into a set of filenames
 static void extractInternalListfile(HANDLE hMpq, std::set<std::string>& out) {
     HANDLE hFile = nullptr;
@@ -595,14 +581,9 @@ bool Extractor::enumerateFiles(const Options& opts,
 
     std::cout << "Found " << archives.size() << " MPQ archives\n";
 
-    // Load external listfile into memory once (avoids repeated file I/O)
-    std::vector<std::string> externalEntries;
-    std::vector<const char*> externalPtrs;
-    if (!opts.listFile.empty()) {
-        externalEntries = readLines(opts.listFile);
-        externalPtrs.reserve(externalEntries.size());
-        for (const auto& e : externalEntries) externalPtrs.push_back(e.c_str());
-        std::cout << "  Loaded external listfile: " << externalEntries.size() << " entries\n";
+    const bool haveExternalListFile = !opts.listFile.empty();
+    if (haveExternalListFile) {
+        std::cout << "  Using external listfile: " << opts.listFile << "\n";
     }
 
     const auto wantedDbcs = buildWantedDbcSet(opts);
@@ -616,12 +597,11 @@ bool Extractor::enumerateFiles(const Options& opts,
             continue;
         }
 
-        // Inject external listfile entries into archive's in-memory name table.
-        // SFileAddListFileEntries is fast — it only hashes the names against the
-        // archive's hash table, no file I/O involved.
-        if (!externalPtrs.empty()) {
-            SFileAddListFileEntries(hMpq, externalPtrs.data(),
-                                   static_cast<DWORD>(externalPtrs.size()));
+        // Inject external listfile into archive's in-memory name table.
+        // SFileAddListFile reads the file and hashes names against the
+        // archive's hash table.
+        if (haveExternalListFile) {
+            SFileAddListFile(hMpq, opts.listFile.c_str());
         }
 
         if (opts.verbose) {