Merge per-chunk water surfaces, restore incremental tile finalization, and pin main thread CPU affinity

Water deduplication: merge per-chunk water surfaces into per-tile surfaces to reduce Vulkan descriptor set usage from ~8900 to ~100-200. Uses hybrid approach — groups with ≤4 chunks stay per-chunk (preserving shore detail), larger groups merge into 128×128 tile-wide surfaces. Re-add incremental tile finalization state machine (reverted in 9b90ab0) to spread GPU uploads across frames and prevent city stuttering. Pin main thread to CPU core 0 and exclude worker threads from core 0 to reduce scheduling jitter on the render/game loop.
2026-05-08 10:03:51 +00:00 · 2026-02-25 03:39:45 -08:00 · 2026-02-25 03:39:45 -08:00 · 86505ad377
commit 86505ad377
parent 7ca9caa212
5 changed files with 629 additions and 314 deletions
--- a/include/rendering/terrain_manager.hpp
+++ b/include/rendering/terrain_manager.hpp
@ -123,6 +123,41 @@ struct PendingTile {
    std::unordered_map<std::string, pipeline::BLPImage> preloadedTextures;
 };

+/**
+ * Phases for incremental tile finalization (one bounded unit of work per call)
+ */
+enum class FinalizationPhase {
+    TERRAIN,        // Upload terrain mesh + textures + water
+    M2_MODELS,      // Upload ONE M2 model per call
+    M2_INSTANCES,   // Create all M2 instances (lightweight struct allocation)
+    WMO_MODELS,     // Upload ONE WMO model per call
+    WMO_INSTANCES,  // Create all WMO instances + load WMO liquids
+    WMO_DOODADS,    // Upload ONE WMO doodad M2 per call
+    WATER,          // Generate water ambient emitters
+    AMBIENT,        // Register ambient emitters + commit tile
+    DONE            // Fully finalized
+};
+
+/**
+ * In-progress tile finalization state — tracks progress across frames
+ */
+struct FinalizingTile {
+    std::shared_ptr<PendingTile> pending;
+    FinalizationPhase phase = FinalizationPhase::TERRAIN;
+
+    // Progress indices within current phase
+    size_t m2ModelIndex = 0;       // Next M2 model to upload
+    size_t wmoModelIndex = 0;      // Next WMO model to upload
+    size_t wmoDoodadIndex = 0;     // Next WMO doodad to upload
+
+    // Accumulated results (built up across phases)
+    std::vector<uint32_t> m2InstanceIds;
+    std::vector<uint32_t> wmoInstanceIds;
+    std::vector<uint32_t> tileUniqueIds;
+    std::vector<uint32_t> tileWmoUniqueIds;
+    std::unordered_set<uint32_t> uploadedM2ModelIds;
+};
+
 /**
 * Terrain manager for multi-tile terrain streaming
 *
@ -219,8 +254,8 @@ public:
    int getLoadedTileCount() const { return static_cast<int>(loadedTiles.size()); }
    int getPendingTileCount() const { return static_cast<int>(pendingTiles.size()); }
    int getReadyQueueCount() const { return static_cast<int>(readyQueue.size()); }
-    /** Total unfinished tiles (worker threads + ready queue) */
-    int getRemainingTileCount() const { return static_cast<int>(pendingTiles.size() + readyQueue.size()); }
+    /** Total unfinished tiles (worker threads + ready queue + finalizing) */
+    int getRemainingTileCount() const { return static_cast<int>(pendingTiles.size() + readyQueue.size() + finalizingTiles_.size()); }
    TileCoord getCurrentTile() const { return currentTile; }

    /** Process all ready tiles immediately (use during loading screens) */
@ -254,9 +289,10 @@ private:
    std::shared_ptr<PendingTile> prepareTile(int x, int y);

    /**
-     * Main thread: upload prepared tile data to GPU
+     * Advance incremental finalization of a tile (one bounded unit of work).
+     * Returns true when the tile is fully finalized (phase == DONE).
     */
-    void finalizeTile(const std::shared_ptr<PendingTile>& pending);
+    bool advanceFinalization(FinalizingTile& ft);

    /**
     * Background worker thread loop
@ -341,16 +377,8 @@ private:
    // Dedup set for WMO placements across tile boundaries (prevents rendering Stormwind 16x)
    std::unordered_set<uint32_t> placedWmoIds;

-    // Progressive M2 upload queue (spread heavy uploads across frames)
-    struct PendingM2Upload {
-        uint32_t modelId;
-        pipeline::M2Model model;
-        std::string path;
-    };
-    std::queue<PendingM2Upload> m2UploadQueue_;
-    static constexpr int MAX_M2_UPLOADS_PER_FRAME = 5;  // Upload up to 5 models per frame
-
-    void processM2UploadQueue();
+    // Tiles currently being incrementally finalized across frames
+    std::deque<FinalizingTile> finalizingTiles_;

    struct GroundEffectEntry {
        std::array<uint32_t, 4> doodadIds{{0, 0, 0, 0}};
--- a/include/rendering/water_renderer.hpp
+++ b/include/rendering/water_renderer.hpp
@ -160,7 +160,7 @@ private:
    VkDescriptorSetLayout sceneSetLayout = VK_NULL_HANDLE;
    VkDescriptorPool sceneDescPool = VK_NULL_HANDLE;
    VkDescriptorSet sceneSet = VK_NULL_HANDLE;
-    static constexpr uint32_t MAX_WATER_SETS = 2048;
+    static constexpr uint32_t MAX_WATER_SETS = 16384;

    VkSampler sceneColorSampler = VK_NULL_HANDLE;
    VkSampler sceneDepthSampler = VK_NULL_HANDLE;
--- a/src/core/application.cpp
+++ b/src/core/application.cpp
@ -55,6 +55,12 @@
 #include <set>
 #include <filesystem>

+#include <thread>
+#ifdef __linux__
+#include <sched.h>
+#include <pthread.h>
+#endif
+
 namespace wowee {
 namespace core {

@ -230,6 +236,26 @@ bool Application::initialize() {

 void Application::run() {
    LOG_INFO("Starting main loop");
+
+    // Pin main thread to a dedicated CPU core to reduce scheduling jitter
+#ifdef __linux__
+    {
+        int numCores = static_cast<int>(std::thread::hardware_concurrency());
+        if (numCores >= 2) {
+            // Use core 0 for the main thread (typically the highest-clocked core)
+            cpu_set_t cpuset;
+            CPU_ZERO(&cpuset);
+            CPU_SET(0, &cpuset);
+            int rc = pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
+            if (rc == 0) {
+                LOG_INFO("Main thread pinned to CPU core 0 (", numCores, " cores available)");
+            } else {
+                LOG_WARNING("Failed to pin main thread to CPU core 0 (error ", rc, ")");
+            }
+        }
+    }
+#endif
+
    const bool frameProfileEnabled = envFlagEnabled("WOWEE_FRAME_PROFILE", false);
    if (frameProfileEnabled) {
        LOG_INFO("Frame timing profile enabled (WOWEE_FRAME_PROFILE=1)");
--- a/src/rendering/terrain_manager.cpp
+++ b/src/rendering/terrain_manager.cpp
@ -22,6 +22,11 @@
 #include <functional>
 #include <unordered_set>

+#ifdef __linux__
+#include <sched.h>
+#include <pthread.h>
+#endif
+
 namespace wowee {
 namespace rendering {

@ -226,7 +231,9 @@ bool TerrainManager::loadTile(int x, int y) {
        return false;
    }

-    finalizeTile(pending);
+    FinalizingTile ft;
+    ft.pending = std::move(pending);
+    while (!advanceFinalization(ft)) {}
    return true;
 }

@ -648,176 +655,157 @@ void TerrainManager::logMissingAdtOnce(const std::string& adtPath) {
    }
 }

-void TerrainManager::finalizeTile(const std::shared_ptr<PendingTile>& pending) {
+bool TerrainManager::advanceFinalization(FinalizingTile& ft) {
+    auto& pending = ft.pending;
    int x = pending->coord.x;
    int y = pending->coord.y;
    TileCoord coord = pending->coord;

-    LOG_DEBUG("Finalizing tile [", x, ",", y, "] (GPU upload)");
+    switch (ft.phase) {

-    // Check if tile was already loaded (race condition guard) or failed
-    if (loadedTiles.find(coord) != loadedTiles.end()) {
-        return;
-    }
-    if (failedTiles.find(coord) != failedTiles.end()) {
-        return;
-    }
-
-    // Upload pre-loaded textures to the GL cache so loadTerrain avoids file I/O
-    if (!pending->preloadedTextures.empty()) {
-        terrainRenderer->uploadPreloadedTextures(pending->preloadedTextures);
-    }
-
-    // Upload terrain to GPU
-    if (!terrainRenderer->loadTerrain(pending->mesh, pending->terrain.textures, x, y)) {
-        LOG_ERROR("Failed to upload terrain to GPU for tile [", x, ",", y, "]");
-        failedTiles[coord] = true;
-        return;
-    }
-
-    // Load water
-    if (waterRenderer) {
-        waterRenderer->loadFromTerrain(pending->terrain, true, x, y);
-    }
-
-    // Register water surface ambient sound emitters
-    if (ambientSoundManager) {
-        // Scan ADT water data for water surfaces
-        int waterEmitterCount = 0;
-        for (size_t chunkIdx = 0; chunkIdx < pending->terrain.waterData.size(); chunkIdx++) {
-            const auto& chunkWater = pending->terrain.waterData[chunkIdx];
-            if (!chunkWater.hasWater()) continue;
-
-            // Calculate chunk position in world coordinates
-            int chunkX = chunkIdx % 16;
-            int chunkY = chunkIdx / 16;
-
-            // WoW coordinates: Each ADT tile is 533.33 units, each chunk is 533.33/16 = 33.333 units
-            // Tile origin in GL space
-            float tileOriginX = (32.0f - x) * 533.33333f;
-            float tileOriginY = (32.0f - y) * 533.33333f;
-
-            // Chunk center position
-            float chunkCenterX = tileOriginX + (chunkX + 0.5f) * 33.333333f;
-            float chunkCenterY = tileOriginY + (chunkY + 0.5f) * 33.333333f;
-
-            // Use first layer for height and type detection
-            if (!chunkWater.layers.empty()) {
-                const auto& layer = chunkWater.layers[0];
-                float waterHeight = layer.minHeight;
-
-                // Determine water type and register appropriate emitter
-                // liquidType: 0=water/lake, 1=ocean, 2=magma, 3=slime
-                if (layer.liquidType == 0) {
-                    // Lake/river water - add water surface emitter every 32 chunks to avoid spam
-                    if (chunkIdx % 32 == 0) {
-                        PendingTile::AmbientEmitter emitter;
-                        emitter.position = glm::vec3(chunkCenterX, chunkCenterY, waterHeight);
-                        emitter.type = 4;  // WATER_SURFACE
-                        pending->ambientEmitters.push_back(emitter);
-                        waterEmitterCount++;
-                    }
-                } else if (layer.liquidType == 1) {
-                    // Ocean - add ocean emitter every 64 chunks (oceans are very large)
-                    if (chunkIdx % 64 == 0) {
-                        PendingTile::AmbientEmitter emitter;
-                        emitter.position = glm::vec3(chunkCenterX, chunkCenterY, waterHeight);
-                        emitter.type = 4;  // WATER_SURFACE (could add separate OCEAN type later)
-                        pending->ambientEmitters.push_back(emitter);
-                        waterEmitterCount++;
-                    }
-                }
-                // Skip magma and slime for now (no ambient sounds for those)
+    case FinalizationPhase::TERRAIN: {
+        // Check if tile was already loaded or failed
+        if (loadedTiles.find(coord) != loadedTiles.end() || failedTiles.find(coord) != failedTiles.end()) {
+            {
+                std::lock_guard<std::mutex> lock(queueMutex);
+                pendingTiles.erase(coord);
            }
+            ft.phase = FinalizationPhase::DONE;
+            return true;
        }
-        if (waterEmitterCount > 0) {
+
+        LOG_DEBUG("Finalizing tile [", x, ",", y, "] (incremental)");
+
+        // Upload pre-loaded textures
+        if (!pending->preloadedTextures.empty()) {
+            terrainRenderer->uploadPreloadedTextures(pending->preloadedTextures);
        }
+
+        // Upload terrain mesh to GPU
+        if (!terrainRenderer->loadTerrain(pending->mesh, pending->terrain.textures, x, y)) {
+            LOG_ERROR("Failed to upload terrain to GPU for tile [", x, ",", y, "]");
+            failedTiles[coord] = true;
+            {
+                std::lock_guard<std::mutex> lock(queueMutex);
+                pendingTiles.erase(coord);
+            }
+            ft.phase = FinalizationPhase::DONE;
+            return true;
+        }
+
+        // Load water immediately after terrain (same frame) — water is now
+        // deduplicated to ~1-2 merged surfaces per tile, so this is fast.
+        if (waterRenderer) {
+            waterRenderer->loadFromTerrain(pending->terrain, true, x, y);
+        }
+
+        // Ensure M2 renderer has asset manager
+        if (m2Renderer && assetManager) {
+            m2Renderer->initialize(nullptr, VK_NULL_HANDLE, assetManager);
+        }
+
+        ft.phase = FinalizationPhase::M2_MODELS;
+        return false;
    }

-    std::vector<uint32_t> m2InstanceIds;
-    std::vector<uint32_t> wmoInstanceIds;
-    std::vector<uint32_t> tileUniqueIds;
-    std::vector<uint32_t> tileWmoUniqueIds;
-
-    // Upload M2 models to GPU and create instances
-    if (m2Renderer && assetManager) {
-        // Always pass the latest asset manager. initialize() is idempotent and updates
-        // the pointer even when the renderer was initialized earlier without assets.
-        m2Renderer->initialize(nullptr, VK_NULL_HANDLE, assetManager);
-
-        // Upload M2 models immediately (batching was causing hangs)
-        // The 5ms time budget in processReadyTiles() limits the spike
-        std::unordered_set<uint32_t> uploadedModelIds;
-        for (auto& m2Ready : pending->m2Models) {
+    case FinalizationPhase::M2_MODELS: {
+        // Upload ONE M2 model per call
+        if (m2Renderer && ft.m2ModelIndex < pending->m2Models.size()) {
+            auto& m2Ready = pending->m2Models[ft.m2ModelIndex];
            if (m2Renderer->loadModel(m2Ready.model, m2Ready.modelId)) {
-                uploadedModelIds.insert(m2Ready.modelId);
+                ft.uploadedM2ModelIds.insert(m2Ready.modelId);
+            }
+            ft.m2ModelIndex++;
+            // Stay in this phase until all models uploaded
+            if (ft.m2ModelIndex < pending->m2Models.size()) {
+                return false;
            }
        }
-        if (!uploadedModelIds.empty()) {
-            LOG_DEBUG("  Uploaded ", uploadedModelIds.size(), " M2 models for tile [", x, ",", y, "]");
+        if (!ft.uploadedM2ModelIds.empty()) {
+            LOG_DEBUG("  Uploaded ", ft.uploadedM2ModelIds.size(), " M2 models for tile [", x, ",", y, "]");
        }
-
-        // Create instances (deduplicate by uniqueId across tile boundaries)
-        int loadedDoodads = 0;
-        int skippedDedup = 0;
-        for (const auto& p : pending->m2Placements) {
-            // Skip if this doodad was already placed by a neighboring tile
-            if (p.uniqueId != 0 && placedDoodadIds.count(p.uniqueId)) {
-                skippedDedup++;
-                continue;
-            }
-            uint32_t instId = m2Renderer->createInstance(p.modelId, p.position, p.rotation, p.scale);
-            if (instId) {
-                m2InstanceIds.push_back(instId);
-                if (p.uniqueId != 0) {
-                    placedDoodadIds.insert(p.uniqueId);
-                    tileUniqueIds.push_back(p.uniqueId);
-                }
-                loadedDoodads++;
-            }
-        }
-
-        LOG_DEBUG("  Loaded doodads for tile [", x, ",", y, "]: ",
-                 loadedDoodads, " instances (", uploadedModelIds.size(), " new models, ",
-                 skippedDedup, " dedup skipped)");
+        ft.phase = FinalizationPhase::M2_INSTANCES;
+        return false;
    }

-    // Upload WMO models to GPU and create instances
-    if (wmoRenderer && assetManager) {
-        // WMORenderer may be initialized before assets are ready; always re-pass assets.
-        wmoRenderer->initialize(nullptr, VK_NULL_HANDLE, assetManager);
-
-        int loadedWMOs = 0;
-        int loadedLiquids = 0;
-        int skippedWmoDedup = 0;
-        for (auto& wmoReady : pending->wmoModels) {
-            // Deduplicate by placement uniqueId when available.
-            // Some ADTs use uniqueId=0, which is not safe for dedup.
-            if (wmoReady.uniqueId != 0 && placedWmoIds.count(wmoReady.uniqueId)) {
-                skippedWmoDedup++;
-                continue;
+    case FinalizationPhase::M2_INSTANCES: {
+        // Create all M2 instances (lightweight struct allocation, no GPU work)
+        if (m2Renderer) {
+            int loadedDoodads = 0;
+            int skippedDedup = 0;
+            for (const auto& p : pending->m2Placements) {
+                if (p.uniqueId != 0 && placedDoodadIds.count(p.uniqueId)) {
+                    skippedDedup++;
+                    continue;
+                }
+                uint32_t instId = m2Renderer->createInstance(p.modelId, p.position, p.rotation, p.scale);
+                if (instId) {
+                    ft.m2InstanceIds.push_back(instId);
+                    if (p.uniqueId != 0) {
+                        placedDoodadIds.insert(p.uniqueId);
+                        ft.tileUniqueIds.push_back(p.uniqueId);
+                    }
+                    loadedDoodads++;
+                }
            }
+            LOG_DEBUG("  Loaded doodads for tile [", x, ",", y, "]: ",
+                     loadedDoodads, " instances (", ft.uploadedM2ModelIds.size(), " new models, ",
+                     skippedDedup, " dedup skipped)");
+        }
+        ft.phase = FinalizationPhase::WMO_MODELS;
+        return false;
+    }
+
+    case FinalizationPhase::WMO_MODELS: {
+        // Upload ONE WMO model per call
+        if (wmoRenderer && assetManager) {
+            wmoRenderer->initialize(nullptr, VK_NULL_HANDLE, assetManager);
+
+            if (ft.wmoModelIndex < pending->wmoModels.size()) {
+                auto& wmoReady = pending->wmoModels[ft.wmoModelIndex];
+                // Deduplicate
+                if (wmoReady.uniqueId != 0 && placedWmoIds.count(wmoReady.uniqueId)) {
+                    ft.wmoModelIndex++;
+                    if (ft.wmoModelIndex < pending->wmoModels.size()) return false;
+                } else {
+                    wmoRenderer->loadModel(wmoReady.model, wmoReady.modelId);
+                    ft.wmoModelIndex++;
+                    if (ft.wmoModelIndex < pending->wmoModels.size()) return false;
+                }
+            }
+        }
+        ft.phase = FinalizationPhase::WMO_INSTANCES;
+        return false;
+    }
+
+    case FinalizationPhase::WMO_INSTANCES: {
+        // Create all WMO instances + load WMO liquids
+        if (wmoRenderer) {
+            int loadedWMOs = 0;
+            int loadedLiquids = 0;
+            int skippedWmoDedup = 0;
+            for (auto& wmoReady : pending->wmoModels) {
+                if (wmoReady.uniqueId != 0 && placedWmoIds.count(wmoReady.uniqueId)) {
+                    skippedWmoDedup++;
+                    continue;
+                }

-            if (wmoRenderer->loadModel(wmoReady.model, wmoReady.modelId)) {
                uint32_t wmoInstId = wmoRenderer->createInstance(wmoReady.modelId, wmoReady.position, wmoReady.rotation);
                if (wmoInstId) {
-                    wmoInstanceIds.push_back(wmoInstId);
+                    ft.wmoInstanceIds.push_back(wmoInstId);
                    if (wmoReady.uniqueId != 0) {
                        placedWmoIds.insert(wmoReady.uniqueId);
-                        tileWmoUniqueIds.push_back(wmoReady.uniqueId);
+                        ft.tileWmoUniqueIds.push_back(wmoReady.uniqueId);
                    }
                    loadedWMOs++;

                    // Load WMO liquids (canals, pools, etc.)
                    if (waterRenderer) {
-                        // Compute the same model matrix as WMORenderer uses
                        glm::mat4 modelMatrix = glm::mat4(1.0f);
                        modelMatrix = glm::translate(modelMatrix, wmoReady.position);
                        modelMatrix = glm::rotate(modelMatrix, wmoReady.rotation.z, glm::vec3(0.0f, 0.0f, 1.0f));
                        modelMatrix = glm::rotate(modelMatrix, wmoReady.rotation.y, glm::vec3(0.0f, 1.0f, 0.0f));
                        modelMatrix = glm::rotate(modelMatrix, wmoReady.rotation.x, glm::vec3(1.0f, 0.0f, 0.0f));
-
-                        // Load liquids from each WMO group
                        for (const auto& group : wmoReady.model.groups) {
                            if (group.liquid.hasLiquid()) {
                                waterRenderer->loadFromWMO(group.liquid, modelMatrix, wmoInstId);
@ -827,60 +815,126 @@ void TerrainManager::finalizeTile(const std::shared_ptr<PendingTile>& pending) {
                    }
                }
            }
+            if (loadedWMOs > 0 || skippedWmoDedup > 0) {
+                LOG_DEBUG("  Loaded WMOs for tile [", x, ",", y, "]: ",
+                         loadedWMOs, " instances, ", skippedWmoDedup, " dedup skipped");
+            }
+            if (loadedLiquids > 0) {
+                LOG_DEBUG("  Loaded WMO liquids for tile [", x, ",", y, "]: ", loadedLiquids);
+            }
        }
-        if (loadedWMOs > 0 || skippedWmoDedup > 0) {
-            LOG_DEBUG("  Loaded WMOs for tile [", x, ",", y, "]: ",
-                     loadedWMOs, " instances, ", skippedWmoDedup, " dedup skipped");
-        }
-        if (loadedLiquids > 0) {
-            LOG_DEBUG("  Loaded WMO liquids for tile [", x, ",", y, "]: ", loadedLiquids);
-        }
+        ft.phase = FinalizationPhase::WMO_DOODADS;
+        return false;
+    }

-        // Upload WMO doodad M2 models
-        if (m2Renderer) {
-            for (auto& doodad : pending->wmoDoodads) {
-                m2Renderer->loadModel(doodad.model, doodad.modelId);
-                uint32_t wmoDoodadInstId = m2Renderer->createInstanceWithMatrix(
-                    doodad.modelId, doodad.modelMatrix, doodad.worldPosition);
-                if (wmoDoodadInstId) m2InstanceIds.push_back(wmoDoodadInstId);
+    case FinalizationPhase::WMO_DOODADS: {
+        // Upload ONE WMO doodad M2 per call
+        if (m2Renderer && ft.wmoDoodadIndex < pending->wmoDoodads.size()) {
+            auto& doodad = pending->wmoDoodads[ft.wmoDoodadIndex];
+            m2Renderer->loadModel(doodad.model, doodad.modelId);
+            uint32_t wmoDoodadInstId = m2Renderer->createInstanceWithMatrix(
+                doodad.modelId, doodad.modelMatrix, doodad.worldPosition);
+            if (wmoDoodadInstId) ft.m2InstanceIds.push_back(wmoDoodadInstId);
+            ft.wmoDoodadIndex++;
+            if (ft.wmoDoodadIndex < pending->wmoDoodads.size()) return false;
+        }
+        ft.phase = FinalizationPhase::WATER;
+        return false;
+    }
+
+    case FinalizationPhase::WATER: {
+        // Terrain water was already loaded in TERRAIN phase.
+        // Generate water ambient emitters here.
+        if (ambientSoundManager) {
+            for (size_t chunkIdx = 0; chunkIdx < pending->terrain.waterData.size(); chunkIdx++) {
+                const auto& chunkWater = pending->terrain.waterData[chunkIdx];
+                if (!chunkWater.hasWater()) continue;
+
+                int chunkX = chunkIdx % 16;
+                int chunkY = chunkIdx / 16;
+                float tileOriginX = (32.0f - x) * 533.33333f;
+                float tileOriginY = (32.0f - y) * 533.33333f;
+                float chunkCenterX = tileOriginX + (chunkX + 0.5f) * 33.333333f;
+                float chunkCenterY = tileOriginY + (chunkY + 0.5f) * 33.333333f;
+
+                if (!chunkWater.layers.empty()) {
+                    const auto& layer = chunkWater.layers[0];
+                    float waterHeight = layer.minHeight;
+                    if (layer.liquidType == 0 && chunkIdx % 32 == 0) {
+                        PendingTile::AmbientEmitter emitter;
+                        emitter.position = glm::vec3(chunkCenterX, chunkCenterY, waterHeight);
+                        emitter.type = 4;
+                        pending->ambientEmitters.push_back(emitter);
+                    } else if (layer.liquidType == 1 && chunkIdx % 64 == 0) {
+                        PendingTile::AmbientEmitter emitter;
+                        emitter.position = glm::vec3(chunkCenterX, chunkCenterY, waterHeight);
+                        emitter.type = 4;
+                        pending->ambientEmitters.push_back(emitter);
+                    }
+                }
            }
        }

-        if (loadedWMOs > 0) {
-            LOG_DEBUG("  Loaded WMOs for tile [", x, ",", y, "]: ", loadedWMOs);
-        }
+        ft.phase = FinalizationPhase::AMBIENT;
+        return false;
    }

-    // Register ambient sound emitters with ambient sound manager
-    if (ambientSoundManager && !pending->ambientEmitters.empty()) {
-        for (const auto& emitter : pending->ambientEmitters) {
-            // Cast uint32_t type to AmbientSoundManager::AmbientType enum
-            auto type = static_cast<audio::AmbientSoundManager::AmbientType>(emitter.type);
-            ambientSoundManager->addEmitter(emitter.position, type);
+    case FinalizationPhase::AMBIENT: {
+        // Register ambient sound emitters
+        if (ambientSoundManager && !pending->ambientEmitters.empty()) {
+            for (const auto& emitter : pending->ambientEmitters) {
+                auto type = static_cast<audio::AmbientSoundManager::AmbientType>(emitter.type);
+                ambientSoundManager->addEmitter(emitter.position, type);
+            }
        }
+
+        // Commit tile to loadedTiles
+        auto tile = std::make_unique<TerrainTile>();
+        tile->coord = coord;
+        tile->terrain = std::move(pending->terrain);
+        tile->mesh = std::move(pending->mesh);
+        tile->loaded = true;
+        tile->m2InstanceIds = std::move(ft.m2InstanceIds);
+        tile->wmoInstanceIds = std::move(ft.wmoInstanceIds);
+        tile->wmoUniqueIds = std::move(ft.tileWmoUniqueIds);
+        tile->doodadUniqueIds = std::move(ft.tileUniqueIds);
+        getTileBounds(coord, tile->minX, tile->minY, tile->maxX, tile->maxY);
+        loadedTiles[coord] = std::move(tile);
+        putCachedTile(pending);
+
+        // Now safe to remove from pendingTiles (tile is in loadedTiles)
+        {
+            std::lock_guard<std::mutex> lock(queueMutex);
+            pendingTiles.erase(coord);
+        }
+
+        LOG_DEBUG("  Finalized tile [", x, ",", y, "]");
+
+        ft.phase = FinalizationPhase::DONE;
+        return true;
    }

-    // Create tile entry
-    auto tile = std::make_unique<TerrainTile>();
-    tile->coord = coord;
-    tile->terrain = std::move(pending->terrain);
-    tile->mesh = std::move(pending->mesh);
-    tile->loaded = true;
-    tile->m2InstanceIds = std::move(m2InstanceIds);
-    tile->wmoInstanceIds = std::move(wmoInstanceIds);
-    tile->wmoUniqueIds = std::move(tileWmoUniqueIds);
-    tile->doodadUniqueIds = std::move(tileUniqueIds);
-
-    // Calculate world bounds
-    getTileBounds(coord, tile->minX, tile->minY, tile->maxX, tile->maxY);
-
-    loadedTiles[coord] = std::move(tile);
-    putCachedTile(pending);
-
-    LOG_DEBUG("  Finalized tile [", x, ",", y, "]");
+    case FinalizationPhase::DONE:
+        return true;
+    }
+    return true;
 }

 void TerrainManager::workerLoop() {
+    // Keep worker threads off core 0 (reserved for main thread)
+#ifdef __linux__
+    {
+        int numCores = static_cast<int>(std::thread::hardware_concurrency());
+        if (numCores >= 2) {
+            cpu_set_t cpuset;
+            CPU_ZERO(&cpuset);
+            for (int i = 1; i < numCores; i++) {
+                CPU_SET(i, &cpuset);
+            }
+            pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
+        }
+    }
+#endif
    LOG_INFO("Terrain worker thread started");

    while (workerRunning.load()) {
@ -927,80 +981,60 @@ void TerrainManager::processReadyTiles() {
    // Taxi mode gets a slightly larger budget to avoid visible late-pop terrain/models.
    const float timeBudgetMs = taxiStreamingMode_ ? 8.0f : 5.0f;
    auto startTime = std::chrono::high_resolution_clock::now();
-    int processed = 0;

-    while (true) {
-        std::shared_ptr<PendingTile> pending;
-
-        {
-            std::lock_guard<std::mutex> lock(queueMutex);
-            if (readyQueue.empty()) {
-                break;
-            }
-            pending = readyQueue.front();
+    // Move newly ready tiles into the finalizing deque.
+    // Keep them in pendingTiles so streamTiles() won't re-enqueue them.
+    {
+        std::lock_guard<std::mutex> lock(queueMutex);
+        while (!readyQueue.empty()) {
+            auto pending = readyQueue.front();
            readyQueue.pop();
-        }
-
-        if (pending) {
-            TileCoord coord = pending->coord;
-
-            finalizeTile(pending);
-
-            auto now = std::chrono::high_resolution_clock::now();
-
-            {
-                std::lock_guard<std::mutex> lock(queueMutex);
-                pendingTiles.erase(coord);
-            }
-            processed++;
-
-            // Check if we've exceeded time budget
-            float elapsedMs = std::chrono::duration<float, std::milli>(now - startTime).count();
-            if (elapsedMs >= timeBudgetMs) {
-                if (processed > 1) {
-                    LOG_DEBUG("Processed ", processed, " tiles in ", elapsedMs, "ms (budget: ", timeBudgetMs, "ms)");
-                }
-                break;
+            if (pending) {
+                FinalizingTile ft;
+                ft.pending = std::move(pending);
+                finalizingTiles_.push_back(std::move(ft));
            }
        }
    }
-}

-void TerrainManager::processM2UploadQueue() {
-    // Upload up to MAX_M2_UPLOADS_PER_FRAME models per frame
-    int uploaded = 0;
-    while (!m2UploadQueue_.empty() && uploaded < MAX_M2_UPLOADS_PER_FRAME) {
-        auto& upload = m2UploadQueue_.front();
-        if (m2Renderer) {
-            m2Renderer->loadModel(upload.model, upload.modelId);
+    // Drive incremental finalization within time budget
+    while (!finalizingTiles_.empty()) {
+        auto& ft = finalizingTiles_.front();
+        bool done = advanceFinalization(ft);
+
+        if (done) {
+            finalizingTiles_.pop_front();
        }
-        m2UploadQueue_.pop();
-        uploaded++;
-    }

-    if (uploaded > 0) {
-        LOG_DEBUG("Uploaded ", uploaded, " M2 models (", m2UploadQueue_.size(), " remaining in queue)");
+        auto now = std::chrono::high_resolution_clock::now();
+        float elapsedMs = std::chrono::duration<float, std::milli>(now - startTime).count();
+        if (elapsedMs >= timeBudgetMs) {
+            break;
+        }
    }
 }

 void TerrainManager::processAllReadyTiles() {
-    while (true) {
-        std::shared_ptr<PendingTile> pending;
-        {
-            std::lock_guard<std::mutex> lock(queueMutex);
-            if (readyQueue.empty()) break;
-            pending = readyQueue.front();
+    // Move all ready tiles into finalizing deque
+    // Keep in pendingTiles until committed (same as processReadyTiles)
+    {
+        std::lock_guard<std::mutex> lock(queueMutex);
+        while (!readyQueue.empty()) {
+            auto pending = readyQueue.front();
            readyQueue.pop();
-        }
-        if (pending) {
-            TileCoord coord = pending->coord;
-            finalizeTile(pending);
-            {
-                std::lock_guard<std::mutex> lock(queueMutex);
-                pendingTiles.erase(coord);
+            if (pending) {
+                FinalizingTile ft;
+                ft.pending = std::move(pending);
+                finalizingTiles_.push_back(std::move(ft));
            }
        }
    }
+    // Finalize all tiles completely (no time budget — used for loading screens)
+    while (!finalizingTiles_.empty()) {
+        auto& ft = finalizingTiles_.front();
+        while (!advanceFinalization(ft)) {}
+        finalizingTiles_.pop_front();
+    }
 }

 std::shared_ptr<PendingTile> TerrainManager::getCachedTile(const TileCoord& coord) {
@ -1099,6 +1133,31 @@ void TerrainManager::unloadTile(int x, int y) {
        pendingTiles.erase(coord);
    }

+    // Remove from finalizingTiles_ if it's being incrementally finalized.
+    // Water may have already been loaded in TERRAIN phase, so clean it up.
+    for (auto fit = finalizingTiles_.begin(); fit != finalizingTiles_.end(); ++fit) {
+        if (fit->pending && fit->pending->coord == coord) {
+            // If past TERRAIN phase, water was already loaded — remove it
+            if (fit->phase != FinalizationPhase::TERRAIN && waterRenderer) {
+                waterRenderer->removeTile(x, y);
+            }
+            // Clean up any M2/WMO instances that were already created
+            if (m2Renderer && !fit->m2InstanceIds.empty()) {
+                m2Renderer->removeInstances(fit->m2InstanceIds);
+            }
+            if (wmoRenderer && !fit->wmoInstanceIds.empty()) {
+                for (uint32_t id : fit->wmoInstanceIds) {
+                    if (waterRenderer) waterRenderer->removeWMO(id);
+                }
+                wmoRenderer->removeInstances(fit->wmoInstanceIds);
+            }
+            for (uint32_t uid : fit->tileUniqueIds) placedDoodadIds.erase(uid);
+            for (uint32_t uid : fit->tileWmoUniqueIds) placedWmoIds.erase(uid);
+            finalizingTiles_.erase(fit);
+            return;
+        }
+    }
+
    auto it = loadedTiles.find(coord);
    if (it == loadedTiles.end()) {
        return;
@ -1167,6 +1226,7 @@ void TerrainManager::unloadAll() {
        while (!readyQueue.empty()) readyQueue.pop();
    }
    pendingTiles.clear();
+    finalizingTiles_.clear();
    placedDoodadIds.clear();

    LOG_INFO("Unloading all terrain tiles");
--- a/src/rendering/water_renderer.cpp
+++ b/src/rendering/water_renderer.cpp
@ -14,6 +14,7 @@
 #include <cstring>
 #include <limits>
 #include <array>
+#include <unordered_map>

 namespace wowee {
 namespace rendering {
@ -555,7 +556,27 @@ void WaterRenderer::loadFromTerrain(const pipeline::ADTTerrain& terrain, bool ap
        clear();
    }

-    int totalLayers = 0;
+    // ── Pass 1: collect layers into merge groups keyed by {liquidType, roundedHeight} ──
+    struct ChunkLayerInfo {
+        int chunkX, chunkY;
+        const pipeline::ADTTerrain::WaterLayer* layer;
+    };
+
+    struct MergeKey {
+        uint16_t liquidType;
+        int32_t roundedHeight;  // minHeight * 2, rounded to int
+        bool operator==(const MergeKey& o) const {
+            return liquidType == o.liquidType && roundedHeight == o.roundedHeight;
+        }
+    };
+
+    struct MergeKeyHash {
+        size_t operator()(const MergeKey& k) const {
+            return std::hash<uint64_t>()((uint64_t(k.liquidType) << 32) | uint32_t(k.roundedHeight));
+        }
+    };
+
+    std::unordered_map<MergeKey, std::vector<ChunkLayerInfo>, MergeKeyHash> mergeGroups;

    for (int chunkIdx = 0; chunkIdx < 256; chunkIdx++) {
        const auto& chunkWater = terrain.waterData[chunkIdx];
@ -563,34 +584,146 @@ void WaterRenderer::loadFromTerrain(const pipeline::ADTTerrain& terrain, bool ap

        int chunkX = chunkIdx % 16;
        int chunkY = chunkIdx / 16;
-        const auto& terrainChunk = terrain.getChunk(chunkX, chunkY);

        for (const auto& layer : chunkWater.layers) {
-            WaterSurface surface;
+            MergeKey key;
+            key.liquidType = layer.liquidType;
+            key.roundedHeight = static_cast<int32_t>(std::round(layer.minHeight * 2.0f));
+            mergeGroups[key].push_back({chunkX, chunkY, &layer});
+        }
+    }

-            surface.position = glm::vec3(
-                terrainChunk.position[0],
-                terrainChunk.position[1],
-                layer.minHeight
-            );
-            surface.origin = glm::vec3(
-                surface.position.x - (static_cast<float>(layer.y) * TILE_SIZE),
-                surface.position.y - (static_cast<float>(layer.x) * TILE_SIZE),
-                layer.minHeight
-            );
-            surface.stepX = glm::vec3(0.0f, -TILE_SIZE, 0.0f);
-            surface.stepY = glm::vec3(-TILE_SIZE, 0.0f, 0.0f);
+    // Tile origin = NW corner = chunk(0,0) position
+    const auto& chunk00 = terrain.getChunk(0, 0);

-            surface.minHeight = layer.minHeight;
-            surface.maxHeight = layer.maxHeight;
-            surface.liquidType = layer.liquidType;
+    // Stormwind water lowering check
+    bool isStormwindArea = (tileX >= 28 && tileX <= 50 && tileY >= 28 && tileY <= 52);
+    float tileWorldX = 0, tileWorldY = 0;
+    glm::vec2 moonwellPos2D(0.0f);
+    if (isStormwindArea) {
+        tileWorldX = (32.0f - tileX) * 533.33333f;
+        tileWorldY = (32.0f - tileY) * 533.33333f;
+        moonwellPos2D = glm::vec2(-8755.9f, 1108.9f);
+    }

-            surface.xOffset = layer.x;
-            surface.yOffset = layer.y;
-            surface.width = layer.width;
-            surface.height = layer.height;
+    int totalSurfaces = 0;

-            size_t numVertices = (layer.width + 1) * (layer.height + 1);
+    // Merge threshold: groups with more than this many chunks get merged into
+    // one tile-wide surface.  Small groups (shore, lakes) stay per-chunk so
+    // their original mask / height data is preserved exactly.
+    constexpr size_t MERGE_THRESHOLD = 4;
+
+    // ── Pass 2: create surfaces ──
+    for (auto& [key, chunkLayers] : mergeGroups) {
+
+        // ── Small group → per-chunk surfaces (original code path) ──
+        if (chunkLayers.size() <= MERGE_THRESHOLD) {
+            for (const auto& info : chunkLayers) {
+                const auto& layer = *info.layer;
+                const auto& terrainChunk = terrain.getChunk(info.chunkX, info.chunkY);
+
+                WaterSurface surface;
+                surface.position = glm::vec3(
+                    terrainChunk.position[0],
+                    terrainChunk.position[1],
+                    layer.minHeight
+                );
+                surface.origin = glm::vec3(
+                    surface.position.x - (static_cast<float>(layer.y) * TILE_SIZE),
+                    surface.position.y - (static_cast<float>(layer.x) * TILE_SIZE),
+                    layer.minHeight
+                );
+                surface.stepX = glm::vec3(0.0f, -TILE_SIZE, 0.0f);
+                surface.stepY = glm::vec3(-TILE_SIZE, 0.0f, 0.0f);
+
+                surface.minHeight = layer.minHeight;
+                surface.maxHeight = layer.maxHeight;
+                surface.liquidType = layer.liquidType;
+                surface.xOffset = layer.x;
+                surface.yOffset = layer.y;
+                surface.width = layer.width;
+                surface.height = layer.height;
+
+                size_t numVertices = (layer.width + 1) * (layer.height + 1);
+                bool useFlat = true;
+                if (layer.heights.size() == numVertices) {
+                    bool sane = true;
+                    for (float h : layer.heights) {
+                        if (!std::isfinite(h) || std::abs(h) > 50000.0f) { sane = false; break; }
+                        if (h < layer.minHeight - 8.0f || h > layer.maxHeight + 8.0f) { sane = false; break; }
+                    }
+                    if (sane) { useFlat = false; surface.heights = layer.heights; }
+                }
+                if (useFlat) surface.heights.resize(numVertices, layer.minHeight);
+
+                if (isStormwindArea && layer.minHeight > 94.0f) {
+                    float distToMoonwell = glm::distance(glm::vec2(tileWorldX, tileWorldY), moonwellPos2D);
+                    if (distToMoonwell > 300.0f) {
+                        for (float& h : surface.heights) h -= 1.0f;
+                        surface.minHeight -= 1.0f;
+                        surface.maxHeight -= 1.0f;
+                    }
+                }
+
+                surface.mask = layer.mask;
+                surface.tileX = tileX;
+                surface.tileY = tileY;
+
+                createWaterMesh(surface);
+                if (surface.indexCount > 0 && vkCtx) {
+                    updateMaterialUBO(surface);
+                }
+                surfaces.push_back(std::move(surface));
+                totalSurfaces++;
+            }
+            continue;
+        }
+
+        // ── Large group → merged tile-wide surface ──
+        WaterSurface surface;
+
+        float groupHeight = key.roundedHeight / 2.0f;
+
+        surface.width = 128;
+        surface.height = 128;
+        surface.xOffset = 0;
+        surface.yOffset = 0;
+        surface.liquidType = key.liquidType;
+        surface.tileX = tileX;
+        surface.tileY = tileY;
+
+        // Origin = chunk(0,0) position (NW corner of tile)
+        surface.origin = glm::vec3(chunk00.position[0], chunk00.position[1], groupHeight);
+        surface.position = surface.origin;
+        surface.stepX = glm::vec3(0.0f, -TILE_SIZE, 0.0f);
+        surface.stepY = glm::vec3(-TILE_SIZE, 0.0f, 0.0f);
+
+        surface.minHeight = groupHeight;
+        surface.maxHeight = groupHeight;
+
+        // Initialize height grid (129×129) with group height
+        constexpr int MERGED_W = 128;
+        const int gridW = MERGED_W + 1;  // 129
+        const int gridH = MERGED_W + 1;
+        surface.heights.resize(gridW * gridH, groupHeight);
+
+        // Initialize mask (128×128 sub-tiles, all masked OUT)
+        // Mask uses LSB bit order: tileIndex = row * 128 + col
+        const int maskBytes = (MERGED_W * MERGED_W + 7) / 8;
+        surface.mask.resize(maskBytes, 0);
+
+        // ── Fill from each contributing chunk ──
+        for (const auto& info : chunkLayers) {
+            const auto& layer = *info.layer;
+
+            // Merged grid offset for this chunk
+            // gx = chunkY*8 + layer.x + localX, gy = chunkX*8 + layer.y + localY
+            int baseGx = info.chunkY * 8;
+            int baseGy = info.chunkX * 8;
+
+            // Copy heights
+            int layerGridW = layer.width + 1;
+            size_t numVertices = static_cast<size_t>(layerGridW) * (layer.height + 1);
            bool useFlat = true;
            if (layer.heights.size() == numVertices) {
                bool sane = true;
@ -598,39 +731,79 @@ void WaterRenderer::loadFromTerrain(const pipeline::ADTTerrain& terrain, bool ap
                    if (!std::isfinite(h) || std::abs(h) > 50000.0f) { sane = false; break; }
                    if (h < layer.minHeight - 8.0f || h > layer.maxHeight + 8.0f) { sane = false; break; }
                }
-                if (sane) { useFlat = false; surface.heights = layer.heights; }
+                if (sane) useFlat = false;
            }
-            if (useFlat) surface.heights.resize(numVertices, layer.minHeight);

-            // Stormwind water lowering
-            bool isStormwindArea = (tileX >= 28 && tileX <= 50 && tileY >= 28 && tileY <= 52);
-            if (isStormwindArea && layer.minHeight > 94.0f) {
-                float tileWorldX = (32.0f - tileX) * 533.33333f;
-                float tileWorldY = (32.0f - tileY) * 533.33333f;
-                glm::vec3 moonwellPos(-8755.9f, 1108.9f, 96.1f);
-                float distToMoonwell = glm::distance(glm::vec2(tileWorldX, tileWorldY),
-                                                      glm::vec2(moonwellPos.x, moonwellPos.y));
-                if (distToMoonwell > 300.0f) {
-                    for (float& h : surface.heights) h -= 1.0f;
-                    surface.minHeight -= 1.0f;
-                    surface.maxHeight -= 1.0f;
+            for (int ly = 0; ly <= layer.height; ly++) {
+                for (int lx = 0; lx <= layer.width; lx++) {
+                    int mgx = baseGx + layer.x + lx;
+                    int mgy = baseGy + layer.y + ly;
+                    if (mgx >= gridW || mgy >= gridH) continue;
+
+                    float h;
+                    if (!useFlat) {
+                        int layerIdx = ly * layerGridW + lx;
+                        h = layer.heights[layerIdx];
+                    } else {
+                        h = layer.minHeight;
+                    }
+
+                    surface.heights[mgy * gridW + mgx] = h;
+                    if (h < surface.minHeight) surface.minHeight = h;
+                    if (h > surface.maxHeight) surface.maxHeight = h;
                }
            }

-            surface.mask = layer.mask;
-            surface.tileX = tileX;
-            surface.tileY = tileY;
+            // Copy mask — mark contributing sub-tiles as renderable
+            for (int ly = 0; ly < layer.height; ly++) {
+                for (int lx = 0; lx < layer.width; lx++) {
+                    bool render = true;
+                    if (!layer.mask.empty()) {
+                        int cx = layer.x + lx;
+                        int cy = layer.y + ly;
+                        int origTileIdx = cy * 8 + cx;
+                        int origByte = origTileIdx / 8;
+                        int origBit = origTileIdx % 8;
+                        if (origByte < static_cast<int>(layer.mask.size())) {
+                            uint8_t mb = layer.mask[origByte];
+                            render = (mb & (1 << origBit)) || (mb & (1 << (7 - origBit)));
+                        }
+                    }

-            createWaterMesh(surface);
-            if (surface.indexCount > 0 && vkCtx) {
-                updateMaterialUBO(surface);
+                    if (render) {
+                        int mx = baseGx + layer.x + lx;
+                        int my = baseGy + layer.y + ly;
+                        if (mx >= MERGED_W || my >= MERGED_W) continue;
+
+                        int mergedTileIdx = my * MERGED_W + mx;
+                        int byteIdx = mergedTileIdx / 8;
+                        int bitIdx = mergedTileIdx % 8;
+                        surface.mask[byteIdx] |= static_cast<uint8_t>(1 << bitIdx);
+                    }
+                }
            }
-            surfaces.push_back(std::move(surface));
-            totalLayers++;
        }
+
+        // Stormwind water lowering
+        if (isStormwindArea && surface.minHeight > 94.0f) {
+            float distToMoonwell = glm::distance(glm::vec2(tileWorldX, tileWorldY), moonwellPos2D);
+            if (distToMoonwell > 300.0f) {
+                for (float& h : surface.heights) h -= 1.0f;
+                surface.minHeight -= 1.0f;
+                surface.maxHeight -= 1.0f;
+            }
+        }
+
+        createWaterMesh(surface);
+        if (surface.indexCount > 0 && vkCtx) {
+            updateMaterialUBO(surface);
+        }
+        surfaces.push_back(std::move(surface));
+        totalSurfaces++;
    }

-    LOG_DEBUG("Loaded ", totalLayers, " water layers from MH2O data");
+    LOG_DEBUG("Water: Loaded ", totalSurfaces, " surfaces from tile [", tileX, ",", tileY,
+              "] (", mergeGroups.size(), " groups), total surfaces: ", surfaces.size());
 }

 void WaterRenderer::removeTile(int tileX, int tileY) {
@ -646,7 +819,7 @@ void WaterRenderer::removeTile(int tileX, int tileY) {
        }
    }
    if (removed > 0) {
-        LOG_DEBUG("Removed ", removed, " water surfaces for tile [", tileX, ",", tileY, "]");
+        LOG_DEBUG("Water: Removed ", removed, " surfaces for tile [", tileX, ",", tileY, "], remaining: ", surfaces.size());
    }
 }

@ -948,7 +1121,8 @@ void WaterRenderer::createWaterMesh(WaterSurface& surface) {
            bool renderTile = true;
            if (!surface.mask.empty()) {
                int tileIndex;
-                if (surface.wmoId == 0 && surface.mask.size() >= 8) {
+                bool isMergedTerrain = (surface.wmoId == 0 && surface.width > 8);
+                if (surface.wmoId == 0 && surface.width <= 8 && surface.mask.size() >= 8) {
                    int cx = static_cast<int>(surface.xOffset) + x;
                    int cy = static_cast<int>(surface.yOffset) + y;
                    tileIndex = cy * 8 + cx;
@ -959,9 +1133,14 @@ void WaterRenderer::createWaterMesh(WaterSurface& surface) {
                int bitIndex = tileIndex % 8;
                if (byteIndex < static_cast<int>(surface.mask.size())) {
                    uint8_t maskByte = surface.mask[byteIndex];
-                    bool lsbOrder = (maskByte & (1 << bitIndex)) != 0;
-                    bool msbOrder = (maskByte & (1 << (7 - bitIndex))) != 0;
-                    renderTile = lsbOrder || msbOrder;
+                    if (isMergedTerrain) {
+                        // Merged surfaces use LSB-only bit order
+                        renderTile = (maskByte & (1 << bitIndex)) != 0;
+                    } else {
+                        bool lsbOrder = (maskByte & (1 << bitIndex)) != 0;
+                        bool msbOrder = (maskByte & (1 << (7 - bitIndex))) != 0;
+                        renderTile = lsbOrder || msbOrder;
+                    }

                    if (!renderTile) {
                        for (int dy = -1; dy <= 1; dy++) {
@ -970,7 +1149,7 @@ void WaterRenderer::createWaterMesh(WaterSurface& surface) {
                                int nx = x + dx, ny = y + dy;
                                if (nx < 0 || ny < 0 || nx >= gridWidth-1 || ny >= gridHeight-1) continue;
                                int neighborIdx;
-                                if (surface.wmoId == 0 && surface.mask.size() >= 8) {
+                                if (surface.wmoId == 0 && surface.width <= 8 && surface.mask.size() >= 8) {
                                    neighborIdx = (static_cast<int>(surface.yOffset) + ny) * 8 +
                                                  (static_cast<int>(surface.xOffset) + nx);
                                } else {
@ -980,9 +1159,16 @@ void WaterRenderer::createWaterMesh(WaterSurface& surface) {
                                int nBitIdx = neighborIdx % 8;
                                if (nByteIdx < static_cast<int>(surface.mask.size())) {
                                    uint8_t nMask = surface.mask[nByteIdx];
-                                    if ((nMask & (1 << nBitIdx)) || (nMask & (1 << (7 - nBitIdx)))) {
-                                        renderTile = true;
-                                        goto found_neighbor;
+                                    if (isMergedTerrain) {
+                                        if (nMask & (1 << nBitIdx)) {
+                                            renderTile = true;
+                                            goto found_neighbor;
+                                        }
+                                    } else {
+                                        if ((nMask & (1 << nBitIdx)) || (nMask & (1 << (7 - nBitIdx)))) {
+                                            renderTile = true;
+                                            goto found_neighbor;
+                                        }
                                    }
                                }
                            }
@ -1100,7 +1286,7 @@ std::optional<float> WaterRenderer::getWaterHeightAt(float glX, float glY) const

        if (!surface.mask.empty()) {
            int tileIndex;
-            if (surface.wmoId == 0 && surface.mask.size() >= 8) {
+            if (surface.wmoId == 0 && surface.width <= 8 && surface.mask.size() >= 8) {
                tileIndex = (static_cast<int>(surface.yOffset) + iy) * 8 +
                            (static_cast<int>(surface.xOffset) + ix);
            } else {
@ -1110,7 +1296,12 @@ std::optional<float> WaterRenderer::getWaterHeightAt(float glX, float glY) const
            int bitIndex = tileIndex % 8;
            if (byteIndex < static_cast<int>(surface.mask.size())) {
                uint8_t maskByte = surface.mask[byteIndex];
-                bool renderTile = (maskByte & (1 << bitIndex)) || (maskByte & (1 << (7 - bitIndex)));
+                bool renderTile;
+                if (surface.wmoId == 0 && surface.width > 8) {
+                    renderTile = (maskByte & (1 << bitIndex)) != 0;
+                } else {
+                    renderTile = (maskByte & (1 << bitIndex)) || (maskByte & (1 << (7 - bitIndex)));
+                }
                if (!renderTile) continue;
            }
        }
@ -1162,7 +1353,7 @@ std::optional<float> WaterRenderer::getNearestWaterHeightAt(float glX, float glY

        if (!surface.mask.empty()) {
            int tileIndex;
-            if (surface.wmoId == 0 && surface.mask.size() >= 8) {
+            if (surface.wmoId == 0 && surface.width <= 8 && surface.mask.size() >= 8) {
                tileIndex = (static_cast<int>(surface.yOffset) + iy) * 8 +
                            (static_cast<int>(surface.xOffset) + ix);
            } else {
@ -1172,7 +1363,12 @@ std::optional<float> WaterRenderer::getNearestWaterHeightAt(float glX, float glY
            int bitIndex = tileIndex % 8;
            if (byteIndex < static_cast<int>(surface.mask.size())) {
                uint8_t maskByte = surface.mask[byteIndex];
-                bool renderTile = (maskByte & (1 << bitIndex)) || (maskByte & (1 << (7 - bitIndex)));
+                bool renderTile;
+                if (surface.wmoId == 0 && surface.width > 8) {
+                    renderTile = (maskByte & (1 << bitIndex)) != 0;
+                } else {
+                    renderTile = (maskByte & (1 << bitIndex)) || (maskByte & (1 << (7 - bitIndex)));
+                }
                if (!renderTile) continue;
            }
        }
@ -1228,7 +1424,7 @@ std::optional<uint16_t> WaterRenderer::getWaterTypeAt(float glX, float glY) cons

        if (!surface.mask.empty()) {
            int tileIndex;
-            if (surface.wmoId == 0 && surface.mask.size() >= 8) {
+            if (surface.wmoId == 0 && surface.width <= 8 && surface.mask.size() >= 8) {
                tileIndex = (static_cast<int>(surface.yOffset) + iy) * 8 +
                            (static_cast<int>(surface.xOffset) + ix);
            } else {
@ -1238,7 +1434,12 @@ std::optional<uint16_t> WaterRenderer::getWaterTypeAt(float glX, float glY) cons
            int bitIndex = tileIndex % 8;
            if (byteIndex < static_cast<int>(surface.mask.size())) {
                uint8_t maskByte = surface.mask[byteIndex];
-                bool renderTile = (maskByte & (1 << bitIndex)) || (maskByte & (1 << (7 - bitIndex)));
+                bool renderTile;
+                if (surface.wmoId == 0 && surface.width > 8) {
+                    renderTile = (maskByte & (1 << bitIndex)) != 0;
+                } else {
+                    renderTile = (maskByte & (1 << bitIndex)) || (maskByte & (1 << (7 - bitIndex)));
+                }
                if (!renderTile) continue;
            }
        }