From 249c4fa8428da434ff220b173759fd460a443f86 Mon Sep 17 00:00:00 2001
From: Kelsi <kelsihates2fa@gmail.com>
Date: Sat, 7 Feb 2026 14:28:14 -0800
Subject: [PATCH] Parallelize M2 bone matrix computation across worker threads

Split the M2 animation update loop into three phases: sequential animation state update, parallel bone matrix computation via std::async (when 32+ animated instances), and sequential particle update. Each thread processes a disjoint slice of instances so no synchronization is needed.
---
 include/rendering/m2_renderer.hpp |  4 ++
 src/rendering/m2_renderer.cpp     | 61 +++++++++++++++++++++++++++++--
 2 files changed, 61 insertions(+), 4 deletions(-)
diff --git a/include/rendering/m2_renderer.hpp b/include/rendering/m2_renderer.hpp
index 69265cbb..69653f04 100644
--- a/include/rendering/m2_renderer.hpp
+++ b/include/rendering/m2_renderer.hpp
@@ -10,6 +10,7 @@
 #include <string>
 #include <optional>
 #include <random>
+#include <future>
 
 namespace wowee {
 
@@ -354,6 +355,9 @@ private:
     static constexpr size_t MAX_M2_PARTICLES = 4000;
     std::mt19937 particleRng_{123};
 
+    // Thread count for parallel bone animation
+    uint32_t numAnimThreads_ = 1;
+
     float interpFloat(const pipeline::M2AnimationTrack& track, float animTime, int seqIdx,
                       const std::vector<pipeline::M2Sequence>& seqs,
                       const std::vector<uint32_t>& globalSeqDurations);
diff --git a/src/rendering/m2_renderer.cpp b/src/rendering/m2_renderer.cpp
index f3cdc29d..d95dd0a0 100644
--- a/src/rendering/m2_renderer.cpp
+++ b/src/rendering/m2_renderer.cpp
@@ -15,6 +15,8 @@
 #include <algorithm>
 #include <cmath>
 #include <limits>
+#include <future>
+#include <thread>
 
 namespace wowee {
 namespace rendering {
@@ -203,7 +205,8 @@ M2Renderer::~M2Renderer() {
 bool M2Renderer::initialize(pipeline::AssetManager* assets) {
     assetManager = assets;
 
-    LOG_INFO("Initializing M2 renderer...");
+    numAnimThreads_ = std::min(4u, std::max(1u, std::thread::hardware_concurrency() - 1));
+    LOG_INFO("Initializing M2 renderer (", numAnimThreads_, " anim threads)...");
 
     // Create M2 shader with skeletal animation support
     const char* vertexSrc = R"(
@@ -1212,7 +1215,13 @@ void M2Renderer::update(float deltaTime) {
     }
 
     // --- Normal M2 animation update ---
-    for (auto& instance : instances) {
+    // Phase 1: Update animation state (cheap, sequential)
+    // Collect indices of instances that need bone matrix computation.
+    std::vector<size_t> boneWorkIndices;
+    boneWorkIndices.reserve(instances.size());
+
+    for (size_t idx = 0; idx < instances.size(); ++idx) {
+        auto& instance = instances[idx];
         auto it = models.find(instance.modelId);
         if (it == models.end()) continue;
         const M2ModelGPU& model = it->second;
@@ -1267,9 +1276,53 @@ void M2Renderer::update(float deltaTime) {
             }
         }
 
-        computeBoneMatrices(model, instance);
+        boneWorkIndices.push_back(idx);
+    }
 
-        // M2 particle emitter update
+    // Phase 2: Compute bone matrices (expensive, parallel if enough work)
+    const size_t animCount = boneWorkIndices.size();
+    if (animCount > 0) {
+        if (animCount < 32 || numAnimThreads_ <= 1) {
+            // Sequential — not enough work to justify thread overhead
+            for (size_t i : boneWorkIndices) {
+                auto& inst = instances[i];
+                const auto& mdl = models.find(inst.modelId)->second;
+                computeBoneMatrices(mdl, inst);
+            }
+        } else {
+            // Parallel — dispatch across worker threads
+            const size_t numThreads = std::min(static_cast<size_t>(numAnimThreads_), animCount);
+            const size_t chunkSize = animCount / numThreads;
+            const size_t remainder = animCount % numThreads;
+
+            std::vector<std::future<void>> futures;
+            futures.reserve(numThreads);
+
+            size_t start = 0;
+            for (size_t t = 0; t < numThreads; ++t) {
+                size_t end = start + chunkSize + (t < remainder ? 1 : 0);
+                futures.push_back(std::async(std::launch::async,
+                    [this, &boneWorkIndices, start, end]() {
+                        for (size_t j = start; j < end; ++j) {
+                            size_t idx = boneWorkIndices[j];
+                            auto& inst = instances[idx];
+                            const auto& mdl = models.find(inst.modelId)->second;
+                            computeBoneMatrices(mdl, inst);
+                        }
+                    }));
+                start = end;
+            }
+
+            for (auto& f : futures) {
+                f.get();
+            }
+        }
+    }
+
+    // Phase 3: Particle update (sequential — uses RNG, not thread-safe)
+    for (size_t idx : boneWorkIndices) {
+        auto& instance = instances[idx];
+        const auto& model = models.find(instance.modelId)->second;
         if (!model.particleEmitters.empty()) {
             emitParticles(instance, model, deltaTime);
             updateParticles(instance, deltaTime);