From 249c4fa8428da434ff220b173759fd460a443f86 Mon Sep 17 00:00:00 2001 From: Kelsi Date: Sat, 7 Feb 2026 14:28:14 -0800 Subject: [PATCH] Parallelize M2 bone matrix computation across worker threads Split the M2 animation update loop into three phases: sequential animation state update, parallel bone matrix computation via std::async (when 32+ animated instances), and sequential particle update. Each thread processes a disjoint slice of instances so no synchronization is needed. --- include/rendering/m2_renderer.hpp | 4 ++ src/rendering/m2_renderer.cpp | 61 +++++++++++++++++++++++++++++-- 2 files changed, 61 insertions(+), 4 deletions(-) diff --git a/include/rendering/m2_renderer.hpp b/include/rendering/m2_renderer.hpp index 69265cbb..69653f04 100644 --- a/include/rendering/m2_renderer.hpp +++ b/include/rendering/m2_renderer.hpp @@ -10,6 +10,7 @@ #include #include #include +#include namespace wowee { @@ -354,6 +355,9 @@ private: static constexpr size_t MAX_M2_PARTICLES = 4000; std::mt19937 particleRng_{123}; + // Thread count for parallel bone animation + uint32_t numAnimThreads_ = 1; + float interpFloat(const pipeline::M2AnimationTrack& track, float animTime, int seqIdx, const std::vector& seqs, const std::vector& globalSeqDurations); diff --git a/src/rendering/m2_renderer.cpp b/src/rendering/m2_renderer.cpp index f3cdc29d..d95dd0a0 100644 --- a/src/rendering/m2_renderer.cpp +++ b/src/rendering/m2_renderer.cpp @@ -15,6 +15,8 @@ #include #include #include +#include +#include namespace wowee { namespace rendering { @@ -203,7 +205,8 @@ M2Renderer::~M2Renderer() { bool M2Renderer::initialize(pipeline::AssetManager* assets) { assetManager = assets; - LOG_INFO("Initializing M2 renderer..."); + numAnimThreads_ = std::min(4u, std::max(1u, std::thread::hardware_concurrency() - 1)); + LOG_INFO("Initializing M2 renderer (", numAnimThreads_, " anim threads)..."); // Create M2 shader with skeletal animation support const char* vertexSrc = R"( @@ -1212,7 +1215,13 @@ void M2Renderer::update(float deltaTime) { } // --- Normal M2 animation update --- - for (auto& instance : instances) { + // Phase 1: Update animation state (cheap, sequential) + // Collect indices of instances that need bone matrix computation. + std::vector boneWorkIndices; + boneWorkIndices.reserve(instances.size()); + + for (size_t idx = 0; idx < instances.size(); ++idx) { + auto& instance = instances[idx]; auto it = models.find(instance.modelId); if (it == models.end()) continue; const M2ModelGPU& model = it->second; @@ -1267,9 +1276,53 @@ void M2Renderer::update(float deltaTime) { } } - computeBoneMatrices(model, instance); + boneWorkIndices.push_back(idx); + } - // M2 particle emitter update + // Phase 2: Compute bone matrices (expensive, parallel if enough work) + const size_t animCount = boneWorkIndices.size(); + if (animCount > 0) { + if (animCount < 32 || numAnimThreads_ <= 1) { + // Sequential — not enough work to justify thread overhead + for (size_t i : boneWorkIndices) { + auto& inst = instances[i]; + const auto& mdl = models.find(inst.modelId)->second; + computeBoneMatrices(mdl, inst); + } + } else { + // Parallel — dispatch across worker threads + const size_t numThreads = std::min(static_cast(numAnimThreads_), animCount); + const size_t chunkSize = animCount / numThreads; + const size_t remainder = animCount % numThreads; + + std::vector> futures; + futures.reserve(numThreads); + + size_t start = 0; + for (size_t t = 0; t < numThreads; ++t) { + size_t end = start + chunkSize + (t < remainder ? 1 : 0); + futures.push_back(std::async(std::launch::async, + [this, &boneWorkIndices, start, end]() { + for (size_t j = start; j < end; ++j) { + size_t idx = boneWorkIndices[j]; + auto& inst = instances[idx]; + const auto& mdl = models.find(inst.modelId)->second; + computeBoneMatrices(mdl, inst); + } + })); + start = end; + } + + for (auto& f : futures) { + f.get(); + } + } + } + + // Phase 3: Particle update (sequential — uses RNG, not thread-safe) + for (size_t idx : boneWorkIndices) { + auto& instance = instances[idx]; + const auto& model = models.find(instance.modelId)->second; if (!model.particleEmitters.empty()) { emitParticles(instance, model, deltaTime); updateParticles(instance, deltaTime);