Add overlay extraction for multi-expansion asset deduplication

Extracts each expansion's assets as a CRC-compared overlay against a
base manifest, storing only files that differ. Auto-detects overlay mode
when a base manifest already exists. Adds --as-overlay, --full-base
flags and manifest merge for partial extractions.
This commit is contained in:
Kelsi 2026-02-14 03:09:17 -08:00
parent 03013e751a
commit a67dca5787
37 changed files with 139303 additions and 73847 deletions

View file

@ -16,6 +16,7 @@
#include <set>
#include <sstream>
#include <thread>
#include <unordered_map>
#include <unordered_set>
#include <vector>
@ -246,55 +247,93 @@ static std::unordered_set<std::string> buildWantedDbcSet(const Extractor::Option
return wanted;
}
// Load all entry keys from a manifest.json into a set of normalized WoW paths.
// This is a minimal parser — just extracts the keys from the "entries" object
// without pulling in a full JSON library.
static std::unordered_set<std::string> loadManifestKeys(const std::string& manifestPath) {
std::unordered_set<std::string> keys;
std::ifstream f(manifestPath);
if (!f.is_open()) {
std::cerr << "Failed to open reference manifest: " << manifestPath << "\n";
return keys;
// Parse a quoted JSON string starting after the opening quote at pos.
// Returns the unescaped string and advances pos past the closing quote.
static std::string parseJsonString(const std::string& line, size_t& pos) {
std::string result;
while (pos < line.size() && line[pos] != '"') {
if (line[pos] == '\\' && pos + 1 < line.size()) {
result += line[pos + 1];
pos += 2;
} else {
result += line[pos];
pos++;
}
}
if (pos < line.size()) pos++; // skip closing quote
return result;
}
// Load all entries from a manifest.json into a map keyed by normalized WoW path.
// Minimal parser that extracts keys and values without a full JSON library.
static std::unordered_map<std::string, ManifestWriter::FileEntry> loadManifestEntries(
const std::string& manifestPath) {
std::unordered_map<std::string, ManifestWriter::FileEntry> entries;
std::ifstream f(manifestPath);
if (!f.is_open()) return entries;
// Find the "entries" section, then extract keys from each line
bool inEntries = false;
std::string line;
while (std::getline(f, line)) {
if (!inEntries) {
if (line.find("\"entries\"") != std::string::npos) {
inEntries = true;
}
if (line.find("\"entries\"") != std::string::npos) inEntries = true;
continue;
}
// End of entries block
size_t closeBrace = line.find_first_not_of(" \t");
if (closeBrace != std::string::npos && line[closeBrace] == '}') {
break;
}
if (closeBrace != std::string::npos && line[closeBrace] == '}') break;
// Extract key: find first quoted string on the line
// Extract key
size_t q1 = line.find('"');
if (q1 == std::string::npos) continue;
size_t q2 = q1 + 1;
// Find closing quote (handle escaped backslashes)
std::string key;
while (q2 < line.size() && line[q2] != '"') {
if (line[q2] == '\\' && q2 + 1 < line.size()) {
key += line[q2 + 1]; // unescape \\, \", etc.
q2 += 2;
} else {
key += line[q2];
q2++;
size_t pos = q1 + 1;
std::string key = parseJsonString(line, pos);
if (key.empty()) continue;
// Extract value object fields: "p", "s", "h"
ManifestWriter::FileEntry entry;
entry.wowPath = key;
size_t pPos = line.find("\"p\":", pos);
if (pPos != std::string::npos) {
size_t pq = line.find('"', pPos + 4);
if (pq != std::string::npos) {
size_t pp = pq + 1;
entry.filesystemPath = parseJsonString(line, pp);
}
}
if (!key.empty()) {
keys.insert(key); // Already normalized (lowercase, backslashes)
size_t sPos = line.find("\"s\":", pos);
if (sPos != std::string::npos) {
size_t numStart = sPos + 4;
while (numStart < line.size() && (line[numStart] == ' ')) numStart++;
entry.size = std::strtoull(line.c_str() + numStart, nullptr, 10);
}
size_t hPos = line.find("\"h\":", pos);
if (hPos != std::string::npos) {
size_t hq = line.find('"', hPos + 4);
if (hq != std::string::npos) {
size_t hp = hq + 1;
std::string hexStr = parseJsonString(line, hp);
entry.crc32 = static_cast<uint32_t>(std::strtoul(hexStr.c_str(), nullptr, 16));
}
}
entries[key] = std::move(entry);
}
return entries;
}
// Load all entry keys from a manifest.json into a set of normalized WoW paths.
static std::unordered_set<std::string> loadManifestKeys(const std::string& manifestPath) {
auto entries = loadManifestEntries(manifestPath);
std::unordered_set<std::string> keys;
keys.reserve(entries.size());
for (auto& [k, v] : entries) {
keys.insert(k);
}
return keys;
}
@ -531,6 +570,29 @@ bool Extractor::enumerateFiles(const Options& opts,
bool Extractor::run(const Options& opts) {
auto startTime = std::chrono::steady_clock::now();
const bool overlayMode = !opts.asOverlay.empty();
// Overlay mode writes files to expansions/<id>/overlay/ under the output dir
const std::string effectiveOutputDir = overlayMode
? opts.outputDir + "/expansions/" + opts.asOverlay + "/overlay"
: opts.outputDir;
// Load base manifest CRCs for overlay deduplication
std::unordered_map<std::string, uint32_t> baseCRCs;
if (overlayMode) {
std::string baseManifestPath = opts.outputDir + "/manifest.json";
auto baseEntries = loadManifestEntries(baseManifestPath);
if (baseEntries.empty()) {
std::cerr << "Warning: base manifest empty or missing at " << baseManifestPath << "\n"
<< " Extract the base expansion first, then use --as-overlay for others.\n";
} else {
for (auto& [k, v] : baseEntries) {
baseCRCs[k] = v.crc32;
}
std::cout << "Loaded " << baseCRCs.size() << " base manifest entries for CRC comparison\n";
}
}
// Enumerate all unique files across all archives
std::vector<std::string> files;
if (!enumerateFiles(opts, files)) {
@ -560,7 +622,7 @@ bool Extractor::run(const Options& opts) {
// Create output directory
std::error_code ec;
fs::create_directories(opts.outputDir, ec);
fs::create_directories(effectiveOutputDir, ec);
if (ec) {
std::cerr << "Failed to create output directory: " << ec.message() << "\n";
return false;
@ -611,7 +673,7 @@ bool Extractor::run(const Options& opts) {
// Map to new filesystem path
std::string mappedPath = PathMapper::mapPath(wowPath);
std::string fullOutputPath = opts.outputDir + "/" + mappedPath;
std::string fullOutputPath = effectiveOutputDir + "/" + mappedPath;
// Search archives in reverse priority order (highest priority first)
HANDLE hFile = nullptr;
@ -643,11 +705,22 @@ bool Extractor::run(const Options& opts) {
SFileCloseFile(hFile);
data.resize(bytesRead);
// Create output directory
// Compute CRC32
uint32_t crc = ManifestWriter::computeCRC32(data.data(), data.size());
// In overlay mode, skip files identical to base
if (!baseCRCs.empty()) {
auto it = baseCRCs.find(normalized);
if (it != baseCRCs.end() && it->second == crc) {
stats.filesSkipped++;
continue;
}
}
// Create output directory and write file
fs::path outPath(fullOutputPath);
fs::create_directories(outPath.parent_path(), ec);
// Write file
std::ofstream out(fullOutputPath, std::ios::binary);
if (!out.is_open()) {
stats.filesFailed++;
@ -656,9 +729,6 @@ bool Extractor::run(const Options& opts) {
out.write(reinterpret_cast<const char*>(data.data()), data.size());
out.close();
// Compute CRC32
uint32_t crc = ManifestWriter::computeCRC32(data.data(), data.size());
// Add manifest entry
ManifestWriter::FileEntry entry;
entry.wowPath = normalized;
@ -702,14 +772,32 @@ bool Extractor::run(const Options& opts) {
<< stats.filesSkipped.load() << " skipped, "
<< stats.filesFailed.load() << " failed\n";
// Merge with existing manifest so partial extractions don't nuke prior entries
// (skip merge for overlay manifests — they're standalone)
std::string manifestPath = effectiveOutputDir + "/manifest.json";
if (!overlayMode && fs::exists(manifestPath)) {
auto existing = loadManifestEntries(manifestPath);
if (!existing.empty()) {
// New entries override existing ones with same key
for (auto& entry : manifestEntries) {
existing[entry.wowPath] = entry;
}
// Rebuild manifestEntries from merged map
manifestEntries.clear();
manifestEntries.reserve(existing.size());
for (auto& [k, v] : existing) {
manifestEntries.push_back(std::move(v));
}
std::cout << "Merged with existing manifest (" << existing.size() << " total entries)\n";
}
}
// Sort manifest entries for deterministic output
std::sort(manifestEntries.begin(), manifestEntries.end(),
[](const ManifestWriter::FileEntry& a, const ManifestWriter::FileEntry& b) {
return a.wowPath < b.wowPath;
});
// Write manifest
std::string manifestPath = opts.outputDir + "/manifest.json";
// basePath is "." since manifest sits inside the output directory
if (!ManifestWriter::write(manifestPath, ".", manifestEntries)) {
std::cerr << "Failed to write manifest: " << manifestPath << "\n";
@ -723,7 +811,7 @@ bool Extractor::run(const Options& opts) {
std::cout << "Verifying extracted files...\n";
uint64_t verified = 0, verifyFailed = 0;
for (const auto& entry : manifestEntries) {
std::string fsPath = opts.outputDir + "/" + entry.filesystemPath;
std::string fsPath = effectiveOutputDir + "/" + entry.filesystemPath;
std::ifstream f(fsPath, std::ios::binary | std::ios::ate);
if (!f.is_open()) {
std::cerr << " MISSING: " << fsPath << "\n";
@ -764,10 +852,11 @@ bool Extractor::run(const Options& opts) {
if (opts.generateDbcCsv) {
std::cout << "Converting selected DBCs to CSV for committing...\n";
const std::string dbcDir = opts.outputDir + "/db";
const std::string dbcDir = effectiveOutputDir + "/db";
const std::string csvExpansion = overlayMode ? opts.asOverlay : opts.expansion;
const std::string csvDir = !opts.dbcCsvOutputDir.empty()
? opts.dbcCsvOutputDir
: (opts.outputDir + "/expansions/" + opts.expansion + "/db");
: (opts.outputDir + "/expansions/" + csvExpansion + "/db");
uint32_t ok = 0, fail = 0, missing = 0;
for (const auto& base : getUsedDbcNamesForExpansion(opts.expansion)) {
@ -796,8 +885,8 @@ bool Extractor::run(const Options& opts) {
}
}
// Cache WoW.exe for Warden MEM_CHECK responses
{
// Cache WoW.exe for Warden MEM_CHECK responses (base extraction only)
if (!overlayMode) {
const char* exeNames[] = { "WoW.exe", "TurtleWoW.exe", "Wow.exe" };
std::vector<std::string> searchDirs = {
fs::path(opts.mpqDir).parent_path().string(), // WoW.exe is typically next to Data/
@ -821,6 +910,41 @@ bool Extractor::run(const Options& opts) {
}
}
// Auto-update expansion.json with assetManifest field
if (overlayMode) {
std::string expJsonPath = opts.outputDir + "/expansions/" + opts.asOverlay + "/expansion.json";
if (fs::exists(expJsonPath)) {
std::ifstream fin(expJsonPath);
std::string content((std::istreambuf_iterator<char>(fin)),
std::istreambuf_iterator<char>());
fin.close();
if (content.find("\"assetManifest\"") == std::string::npos) {
// Insert assetManifest before the closing brace
size_t lastBrace = content.rfind('}');
if (lastBrace != std::string::npos) {
// Find the last non-whitespace before the closing brace to add comma
size_t pos = lastBrace;
while (pos > 0 && (content[pos - 1] == ' ' || content[pos - 1] == '\n' ||
content[pos - 1] == '\r' || content[pos - 1] == '\t')) {
pos--;
}
std::string insert = ",\n \"assetManifest\": \"overlay/manifest.json\"\n";
content.insert(pos, insert);
std::ofstream fout(expJsonPath);
fout << content;
fout.close();
std::cout << "Updated " << expJsonPath << " with assetManifest\n";
}
} else {
std::cout << "expansion.json already has assetManifest field\n";
}
} else {
std::cerr << "Warning: " << expJsonPath << " not found — create it manually\n";
}
}
std::cout << "Done in " << secs / 60 << "m " << secs % 60 << "s\n";
return true;

View file

@ -26,6 +26,7 @@ public:
bool onlyUsedDbcs = false; // Extract only the DBC files wowee uses (implies DBFilesClient/*.dbc filter)
std::string dbcCsvOutputDir; // When set, write CSVs into this directory instead of outputDir/expansions/<exp>/db
std::string referenceManifest; // If set, only extract files NOT in this manifest (delta extraction)
std::string asOverlay; // If set, extract as overlay for this expansion ID (only files differing from base)
};
struct Stats {

View file

@ -1,4 +1,5 @@
#include "extractor.hpp"
#include <filesystem>
#include <iostream>
#include <string>
#include <cstring>
@ -19,6 +20,11 @@ static void printUsage(const char* prog) {
<< " --skip-dbc Do not extract DBFilesClient/*.dbc (visual assets only)\n"
<< " --dbc-csv Convert selected DBFilesClient/*.dbc to CSV under\n"
<< " <output>/expansions/<expansion>/db/*.csv (for committing)\n"
<< " --as-overlay <id> Extract as expansion overlay (only files differing from base\n"
<< " manifest at <output>/manifest.json). Stores overlay assets in\n"
<< " <output>/expansions/<id>/overlay/ and implies --dbc-csv.\n"
<< " Auto-detected when base manifest already exists.\n"
<< " --full-base Force full base extraction even if manifest exists\n"
<< " --reference-manifest <path>\n"
<< " Only extract files NOT in this manifest (delta extraction)\n"
<< " --dbc-csv-out <dir> Write CSV DBCs into <dir> (overrides default output path)\n"
@ -32,6 +38,7 @@ int main(int argc, char** argv) {
wowee::tools::Extractor::Options opts;
std::string expansion;
std::string locale;
bool forceBase = false;
for (int i = 1; i < argc; ++i) {
if (std::strcmp(argv[i], "--mpq-dir") == 0 && i + 1 < argc) {
@ -52,6 +59,11 @@ int main(int argc, char** argv) {
opts.generateDbcCsv = true;
} else if (std::strcmp(argv[i], "--dbc-csv-out") == 0 && i + 1 < argc) {
opts.dbcCsvOutputDir = argv[++i];
} else if (std::strcmp(argv[i], "--as-overlay") == 0 && i + 1 < argc) {
opts.asOverlay = argv[++i];
opts.generateDbcCsv = true; // Overlay mode always generates per-expansion CSVs
} else if (std::strcmp(argv[i], "--full-base") == 0) {
forceBase = true;
} else if (std::strcmp(argv[i], "--reference-manifest") == 0 && i + 1 < argc) {
opts.referenceManifest = argv[++i];
} else if (std::strcmp(argv[i], "--verify") == 0) {
@ -98,6 +110,20 @@ int main(int argc, char** argv) {
}
opts.locale = locale;
// Auto-detect overlay mode: if a base manifest already exists and this expansion
// has a profile directory, automatically use overlay mode so the user doesn't have
// to think about extraction order.
if (opts.asOverlay.empty() && !forceBase && !opts.onlyUsedDbcs) {
namespace fs = std::filesystem;
std::string baseManifest = opts.outputDir + "/manifest.json";
std::string expJson = opts.outputDir + "/expansions/" + expansion + "/expansion.json";
if (fs::exists(baseManifest) && fs::exists(expJson)) {
opts.asOverlay = expansion;
opts.generateDbcCsv = true;
std::cout << "Base manifest found — auto-overlay mode for " << expansion << "\n";
}
}
std::cout << "=== Wowee Asset Extractor ===\n";
std::cout << "MPQ directory: " << opts.mpqDir << "\n";
std::cout << "Output: " << opts.outputDir << "\n";
@ -118,6 +144,9 @@ int main(int argc, char** argv) {
}
}
if (!opts.asOverlay.empty()) {
std::cout << "Overlay: " << opts.asOverlay << " (only files differing from base)\n";
}
if (!opts.referenceManifest.empty()) {
std::cout << "Reference: " << opts.referenceManifest << " (delta mode)\n";
}