mirror of
https://github.com/Kelsidavis/WoWee.git
synced 2026-05-08 18:13:52 +00:00
A JSON DBC with a malformed record (object instead of array, or a string entry) would call row[col] which throws on non-arrays — the outer try-catch treated this as a hard failure for the whole DBC. Skip the row (stays zero-initialized) so a single malformed record doesn't lose all the rest.
500 lines
17 KiB
C++
500 lines
17 KiB
C++
#include "pipeline/dbc_loader.hpp"
|
|
#include "core/logger.hpp"
|
|
#include <nlohmann/json.hpp>
|
|
#include <cctype>
|
|
#include <cmath>
|
|
#include <cstring>
|
|
#include <set>
|
|
#include <sstream>
|
|
#include <string>
|
|
|
|
namespace wowee {
|
|
namespace pipeline {
|
|
|
|
namespace {
|
|
std::string trimAscii(std::string s) {
|
|
size_t b = 0;
|
|
while (b < s.size() && std::isspace(static_cast<unsigned char>(s[b]))) {
|
|
++b;
|
|
}
|
|
size_t e = s.size();
|
|
while (e > b && std::isspace(static_cast<unsigned char>(s[e - 1]))) {
|
|
--e;
|
|
}
|
|
return s.substr(b, e - b);
|
|
}
|
|
} // namespace
|
|
|
|
DBCFile::DBCFile() = default;
|
|
DBCFile::~DBCFile() = default;
|
|
|
|
bool DBCFile::load(const std::vector<uint8_t>& dbcData) {
|
|
if (dbcData.empty()) {
|
|
LOG_ERROR("DBC data is empty");
|
|
return false;
|
|
}
|
|
|
|
// Detect CSV format: starts with '#'
|
|
if (dbcData[0] == '#') {
|
|
return loadCSV(dbcData);
|
|
}
|
|
|
|
// Detect JSON format: starts with '{'
|
|
if (dbcData[0] == '{' || (dbcData[0] <= ' ' && dbcData.size() > 1)) {
|
|
size_t start = 0;
|
|
while (start < dbcData.size() && dbcData[start] <= ' ') start++;
|
|
if (start < dbcData.size() && dbcData[start] == '{') {
|
|
return loadJSON(dbcData);
|
|
}
|
|
}
|
|
|
|
if (dbcData.size() < sizeof(DBCHeader)) {
|
|
LOG_ERROR("DBC data too small for header");
|
|
return false;
|
|
}
|
|
|
|
// Read header safely (avoid unaligned reinterpret_cast — UB on strict platforms)
|
|
DBCHeader header;
|
|
std::memcpy(&header, dbcData.data(), sizeof(DBCHeader));
|
|
|
|
// Verify magic
|
|
if (std::memcmp(header.magic, "WDBC", 4) != 0) {
|
|
LOG_ERROR("Invalid DBC magic: ", std::string(header.magic, 4));
|
|
return false;
|
|
}
|
|
|
|
recordCount = header.recordCount;
|
|
fieldCount = header.fieldCount;
|
|
recordSize = header.recordSize;
|
|
stringBlockSize = header.stringBlockSize;
|
|
|
|
// Reject absurd header values up front. Real DBCs cap at ~1M records
|
|
// and 1024 fields; large stringBlockSize is up to ~64MB. Multiplying
|
|
// these without bounds risks uint32 overflow on the totalRecordSize
|
|
// computation below — the resize would be tiny but the memcpy would
|
|
// read TB of memory.
|
|
if (recordCount > 10'000'000 || fieldCount > 1024 ||
|
|
recordSize > 1024 * 4 ||
|
|
stringBlockSize > 256u * 1024 * 1024) {
|
|
LOG_ERROR("DBC header rejected: recordCount=", recordCount,
|
|
" fieldCount=", fieldCount, " recordSize=", recordSize,
|
|
" stringBlockSize=", stringBlockSize);
|
|
return false;
|
|
}
|
|
|
|
// Validate sizes — use uint64 for the product so the overflow check
|
|
// above is the only path that allows a large recordCount * recordSize.
|
|
uint64_t expectedSize = sizeof(DBCHeader) +
|
|
static_cast<uint64_t>(recordCount) * recordSize +
|
|
stringBlockSize;
|
|
if (dbcData.size() < expectedSize) {
|
|
LOG_ERROR("DBC file truncated: expected ", expectedSize, " bytes, got ", dbcData.size());
|
|
return false;
|
|
}
|
|
|
|
// DBC fields are fixed-width uint32 (4 bytes each); record size must match.
|
|
// Mismatches indicate a corrupted header or unsupported DBC variant.
|
|
if (recordSize != fieldCount * 4) {
|
|
LOG_WARNING("DBC record size mismatch: recordSize=", recordSize,
|
|
" but fieldCount*4=", fieldCount * 4);
|
|
}
|
|
|
|
LOG_DEBUG("Loading DBC: ", recordCount, " records, ",
|
|
fieldCount, " fields, ", recordSize, " bytes/record, ",
|
|
stringBlockSize, " string bytes");
|
|
|
|
// Copy record data. Use size_t for the product so it matches the
|
|
// header-validated 64-bit expectedSize math above.
|
|
const uint8_t* recordStart = dbcData.data() + sizeof(DBCHeader);
|
|
size_t totalRecordSize = static_cast<size_t>(recordCount) * recordSize;
|
|
recordData.resize(totalRecordSize);
|
|
if (totalRecordSize > 0) {
|
|
std::memcpy(recordData.data(), recordStart, totalRecordSize);
|
|
}
|
|
|
|
// Copy string block
|
|
const uint8_t* stringStart = recordStart + totalRecordSize;
|
|
stringBlock.resize(stringBlockSize);
|
|
if (stringBlockSize > 0) {
|
|
std::memcpy(stringBlock.data(), stringStart, stringBlockSize);
|
|
}
|
|
|
|
loaded = true;
|
|
idCacheBuilt = false;
|
|
idToIndexCache.clear();
|
|
|
|
return true;
|
|
}
|
|
|
|
const uint8_t* DBCFile::getRecord(uint32_t index) const {
|
|
if (!loaded || index >= recordCount) {
|
|
return nullptr;
|
|
}
|
|
|
|
return recordData.data() + (index * recordSize);
|
|
}
|
|
|
|
uint32_t DBCFile::getUInt32(uint32_t recordIndex, uint32_t fieldIndex) const {
|
|
if (!loaded || recordIndex >= recordCount || fieldIndex >= fieldCount) {
|
|
return 0;
|
|
}
|
|
|
|
const uint8_t* record = getRecord(recordIndex);
|
|
if (!record) {
|
|
return 0;
|
|
}
|
|
|
|
uint32_t value;
|
|
std::memcpy(&value, record + (fieldIndex * 4), sizeof(uint32_t));
|
|
return value;
|
|
}
|
|
|
|
int32_t DBCFile::getInt32(uint32_t recordIndex, uint32_t fieldIndex) const {
|
|
return static_cast<int32_t>(getUInt32(recordIndex, fieldIndex));
|
|
}
|
|
|
|
float DBCFile::getFloat(uint32_t recordIndex, uint32_t fieldIndex) const {
|
|
if (!loaded || recordIndex >= recordCount || fieldIndex >= fieldCount) {
|
|
return 0.0f;
|
|
}
|
|
|
|
const uint8_t* record = getRecord(recordIndex);
|
|
if (!record) {
|
|
return 0.0f;
|
|
}
|
|
|
|
float value;
|
|
std::memcpy(&value, record + (fieldIndex * 4), sizeof(float));
|
|
return value;
|
|
}
|
|
|
|
std::string DBCFile::getString(uint32_t recordIndex, uint32_t fieldIndex) const {
|
|
return std::string(getStringView(recordIndex, fieldIndex));
|
|
}
|
|
|
|
std::string_view DBCFile::getStringView(uint32_t recordIndex, uint32_t fieldIndex) const {
|
|
uint32_t offset = getUInt32(recordIndex, fieldIndex);
|
|
return getStringViewByOffset(offset);
|
|
}
|
|
|
|
std::string DBCFile::getStringByOffset(uint32_t offset) const {
|
|
return std::string(getStringViewByOffset(offset));
|
|
}
|
|
|
|
std::string_view DBCFile::getStringViewByOffset(uint32_t offset) const {
|
|
if (!loaded || offset >= stringBlockSize) {
|
|
return {};
|
|
}
|
|
|
|
const char* str = reinterpret_cast<const char*>(stringBlock.data() + offset);
|
|
const char* end = reinterpret_cast<const char*>(stringBlock.data() + stringBlockSize);
|
|
|
|
size_t length = 0;
|
|
while (str + length < end && str[length] != '\0') {
|
|
length++;
|
|
}
|
|
|
|
return std::string_view(str, length);
|
|
}
|
|
|
|
int32_t DBCFile::findRecordById(uint32_t id) const {
|
|
if (!loaded) {
|
|
return -1;
|
|
}
|
|
|
|
// Build ID cache if not already built
|
|
if (!idCacheBuilt) {
|
|
buildIdCache();
|
|
}
|
|
|
|
auto it = idToIndexCache.find(id);
|
|
if (it != idToIndexCache.end()) {
|
|
return static_cast<int32_t>(it->second);
|
|
}
|
|
|
|
return -1;
|
|
}
|
|
|
|
void DBCFile::buildIdCache() const {
|
|
idToIndexCache.clear();
|
|
|
|
for (uint32_t i = 0; i < recordCount; i++) {
|
|
uint32_t id = getUInt32(i, 0); // Assume first field is ID
|
|
idToIndexCache[id] = i;
|
|
}
|
|
|
|
idCacheBuilt = true;
|
|
LOG_DEBUG("Built DBC ID cache with ", idToIndexCache.size(), " entries");
|
|
}
|
|
|
|
bool DBCFile::loadCSV(const std::vector<uint8_t>& csvData) {
|
|
std::string text(reinterpret_cast<const char*>(csvData.data()), csvData.size());
|
|
std::istringstream stream(text);
|
|
std::string line;
|
|
|
|
// --- Parse metadata line: # fields=N strings=I,J,K ---
|
|
if (!std::getline(stream, line) || line.empty() || line[0] != '#') {
|
|
LOG_ERROR("CSV DBC: missing metadata line");
|
|
return false;
|
|
}
|
|
|
|
fieldCount = 0;
|
|
std::set<uint32_t> stringCols;
|
|
|
|
// Parse "fields=N"
|
|
auto fieldsPos = line.find("fields=");
|
|
if (fieldsPos != std::string::npos) {
|
|
try {
|
|
fieldCount = static_cast<uint32_t>(std::stoul(line.substr(fieldsPos + 7)));
|
|
} catch (...) {
|
|
fieldCount = 0;
|
|
}
|
|
}
|
|
if (fieldCount == 0) {
|
|
LOG_ERROR("CSV DBC: invalid field count");
|
|
return false;
|
|
}
|
|
|
|
// Parse "strings=I,J,K"
|
|
auto stringsPos = line.find("strings=");
|
|
if (stringsPos != std::string::npos) {
|
|
std::istringstream ss(line.substr(stringsPos + 8));
|
|
std::string tok;
|
|
while (std::getline(ss, tok, ',')) {
|
|
tok = trimAscii(tok);
|
|
if (!tok.empty()) {
|
|
try {
|
|
stringCols.insert(static_cast<uint32_t>(std::stoul(tok)));
|
|
} catch (...) {
|
|
LOG_WARNING("CSV DBC: invalid string column index token: '", tok, "'");
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Field 0 is always the numeric record ID in DBC files — never a string.
|
|
// Some CSV exports incorrectly mark it as a string column; force-remove it.
|
|
if (stringCols.erase(0) > 0) {
|
|
LOG_DEBUG("CSV DBC: removed field 0 from string columns (always numeric ID)");
|
|
}
|
|
|
|
recordSize = fieldCount * 4;
|
|
|
|
// --- Build string block with initial null byte ---
|
|
stringBlock.clear();
|
|
stringBlock.push_back(0); // offset 0 = empty string
|
|
|
|
// --- Parse data rows ---
|
|
struct RowData {
|
|
std::vector<uint32_t> fields;
|
|
};
|
|
std::vector<RowData> rows;
|
|
|
|
while (std::getline(stream, line)) {
|
|
if (line.empty()) continue;
|
|
|
|
RowData row;
|
|
row.fields.resize(fieldCount, 0);
|
|
|
|
uint32_t col = 0;
|
|
size_t pos = 0;
|
|
|
|
while (col < fieldCount && pos < line.size()) {
|
|
if (stringCols.count(col) && pos < line.size() && line[pos] == '"') {
|
|
// Quoted string field
|
|
pos++; // skip opening quote
|
|
std::string str;
|
|
while (pos < line.size()) {
|
|
if (line[pos] == '"') {
|
|
if (pos + 1 < line.size() && line[pos + 1] == '"') {
|
|
str += '"'; // escaped quote
|
|
pos += 2;
|
|
} else {
|
|
pos++; // closing quote
|
|
break;
|
|
}
|
|
} else {
|
|
str += line[pos++];
|
|
}
|
|
}
|
|
// Skip comma after closing quote
|
|
if (pos < line.size() && line[pos] == ',') pos++;
|
|
|
|
// Store string in string block
|
|
if (str.empty()) {
|
|
row.fields[col] = 0; // points to empty string at offset 0
|
|
} else {
|
|
uint32_t offset = static_cast<uint32_t>(stringBlock.size());
|
|
stringBlock.insert(stringBlock.end(), str.begin(), str.end());
|
|
stringBlock.push_back(0); // null terminator
|
|
row.fields[col] = offset;
|
|
}
|
|
} else if (pos < line.size() && line[pos] == '"') {
|
|
// Quoted value in numeric field — skip quotes, try to parse content
|
|
pos++; // skip opening quote
|
|
std::string str;
|
|
while (pos < line.size()) {
|
|
if (line[pos] == '"') {
|
|
if (pos + 1 < line.size() && line[pos + 1] == '"') {
|
|
str += '"';
|
|
pos += 2;
|
|
} else {
|
|
pos++; // closing quote
|
|
break;
|
|
}
|
|
} else {
|
|
str += line[pos++];
|
|
}
|
|
}
|
|
if (pos < line.size() && line[pos] == ',') pos++;
|
|
if (!str.empty()) {
|
|
try {
|
|
row.fields[col] = static_cast<uint32_t>(std::stoul(str));
|
|
} catch (...) {
|
|
row.fields[col] = 0;
|
|
}
|
|
}
|
|
} else {
|
|
// Numeric field — read until comma or end of line
|
|
size_t end = line.find(',', pos);
|
|
if (end == std::string::npos) end = line.size();
|
|
std::string tok = line.substr(pos, end - pos);
|
|
if (!tok.empty()) {
|
|
try {
|
|
row.fields[col] = static_cast<uint32_t>(std::stoul(tok));
|
|
} catch (...) {
|
|
row.fields[col] = 0; // non-numeric value in numeric field
|
|
}
|
|
}
|
|
pos = (end < line.size()) ? end + 1 : line.size();
|
|
}
|
|
col++;
|
|
}
|
|
|
|
rows.push_back(std::move(row));
|
|
}
|
|
|
|
// --- Build record data (binary layout identical to WDBC) ---
|
|
recordCount = static_cast<uint32_t>(rows.size());
|
|
stringBlockSize = static_cast<uint32_t>(stringBlock.size());
|
|
|
|
recordData.resize(static_cast<size_t>(recordCount) * recordSize);
|
|
for (uint32_t i = 0; i < recordCount; ++i) {
|
|
uint8_t* dst = recordData.data() + static_cast<size_t>(i) * recordSize;
|
|
for (uint32_t f = 0; f < fieldCount; ++f) {
|
|
uint32_t val = rows[i].fields[f];
|
|
std::memcpy(dst + f * 4, &val, 4);
|
|
}
|
|
}
|
|
|
|
loaded = true;
|
|
idCacheBuilt = false;
|
|
idToIndexCache.clear();
|
|
|
|
LOG_DEBUG("Loaded CSV DBC: ", recordCount, " records, ",
|
|
fieldCount, " fields, ", stringCols.size(), " string cols, ",
|
|
stringBlockSize, " string bytes");
|
|
return true;
|
|
}
|
|
|
|
bool DBCFile::loadJSON(const std::vector<uint8_t>& jsonData) {
|
|
try {
|
|
auto j = nlohmann::json::parse(jsonData.begin(), jsonData.end());
|
|
|
|
if (!j.contains("records") || !j["records"].is_array()) {
|
|
LOG_ERROR("JSON DBC: missing 'records' array");
|
|
return false;
|
|
}
|
|
|
|
const auto& records = j["records"];
|
|
if (records.empty()) {
|
|
LOG_WARNING("JSON DBC: empty records array");
|
|
return false;
|
|
}
|
|
|
|
fieldCount = j.value("fieldCount", 0u);
|
|
if (fieldCount == 0 && !records[0].empty()) {
|
|
fieldCount = static_cast<uint32_t>(records[0].size());
|
|
}
|
|
if (fieldCount == 0) return false;
|
|
// Sanity caps. Real DBCs cap at ~250 fields and a few million
|
|
// records (Spell.dbc is the biggest at ~50K rows). Multi-million
|
|
// products would OOM the recordData allocation below.
|
|
if (fieldCount > 1024) {
|
|
LOG_ERROR("JSON DBC: fieldCount ", fieldCount, " too large");
|
|
return false;
|
|
}
|
|
|
|
recordSize = fieldCount * 4;
|
|
recordCount = static_cast<uint32_t>(records.size());
|
|
if (recordCount > 5'000'000 ||
|
|
static_cast<uint64_t>(recordCount) * recordSize > (256ull << 20)) {
|
|
LOG_ERROR("JSON DBC: recordCount ", recordCount, " * recordSize ",
|
|
recordSize, " exceeds 256MB cap");
|
|
return false;
|
|
}
|
|
|
|
stringBlock.clear();
|
|
stringBlock.push_back(0);
|
|
|
|
recordData.resize(static_cast<size_t>(recordCount) * recordSize, 0);
|
|
|
|
for (uint32_t i = 0; i < recordCount; i++) {
|
|
const auto& row = records[i];
|
|
// Skip non-array rows (object, string, etc.) — row[col] throws
|
|
// on a non-array, which the outer try-catch would treat as a
|
|
// hard load failure for the whole file. Empty record stays
|
|
// zero-initialized from the resize() above.
|
|
if (!row.is_array()) continue;
|
|
uint32_t* fields = reinterpret_cast<uint32_t*>(
|
|
recordData.data() + static_cast<size_t>(i) * recordSize);
|
|
|
|
uint32_t cols = std::min(fieldCount, static_cast<uint32_t>(row.size()));
|
|
for (uint32_t col = 0; col < cols; col++) {
|
|
const auto& val = row[col];
|
|
if (val.is_string()) {
|
|
const std::string& str = val.get_ref<const std::string&>();
|
|
// Cap individual string at 4KB and total stringBlock at
|
|
// 64MB to prevent OOM from a malicious JSON DBC stuffing
|
|
// huge strings into every field.
|
|
if (str.empty()) {
|
|
fields[col] = 0;
|
|
} else if (str.size() > 4096 ||
|
|
stringBlock.size() + str.size() > 64ull * 1024 * 1024) {
|
|
fields[col] = 0;
|
|
} else {
|
|
fields[col] = static_cast<uint32_t>(stringBlock.size());
|
|
stringBlock.insert(stringBlock.end(), str.begin(), str.end());
|
|
stringBlock.push_back(0);
|
|
}
|
|
} else if (val.is_number_float()) {
|
|
float f = val.get<float>();
|
|
if (!std::isfinite(f)) f = 0.0f;
|
|
std::memcpy(&fields[col], &f, 4);
|
|
} else if (val.is_number_integer()) {
|
|
// Range-check: nlohmann throws on out-of-range get<uint32_t>
|
|
// (negative or > UINT32_MAX). Catching at the field level
|
|
// keeps a single bad cell from killing the whole DBC load.
|
|
int64_t raw = val.get<int64_t>();
|
|
if (raw < 0 || raw > 0xFFFFFFFFll) raw = 0;
|
|
fields[col] = static_cast<uint32_t>(raw);
|
|
}
|
|
}
|
|
}
|
|
|
|
stringBlockSize = static_cast<uint32_t>(stringBlock.size());
|
|
loaded = true;
|
|
idCacheBuilt = false;
|
|
idToIndexCache.clear();
|
|
|
|
LOG_INFO("Loaded JSON DBC: ", recordCount, " records, ",
|
|
fieldCount, " fields, ", stringBlockSize, " string bytes");
|
|
return true;
|
|
} catch (const std::exception& e) {
|
|
LOG_ERROR("JSON DBC parse error: ", e.what());
|
|
return false;
|
|
}
|
|
}
|
|
|
|
} // namespace pipeline
|
|
} // namespace wowee
|