Kelsidavis-WoWee/src/pipeline/dbc_loader.cpp
Kelsi a531f70890 fix(dbc): skip non-array rows in loadJSON instead of failing
A JSON DBC with a malformed record (object instead of array, or
a string entry) would call row[col] which throws on non-arrays —
the outer try-catch treated this as a hard failure for the whole
DBC. Skip the row (stays zero-initialized) so a single malformed
record doesn't lose all the rest.
2026-05-06 10:07:49 -07:00

500 lines
17 KiB
C++

#include "pipeline/dbc_loader.hpp"
#include "core/logger.hpp"
#include <nlohmann/json.hpp>
#include <cctype>
#include <cmath>
#include <cstring>
#include <set>
#include <sstream>
#include <string>
namespace wowee {
namespace pipeline {
namespace {
std::string trimAscii(std::string s) {
size_t b = 0;
while (b < s.size() && std::isspace(static_cast<unsigned char>(s[b]))) {
++b;
}
size_t e = s.size();
while (e > b && std::isspace(static_cast<unsigned char>(s[e - 1]))) {
--e;
}
return s.substr(b, e - b);
}
} // namespace
DBCFile::DBCFile() = default;
DBCFile::~DBCFile() = default;
bool DBCFile::load(const std::vector<uint8_t>& dbcData) {
if (dbcData.empty()) {
LOG_ERROR("DBC data is empty");
return false;
}
// Detect CSV format: starts with '#'
if (dbcData[0] == '#') {
return loadCSV(dbcData);
}
// Detect JSON format: starts with '{'
if (dbcData[0] == '{' || (dbcData[0] <= ' ' && dbcData.size() > 1)) {
size_t start = 0;
while (start < dbcData.size() && dbcData[start] <= ' ') start++;
if (start < dbcData.size() && dbcData[start] == '{') {
return loadJSON(dbcData);
}
}
if (dbcData.size() < sizeof(DBCHeader)) {
LOG_ERROR("DBC data too small for header");
return false;
}
// Read header safely (avoid unaligned reinterpret_cast — UB on strict platforms)
DBCHeader header;
std::memcpy(&header, dbcData.data(), sizeof(DBCHeader));
// Verify magic
if (std::memcmp(header.magic, "WDBC", 4) != 0) {
LOG_ERROR("Invalid DBC magic: ", std::string(header.magic, 4));
return false;
}
recordCount = header.recordCount;
fieldCount = header.fieldCount;
recordSize = header.recordSize;
stringBlockSize = header.stringBlockSize;
// Reject absurd header values up front. Real DBCs cap at ~1M records
// and 1024 fields; large stringBlockSize is up to ~64MB. Multiplying
// these without bounds risks uint32 overflow on the totalRecordSize
// computation below — the resize would be tiny but the memcpy would
// read TB of memory.
if (recordCount > 10'000'000 || fieldCount > 1024 ||
recordSize > 1024 * 4 ||
stringBlockSize > 256u * 1024 * 1024) {
LOG_ERROR("DBC header rejected: recordCount=", recordCount,
" fieldCount=", fieldCount, " recordSize=", recordSize,
" stringBlockSize=", stringBlockSize);
return false;
}
// Validate sizes — use uint64 for the product so the overflow check
// above is the only path that allows a large recordCount * recordSize.
uint64_t expectedSize = sizeof(DBCHeader) +
static_cast<uint64_t>(recordCount) * recordSize +
stringBlockSize;
if (dbcData.size() < expectedSize) {
LOG_ERROR("DBC file truncated: expected ", expectedSize, " bytes, got ", dbcData.size());
return false;
}
// DBC fields are fixed-width uint32 (4 bytes each); record size must match.
// Mismatches indicate a corrupted header or unsupported DBC variant.
if (recordSize != fieldCount * 4) {
LOG_WARNING("DBC record size mismatch: recordSize=", recordSize,
" but fieldCount*4=", fieldCount * 4);
}
LOG_DEBUG("Loading DBC: ", recordCount, " records, ",
fieldCount, " fields, ", recordSize, " bytes/record, ",
stringBlockSize, " string bytes");
// Copy record data. Use size_t for the product so it matches the
// header-validated 64-bit expectedSize math above.
const uint8_t* recordStart = dbcData.data() + sizeof(DBCHeader);
size_t totalRecordSize = static_cast<size_t>(recordCount) * recordSize;
recordData.resize(totalRecordSize);
if (totalRecordSize > 0) {
std::memcpy(recordData.data(), recordStart, totalRecordSize);
}
// Copy string block
const uint8_t* stringStart = recordStart + totalRecordSize;
stringBlock.resize(stringBlockSize);
if (stringBlockSize > 0) {
std::memcpy(stringBlock.data(), stringStart, stringBlockSize);
}
loaded = true;
idCacheBuilt = false;
idToIndexCache.clear();
return true;
}
const uint8_t* DBCFile::getRecord(uint32_t index) const {
if (!loaded || index >= recordCount) {
return nullptr;
}
return recordData.data() + (index * recordSize);
}
uint32_t DBCFile::getUInt32(uint32_t recordIndex, uint32_t fieldIndex) const {
if (!loaded || recordIndex >= recordCount || fieldIndex >= fieldCount) {
return 0;
}
const uint8_t* record = getRecord(recordIndex);
if (!record) {
return 0;
}
uint32_t value;
std::memcpy(&value, record + (fieldIndex * 4), sizeof(uint32_t));
return value;
}
int32_t DBCFile::getInt32(uint32_t recordIndex, uint32_t fieldIndex) const {
return static_cast<int32_t>(getUInt32(recordIndex, fieldIndex));
}
float DBCFile::getFloat(uint32_t recordIndex, uint32_t fieldIndex) const {
if (!loaded || recordIndex >= recordCount || fieldIndex >= fieldCount) {
return 0.0f;
}
const uint8_t* record = getRecord(recordIndex);
if (!record) {
return 0.0f;
}
float value;
std::memcpy(&value, record + (fieldIndex * 4), sizeof(float));
return value;
}
std::string DBCFile::getString(uint32_t recordIndex, uint32_t fieldIndex) const {
return std::string(getStringView(recordIndex, fieldIndex));
}
std::string_view DBCFile::getStringView(uint32_t recordIndex, uint32_t fieldIndex) const {
uint32_t offset = getUInt32(recordIndex, fieldIndex);
return getStringViewByOffset(offset);
}
std::string DBCFile::getStringByOffset(uint32_t offset) const {
return std::string(getStringViewByOffset(offset));
}
std::string_view DBCFile::getStringViewByOffset(uint32_t offset) const {
if (!loaded || offset >= stringBlockSize) {
return {};
}
const char* str = reinterpret_cast<const char*>(stringBlock.data() + offset);
const char* end = reinterpret_cast<const char*>(stringBlock.data() + stringBlockSize);
size_t length = 0;
while (str + length < end && str[length] != '\0') {
length++;
}
return std::string_view(str, length);
}
int32_t DBCFile::findRecordById(uint32_t id) const {
if (!loaded) {
return -1;
}
// Build ID cache if not already built
if (!idCacheBuilt) {
buildIdCache();
}
auto it = idToIndexCache.find(id);
if (it != idToIndexCache.end()) {
return static_cast<int32_t>(it->second);
}
return -1;
}
void DBCFile::buildIdCache() const {
idToIndexCache.clear();
for (uint32_t i = 0; i < recordCount; i++) {
uint32_t id = getUInt32(i, 0); // Assume first field is ID
idToIndexCache[id] = i;
}
idCacheBuilt = true;
LOG_DEBUG("Built DBC ID cache with ", idToIndexCache.size(), " entries");
}
bool DBCFile::loadCSV(const std::vector<uint8_t>& csvData) {
std::string text(reinterpret_cast<const char*>(csvData.data()), csvData.size());
std::istringstream stream(text);
std::string line;
// --- Parse metadata line: # fields=N strings=I,J,K ---
if (!std::getline(stream, line) || line.empty() || line[0] != '#') {
LOG_ERROR("CSV DBC: missing metadata line");
return false;
}
fieldCount = 0;
std::set<uint32_t> stringCols;
// Parse "fields=N"
auto fieldsPos = line.find("fields=");
if (fieldsPos != std::string::npos) {
try {
fieldCount = static_cast<uint32_t>(std::stoul(line.substr(fieldsPos + 7)));
} catch (...) {
fieldCount = 0;
}
}
if (fieldCount == 0) {
LOG_ERROR("CSV DBC: invalid field count");
return false;
}
// Parse "strings=I,J,K"
auto stringsPos = line.find("strings=");
if (stringsPos != std::string::npos) {
std::istringstream ss(line.substr(stringsPos + 8));
std::string tok;
while (std::getline(ss, tok, ',')) {
tok = trimAscii(tok);
if (!tok.empty()) {
try {
stringCols.insert(static_cast<uint32_t>(std::stoul(tok)));
} catch (...) {
LOG_WARNING("CSV DBC: invalid string column index token: '", tok, "'");
}
}
}
}
// Field 0 is always the numeric record ID in DBC files — never a string.
// Some CSV exports incorrectly mark it as a string column; force-remove it.
if (stringCols.erase(0) > 0) {
LOG_DEBUG("CSV DBC: removed field 0 from string columns (always numeric ID)");
}
recordSize = fieldCount * 4;
// --- Build string block with initial null byte ---
stringBlock.clear();
stringBlock.push_back(0); // offset 0 = empty string
// --- Parse data rows ---
struct RowData {
std::vector<uint32_t> fields;
};
std::vector<RowData> rows;
while (std::getline(stream, line)) {
if (line.empty()) continue;
RowData row;
row.fields.resize(fieldCount, 0);
uint32_t col = 0;
size_t pos = 0;
while (col < fieldCount && pos < line.size()) {
if (stringCols.count(col) && pos < line.size() && line[pos] == '"') {
// Quoted string field
pos++; // skip opening quote
std::string str;
while (pos < line.size()) {
if (line[pos] == '"') {
if (pos + 1 < line.size() && line[pos + 1] == '"') {
str += '"'; // escaped quote
pos += 2;
} else {
pos++; // closing quote
break;
}
} else {
str += line[pos++];
}
}
// Skip comma after closing quote
if (pos < line.size() && line[pos] == ',') pos++;
// Store string in string block
if (str.empty()) {
row.fields[col] = 0; // points to empty string at offset 0
} else {
uint32_t offset = static_cast<uint32_t>(stringBlock.size());
stringBlock.insert(stringBlock.end(), str.begin(), str.end());
stringBlock.push_back(0); // null terminator
row.fields[col] = offset;
}
} else if (pos < line.size() && line[pos] == '"') {
// Quoted value in numeric field — skip quotes, try to parse content
pos++; // skip opening quote
std::string str;
while (pos < line.size()) {
if (line[pos] == '"') {
if (pos + 1 < line.size() && line[pos + 1] == '"') {
str += '"';
pos += 2;
} else {
pos++; // closing quote
break;
}
} else {
str += line[pos++];
}
}
if (pos < line.size() && line[pos] == ',') pos++;
if (!str.empty()) {
try {
row.fields[col] = static_cast<uint32_t>(std::stoul(str));
} catch (...) {
row.fields[col] = 0;
}
}
} else {
// Numeric field — read until comma or end of line
size_t end = line.find(',', pos);
if (end == std::string::npos) end = line.size();
std::string tok = line.substr(pos, end - pos);
if (!tok.empty()) {
try {
row.fields[col] = static_cast<uint32_t>(std::stoul(tok));
} catch (...) {
row.fields[col] = 0; // non-numeric value in numeric field
}
}
pos = (end < line.size()) ? end + 1 : line.size();
}
col++;
}
rows.push_back(std::move(row));
}
// --- Build record data (binary layout identical to WDBC) ---
recordCount = static_cast<uint32_t>(rows.size());
stringBlockSize = static_cast<uint32_t>(stringBlock.size());
recordData.resize(static_cast<size_t>(recordCount) * recordSize);
for (uint32_t i = 0; i < recordCount; ++i) {
uint8_t* dst = recordData.data() + static_cast<size_t>(i) * recordSize;
for (uint32_t f = 0; f < fieldCount; ++f) {
uint32_t val = rows[i].fields[f];
std::memcpy(dst + f * 4, &val, 4);
}
}
loaded = true;
idCacheBuilt = false;
idToIndexCache.clear();
LOG_DEBUG("Loaded CSV DBC: ", recordCount, " records, ",
fieldCount, " fields, ", stringCols.size(), " string cols, ",
stringBlockSize, " string bytes");
return true;
}
bool DBCFile::loadJSON(const std::vector<uint8_t>& jsonData) {
try {
auto j = nlohmann::json::parse(jsonData.begin(), jsonData.end());
if (!j.contains("records") || !j["records"].is_array()) {
LOG_ERROR("JSON DBC: missing 'records' array");
return false;
}
const auto& records = j["records"];
if (records.empty()) {
LOG_WARNING("JSON DBC: empty records array");
return false;
}
fieldCount = j.value("fieldCount", 0u);
if (fieldCount == 0 && !records[0].empty()) {
fieldCount = static_cast<uint32_t>(records[0].size());
}
if (fieldCount == 0) return false;
// Sanity caps. Real DBCs cap at ~250 fields and a few million
// records (Spell.dbc is the biggest at ~50K rows). Multi-million
// products would OOM the recordData allocation below.
if (fieldCount > 1024) {
LOG_ERROR("JSON DBC: fieldCount ", fieldCount, " too large");
return false;
}
recordSize = fieldCount * 4;
recordCount = static_cast<uint32_t>(records.size());
if (recordCount > 5'000'000 ||
static_cast<uint64_t>(recordCount) * recordSize > (256ull << 20)) {
LOG_ERROR("JSON DBC: recordCount ", recordCount, " * recordSize ",
recordSize, " exceeds 256MB cap");
return false;
}
stringBlock.clear();
stringBlock.push_back(0);
recordData.resize(static_cast<size_t>(recordCount) * recordSize, 0);
for (uint32_t i = 0; i < recordCount; i++) {
const auto& row = records[i];
// Skip non-array rows (object, string, etc.) — row[col] throws
// on a non-array, which the outer try-catch would treat as a
// hard load failure for the whole file. Empty record stays
// zero-initialized from the resize() above.
if (!row.is_array()) continue;
uint32_t* fields = reinterpret_cast<uint32_t*>(
recordData.data() + static_cast<size_t>(i) * recordSize);
uint32_t cols = std::min(fieldCount, static_cast<uint32_t>(row.size()));
for (uint32_t col = 0; col < cols; col++) {
const auto& val = row[col];
if (val.is_string()) {
const std::string& str = val.get_ref<const std::string&>();
// Cap individual string at 4KB and total stringBlock at
// 64MB to prevent OOM from a malicious JSON DBC stuffing
// huge strings into every field.
if (str.empty()) {
fields[col] = 0;
} else if (str.size() > 4096 ||
stringBlock.size() + str.size() > 64ull * 1024 * 1024) {
fields[col] = 0;
} else {
fields[col] = static_cast<uint32_t>(stringBlock.size());
stringBlock.insert(stringBlock.end(), str.begin(), str.end());
stringBlock.push_back(0);
}
} else if (val.is_number_float()) {
float f = val.get<float>();
if (!std::isfinite(f)) f = 0.0f;
std::memcpy(&fields[col], &f, 4);
} else if (val.is_number_integer()) {
// Range-check: nlohmann throws on out-of-range get<uint32_t>
// (negative or > UINT32_MAX). Catching at the field level
// keeps a single bad cell from killing the whole DBC load.
int64_t raw = val.get<int64_t>();
if (raw < 0 || raw > 0xFFFFFFFFll) raw = 0;
fields[col] = static_cast<uint32_t>(raw);
}
}
}
stringBlockSize = static_cast<uint32_t>(stringBlock.size());
loaded = true;
idCacheBuilt = false;
idToIndexCache.clear();
LOG_INFO("Loaded JSON DBC: ", recordCount, " records, ",
fieldCount, " fields, ", stringBlockSize, " string bytes");
return true;
} catch (const std::exception& e) {
LOG_ERROR("JSON DBC parse error: ", e.what());
return false;
}
}
} // namespace pipeline
} // namespace wowee