mirror of
https://github.com/Kelsidavis/WoWee.git
synced 2026-03-22 23:30:14 +00:00
tools: fix DBC string-column detection false positives in both dbc_to_csv and asset_extract
The string-column auto-detector in both tools had two gaps that caused small integer fields (RaceID=1, SexID=0/1, BaseSection, ColorIndex) to be falsely classified as string columns, corrupting the generated CSVs: 1. No boundary check: a value of N was accepted as a valid string offset even when N landed inside a longer string (e.g. offset 3 inside "Character\..."). Fix: precompute valid string-start boundaries (offset 0 plus every position immediately after a null byte); reject offsets that are not boundaries. 2. No diversity check: a column whose only non-zero value is 1 would pass the boundary test because offset 1 is always a valid boundary (it follows the mandatory null at offset 0). Fix: require at least 2 distinct non-empty string values before marking a column as a string column. Columns like SexID (all values are 0 or 1, resolving to "" and the same path fragment) are integer fields, not string fields. Both dbc_to_csv and asset_extract now produce correct column metadata, e.g. CharSections.dbc yields "strings=6,7,8" instead of "strings=0,1,...,9".
This commit is contained in:
parent
5b06a62d91
commit
b31a2a66b6
2 changed files with 47 additions and 4 deletions
|
|
@ -76,8 +76,29 @@ static std::vector<uint8_t> readFileBytes(const std::string& path) {
|
||||||
return buf;
|
return buf;
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool isValidStringOffset(const std::vector<uint8_t>& stringBlock, uint32_t offset) {
|
// Precompute the set of valid string-boundary offsets in the string block.
|
||||||
|
// An offset is a valid boundary if it is 0 or immediately follows a null byte.
|
||||||
|
// This prevents small integer values (e.g. RaceID=1, 2, 3) from being falsely
|
||||||
|
// detected as string offsets just because they land in the middle of a longer
|
||||||
|
// string that starts at a lower offset.
|
||||||
|
static std::set<uint32_t> computeStringBoundaries(const std::vector<uint8_t>& stringBlock) {
|
||||||
|
std::set<uint32_t> boundaries;
|
||||||
|
if (stringBlock.empty()) return boundaries;
|
||||||
|
boundaries.insert(0);
|
||||||
|
for (size_t i = 0; i + 1 < stringBlock.size(); ++i) {
|
||||||
|
if (stringBlock[i] == 0) {
|
||||||
|
boundaries.insert(static_cast<uint32_t>(i + 1));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return boundaries;
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool isValidStringOffset(const std::vector<uint8_t>& stringBlock,
|
||||||
|
const std::set<uint32_t>& boundaries,
|
||||||
|
uint32_t offset) {
|
||||||
if (offset >= stringBlock.size()) return false;
|
if (offset >= stringBlock.size()) return false;
|
||||||
|
// Must start at a string boundary (offset 0 or right after a null byte).
|
||||||
|
if (!boundaries.count(offset)) return false;
|
||||||
for (size_t i = offset; i < stringBlock.size(); ++i) {
|
for (size_t i = offset; i < stringBlock.size(); ++i) {
|
||||||
uint8_t c = stringBlock[i];
|
uint8_t c = stringBlock[i];
|
||||||
if (c == 0) return true;
|
if (c == 0) return true;
|
||||||
|
|
@ -105,21 +126,33 @@ static std::set<uint32_t> detectStringColumns(const DBCFile& dbc,
|
||||||
std::set<uint32_t> cols;
|
std::set<uint32_t> cols;
|
||||||
if (stringBlock.size() <= 1) return cols;
|
if (stringBlock.size() <= 1) return cols;
|
||||||
|
|
||||||
|
auto boundaries = computeStringBoundaries(stringBlock);
|
||||||
|
|
||||||
for (uint32_t col = 0; col < fieldCount; ++col) {
|
for (uint32_t col = 0; col < fieldCount; ++col) {
|
||||||
bool allZeroOrValid = true;
|
bool allZeroOrValid = true;
|
||||||
bool hasNonZero = false;
|
bool hasNonZero = false;
|
||||||
|
std::set<std::string> distinctStrings;
|
||||||
|
|
||||||
for (uint32_t row = 0; row < recordCount; ++row) {
|
for (uint32_t row = 0; row < recordCount; ++row) {
|
||||||
uint32_t val = dbc.getUInt32(row, col);
|
uint32_t val = dbc.getUInt32(row, col);
|
||||||
if (val == 0) continue;
|
if (val == 0) continue;
|
||||||
hasNonZero = true;
|
hasNonZero = true;
|
||||||
if (!isValidStringOffset(stringBlock, val)) {
|
if (!isValidStringOffset(stringBlock, boundaries, val)) {
|
||||||
allZeroOrValid = false;
|
allZeroOrValid = false;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
// Collect distinct non-empty strings for diversity check.
|
||||||
|
const char* s = reinterpret_cast<const char*>(stringBlock.data() + val);
|
||||||
|
if (*s != '\0') {
|
||||||
|
distinctStrings.insert(std::string(s, strnlen(s, 256)));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (allZeroOrValid && hasNonZero) {
|
// Require at least 2 distinct non-empty string values. Columns that
|
||||||
|
// only ever point to a single string (e.g. SexID=1 always resolves to
|
||||||
|
// the same path fragment at offset 1 in the block) are almost certainly
|
||||||
|
// integer fields whose small values accidentally land at a string boundary.
|
||||||
|
if (allZeroOrValid && hasNonZero && distinctStrings.size() >= 2) {
|
||||||
cols.insert(col);
|
cols.insert(col);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -104,6 +104,7 @@ std::set<uint32_t> detectStringColumns(const DBCFile& dbc,
|
||||||
for (uint32_t col = 0; col < fieldCount; ++col) {
|
for (uint32_t col = 0; col < fieldCount; ++col) {
|
||||||
bool allZeroOrValid = true;
|
bool allZeroOrValid = true;
|
||||||
bool hasNonZero = false;
|
bool hasNonZero = false;
|
||||||
|
std::set<std::string> distinctStrings;
|
||||||
|
|
||||||
for (uint32_t row = 0; row < recordCount; ++row) {
|
for (uint32_t row = 0; row < recordCount; ++row) {
|
||||||
uint32_t val = dbc.getUInt32(row, col);
|
uint32_t val = dbc.getUInt32(row, col);
|
||||||
|
|
@ -113,9 +114,18 @@ std::set<uint32_t> detectStringColumns(const DBCFile& dbc,
|
||||||
allZeroOrValid = false;
|
allZeroOrValid = false;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
// Collect distinct non-empty strings for diversity check.
|
||||||
|
const char* s = reinterpret_cast<const char*>(stringBlock.data() + val);
|
||||||
|
if (*s != '\0') {
|
||||||
|
distinctStrings.insert(std::string(s, strnlen(s, 256)));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (allZeroOrValid && hasNonZero) {
|
// Require at least 2 distinct non-empty string values. Columns that
|
||||||
|
// only ever point to a single string (e.g. SexID=1 always resolves to
|
||||||
|
// the same path fragment at offset 1 in the block) are almost certainly
|
||||||
|
// integer fields whose small values accidentally land at a string boundary.
|
||||||
|
if (allZeroOrValid && hasNonZero && distinctStrings.size() >= 2) {
|
||||||
stringCols.insert(col);
|
stringCols.insert(col);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue