feat(unicode): add UTF string length functions

This commit is contained in:
phaneron 2025-04-07 22:25:03 -04:00
parent ce9c709029
commit 68c162cde1
3 changed files with 166 additions and 2 deletions

View file

@ -334,3 +334,125 @@ done:
}
return result;
}
int32_t SUniConvertUTF16to8Len(const uint16_t* src, uint32_t srcmaxchars, uint32_t* srcchars) {
if (!srcmaxchars || !src) {
if (srcchars) {
*srcchars = 0;
}
return -1;
}
auto srcend = srcmaxchars == STORM_MAX_STR ? reinterpret_cast<const uint16_t*>(UINTPTR_MAX) : &src[srcmaxchars];
auto srcstart = src;
int32_t result;
while (src < srcend) {
auto grapheme = static_cast<uint32_t>(src[0]);
if (0xD7FF < grapheme && grapheme < 0xDC00) {
if (src + 1 >= srcend) {
goto fail;
}
auto char2 = static_cast<uint32_t>(src[1]);
if (0xDBFF < char2 && char2 < 0xE000) {
grapheme = ((grapheme - 0xD7F7) * 1024) + char2;
}
}
uint32_t chars;
if (grapheme < 0x80) {
result++;
if (grapheme == 0) {
goto done;
}
} else if (grapheme < 0x800) {
result += 2;
} else if (grapheme < 0x10000) {
result += 3;
} else if (grapheme < 0x200000) {
result += 4;
} else if (grapheme < 0x4000000) {
result += 5;
} else if (grapheme > 0x7FFFFFFF) {
result += 2;
} else {
if (static_cast<int32_t>(grapheme) < 0) {
result += 2;
} else {
result += 6;
}
}
result += chars;
}
fail:
result = -1;
done:
if (srcchars) {
*srcchars = src - srcstart;
}
return result;
}
int32_t SUniConvertUTF8to16Len(const uint8_t* src, uint32_t srcmaxchars, uint32_t* srcchars) {
if (!srcmaxchars || !src) {
if (srcchars) {
*srcchars = 0;
}
return -1;
}
auto srcend = srcmaxchars == STORM_MAX_STR ? reinterpret_cast<const uint8_t*>(UINTPTR_MAX) : src + srcmaxchars;
auto srcstart = src;
int32_t result = 0;
while (src < srcend) {
auto bytes = bytesFromUTF8[*src];
if ((src + bytes) >= srcend) {
result = -1 - bytes;
goto done;
}
uint32_t grapheme = 0;
switch (bytes) {
case 5:
grapheme = (grapheme + *src++) << 6;
case 4:
grapheme = (grapheme + *src++) << 6;
case 3:
grapheme = (grapheme + *src++) << 6;
case 2:
grapheme = (grapheme + *src++) << 6;
case 1:
grapheme = (grapheme + *src++) << 6;
case 0:
grapheme = (grapheme + *src++) - offsetsFromUTF8[bytes];
}
if (grapheme < 0x10000) {
result++;
if (grapheme == offsetsFromUTF8[bytes]) {
goto done;
}
} else if (grapheme < 0x110000) {
result += 2;
} else {
result ++;
}
}
result = -1;
done:
if (srcchars) {
*srcchars = src - srcstart;
}
return result;
}

View file

@ -7,8 +7,12 @@ uint32_t SUniSGetUTF8(const uint8_t* strptr, int32_t* chars);
void SUniSPutUTF8(uint32_t c, char* strptr);
int32_t SUniConvertUTF16to8(uint8_t* dst, uint32_t dstmaxchars, const uint16_t* src, uint32_t srcmaxchars, uint32_t* dstchars, uint32_t* srcchars);
int32_t SUniConvertUTF8to16(uint16_t* dst, uint32_t dstmaxchars, const uint8_t* src, uint32_t srcmaxchars, uint32_t* dstchars, uint32_t* srcchars);
int32_t SUniConvertUTF16to8(uint8_t* dst, uint32_t dstmaxchars, const uint16_t* src, uint32_t srcmaxchars, uint32_t* dstchars, uint32_t* srcchars);
int32_t SUniConvertUTF16to8Len(const uint16_t* src, uint32_t srcmaxchars, uint32_t* srcchars);
int32_t SUniConvertUTF8to16Len(const uint8_t* src, uint32_t srcmaxchars, uint32_t* srcchars);
#endif

View file

@ -96,3 +96,41 @@ TEST_CASE("SUniConvertUTF8to16", "[unicode]") {
REQUIRE(widechars[2] == 0x0000);
}
}
TEST_CASE("SUniConvertUTF8to16Len", "[unicode]") {
SECTION("fail with the correct result") {
uint8_t chars[] = { 0xE6, 0xB1, 0x89, 0xE5, 0xAD, 0x97, 0x00 };
uint32_t srcchars;
int32_t result;
result = SUniConvertUTF8to16Len(chars, 2, &srcchars);
REQUIRE(result == -3);
REQUIRE(srcchars == 0);
result = SUniConvertUTF8to16Len(chars, 3, &srcchars);
REQUIRE(result == -1);
REQUIRE(srcchars == 3);
result = SUniConvertUTF8to16Len(chars, 4, &srcchars);
REQUIRE(result == -3);
REQUIRE(srcchars == 3);
result = SUniConvertUTF8to16Len(chars, 5, &srcchars);
REQUIRE(result == -3);
REQUIRE(srcchars == 3);
result = SUniConvertUTF8to16Len(chars, 6, &srcchars);
REQUIRE(result == -1);
REQUIRE(srcchars == 6);
}
SECTION("get length correctly") {
uint8_t chars[] = { 0xE6, 0xB1, 0x89, 0xE5, 0xAD, 0x97, 0x00 };
uint32_t srcchars;
auto result = SUniConvertUTF8to16Len(chars, 7, &srcchars);
REQUIRE(result == 3);
REQUIRE(srcchars == 7);
}
}