From 68c162cde1991dc6c8a9d3866cbeeab809bf4b06 Mon Sep 17 00:00:00 2001 From: superp00t Date: Mon, 7 Apr 2025 22:25:03 -0400 Subject: [PATCH] feat(unicode): add UTF string length functions --- storm/Unicode.cpp | 122 ++++++++++++++++++++++++++++++++++++++++++++++ storm/Unicode.hpp | 8 ++- test/Unicode.cpp | 38 +++++++++++++++ 3 files changed, 166 insertions(+), 2 deletions(-) diff --git a/storm/Unicode.cpp b/storm/Unicode.cpp index de7611d..bcd08a1 100644 --- a/storm/Unicode.cpp +++ b/storm/Unicode.cpp @@ -334,3 +334,125 @@ done: } return result; } + +int32_t SUniConvertUTF16to8Len(const uint16_t* src, uint32_t srcmaxchars, uint32_t* srcchars) { + if (!srcmaxchars || !src) { + if (srcchars) { + *srcchars = 0; + } + return -1; + } + + auto srcend = srcmaxchars == STORM_MAX_STR ? reinterpret_cast(UINTPTR_MAX) : &src[srcmaxchars]; + auto srcstart = src; + + int32_t result; + + while (src < srcend) { + auto grapheme = static_cast(src[0]); + if (0xD7FF < grapheme && grapheme < 0xDC00) { + if (src + 1 >= srcend) { + goto fail; + } + auto char2 = static_cast(src[1]); + if (0xDBFF < char2 && char2 < 0xE000) { + grapheme = ((grapheme - 0xD7F7) * 1024) + char2; + } + } + + uint32_t chars; + + if (grapheme < 0x80) { + result++; + if (grapheme == 0) { + goto done; + } + } else if (grapheme < 0x800) { + result += 2; + } else if (grapheme < 0x10000) { + result += 3; + } else if (grapheme < 0x200000) { + result += 4; + } else if (grapheme < 0x4000000) { + result += 5; + } else if (grapheme > 0x7FFFFFFF) { + result += 2; + } else { + if (static_cast(grapheme) < 0) { + result += 2; + } else { + result += 6; + } + } + + result += chars; + } + +fail: + result = -1; + +done: + if (srcchars) { + *srcchars = src - srcstart; + } + + return result; +} + +int32_t SUniConvertUTF8to16Len(const uint8_t* src, uint32_t srcmaxchars, uint32_t* srcchars) { + if (!srcmaxchars || !src) { + if (srcchars) { + *srcchars = 0; + } + return -1; + } + + auto srcend = srcmaxchars == STORM_MAX_STR ? reinterpret_cast(UINTPTR_MAX) : src + srcmaxchars; + auto srcstart = src; + int32_t result = 0; + + while (src < srcend) { + auto bytes = bytesFromUTF8[*src]; + if ((src + bytes) >= srcend) { + result = -1 - bytes; + goto done; + } + + uint32_t grapheme = 0; + + switch (bytes) { + case 5: + grapheme = (grapheme + *src++) << 6; + case 4: + grapheme = (grapheme + *src++) << 6; + case 3: + grapheme = (grapheme + *src++) << 6; + case 2: + grapheme = (grapheme + *src++) << 6; + case 1: + grapheme = (grapheme + *src++) << 6; + case 0: + grapheme = (grapheme + *src++) - offsetsFromUTF8[bytes]; + } + + if (grapheme < 0x10000) { + result++; + if (grapheme == offsetsFromUTF8[bytes]) { + goto done; + } + } else if (grapheme < 0x110000) { + result += 2; + } else { + result ++; + } + } + + result = -1; + +done: + if (srcchars) { + *srcchars = src - srcstart; + } + + return result; +} diff --git a/storm/Unicode.hpp b/storm/Unicode.hpp index bf6efc8..0beaad6 100644 --- a/storm/Unicode.hpp +++ b/storm/Unicode.hpp @@ -7,8 +7,12 @@ uint32_t SUniSGetUTF8(const uint8_t* strptr, int32_t* chars); void SUniSPutUTF8(uint32_t c, char* strptr); -int32_t SUniConvertUTF16to8(uint8_t* dst, uint32_t dstmaxchars, const uint16_t* src, uint32_t srcmaxchars, uint32_t* dstchars, uint32_t* srcchars); - int32_t SUniConvertUTF8to16(uint16_t* dst, uint32_t dstmaxchars, const uint8_t* src, uint32_t srcmaxchars, uint32_t* dstchars, uint32_t* srcchars); +int32_t SUniConvertUTF16to8(uint8_t* dst, uint32_t dstmaxchars, const uint16_t* src, uint32_t srcmaxchars, uint32_t* dstchars, uint32_t* srcchars); + +int32_t SUniConvertUTF16to8Len(const uint16_t* src, uint32_t srcmaxchars, uint32_t* srcchars); + +int32_t SUniConvertUTF8to16Len(const uint8_t* src, uint32_t srcmaxchars, uint32_t* srcchars); + #endif diff --git a/test/Unicode.cpp b/test/Unicode.cpp index 73d2425..237ee20 100644 --- a/test/Unicode.cpp +++ b/test/Unicode.cpp @@ -96,3 +96,41 @@ TEST_CASE("SUniConvertUTF8to16", "[unicode]") { REQUIRE(widechars[2] == 0x0000); } } + +TEST_CASE("SUniConvertUTF8to16Len", "[unicode]") { + SECTION("fail with the correct result") { + uint8_t chars[] = { 0xE6, 0xB1, 0x89, 0xE5, 0xAD, 0x97, 0x00 }; + uint32_t srcchars; + + int32_t result; + + result = SUniConvertUTF8to16Len(chars, 2, &srcchars); + REQUIRE(result == -3); + REQUIRE(srcchars == 0); + + result = SUniConvertUTF8to16Len(chars, 3, &srcchars); + REQUIRE(result == -1); + REQUIRE(srcchars == 3); + + result = SUniConvertUTF8to16Len(chars, 4, &srcchars); + REQUIRE(result == -3); + REQUIRE(srcchars == 3); + + result = SUniConvertUTF8to16Len(chars, 5, &srcchars); + REQUIRE(result == -3); + REQUIRE(srcchars == 3); + + result = SUniConvertUTF8to16Len(chars, 6, &srcchars); + REQUIRE(result == -1); + REQUIRE(srcchars == 6); + } + + SECTION("get length correctly") { + uint8_t chars[] = { 0xE6, 0xB1, 0x89, 0xE5, 0xAD, 0x97, 0x00 }; + uint32_t srcchars; + + auto result = SUniConvertUTF8to16Len(chars, 7, &srcchars); + REQUIRE(result == 3); + REQUIRE(srcchars == 7); + } +}