From 0401d901376995c7a209125b8cb12f1ceaa46fb7 Mon Sep 17 00:00:00 2001 From: superp00t Date: Sun, 6 Apr 2025 13:02:42 -0400 Subject: [PATCH] feat(unicode): add UTF-8 conversion routines --- common/Unicode.cpp | 334 +++++++++++++++++++++++++++++++++++++++++++++ common/Unicode.hpp | 14 ++ 2 files changed, 348 insertions(+) create mode 100644 common/Unicode.cpp create mode 100644 common/Unicode.hpp diff --git a/common/Unicode.cpp b/common/Unicode.cpp new file mode 100644 index 0000000..4d691a9 --- /dev/null +++ b/common/Unicode.cpp @@ -0,0 +1,334 @@ +#include "common/Unicode.hpp" +#include + +static const uint32_t offsetsFromUTF8[] = { + 0x0, + 0x3080, + 0xE2080, + 0x3C82080, + 0xFA082080, + 0x82082080 +}; + +static const uint32_t firstByteMark[] = { + 0x0, + 0x0, + 0xC0, + 0xE0, + 0xF0, + 0xF8, + 0xFC +}; + +static const uint8_t bytesFromUTF8[] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, + 0x04, 0x04, 0x04, 0x04, 0x05, 0x05, 0x05, 0x05 +}; + +uint32_t sgetu8(const uint8_t* strptr, int32_t* chars) { + if (chars) { + *chars = 0; + } + + if (!strptr || !*strptr) { + return -1; + } + + if (chars) { + *chars = *chars + 1; + } + + uint32_t value = *strptr; + int32_t extra; + + if ((value & 0xFE) == 0xFC) { + value &= 0x01; + extra = 5; + } else if ((value & 0xFC) == 0xF8) { + value &= 0x03; + extra = 4; + } else if ((value & 0xF8) == 0xF0) { + value &= 0x07; + extra = 3; + } else if ((value & 0xF0) == 0xE0) { + value &= 0x0F; + extra = 2; + } else if ((value & 0xE0) == 0xC0) { + value &= 0x1F; + extra = 1; + } else if ((value & 0x80) == 0x00) { + return value; + } else { + return 0x80000000; + } + + const uint8_t* next = strptr + 1; + + for (int32_t i = 0; i < extra; i++, next++) { + if (!*next) { + return -1; + } + + if (chars) { + *chars = *chars + 1; + } + + if ((*next & 0xC0) != 0x80) { + return 0x80000000; + } + + value = (value << 6) | (*next & 0x3F); + } + + return value; +} + +void sputu8(uint32_t c, char* strptr) { + if (!strptr) { + return; + } + + auto curstr = strptr; + auto v3 = c; + char v4, v5, v6, v7; + + if (c >= 0x80) { + if (c >= 0x800) { + if (c >= 0x10000) { + if (c >= 0x200000) { + if (c >= 0x400000) { + if (c >= 0x80000000) { + *curstr = 0; + return; + } + + *strptr = (c >> 30) | 0xFC; + curstr = strptr + 1; + // TODO this seems likely to be bitwise right shift 24, not 8 + v7 = ((c >> 8) & 0x3F) | 0x80; + } else { + // TODO this seems likely to be bitwise right shift 24, not 8 + v7 = (c >> 8) | 0xF8; + } + + *curstr++ = v7; + v6 = ((c >> 18) & 0x3F) | 0x80; + } else { + v6 = (c >> 18) | 0xF0; + } + + *curstr++ = v6; + v5 = ((c >> 12) & 0x3F) | 0x80; + } else { + v5 = (c >> 12) | 0xE0; + } + + *curstr++ = v5; + v4 = ((c >> 6) & 0x3F) | 0x80; + } else { + v4 = (c >> 6) | 0xC0; + } + + *curstr++ = v4; + v3 = (c & 0x3F) | 0x80; + } + + *curstr++ = v3; + *curstr = '\0'; +} + +int32_t ConvertUTF16to8(uint8_t* dst, uint32_t dstmaxchars, const uint16_t* src, uint32_t srcmaxchars, uint32_t* dstchars, uint32_t* srcchars) { + auto srcend = srcmaxchars == STORM_MAX_STR ? reinterpret_cast(UINTPTR_MAX) : &src[srcmaxchars]; + auto dstend = &dst[dstmaxchars]; + auto dststart = dst; + auto srcstart = src; + + int32_t result; + + while (src < srcend) { + uint32_t widechars = 1; + auto grapheme = static_cast(src[0]); + if (0xD7FF < grapheme && grapheme < 0xDC00) { + if (src + 1 >= srcend) { + goto fail; + } + auto char2 = static_cast(src[1]); + if (0xDBFF < char2 && char2 < 0xE000) { + grapheme = ((grapheme - 0xD7F7) * 1024) + char2; + widechars = 2; + } + } + + uint32_t chars; + + if (grapheme < 0x80) { + chars = 1; + if (grapheme == 0) { + if (dst < dstend) { + *dst++ = '\0'; + result = 0; + } + goto done; + } + } else if (grapheme < 0x800) { + chars = 2; + } else if (grapheme < 0x10000) { + chars = 3; + } else if (grapheme < 0x200000) { + chars = 4; + } else if (grapheme < 0x4000000) { + chars = 5; + } else if (grapheme > 0x7FFFFFFF) { + grapheme = 0xFFFD; + chars = 2; + } else { + chars = 6; + } + + result = chars; + + dst += chars; + + if (dst > dstend) { + goto done; + } + + switch (chars) { + case 6: + *--dst = (grapheme & 0x3F) | 0x80; + grapheme >>= 6; + case 5: + *--dst = (grapheme & 0x3F) | 0x80; + grapheme >>= 6; + case 4: + *--dst = (grapheme & 0x3F) | 0x80; + grapheme >>= 6; + case 3: + *--dst = (grapheme & 0x3F) | 0x80; + grapheme >>= 6; + case 2: + *--dst = (grapheme & 0x3F) | 0x80; + grapheme >>= 6; + case 1: + *--dst = grapheme | firstByteMark[chars]; + } + + src += widechars; + dst += chars; + } + +fail: + result = -1; + +done: + if (srcchars) { + *srcchars = src - srcstart; + } + + if (dstchars) { + *dstchars = dst - dststart; + } + return result; +} + +int32_t ConvertUTF8to16(uint16_t* dst, uint32_t dstmaxchars, const uint8_t* src, uint32_t srcmaxchars, uint32_t* dstchars, uint32_t* srcchars) { + auto srcend = srcmaxchars == STORM_MAX_STR ? reinterpret_cast(UINTPTR_MAX) : src + srcmaxchars; + auto dstend = dst + dstmaxchars; + auto dststart = dst; + auto srcstart = src; + + int32_t result = 0; + + while (src < srcend) { + auto bytes = bytesFromUTF8[*src]; + if ((src + bytes) >= srcend) { + result = -1 - bytes; + goto done; + } + + uint32_t grapheme = 0; + + int32_t width = 0; + + switch (bytes) { + case 5: + grapheme = (grapheme + src[width++]) << 6; + case 4: + grapheme = (grapheme + src[width++]) << 6; + case 3: + grapheme = (grapheme + src[width++]) << 6; + case 2: + grapheme = (grapheme + src[width++]) << 6; + case 1: + grapheme = (grapheme + src[width++]) << 6; + case 0: + grapheme = (grapheme + src[width++]) - offsetsFromUTF8[bytes]; + } + + if (dst >= dstend) { + result = 1; + goto done; + } + + if (grapheme < 0x10000) { + *dst++ = static_cast(grapheme); + if (grapheme == 0x0) { + goto done; + } + } else if (grapheme < 0x110000) { + if (dst >= dstend) { + result = 0; + goto done; + } + auto v16 = grapheme - 0x10000; + *dst++ = static_cast(v16 >> 10) - 0x2800; + *dst++ = static_cast(v16 & 0x3FF) - 0x2400; + } else { + *dst++ = 0xFFFD; + } + + src += width; + } + + result = -1; + +done: + if (srcchars) { + *srcchars = src - srcstart; + } + + if (dstchars) { + *dstchars = dst - dststart; + } + return result; +} diff --git a/common/Unicode.hpp b/common/Unicode.hpp new file mode 100644 index 0000000..802f1cb --- /dev/null +++ b/common/Unicode.hpp @@ -0,0 +1,14 @@ +#ifndef COMMON_UNICODE_HPP +#define COMMON_UNICODE_HPP + +#include + +uint32_t sgetu8(const uint8_t* strptr, int32_t* chars); + +void sputu8(uint32_t c, char* strptr); + +int32_t ConvertUTF16to8(uint8_t* dst, uint32_t dstmaxchars, const uint16_t* src, uint32_t srcmaxchars, uint32_t* dstchars, uint32_t* srcchars); + +int32_t ConvertUTF8to16(uint16_t* dst, uint32_t dstmaxchars, const uint8_t* src, uint32_t srcmaxchars, uint32_t* dstchars, uint32_t* srcchars); + +#endif