From 4bef36fcae1897ac8d0f4826575b8d5ef7e2ce0a Mon Sep 17 00:00:00 2001 From: fallenoak Date: Sun, 22 Nov 2020 00:12:24 -0600 Subject: [PATCH] feat(unicode): add SUniSGetUTF8 --- storm/Unicode.cpp | 59 +++++++++++++++++++++++++++++++++++++++++++++++ storm/Unicode.hpp | 8 +++++++ test/Unicode.cpp | 32 +++++++++++++++++++++++++ 3 files changed, 99 insertions(+) create mode 100644 storm/Unicode.cpp create mode 100644 storm/Unicode.hpp create mode 100644 test/Unicode.cpp diff --git a/storm/Unicode.cpp b/storm/Unicode.cpp new file mode 100644 index 0000000..f5aa864 --- /dev/null +++ b/storm/Unicode.cpp @@ -0,0 +1,59 @@ +#include "storm/Unicode.hpp" + +uint32_t SUniSGetUTF8(const uint8_t* strptr, int32_t* chars) { + if (chars) { + *chars = 0; + } + + if (!strptr || !*strptr) { + return -1; + } + + if (chars) { + *chars = *chars + 1; + } + + uint32_t value = *strptr; + int32_t extra; + + if ((value & 0xFE) == 0xFC) { + value &= 0x01; + extra = 5; + } else if ((value & 0xFC) == 0xF8) { + value &= 0x03; + extra = 4; + } else if ((value & 0xF8) == 0xF0) { + value &= 0x07; + extra = 3; + } else if ((value & 0xF0) == 0xE0) { + value &= 0x0F; + extra = 2; + } else if ((value & 0xE0) == 0xC0) { + value &= 0x1F; + extra = 1; + } else if ((value & 0x80) == 0x00) { + return value; + } else { + return 0x80000000; + } + + const uint8_t* next = strptr + 1; + + for (int32_t i = 0; i < extra; i++, next++) { + if (!*next) { + return -1; + } + + if (chars) { + *chars = *chars + 1; + } + + if ((*next & 0xC0) != 0x80) { + return 0x80000000; + } + + value = (value << 6) | (*next & 0x3F); + } + + return value; +} diff --git a/storm/Unicode.hpp b/storm/Unicode.hpp new file mode 100644 index 0000000..cdccd1d --- /dev/null +++ b/storm/Unicode.hpp @@ -0,0 +1,8 @@ +#ifndef STORM_UNICODE_HPP +#define STORM_UNICODE_HPP + +#include + +uint32_t SUniSGetUTF8(const uint8_t* strptr, int32_t* chars); + +#endif diff --git a/test/Unicode.cpp b/test/Unicode.cpp new file mode 100644 index 0000000..e951c5c --- /dev/null +++ b/test/Unicode.cpp @@ -0,0 +1,32 @@ +#include "storm/Unicode.hpp" +#include "test/Test.hpp" + +TEST_CASE("SUniSGetUTF8", "[unicode]") { + SECTION("returns ascii-range utf-8 first character") { + auto string = "foobar"; + int32_t chars = 0; + auto code = SUniSGetUTF8(reinterpret_cast(string), &chars); + + REQUIRE(code == 'f'); + REQUIRE(chars == 1); + } + + SECTION("returns non-ascii-range utf-8 first character") { + auto string = "\xF0\x9F\x99\x82" + "foobar"; + int32_t chars = 0; + auto code = SUniSGetUTF8(reinterpret_cast(string), &chars); + + REQUIRE(code == 0x1F642); + REQUIRE(chars == 4); + } + + SECTION("returns null first character") { + auto string = ""; + int32_t chars = 0; + auto code = SUniSGetUTF8(reinterpret_cast(string), &chars); + + REQUIRE(code == -1u); + REQUIRE(chars == 0); + } +}