feat(unicode): add SUniSGetUTF8

This commit is contained in:
fallenoak 2020-11-22 00:12:24 -06:00
parent d6c0d8a739
commit 4bef36fcae
No known key found for this signature in database
GPG key ID: 7628F8E61AEA070D
3 changed files with 99 additions and 0 deletions

59
storm/Unicode.cpp Normal file
View file

@ -0,0 +1,59 @@
#include "storm/Unicode.hpp"
uint32_t SUniSGetUTF8(const uint8_t* strptr, int32_t* chars) {
if (chars) {
*chars = 0;
}
if (!strptr || !*strptr) {
return -1;
}
if (chars) {
*chars = *chars + 1;
}
uint32_t value = *strptr;
int32_t extra;
if ((value & 0xFE) == 0xFC) {
value &= 0x01;
extra = 5;
} else if ((value & 0xFC) == 0xF8) {
value &= 0x03;
extra = 4;
} else if ((value & 0xF8) == 0xF0) {
value &= 0x07;
extra = 3;
} else if ((value & 0xF0) == 0xE0) {
value &= 0x0F;
extra = 2;
} else if ((value & 0xE0) == 0xC0) {
value &= 0x1F;
extra = 1;
} else if ((value & 0x80) == 0x00) {
return value;
} else {
return 0x80000000;
}
const uint8_t* next = strptr + 1;
for (int32_t i = 0; i < extra; i++, next++) {
if (!*next) {
return -1;
}
if (chars) {
*chars = *chars + 1;
}
if ((*next & 0xC0) != 0x80) {
return 0x80000000;
}
value = (value << 6) | (*next & 0x3F);
}
return value;
}

8
storm/Unicode.hpp Normal file
View file

@ -0,0 +1,8 @@
#ifndef STORM_UNICODE_HPP
#define STORM_UNICODE_HPP
#include <cstdint>
uint32_t SUniSGetUTF8(const uint8_t* strptr, int32_t* chars);
#endif

32
test/Unicode.cpp Normal file
View file

@ -0,0 +1,32 @@
#include "storm/Unicode.hpp"
#include "test/Test.hpp"
TEST_CASE("SUniSGetUTF8", "[unicode]") {
SECTION("returns ascii-range utf-8 first character") {
auto string = "foobar";
int32_t chars = 0;
auto code = SUniSGetUTF8(reinterpret_cast<const uint8_t*>(string), &chars);
REQUIRE(code == 'f');
REQUIRE(chars == 1);
}
SECTION("returns non-ascii-range utf-8 first character") {
auto string = "\xF0\x9F\x99\x82"
"foobar";
int32_t chars = 0;
auto code = SUniSGetUTF8(reinterpret_cast<const uint8_t*>(string), &chars);
REQUIRE(code == 0x1F642);
REQUIRE(chars == 4);
}
SECTION("returns null first character") {
auto string = "";
int32_t chars = 0;
auto code = SUniSGetUTF8(reinterpret_cast<const uint8_t*>(string), &chars);
REQUIRE(code == -1u);
REQUIRE(chars == 0);
}
}