core: encoding: add utf-8 functions from b_string to b_wchar interface

This commit is contained in:
2025-10-25 00:02:06 +01:00
parent 3e82d12476
commit f6f49faf97
2 changed files with 198 additions and 0 deletions

View File

@@ -1191,3 +1191,191 @@ bool b_wchar_is_punct(b_wchar c)
{
return iswpunct((wchar_t)c);
}
bool b_wchar_utf8_is_valid_scalar(b_wchar c)
{
return (((c) >= 0x0000 && (c) <= 0xD7FF)
|| ((c) >= 0xE000 && (c) <= 0x10FFFF));
}
unsigned int b_wchar_utf8_header_decode(char c)
{
unsigned int len = 0;
if (!(c & 0x80)) {
len = 1;
} else if (c & 0xC0 && !(c & 0x20)) {
len = 2;
} else if (c & 0xE0 && !(c & 0x10)) {
len = 3;
} else if (c & 0xF0 && !(c & 0x08)) {
len = 4;
} else {
len = 0;
}
return len;
}
unsigned int b_wchar_utf8_codepoint_size(b_wchar c)
{
if (!b_wchar_utf8_is_valid_scalar(c)) {
return 0;
}
if (c <= 0x7F) {
return 1;
}
if (c <= 0x7FF) {
return 2;
}
if (c <= 0xFFFF) {
return 3;
}
if (c <= 0x10FFFF) {
return 4;
}
return 0;
}
static int32_t decode_utf8_trailer_byte(char c)
{
if (!(c & 0x80) || (c & 0x40)) {
return -1;
}
return c & 0x3F;
}
b_wchar b_wchar_utf8_codepoint_decode(const char *s)
{
b_wchar result = 0;
int len = 0;
if (!(s[0] & 0x80)) {
len = 1;
result = s[0] & 0x7F;
} else if (s[0] & 0xC0 && !(s[0] & 0x20)) {
len = 2;
result = s[0] & 0x1F;
result <<= 6;
} else if (s[0] & 0xE0 && !(s[0] & 0x10)) {
len = 3;
result = s[0] & 0x0F;
result <<= 12;
} else if (s[0] & 0xF0 && !(s[0] & 0x08)) {
len = 4;
result = s[0] & 0x07;
result <<= 18;
} else {
return B_WCHAR_INVALID;
}
for (int i = 1; i < len; i++) {
int32_t c = decode_utf8_trailer_byte(s[i]);
if (c == -1) {
return B_WCHAR_INVALID;
}
c <<= 6 * (len - i - 1);
result |= c;
}
if (!b_wchar_utf8_is_valid_scalar(result)) {
return B_WCHAR_INVALID;
}
return result;
}
unsigned int b_wchar_utf8_codepoint_encode(b_wchar c, char s[4])
{
unsigned int len = b_wchar_utf8_codepoint_size(c);
switch (len) {
case 1:
s[0] = c & 0x7F;
break;
case 2:
s[0] = ((c >> 6) & 0x1F) | 0xC0;
s[1] = (c & 0x3F) | 0x80;
break;
case 3:
s[0] = ((c >> 12) & 0x0F) | 0xE0;
s[1] = ((c >> 6) & 0x3F) | 0x80;
s[2] = (c & 0x3F) | 0x80;
break;
case 4:
s[0] = ((c >> 18) & 0x07) | 0xF0;
s[1] = ((c >> 12) & 0x3F) | 0x80;
s[2] = ((c >> 6) & 0x3F) | 0x80;
s[3] = (c & 0x3F) | 0x80;
break;
default:
return 0;
}
return len;
}
unsigned int b_wchar_utf8_codepoint_stride(const char *s)
{
char c = *s;
if (!(c & 0x80)) {
return 1;
}
if ((c & 0xC0) && !(c & 0x20)) {
return 2;
}
if ((c & 0xE0) && !(c & 0x10)) {
return 3;
}
if ((c & 0xF0) && !(c & 0x08)) {
return 4;
}
return 0;
}
size_t b_wchar_utf8_codepoint_count(const char *s, size_t nr_bytes)
{
size_t nr_codepoints = 0;
const char *end = s + nr_bytes;
while (*s && s < end) {
size_t stride = b_wchar_utf8_codepoint_stride(s);
if (stride == 0) {
/* invalid codepoint */
return 0;
}
nr_codepoints++;
s += stride;
}
return nr_codepoints;
}
size_t b_wchar_utf8_string_encoded_size(const b_wchar *s, size_t nr_codepoints)
{
size_t len = 0;
for (size_t i = 0; i < nr_codepoints; i++) {
size_t l = b_wchar_utf8_codepoint_size(s[i]);
if (l == 0) {
/* invalid codepoint */
return 0;
}
len += l;
}
return len;
}