From f6f49faf97a83b3057b302f39e5d7cd27de2d419 Mon Sep 17 00:00:00 2001 From: Max Wash Date: Sat, 25 Oct 2025 00:02:06 +0100 Subject: [PATCH] core: encoding: add utf-8 functions from b_string to b_wchar interface --- core/encoding.c | 188 ++++++++++++++++++++++++++++++ core/include/blue/core/encoding.h | 10 ++ 2 files changed, 198 insertions(+) diff --git a/core/encoding.c b/core/encoding.c index 870784b..7d00ba0 100644 --- a/core/encoding.c +++ b/core/encoding.c @@ -1191,3 +1191,191 @@ bool b_wchar_is_punct(b_wchar c) { return iswpunct((wchar_t)c); } + +bool b_wchar_utf8_is_valid_scalar(b_wchar c) +{ + return (((c) >= 0x0000 && (c) <= 0xD7FF) + || ((c) >= 0xE000 && (c) <= 0x10FFFF)); +} + +unsigned int b_wchar_utf8_header_decode(char c) +{ + unsigned int len = 0; + + if (!(c & 0x80)) { + len = 1; + } else if (c & 0xC0 && !(c & 0x20)) { + len = 2; + } else if (c & 0xE0 && !(c & 0x10)) { + len = 3; + } else if (c & 0xF0 && !(c & 0x08)) { + len = 4; + } else { + len = 0; + } + + return len; +} + +unsigned int b_wchar_utf8_codepoint_size(b_wchar c) +{ + if (!b_wchar_utf8_is_valid_scalar(c)) { + return 0; + } + + if (c <= 0x7F) { + return 1; + } + + if (c <= 0x7FF) { + return 2; + } + + if (c <= 0xFFFF) { + return 3; + } + + if (c <= 0x10FFFF) { + return 4; + } + + return 0; +} + +static int32_t decode_utf8_trailer_byte(char c) +{ + if (!(c & 0x80) || (c & 0x40)) { + return -1; + } + + return c & 0x3F; +} + +b_wchar b_wchar_utf8_codepoint_decode(const char *s) +{ + b_wchar result = 0; + int len = 0; + + if (!(s[0] & 0x80)) { + len = 1; + result = s[0] & 0x7F; + } else if (s[0] & 0xC0 && !(s[0] & 0x20)) { + len = 2; + result = s[0] & 0x1F; + result <<= 6; + } else if (s[0] & 0xE0 && !(s[0] & 0x10)) { + len = 3; + result = s[0] & 0x0F; + result <<= 12; + } else if (s[0] & 0xF0 && !(s[0] & 0x08)) { + len = 4; + result = s[0] & 0x07; + result <<= 18; + } else { + return B_WCHAR_INVALID; + } + + for (int i = 1; i < len; i++) { + int32_t c = decode_utf8_trailer_byte(s[i]); + if (c == -1) { + return B_WCHAR_INVALID; + } + + c <<= 6 * (len - i - 1); + result |= c; + } + + if (!b_wchar_utf8_is_valid_scalar(result)) { + return B_WCHAR_INVALID; + } + + return result; +} + +unsigned int b_wchar_utf8_codepoint_encode(b_wchar c, char s[4]) +{ + unsigned int len = b_wchar_utf8_codepoint_size(c); + + switch (len) { + case 1: + s[0] = c & 0x7F; + break; + case 2: + s[0] = ((c >> 6) & 0x1F) | 0xC0; + s[1] = (c & 0x3F) | 0x80; + break; + case 3: + s[0] = ((c >> 12) & 0x0F) | 0xE0; + s[1] = ((c >> 6) & 0x3F) | 0x80; + s[2] = (c & 0x3F) | 0x80; + break; + case 4: + s[0] = ((c >> 18) & 0x07) | 0xF0; + s[1] = ((c >> 12) & 0x3F) | 0x80; + s[2] = ((c >> 6) & 0x3F) | 0x80; + s[3] = (c & 0x3F) | 0x80; + break; + default: + return 0; + } + + return len; +} + +unsigned int b_wchar_utf8_codepoint_stride(const char *s) +{ + char c = *s; + + if (!(c & 0x80)) { + return 1; + } + + if ((c & 0xC0) && !(c & 0x20)) { + return 2; + } + + if ((c & 0xE0) && !(c & 0x10)) { + return 3; + } + + if ((c & 0xF0) && !(c & 0x08)) { + return 4; + } + + return 0; +} + +size_t b_wchar_utf8_codepoint_count(const char *s, size_t nr_bytes) +{ + size_t nr_codepoints = 0; + const char *end = s + nr_bytes; + + while (*s && s < end) { + size_t stride = b_wchar_utf8_codepoint_stride(s); + if (stride == 0) { + /* invalid codepoint */ + return 0; + } + + nr_codepoints++; + s += stride; + } + + return nr_codepoints; +} + +size_t b_wchar_utf8_string_encoded_size(const b_wchar *s, size_t nr_codepoints) +{ + size_t len = 0; + for (size_t i = 0; i < nr_codepoints; i++) { + size_t l = b_wchar_utf8_codepoint_size(s[i]); + if (l == 0) { + /* invalid codepoint */ + return 0; + } + + len += l; + } + + return len; +} diff --git a/core/include/blue/core/encoding.h b/core/include/blue/core/encoding.h index 97817ae..50c3f90 100644 --- a/core/include/blue/core/encoding.h +++ b/core/include/blue/core/encoding.h @@ -28,4 +28,14 @@ static inline bool b_wchar_is_alnum(b_wchar c) BLUE_API bool b_wchar_is_punct(b_wchar c); +BLUE_API bool b_wchar_utf8_is_valid_scalar(b_wchar c); +BLUE_API unsigned int b_wchar_utf8_header_decode(char c); +BLUE_API unsigned int b_wchar_utf8_codepoint_size(b_wchar c); +BLUE_API b_wchar b_wchar_utf8_codepoint_decode(const char *s); +BLUE_API unsigned int b_wchar_utf8_codepoint_encode(b_wchar c, char s[4]); +BLUE_API unsigned int b_wchar_utf8_codepoint_stride(const char *s); +BLUE_API size_t b_wchar_utf8_codepoint_count(const char *s, size_t nr_bytes); +BLUE_API size_t b_wchar_utf8_string_encoded_size( + const b_wchar *s, size_t nr_codepoints); + #endif