From 2fcadf7f398a529781da9ef3c10055852adcf763 Mon Sep 17 00:00:00 2001 From: Max Wash Date: Mon, 22 Sep 2025 10:36:26 +0100 Subject: [PATCH] core: string: add UTF-8 and null-char support; and some new string functions b_string now uses UTF-8 internally, and can correctly manipulate strings that contain non-ASCII and multi-byte codepoints. b_string now tracks the length of a string in both bytes and unicode codepoints. string insertion functions have been updated to correctly handle strings with multi-byte codepoints, so the index parameter of each function now refers to codepoints rather than bytes. inserting single-byte chars into a string with no multi-byte codepoints is still optimised to used array indexing and memmove. a b_string_iterator has been added to simplify iterating through a UTF-8 string, without having to use a charAt()-style interface that would incur performance penalties. strings can now also contain null bytes. new functions include: - b_string_tokenise: a b_iterator interface for iterating through tokens in a string. similar to strtok except that: * it is re-entrant, and uses no global state. * it supports delimiters that are longer than one character and/or contain multi-byte UTF-8 codepoints. * it doesn't modify the string that is being iterated over. * it correctly handles strings with multi-byte UTF-8 codepoints and null chars. - b_string_compare: for comparing strings. necessary to use this rather than strcpy as b_strings can now contain null chars. --- object/include/blue/object/string.h | 77 +- object/string.c | 1364 ++++++++++++++++++++++++--- object/string.h | 9 +- 3 files changed, 1288 insertions(+), 162 deletions(-) diff --git a/object/include/blue/object/string.h b/object/include/blue/object/string.h index c8853e2..bc9550f 100644 --- a/object/include/blue/object/string.h +++ b/object/include/blue/object/string.h @@ -1,6 +1,7 @@ #ifndef BLUELIB_STRING_H_ #define BLUELIB_STRING_H_ +#include #include #include #include @@ -13,16 +14,44 @@ struct b_stream; #define B_CSTR(s) (b_string_create_from_cstr(s)) #define B_RV_CSTR(s) (B_RV(b_string_create_from_cstr(s))) +#define b_string_foreach(it, str) \ + for (int z__b_unique_name() = b_string_iterator_begin(str, it); \ + b_string_iterator_is_valid(it); b_string_iterator_next(it)) + typedef struct b_string b_string; +typedef struct b_string_iterator { + b_iterator _base; + int _m, _f; + b_string *_s, *_tmp; + const char **_d; + size_t _nd, _ds; + + b_status status; + size_t iteration_index; + size_t byte_index; + size_t codepoint_index; + b_wchar char_value; + const char *string_value; + size_t string_length; + size_t string_codepoints; +} b_string_iterator; + typedef enum b_strlen_flags { B_STRLEN_NORMAL = 0, B_STRLEN_IGNORE_ESC = 0x01u, B_STRLEN_IGNORE_MOD = 0x02u, + B_STRLEN_CODEPOINTS = 0x04u, } b_strlen_flags; +typedef enum b_string_tokenise_flags { + B_STRING_TOK_F_NORMAL = 0x00u, + B_STRING_TOK_F_INCLUDE_EMPTY_TOKENS = 0x01u, +} b_string_tokenise_flags; + BLUE_API b_string *b_string_create(void); BLUE_API b_string *b_string_create_from_cstr(const char *s); +BLUE_API b_string *b_string_create_from_wstr(const b_wchar *s); BLUE_API b_string *b_string_create_from_c(char c, size_t count); BLUE_API b_string *b_string_duplicate(const b_string *str); @@ -41,6 +70,7 @@ BLUE_API b_status b_string_replace( BLUE_API b_status b_string_replace_all(b_string *str, const char *new_data); BLUE_API b_status b_string_remove(b_string *str, size_t start, size_t length); BLUE_API b_status b_string_transform(b_string *str, int (*transformer)(int)); +BLUE_API b_status b_string_trim(b_string *str); static inline b_status b_string_toupper(b_string *str) { return b_string_transform(str, toupper); @@ -51,22 +81,42 @@ static inline b_status b_string_tolower(b_string *str) } BLUE_API b_status b_string_open_stream(b_string *str, struct b_stream **out); -BLUE_API void b_string_append_s(b_string *dest, const b_string *src); -BLUE_API void b_string_append_cstr(b_string *dest, const char *src); -BLUE_API void b_string_append_cstrf(b_string *dest, const char *format, ...); -BLUE_API void b_string_prepend_cstr(b_string *dest, const char *src); -BLUE_API void b_string_prepend_cstrf(b_string *dest, const char *format, ...); -BLUE_API void b_string_insert_s(b_string *dest, const b_string *src, size_t at); -BLUE_API void b_string_insert_cstr(b_string *dest, const char *src, size_t at); -BLUE_API void b_string_insert_cstrn( +BLUE_API b_status b_string_append_c(b_string *dest, char c); +BLUE_API b_status b_string_append_wc(b_string *dest, b_wchar c); +BLUE_API b_status b_string_append_s(b_string *dest, const b_string *src); +BLUE_API b_status b_string_append_cstr(b_string *dest, const char *src); +BLUE_API b_status b_string_append_wstr(b_string *dest, const b_wchar *src); +BLUE_API b_status b_string_append_cstrf(b_string *dest, const char *format, ...); + +BLUE_API b_status b_string_prepend_c(b_string *dest, char c); +BLUE_API b_status b_string_prepend_wc(b_string *dest, b_wchar c); +BLUE_API b_status b_string_prepend_cstr(b_string *dest, const char *src); +BLUE_API b_status b_string_prepend_wstr(b_string *dest, const b_wchar *src); +BLUE_API b_status b_string_prepend_cstrf(b_string *dest, const char *format, ...); + +BLUE_API b_status b_string_insert_c(b_string *dest, char c, size_t at); +BLUE_API b_status b_string_insert_wc(b_string *dest, b_wchar c, size_t at); +BLUE_API b_status b_string_insert_s(b_string *dest, const b_string *src, size_t at); +BLUE_API b_status b_string_insert_cstr(b_string *dest, const char *src, size_t at); +BLUE_API b_status b_string_insert_wstr( + b_string *dest, const b_wchar *src, size_t at); +BLUE_API b_status b_string_insert_cstrn( b_string *dest, const char *src, size_t len, size_t at); -BLUE_API void b_string_insert_cstrf( +BLUE_API b_status b_string_insert_wstrn( + b_string *dest, const char *src, size_t len, size_t at); +BLUE_API b_status b_string_insert_cstrf( b_string *dest, size_t at, const char *format, ...); BLUE_API void b_string_clear(b_string *str); +BLUE_API b_status b_string_tokenise( + b_string *str, const char *delims[], size_t nr_delims, + b_string_tokenise_flags flags, b_string_iterator *it); + BLUE_API size_t b_string_get_size(const b_string *str, b_strlen_flags flags); BLUE_API size_t b_string_get_capacity(const b_string *str); +BLUE_API bool b_string_compare(const b_string *a, const b_string *b); + BLUE_API char b_string_front(const b_string *str); BLUE_API char b_string_back(const b_string *str); @@ -75,9 +125,16 @@ BLUE_API void b_string_pop_back(b_string *str); BLUE_API const char *b_string_ptr(const b_string *str); BLUE_API b_string *b_string_substr(const b_string *str, size_t start, size_t len); +BLUE_API int b_string_iterator_begin(const b_string *string, b_string_iterator *it); +BLUE_API bool b_string_iterator_next(b_string_iterator *it); +// BLUE_API b_status b_string_iterator_erase(b_string_iterator *it); +BLUE_API bool b_string_iterator_is_valid(const b_string_iterator *it); + BLUE_API char *b_strdup(const char *s); BLUE_API size_t b_strlen(const char *s, b_strlen_flags flags); +BLUE_API b_wchar *b_wstrdup(const b_wchar *s); +BLUE_API size_t b_wstrlen(const b_wchar *s); -BLUE_API uint64_t b_cstr_hash(const char *s); +BLUE_API uint64_t b_string_hash(const b_string *s); #endif diff --git a/object/string.c b/object/string.c index d948a29..d2d3077 100644 --- a/object/string.c +++ b/object/string.c @@ -11,6 +11,17 @@ #include #include +#define IS_VALID_UTF8_SCALAR(x) \ + (((x) >= 0x0000 && (x) <= 0xD7FF) || ((x) >= 0xE000 && (x) <= 0x10FFFF)) + +#define STRING_TOK_F_FOUND_DELIM 0x80 + +enum iterator_mode { + ITERATOR_MODE_NONE = 0, + ITERATOR_MODE_CHARS, + ITERATOR_MODE_TOKENS, +}; + static void string_release(struct b_object *obj); static void string_to_string(struct b_object *obj, struct b_stream *out); @@ -23,6 +34,225 @@ static struct b_object_type string_type = { .t_to_string = string_to_string, }; +static size_t utf8_codepoint_size(b_wchar c) +{ + if (!IS_VALID_UTF8_SCALAR(c)) { + return 0; + } + + if (c <= 0x7F) { + return 1; + } + + if (c <= 0x7FF) { + return 2; + } + + if (c <= 0xFFFF) { + return 3; + } + + if (c <= 0x10FFFF) { + return 4; + } + + return 0; +} + +int32_t decode_utf8_trailer_byte(char c) +{ + if (!(c & 0x80) || (c & 0x40)) { + return -1; + } + + return c & 0x3F; +} + +static b_wchar utf8_codepoint_decode(const char *s) +{ + b_wchar result = 0; + int len = 0; + + if (!(s[0] & 0x80)) { + len = 1; + result = s[0] & 0x7F; + } else if (s[0] & 0xC0 && !(s[0] & 0x20)) { + len = 2; + result = s[0] & 0x1F; + result <<= 6; + } else if (s[0] & 0xE0 && !(s[0] & 0x10)) { + len = 3; + result = s[0] & 0x0F; + result <<= 12; + } else if (s[0] & 0xF0 && !(s[0] & 0x08)) { + len = 4; + result = s[0] & 0x07; + result <<= 18; + } else { + return B_WCHAR_INVALID; + } + + for (int i = 1; i < len; i++) { + int32_t c = decode_utf8_trailer_byte(s[i]); + if (c == -1) { + return B_WCHAR_INVALID; + } + + c <<= 6 * (len - i - 1); + result |= c; + } + + if (!IS_VALID_UTF8_SCALAR(result)) { + return B_WCHAR_INVALID; + } + + return result; +} + +static size_t utf8_codepoint_encode(b_wchar c, char s[4]) +{ + size_t len = utf8_codepoint_size(c); + + switch (len) { + case 1: + s[0] = c & 0x7F; + break; + case 2: + s[0] = ((c >> 6) & 0x1F) | 0xC0; + s[1] = (c & 0x3F) | 0x80; + break; + case 3: + s[0] = ((c >> 12) & 0x0F) | 0xE0; + s[1] = ((c >> 6) & 0x3F) | 0x80; + s[2] = (c & 0x3F) | 0x80; + break; + case 4: + s[0] = ((c >> 18) & 0x07) | 0xF0; + s[1] = ((c >> 12) & 0x3F) | 0x80; + s[2] = ((c >> 6) & 0x3F) | 0x80; + s[3] = (c & 0x3F) | 0x80; + break; + default: + return 0; + } + + return len; +} + +static size_t codepoint_stride(const char *s) +{ + char c = *s; + + if (!(c & 0x80)) { + return 1; + } + + if ((c & 0xC0) && !(c & 0x20)) { + return 2; + } + + if ((c & 0xE0) && !(c & 0x10)) { + return 3; + } + + if ((c & 0xF0) && !(c & 0x08)) { + return 4; + } + + return 0; +} + +static size_t get_number_of_codepoints(const char *s, size_t len) +{ + size_t nr_codepoints = 0; + const char *end = s + len; + + while (*s && s < end) { + size_t stride = codepoint_stride(s); + if (stride == 0) { + /* invalid codepoint */ + return 0; + } + + nr_codepoints++; + s += stride; + } + + if (*s != 0) { + /* string is not null-terminated */ + return 0; + } + + return nr_codepoints; +} + +static size_t get_utf8_encoded_size(const b_wchar *s, size_t nr_codepoints) +{ + size_t len = 0; + for (size_t i = 0; i < nr_codepoints; i++) { + size_t l = utf8_codepoint_size(s[i]); + if (l == 0) { + /* invalid codepoint */ + return 0; + } + + len += l; + } + + return len; +} + +static enum b_status convert_codepoint_range_to_byte_range( + const struct b_string *str, size_t cp_start, size_t cp_length, + size_t *out_byte_start, size_t *out_byte_length) +{ + const char *s = b_string_ptr(str); + size_t byte_offset = 0, byte_length = 0; + + for (size_t i = 0; i < cp_start; i++) { + const char *cp = &s[byte_offset]; + if (!cp || byte_offset >= str->s_len) { + /* out of range */ + return B_ERR_OUT_OF_BOUNDS; + } + + size_t stride = codepoint_stride(cp); + if (!stride) { + /* invalid codepoint */ + return B_ERR_BAD_STATE; + } + + byte_offset += stride; + } + + for (size_t i = 0; i < cp_length; i++) { + size_t cp_offset = byte_offset + byte_length; + const char *cp = &s[cp_offset]; + if (!cp || (cp_offset >= str->s_len)) { + /* out of range */ + return B_ERR_OUT_OF_BOUNDS; + } + + size_t stride = codepoint_stride(cp); + if (!stride) { + /* invalid codepoint */ + return B_ERR_BAD_STATE; + } + + byte_length += stride; + } + + if (out_byte_start) { + *out_byte_start = byte_offset; + } + + if (out_byte_length) { + *out_byte_length = byte_length; + } + + return B_SUCCESS; +} + struct b_string *b_string_create(void) { struct b_string *str @@ -32,6 +262,7 @@ struct b_string *b_string_create(void) } str->s_len = 0; + str->s_codepoints = 0; str->s_max = STRING_INLINE_CAPACITY; return str; @@ -52,6 +283,67 @@ static char *string_ptr(struct b_string *str) return str->s_data.d_external; } +static char *get_next_codepoint(struct b_string *str, char *this_codepoint) +{ + char c = *this_codepoint; + char *end = this_codepoint - 1; + size_t len = 0; + if (!(c & 0x80)) { + len = 1; + } else if ((c & 0xC0) && !(c & 0x20)) { + len = 2; + } else if ((c & 0xE0) && !(c & 0x10)) { + len = 3; + } else if ((c & 0xF0) && !(c & 0x08)) { + len = 4; + } else { + return NULL; + } + + return this_codepoint + len; +} + +static char *get_previous_codepoint(struct b_string *str, char *this_codepoint) +{ + char *start = string_ptr(str); + char *end = this_codepoint - 1; + + while (end >= start) { + char c = *end; + if ((c & 0x80) && !(c & 0x40)) { + end--; + continue; + } + + if ((c & 0xF0) && !(c & 0x08)) { + return end; + } + + if ((c & 0xE0) && !(c & 0x10)) { + return end; + } + + if ((c & 0xC0) && !(c & 0x20)) { + return end; + } + + if (!(c & 0x80)) { + return end; + } + } + + return NULL; +} + +static char *get_last_codepoint(struct b_string *str) +{ + if (str->s_len == 0) { + return NULL; + } + + return get_previous_codepoint(str, string_ptr(str) + str->s_len); +} + static int string_make_inline(struct b_string *str) { char *buffer = string_ptr(str); @@ -143,12 +435,16 @@ struct b_string *b_string_create_from_cstr(const char *s) return str; } - str->s_len = strlen(s); - string_change_capacity(str, str->s_len); + size_t s_len = strlen(s); + size_t s_codepoints = get_number_of_codepoints(s, s_len); + b_string_reserve(str, s_len); char *dest = string_ptr(str); - memcpy(dest, s, str->s_len); - dest[str->s_len] = 0; + memcpy(dest, s, s_len); + dest[s_len] = 0; + + str->s_len = s_len; + str->s_codepoints = s_codepoints; return str; } @@ -167,6 +463,7 @@ struct b_string *b_string_create_from_c(char c, size_t count) } str->s_len = count; + str->s_codepoints = count; return str; } @@ -183,6 +480,7 @@ struct b_string *b_string_duplicate(const struct b_string *str) memcpy(dst, src, str->s_len); new_str->s_len = str->s_len; + new_str->s_codepoints = str->s_codepoints; return new_str; } @@ -193,7 +491,9 @@ char *b_string_steal(struct b_string *str) char *src = string_ptr(str); if (string_is_inline(str)) { - dest = b_strdup(src); + dest = malloc(str->s_len + 1); + memcpy(dest, src, str->s_len); + dest[str->s_len] = 0; src[0] = 0; } else { dest = src; @@ -202,6 +502,7 @@ char *b_string_steal(struct b_string *str) } str->s_len = 0; + str->s_codepoints = 0; return dest; } @@ -216,7 +517,7 @@ b_status b_string_reserve(struct b_string *str, size_t capacity) return err == 0 ? B_SUCCESS : B_ERR_NO_MEMORY; } -b_status b_string_replace( +static enum b_status replace_ansi( struct b_string *str, size_t start, size_t length, const char *new_data) { b_status status = B_SUCCESS; @@ -255,6 +556,70 @@ b_status b_string_replace( return B_SUCCESS; } +static enum b_status replace_utf8( + struct b_string *str, size_t start, size_t length, const char *new_data) +{ + if (start >= str->s_codepoints) { + return B_ERR_INVALID_ARGUMENT; + } + + if (start + length >= str->s_codepoints) { + length = str->s_codepoints - start; + } + + size_t new_data_nr_bytes = strlen(new_data); + size_t new_data_nr_codepoints + = get_number_of_codepoints(new_data, new_data_nr_bytes); + if (new_data_nr_codepoints == 0) { + /* new_data is not a valid utf-8 string */ + return B_ERR_INVALID_ARGUMENT; + } + + size_t old_data_offset = 0, old_data_nr_bytes = 0; + size_t old_data_nr_codepoints = length; + enum b_status status = convert_codepoint_range_to_byte_range( + str, start, length, &old_data_offset, &old_data_nr_bytes); + if (!B_OK(status)) { + return status; + } + + size_t new_total_bytes = str->s_len - old_data_nr_bytes + new_data_nr_bytes; + if (new_total_bytes > str->s_max) { + status = b_string_reserve(str, new_total_bytes); + } + + if (!B_OK(status)) { + return status; + } + + char *s = string_ptr(str); + + char *substitution_start = s + old_data_offset; + char *excess_src = s + old_data_offset + old_data_nr_bytes; + size_t excess_length = str->s_len - old_data_offset - old_data_nr_bytes; + char *excess_dest = substitution_start + new_data_nr_bytes; + + memmove(excess_dest, excess_src, excess_length); + memmove(substitution_start, new_data, new_data_nr_bytes); + s[new_total_bytes] = '\0'; + + str->s_len = new_total_bytes; + str->s_codepoints -= old_data_nr_codepoints; + str->s_codepoints += new_data_nr_codepoints; + + return B_SUCCESS; +} + +b_status b_string_replace( + struct b_string *str, size_t start, size_t length, const char *new_data) +{ + if (str->s_len == str->s_codepoints) { + return replace_ansi(str, start, length, new_data); + } + + return replace_utf8(str, start, length, new_data); +} + b_status b_string_replace_all(b_string *str, const char *new_data) { size_t new_len = strlen(new_data); @@ -267,7 +632,7 @@ b_status b_string_replace_all(b_string *str, const char *new_data) return B_SUCCESS; } -b_status b_string_remove(b_string *str, size_t start, size_t length) +static enum b_status remove_ansi(struct b_string *str, size_t start, size_t length) { b_status status = B_SUCCESS; @@ -295,7 +660,42 @@ b_status b_string_remove(b_string *str, size_t start, size_t length) return B_SUCCESS; } -b_status b_string_transform(b_string *str, int (*transformer)(int)) +static enum b_status remove_utf8(struct b_string *str, size_t start, size_t length) +{ + size_t remove_offset = 0, remove_nr_bytes = 0; + enum b_status status = convert_codepoint_range_to_byte_range( + str, start, length, &remove_offset, &remove_nr_bytes); + if (!B_OK(status)) { + return status; + } + + size_t new_total_bytes = str->s_len - remove_nr_bytes; + + char *s = string_ptr(str); + + char *removal_start = s + remove_offset; + char *excess_src = s + remove_offset + remove_nr_bytes; + size_t excess_length = str->s_len - remove_offset - remove_nr_bytes; + + memmove(removal_start, excess_src, excess_length); + s[new_total_bytes] = '\0'; + + str->s_len = new_total_bytes; + str->s_codepoints -= length; + + return B_SUCCESS; +} + +enum b_status b_string_remove(struct b_string *str, size_t start, size_t length) +{ + if (str->s_len == str->s_codepoints) { + return remove_ansi(str, start, length); + } + + return remove_utf8(str, start, length); +} + +b_status b_string_transform(struct b_string *str, int (*transformer)(int)) { char *s = string_ptr(str); for (size_t i = 0; i < str->s_len; i++) { @@ -309,220 +709,397 @@ b_status b_string_transform(b_string *str, int (*transformer)(int)) return B_SUCCESS; } -static enum b_status stream_close(struct b_stream *stream) +static enum b_status trim_ansi(struct b_string *str) { - struct b_string *str = stream->s_ptr; - b_string_release(str); - - return B_SUCCESS; -} - -static enum b_status stream_getc(struct b_stream *stream, int *out) -{ - struct b_string *str = stream->s_ptr; - if (stream->s_cursor >= str->s_len) { - return B_ERR_NO_DATA; + char *s = string_ptr(str); + size_t whitespace_end = 0; + for (size_t i = 0; i < str->s_len; i++) { + if (!isspace(s[i])) { + whitespace_end = i; + break; + } } - char *s = string_ptr(str); - *out = s[stream->s_cursor]; - stream->s_cursor++; + memmove(s, s + whitespace_end, str->s_len - whitespace_end); + str->s_len -= whitespace_end; + + for (long i = str->s_len - 1; i >= 0; i--) { + if (isspace(s[i])) { + s[i] = 0; + str->s_len--; + } else { + break; + } + } return B_SUCCESS; } -static enum b_status stream_read( - struct b_stream *stream, unsigned char *buf, size_t count, size_t *nr_read) +static enum b_status trim_utf8(struct b_string *str) { - struct b_string *str = stream->s_ptr; - if (stream->s_cursor >= str->s_len) { - *nr_read = 0; + char *s = string_ptr(str); + size_t whitespace_end = 0; + size_t nr_whitespace_codepoints = 0; + for (size_t i = 0; i < str->s_len;) { + b_wchar c = utf8_codepoint_decode(&s[i]); + + if (!b_wchar_is_space(s[i])) { + whitespace_end = i; + break; + } + + nr_whitespace_codepoints++; + } + + memmove(s, s + whitespace_end, str->s_len - whitespace_end); + str->s_len -= whitespace_end; + str->s_codepoints -= nr_whitespace_codepoints; + + char *p = get_last_codepoint(str); + if (!p) { + return B_ERR_BAD_STATE; + } + + for (long i = str->s_len - 1; i >= 0;) { + b_wchar c = utf8_codepoint_decode(p); + size_t c_size = utf8_codepoint_size(c); + + if (b_wchar_is_space(c)) { + memset(p, 0, c_size); + str->s_len -= c_size; + str->s_codepoints--; + } else { + break; + } + + p = get_previous_codepoint(str, p); + } + + return B_SUCCESS; +} + +b_status b_string_trim(struct b_string *str) +{ + if (str->s_len == 0) { return B_SUCCESS; } - size_t available = str->s_len - stream->s_cursor; - size_t to_read = b_min(size_t, count, available); - - char *s = string_ptr(str) + stream->s_cursor; - - memcpy(buf, s, to_read); - - *nr_read = to_read; - - return B_SUCCESS; -} - -static enum b_status stream_write( - struct b_stream *stream, const unsigned char *buf, size_t count, - size_t *nr_written) -{ - struct b_string *str = stream->s_ptr; - enum b_status status = B_SUCCESS; - - if (stream->s_cursor + count > str->s_max) { - status = b_string_reserve(str, stream->s_cursor + count); + if (str->s_len == str->s_codepoints) { + return trim_ansi(str); } - if (!B_OK(status)) { - return status; - } - - char *s = string_ptr(str) + stream->s_cursor; - memcpy(s, buf, count); - s[str->s_max] = '\0'; - stream->s_cursor += count; - str->s_len = b_max(size_t, str->s_len, stream->s_cursor + count); - - *nr_written = count; - - return B_SUCCESS; + return trim_utf8(str); } -static enum b_status stream_seek( - struct b_stream *stream, long long offset, b_stream_seek_origin origin) -{ - struct b_string *str = stream->s_ptr; - - size_t abs_offset; - switch (origin) { - case B_STREAM_SEEK_START: - abs_offset = offset; - break; - case B_STREAM_SEEK_CURRENT: - abs_offset = stream->s_cursor + offset; - break; - case B_STREAM_SEEK_END: - abs_offset = str->s_len + offset; - break; - default: - return B_ERR_INVALID_ARGUMENT; - } - - stream->s_cursor = abs_offset; - - return B_SUCCESS; -} - -static enum b_status stream_reserve(struct b_stream *stream, size_t len) -{ - struct b_string *str = stream->s_ptr; - - size_t new_capacity = str->s_len + len; - return b_string_reserve(str, new_capacity); -} - -enum b_status b_string_open_stream(struct b_string *str, struct b_stream **out) -{ - struct b_stream *stream = malloc(sizeof *stream); - if (!stream) { - return B_ERR_NO_MEMORY; - } - - memset(stream, 0x0, sizeof *stream); - - stream->s_mode |= B_STREAM_READ | B_STREAM_WRITE; - - stream->s_ptr = b_string_retain(str); - stream->s_close = stream_close; - stream->s_getc = stream_getc; - stream->s_read = stream_read; - stream->s_write = stream_write; - stream->s_seek = stream_seek; - stream->s_reserve = stream_reserve; - - *out = stream; - - return B_SUCCESS; -} - -static void string_insert( - struct b_string *dest, const char *src, size_t len, size_t at) +static enum b_status string_insert_cstr_ansi( + struct b_string *dest, const char *src, size_t nr_bytes, size_t at) { if (at >= dest->s_len) { at = dest->s_len; } - size_t new_size = dest->s_len + len; + size_t new_size = dest->s_len + nr_bytes; if (dest->s_max < new_size) { string_change_capacity(dest, new_size); } char *dest_buf = string_ptr(dest); char *from = dest_buf + at; - char *to = dest_buf + at + len; + char *to = dest_buf + at + nr_bytes; memmove(to, from, dest->s_len - at); - memcpy(from, src, len); + memcpy(from, src, nr_bytes); dest_buf[new_size] = '\0'; dest->s_len = new_size; + dest->s_codepoints += nr_bytes; + return B_SUCCESS; } -static void string_insertf( +static enum b_status string_insert_cstr_utf8( + struct b_string *dest, const char *src, size_t nr_bytes, + size_t codepoint_offset) +{ + if (codepoint_offset >= dest->s_codepoints) { + codepoint_offset = dest->s_codepoints; + } + + size_t byte_offset = 0; + enum b_status status = B_SUCCESS; + + if (codepoint_offset == dest->s_codepoints) { + byte_offset = dest->s_len; + } else { + status = convert_codepoint_range_to_byte_range( + dest, 0, codepoint_offset, NULL, &byte_offset); + } + + if (!B_OK(status)) { + return status; + } + + size_t new_total_bytes = dest->s_len + nr_bytes; + if (dest->s_max < new_total_bytes) { + string_change_capacity(dest, new_total_bytes); + } + + char *dest_buf = string_ptr(dest); + char *from = dest_buf + byte_offset; + char *to = dest_buf + byte_offset + nr_bytes; + + memmove(to, from, dest->s_len - byte_offset); + memcpy(from, src, nr_bytes); + dest_buf[new_total_bytes] = '\0'; + + dest->s_len += nr_bytes; + dest->s_codepoints += get_number_of_codepoints(src, nr_bytes); + + return B_SUCCESS; +} + +static enum b_status string_insert_wstr_ansi( + struct b_string *dest, const b_wchar *src, size_t nr_codepoints, size_t at) +{ + if (at >= dest->s_len) { + at = dest->s_len; + } + + size_t utf8_encoded_size = get_utf8_encoded_size(src, nr_codepoints); + if (utf8_encoded_size == 0) { + return B_ERR_INVALID_ARGUMENT; + } + + size_t new_total_bytes = dest->s_len + utf8_encoded_size; + if (dest->s_max < new_total_bytes) { + string_change_capacity(dest, new_total_bytes); + } + + char *dest_buf = string_ptr(dest); + char *from = dest_buf + at; + char *to = dest_buf + at + utf8_encoded_size; + memmove(to, from, dest->s_len - at); + + char *ptr = dest_buf + at; + for (size_t i = 0; i < nr_codepoints; i++) { + char c[4]; + size_t c_len = utf8_codepoint_encode(src[i], c); + if (c_len == 0) { + /* the input string was already checked by + * get_utf8_encoded_size, so this should never happen */ + return B_ERR_INVALID_ARGUMENT; + } + + memcpy(ptr, c, c_len); + ptr += c_len; + } + + dest_buf[new_total_bytes] = '\0'; + + dest->s_len += utf8_encoded_size; + dest->s_codepoints += nr_codepoints; + + return B_SUCCESS; +} + +static enum b_status string_insert_wstr_utf8( + struct b_string *dest, const b_wchar *src, size_t nr_codepoints, + size_t codepoint_offset) +{ + if (codepoint_offset >= dest->s_codepoints) { + codepoint_offset = dest->s_codepoints; + } + + size_t utf8_encoded_size = get_utf8_encoded_size(src, nr_codepoints); + if (utf8_encoded_size == 0) { + return B_ERR_INVALID_ARGUMENT; + } + + size_t new_total_bytes = dest->s_len + utf8_encoded_size; + if (dest->s_max < new_total_bytes) { + string_change_capacity(dest, new_total_bytes); + } + + size_t move_offset = 0; + enum b_status status = B_SUCCESS; + + if (codepoint_offset == dest->s_codepoints) { + move_offset = dest->s_len; + } else { + status = convert_codepoint_range_to_byte_range( + dest, 0, codepoint_offset, NULL, &move_offset); + } + + if (!B_OK(status)) { + return status; + } + + char *dest_buf = string_ptr(dest); + char *from = dest_buf + move_offset; + char *to = dest_buf + move_offset + utf8_encoded_size; + memmove(to, from, dest->s_len - move_offset); + + char *ptr = dest_buf + move_offset; + for (size_t i = 0; i < nr_codepoints; i++) { + char c[4]; + size_t c_len = utf8_codepoint_encode(src[i], c); + if (c_len == 0) { + /* the input string was already checked by + * get_utf8_encoded_size, so this should never happen */ + return B_ERR_INVALID_ARGUMENT; + } + + memcpy(ptr, c, c_len); + ptr += c_len; + } + + dest_buf[new_total_bytes] = '\0'; + + dest->s_len += utf8_encoded_size; + dest->s_codepoints += nr_codepoints; + + return B_SUCCESS; +} + +static enum b_status string_insert_cstr( + struct b_string *dest, const char *src, size_t nr_bytes, size_t at) +{ + if (dest->s_len == dest->s_codepoints) { + return string_insert_cstr_ansi(dest, src, nr_bytes, at); + } + + return string_insert_cstr_utf8(dest, src, nr_bytes, at); +} + +static enum b_status string_insert_wstr( + struct b_string *dest, const b_wchar *src, size_t nr_codepoints, size_t at) +{ + if (dest->s_len == dest->s_codepoints) { + return string_insert_wstr_ansi(dest, src, nr_codepoints, at); + } + + return string_insert_wstr_utf8(dest, src, nr_codepoints, at); +} + +static enum b_status string_insertf( struct b_string *dest, size_t at, const char *format, va_list arg) { char buf[1024]; size_t len = vsnprintf(buf, sizeof buf, format, arg); - string_insert(dest, buf, len, at); + return string_insert_cstr(dest, buf, len, at); } -void b_string_insert_s(struct b_string *dest, const struct b_string *src, size_t at) +enum b_status b_string_insert_c(struct b_string *dest, char c, size_t at) { - string_insert(dest, b_string_ptr(src), src->s_len, at); + return string_insert_cstr(dest, &c, 1, at); } -void b_string_insert_cstr(struct b_string *dest, const char *src, size_t at) +enum b_status b_string_insert_wc(struct b_string *dest, b_wchar c, size_t at) { - string_insert(dest, src, strlen(src), at); + return string_insert_wstr(dest, &c, 1, at); } -void b_string_insert_cstrf(struct b_string *dest, size_t at, const char *format, ...) +enum b_status b_string_insert_s( + struct b_string *dest, const struct b_string *src, size_t at) +{ + return string_insert_cstr(dest, b_string_ptr(src), src->s_len, at); +} + +enum b_status b_string_insert_cstr(struct b_string *dest, const char *src, size_t at) +{ + return string_insert_cstr(dest, src, strlen(src), at); +} + +enum b_status b_string_insert_wstr( + struct b_string *dest, const b_wchar *src, size_t at) +{ + return string_insert_wstr(dest, src, b_wstrlen(src), at); +} + +enum b_status b_string_insert_cstrf( + struct b_string *dest, size_t at, const char *format, ...) { va_list arg; va_start(arg, format); - string_insertf(dest, at, format, arg); + enum b_status status = string_insertf(dest, at, format, arg); va_end(arg); + + return status; } -void b_string_insert_cstrn(b_string *dest, const char *src, size_t len, size_t at) +enum b_status b_string_insert_cstrn( + b_string *dest, const char *src, size_t len, size_t at) { - string_insert(dest, src, len, at); + return string_insert_cstr(dest, src, len, at); } -void b_string_append_s(struct b_string *dest, const struct b_string *src) +enum b_status b_string_append_c(struct b_string *dest, char c) { - b_string_insert_s(dest, src, SIZE_MAX); + return b_string_insert_c(dest, c, SIZE_MAX); } -void b_string_append_cstr(struct b_string *dest, const char *src) +enum b_status b_string_append_wc(struct b_string *dest, b_wchar c) { - b_string_insert_cstr(dest, src, SIZE_MAX); + return b_string_insert_wc(dest, c, SIZE_MAX); } -void b_string_append_cstrf(struct b_string *dest, const char *format, ...) +enum b_status b_string_append_s(struct b_string *dest, const struct b_string *src) +{ + return b_string_insert_s(dest, src, SIZE_MAX); +} + +enum b_status b_string_append_cstr(struct b_string *dest, const char *src) +{ + return b_string_insert_cstr(dest, src, SIZE_MAX); +} + +enum b_status b_string_append_wstr(struct b_string *dest, const b_wchar *src) +{ + return b_string_insert_wstr(dest, src, SIZE_MAX); +} + +enum b_status b_string_append_cstrf(struct b_string *dest, const char *format, ...) { va_list arg; va_start(arg, format); - string_insertf(dest, SIZE_MAX, format, arg); + enum b_status status = string_insertf(dest, SIZE_MAX, format, arg); va_end(arg); + + return status; } -void b_string_prepend_s(struct b_string *dest, const struct b_string *src) +enum b_status b_string_prepend_c(struct b_string *dest, char c) { - b_string_insert_s(dest, src, 0); + return b_string_insert_c(dest, c, 0); } -void b_string_prepend_cstr(struct b_string *dest, const char *src) +enum b_status b_string_prepend_wc(struct b_string *dest, b_wchar c) { - b_string_insert_cstr(dest, src, 0); + return b_string_insert_wc(dest, c, 0); } -void b_string_prepend_cstrf(struct b_string *dest, const char *format, ...) +enum b_status b_string_prepend_s(struct b_string *dest, const struct b_string *src) +{ + return b_string_insert_s(dest, src, 0); +} + +enum b_status b_string_prepend_cstr(struct b_string *dest, const char *src) +{ + return b_string_insert_cstr(dest, src, 0); +} + +enum b_status b_string_prepend_wstr(struct b_string *dest, const b_wchar *src) +{ + return b_string_insert_wstr(dest, src, 0); +} + +enum b_status b_string_prepend_cstrf(struct b_string *dest, const char *format, ...) { va_list arg; va_start(arg, format); - string_insertf(dest, 0, format, arg); + enum b_status status = string_insertf(dest, 0, format, arg); va_end(arg); + + return status; } void b_string_clear(struct b_string *str) @@ -534,15 +1111,150 @@ void b_string_clear(struct b_string *str) char *s = string_ptr(str); *s = '\0'; str->s_len = 0; + str->s_codepoints = 0; +} + +static struct b_iterator_ops it_ops; + +static bool has_prefix(const char *s, const char *prefix, size_t *prefix_len) +{ + size_t len = 0; + for (size_t i = 0;; i++) { + if (s[i] == 0 || prefix[i] == 0) { + break; + } + + if (s[i] != prefix[i]) { + return false; + } + + len++; + } + + *prefix_len = len; + return true; +} + +static bool has_prefixes( + const char *s, const char **prefixes, size_t nr_prefixes, + size_t *selected_prefix_len) +{ + for (size_t i = 0; i < nr_prefixes; i++) { + const char *delim = prefixes[i]; + if (has_prefix(s, delim, selected_prefix_len)) { + return true; + } + } + + return false; +} + +static enum b_status find_next_token(struct b_string_iterator *it) +{ + size_t offset = it->_ds; + size_t prefix_len = 0; + char *start = string_ptr(it->_s); + bool found_delim_last_time = (it->_f & STRING_TOK_F_FOUND_DELIM) != 0; + bool found_delim = false; + bool include_empty = (it->_f & B_STRING_TOK_F_INCLUDE_EMPTY_TOKENS); + bool found_null = false; + b_string_clear(it->_tmp); + + while (1) { + char *s = start + offset; + if (*s == 0) { + it->_f &= ~STRING_TOK_F_FOUND_DELIM; + break; + } + + found_delim = has_prefixes(s, it->_d, it->_nd, &prefix_len); + if (found_delim) { + if (it->_tmp->s_len == 0 && !include_empty) { + /* this token is empty, skip it */ + offset += prefix_len; + found_delim = false; + continue; + } + + it->_f |= STRING_TOK_F_FOUND_DELIM; + break; + } + + b_wchar c = utf8_codepoint_decode(s); + if (c == B_WCHAR_INVALID) { + return B_ERR_BAD_STATE; + } + + b_string_append_wc(it->_tmp, c); + offset += utf8_codepoint_size(c); + + if (offset > it->_s->s_len) { + break; + } + } + + bool end = !found_delim && it->_tmp->s_len == 0; + + if (include_empty && found_delim_last_time) { + end = false; + } + + if (end) { + it->string_value = NULL; + it->string_length = 0; + it->string_codepoints = 0; + return B_ERR_NO_DATA; + } + + it->_ds = offset + prefix_len; + it->string_value = b_string_ptr(it->_tmp); + it->string_length = it->_tmp->s_len; + it->string_codepoints = it->_tmp->s_codepoints; + return B_SUCCESS; +} + +enum b_status b_string_tokenise( + struct b_string *str, const char *delims[], size_t nr_delims, + b_string_tokenise_flags flags, struct b_string_iterator *it) +{ + memset(it, 0x0, sizeof *it); + + if (!nr_delims) { + return B_ERR_INVALID_ARGUMENT; + } + + struct b_string *tmp = b_string_create(); + if (!tmp) { + return B_ERR_NO_MEMORY; + } + + it->_base.it_ops = &it_ops; + it->_m = ITERATOR_MODE_TOKENS; + it->_d = delims; + it->_nd = nr_delims; + it->_s = str; + it->_f = flags; + it->_tmp = tmp; + + enum b_status status = find_next_token(it); + if (!B_OK(status)) { + b_string_release(tmp); + it->_tmp = NULL; + } + + return status; } size_t b_string_get_size(const struct b_string *str, b_strlen_flags flags) { - if (flags != B_STRLEN_NORMAL) { + switch (flags) { + case B_STRLEN_NORMAL: + return str->s_len; + case B_STRLEN_CODEPOINTS: + return str->s_codepoints; + default: return b_strlen(b_string_ptr(str), flags); } - - return str->s_len; } size_t b_string_get_capacity(const struct b_string *str) @@ -550,6 +1262,28 @@ size_t b_string_get_capacity(const struct b_string *str) return str->s_max; } +bool b_string_compare(const struct b_string *a, const struct b_string *b) +{ + if (a->s_len != b->s_len) { + return false; + } + + if (a == b) { + return true; + } + + const char *ap = b_string_ptr(a); + const char *bp = b_string_ptr(b); + + for (size_t i = 0; i < a->s_len; i++) { + if (ap[i] != bp[i]) { + return false; + } + } + + return true; +} + char b_string_front(const struct b_string *str) { if (str->s_len == 0) { @@ -613,6 +1347,295 @@ struct b_string *b_string_substr(const struct b_string *str, size_t start, size_ return newstr; } +static enum b_status stream_close(struct b_stream *stream) +{ + struct b_string *str = stream->s_ptr; + b_string_release(str); + + return B_SUCCESS; +} + +static enum b_status stream_getc(struct b_stream *stream, int *out) +{ + struct b_string *str = stream->s_ptr; + if (stream->s_cursor >= str->s_len) { + return B_ERR_NO_DATA; + } + + char *s = string_ptr(str); + *out = s[stream->s_cursor]; + stream->s_cursor++; + + return B_SUCCESS; +} + +static enum b_status stream_read( + struct b_stream *stream, unsigned char *buf, size_t count, size_t *nr_read) +{ + struct b_string *str = stream->s_ptr; + if (stream->s_cursor >= str->s_len) { + *nr_read = 0; + return B_SUCCESS; + } + + size_t available = str->s_len - stream->s_cursor; + size_t to_read = b_min(size_t, count, available); + + char *s = string_ptr(str) + stream->s_cursor; + + memcpy(buf, s, to_read); + + *nr_read = to_read; + + return B_SUCCESS; +} + +static enum b_status stream_write( + struct b_stream *stream, const unsigned char *buf, size_t count, + size_t *nr_written) +{ + struct b_string *str = stream->s_ptr; + enum b_status status = B_SUCCESS; + + if (stream->s_cursor + count > str->s_max) { + status = b_string_reserve(str, stream->s_cursor + count); + } + + if (!B_OK(status)) { + return status; + } + + string_insert_cstr(str, (const char *)buf, count, stream->s_cursor); + stream->s_cursor += count; + + *nr_written = count; + + return B_SUCCESS; +} + +static enum b_status stream_seek( + struct b_stream *stream, long long offset, b_stream_seek_origin origin) +{ + struct b_string *str = stream->s_ptr; + + size_t abs_offset; + switch (origin) { + case B_STREAM_SEEK_START: + abs_offset = offset; + break; + case B_STREAM_SEEK_CURRENT: + abs_offset = stream->s_cursor + offset; + break; + case B_STREAM_SEEK_END: + abs_offset = str->s_len + offset; + break; + default: + return B_ERR_INVALID_ARGUMENT; + } + + stream->s_cursor = abs_offset; + + return B_SUCCESS; +} + +static enum b_status stream_reserve(struct b_stream *stream, size_t len) +{ + struct b_string *str = stream->s_ptr; + + size_t new_capacity = str->s_len + len; + return b_string_reserve(str, new_capacity); +} + +enum b_status b_string_open_stream(struct b_string *str, struct b_stream **out) +{ + struct b_stream *stream = malloc(sizeof *stream); + if (!stream) { + return B_ERR_NO_MEMORY; + } + + memset(stream, 0x0, sizeof *stream); + + stream->s_mode |= B_STREAM_READ | B_STREAM_WRITE; + + stream->s_ptr = b_string_retain(str); + stream->s_close = stream_close; + stream->s_getc = stream_getc; + stream->s_read = stream_read; + stream->s_write = stream_write; + stream->s_seek = stream_seek; + stream->s_reserve = stream_reserve; + + *out = stream; + + return B_SUCCESS; +} + +static bool string_iterator_next(struct b_iterator *it) +{ + return b_string_iterator_next((struct b_string_iterator *)it); +} + +static bool string_iterator_is_valid(const struct b_iterator *it) +{ + return b_string_iterator_is_valid((struct b_string_iterator *)it); +} + +static struct b_iterator_ops it_ops = { + .it_next = string_iterator_next, + .it_close = NULL, + .it_is_valid = string_iterator_is_valid, +}; + +static void iterator_cleanup(b_string_iterator *it) +{ + if (it->_tmp) { + b_string_release(it->_tmp); + } + + memset(it, 0x0, sizeof *it); +} + +int b_string_iterator_begin(const struct b_string *string, b_string_iterator *it) +{ + memset(it, 0x0, sizeof *it); + + it->_base.it_ops = &it_ops; + + if (!string->s_len) { + it->status = B_ERR_NO_DATA; + return -1; + } + + const char *p = b_string_ptr(string); + it->_m = ITERATOR_MODE_CHARS; + it->_s = B_STRING(string); + it->char_value = utf8_codepoint_decode(p); + + if (it->char_value == B_WCHAR_INVALID) { + it->status = B_ERR_BAD_FORMAT; + return -1; + } + + return 0; +} + +static bool chars_iterator_next(b_string_iterator *it) +{ + if (!b_string_iterator_is_valid(it)) { + return false; + } + + size_t stride = utf8_codepoint_size(it->char_value); + if (stride == 0) { + iterator_cleanup(it); + return false; + } + + it->byte_index += stride; + it->codepoint_index += 1; + + if (it->byte_index >= it->_s->s_len) { + iterator_cleanup(it); + it->_s = NULL; + it->byte_index = 0; + it->codepoint_index = 0; + it->char_value = B_WCHAR_INVALID; + it->status = B_ERR_NO_DATA; + return false; + } + + char *p = string_ptr(it->_s) + it->byte_index; + it->char_value = utf8_codepoint_decode(p); + if (it->char_value == B_WCHAR_INVALID) { + iterator_cleanup(it); + it->_s = NULL; + it->byte_index = 0; + it->codepoint_index = 0; + it->char_value = B_WCHAR_INVALID; + it->status = B_ERR_BAD_FORMAT; + return false; + } + + it->iteration_index++; + return true; +} + +static bool tokens_iterator_next(b_string_iterator *it) +{ + if (!b_string_iterator_is_valid(it)) { + return false; + } + + enum b_status status = find_next_token(it); + if (!B_OK(status)) { + iterator_cleanup(it); + return false; + } + + it->string_value = string_ptr(it->_tmp); + it->iteration_index++; + + return true; +} + +bool b_string_iterator_next(b_string_iterator *it) +{ + switch (it->_m) { + case ITERATOR_MODE_CHARS: + return chars_iterator_next(it); + case ITERATOR_MODE_TOKENS: + return tokens_iterator_next(it); + default: + return false; + } +} + +static bool chars_iterator_is_valid(const struct b_string_iterator *it) +{ + if (!it->_s) { + return false; + } + + if (it->byte_index >= it->_s->s_len) { + return false; + } + + if (it->char_value == B_WCHAR_INVALID) { + return false; + } + + return true; +} + +static bool tokens_iterator_is_valid(const struct b_string_iterator *it) +{ + if (!it->_s) { + return false; + } + + if (it->byte_index >= it->_s->s_len) { + return false; + } + + if (!it->string_value) { + return false; + } + + return true; +} + +bool b_string_iterator_is_valid(const struct b_string_iterator *it) +{ + switch (it->_m) { + case ITERATOR_MODE_CHARS: + return chars_iterator_is_valid(it); + case ITERATOR_MODE_TOKENS: + return tokens_iterator_is_valid(it); + default: + return false; + } +} + static void string_release(struct b_object *obj) { struct b_string *str = B_STRING(obj); @@ -624,7 +1647,10 @@ static void string_release(struct b_object *obj) static void string_to_string(struct b_object *obj, struct b_stream *out) { b_string *str = B_STRING(obj); - b_stream_write_fmt(out, NULL, "%s", b_string_ptr(str)); + const char *s = b_string_ptr(str); + for (size_t i = 0; i < str->s_len; i++) { + b_stream_write_char(out, s[i]); + } } char *b_strdup(const char *s) @@ -677,6 +1703,44 @@ size_t b_strlen(const char *s, b_strlen_flags flags) return out; } +b_wchar *b_wstrdup(const b_wchar *s) +{ + size_t len = b_wstrlen(s); + b_wchar *buf = calloc(len + 1, sizeof(b_wchar)); + if (!buf) { + return NULL; + } + + memcpy(buf, s, len * sizeof(b_wchar)); + + return buf; +} + +size_t b_wstrlen(const b_wchar *s) +{ + size_t len; + for (len = 0; s[len] != 0; len++) + ; + return len; +} + +uint64_t b_string_hash(const struct b_string *str) +{ +#define FNV1_OFFSET_BASIS 0xcbf29ce484222325 +#define FNV1_PRIME 0x100000001b3 + uint64_t hash = FNV1_OFFSET_BASIS; + size_t i = 0; + + const char *s = b_string_ptr(str); + + for (i = 0; i < str->s_len; i++) { + hash ^= s[i]; + hash *= FNV1_PRIME; + } + + return hash; +} + b_object_type_id b_string_type_id(void) { return (b_object_type_id)&string_type; diff --git a/object/string.h b/object/string.h index fecf196..4b39c5f 100644 --- a/object/string.h +++ b/object/string.h @@ -8,9 +8,14 @@ struct b_string { struct b_object s_base; - /* length of string, not including null-terminator */ + /* length of string in bytes, not including null-terminator. + * a multi-byte utf-8 codepoint will be counted as multiple bytes here */ unsigned int s_len; - /* maximum length of string storable in the currently-allocated buffer, not including null terminator */ + /* length of string in codepoints, not including null-terminator. + * a multi-byte utf-8 codepoint will be counted as one codepoint here */ + unsigned int s_codepoints; + /* maximum length of string storable in the currently-allocated buffer + * in bytes, not including null terminator */ unsigned int s_max; union { char d_inline[STRING_INLINE_CAPACITY + 1];