core: string: add UTF-8 and null-char support; and some new string functions

b_string now uses UTF-8 internally, and can correctly manipulate strings
that contain non-ASCII and multi-byte codepoints.

b_string now tracks the length of a string in both bytes and unicode codepoints.

string insertion functions have been updated to correctly handle strings with
multi-byte codepoints, so the index parameter of each function now refers to codepoints
rather than bytes. inserting single-byte chars into a string with no multi-byte codepoints
is still optimised to used array indexing and memmove.

a b_string_iterator has been added to simplify iterating through a UTF-8 string, without
having to use a charAt()-style interface that would incur performance penalties.

strings can now also contain null bytes.

new functions include:
  - b_string_tokenise: a b_iterator interface for iterating through tokens
    in a string. similar to strtok except that:
    * it is re-entrant, and uses no global state.
    * it supports delimiters that are longer than one character and/or contain
      multi-byte UTF-8 codepoints.
    * it doesn't modify the string that is being iterated over.
    * it correctly handles strings with multi-byte UTF-8 codepoints and null chars.
  - b_string_compare: for comparing strings. necessary to use this rather than strcpy
    as b_strings can now contain null chars.
This commit is contained in:
2025-09-22 10:36:26 +01:00
parent cbaeb002f8
commit 2fcadf7f39
3 changed files with 1288 additions and 162 deletions

View File

@@ -1,6 +1,7 @@
#ifndef BLUELIB_STRING_H_
#define BLUELIB_STRING_H_
#include <blue/core/encoding.h>
#include <blue/core/status.h>
#include <blue/object/object.h>
#include <blue/object/type.h>
@@ -13,16 +14,44 @@ struct b_stream;
#define B_CSTR(s) (b_string_create_from_cstr(s))
#define B_RV_CSTR(s) (B_RV(b_string_create_from_cstr(s)))
#define b_string_foreach(it, str) \
for (int z__b_unique_name() = b_string_iterator_begin(str, it); \
b_string_iterator_is_valid(it); b_string_iterator_next(it))
typedef struct b_string b_string;
typedef struct b_string_iterator {
b_iterator _base;
int _m, _f;
b_string *_s, *_tmp;
const char **_d;
size_t _nd, _ds;
b_status status;
size_t iteration_index;
size_t byte_index;
size_t codepoint_index;
b_wchar char_value;
const char *string_value;
size_t string_length;
size_t string_codepoints;
} b_string_iterator;
typedef enum b_strlen_flags {
B_STRLEN_NORMAL = 0,
B_STRLEN_IGNORE_ESC = 0x01u,
B_STRLEN_IGNORE_MOD = 0x02u,
B_STRLEN_CODEPOINTS = 0x04u,
} b_strlen_flags;
typedef enum b_string_tokenise_flags {
B_STRING_TOK_F_NORMAL = 0x00u,
B_STRING_TOK_F_INCLUDE_EMPTY_TOKENS = 0x01u,
} b_string_tokenise_flags;
BLUE_API b_string *b_string_create(void);
BLUE_API b_string *b_string_create_from_cstr(const char *s);
BLUE_API b_string *b_string_create_from_wstr(const b_wchar *s);
BLUE_API b_string *b_string_create_from_c(char c, size_t count);
BLUE_API b_string *b_string_duplicate(const b_string *str);
@@ -41,6 +70,7 @@ BLUE_API b_status b_string_replace(
BLUE_API b_status b_string_replace_all(b_string *str, const char *new_data);
BLUE_API b_status b_string_remove(b_string *str, size_t start, size_t length);
BLUE_API b_status b_string_transform(b_string *str, int (*transformer)(int));
BLUE_API b_status b_string_trim(b_string *str);
static inline b_status b_string_toupper(b_string *str)
{
return b_string_transform(str, toupper);
@@ -51,22 +81,42 @@ static inline b_status b_string_tolower(b_string *str)
}
BLUE_API b_status b_string_open_stream(b_string *str, struct b_stream **out);
BLUE_API void b_string_append_s(b_string *dest, const b_string *src);
BLUE_API void b_string_append_cstr(b_string *dest, const char *src);
BLUE_API void b_string_append_cstrf(b_string *dest, const char *format, ...);
BLUE_API void b_string_prepend_cstr(b_string *dest, const char *src);
BLUE_API void b_string_prepend_cstrf(b_string *dest, const char *format, ...);
BLUE_API void b_string_insert_s(b_string *dest, const b_string *src, size_t at);
BLUE_API void b_string_insert_cstr(b_string *dest, const char *src, size_t at);
BLUE_API void b_string_insert_cstrn(
BLUE_API b_status b_string_append_c(b_string *dest, char c);
BLUE_API b_status b_string_append_wc(b_string *dest, b_wchar c);
BLUE_API b_status b_string_append_s(b_string *dest, const b_string *src);
BLUE_API b_status b_string_append_cstr(b_string *dest, const char *src);
BLUE_API b_status b_string_append_wstr(b_string *dest, const b_wchar *src);
BLUE_API b_status b_string_append_cstrf(b_string *dest, const char *format, ...);
BLUE_API b_status b_string_prepend_c(b_string *dest, char c);
BLUE_API b_status b_string_prepend_wc(b_string *dest, b_wchar c);
BLUE_API b_status b_string_prepend_cstr(b_string *dest, const char *src);
BLUE_API b_status b_string_prepend_wstr(b_string *dest, const b_wchar *src);
BLUE_API b_status b_string_prepend_cstrf(b_string *dest, const char *format, ...);
BLUE_API b_status b_string_insert_c(b_string *dest, char c, size_t at);
BLUE_API b_status b_string_insert_wc(b_string *dest, b_wchar c, size_t at);
BLUE_API b_status b_string_insert_s(b_string *dest, const b_string *src, size_t at);
BLUE_API b_status b_string_insert_cstr(b_string *dest, const char *src, size_t at);
BLUE_API b_status b_string_insert_wstr(
b_string *dest, const b_wchar *src, size_t at);
BLUE_API b_status b_string_insert_cstrn(
b_string *dest, const char *src, size_t len, size_t at);
BLUE_API void b_string_insert_cstrf(
BLUE_API b_status b_string_insert_wstrn(
b_string *dest, const char *src, size_t len, size_t at);
BLUE_API b_status b_string_insert_cstrf(
b_string *dest, size_t at, const char *format, ...);
BLUE_API void b_string_clear(b_string *str);
BLUE_API b_status b_string_tokenise(
b_string *str, const char *delims[], size_t nr_delims,
b_string_tokenise_flags flags, b_string_iterator *it);
BLUE_API size_t b_string_get_size(const b_string *str, b_strlen_flags flags);
BLUE_API size_t b_string_get_capacity(const b_string *str);
BLUE_API bool b_string_compare(const b_string *a, const b_string *b);
BLUE_API char b_string_front(const b_string *str);
BLUE_API char b_string_back(const b_string *str);
@@ -75,9 +125,16 @@ BLUE_API void b_string_pop_back(b_string *str);
BLUE_API const char *b_string_ptr(const b_string *str);
BLUE_API b_string *b_string_substr(const b_string *str, size_t start, size_t len);
BLUE_API int b_string_iterator_begin(const b_string *string, b_string_iterator *it);
BLUE_API bool b_string_iterator_next(b_string_iterator *it);
// BLUE_API b_status b_string_iterator_erase(b_string_iterator *it);
BLUE_API bool b_string_iterator_is_valid(const b_string_iterator *it);
BLUE_API char *b_strdup(const char *s);
BLUE_API size_t b_strlen(const char *s, b_strlen_flags flags);
BLUE_API b_wchar *b_wstrdup(const b_wchar *s);
BLUE_API size_t b_wstrlen(const b_wchar *s);
BLUE_API uint64_t b_cstr_hash(const char *s);
BLUE_API uint64_t b_string_hash(const b_string *s);
#endif

File diff suppressed because it is too large Load Diff

View File

@@ -8,9 +8,14 @@
struct b_string {
struct b_object s_base;
/* length of string, not including null-terminator */
/* length of string in bytes, not including null-terminator.
* a multi-byte utf-8 codepoint will be counted as multiple bytes here */
unsigned int s_len;
/* maximum length of string storable in the currently-allocated buffer, not including null terminator */
/* length of string in codepoints, not including null-terminator.
* a multi-byte utf-8 codepoint will be counted as one codepoint here */
unsigned int s_codepoints;
/* maximum length of string storable in the currently-allocated buffer
* in bytes, not including null terminator */
unsigned int s_max;
union {
char d_inline[STRING_INLINE_CAPACITY + 1];