core: string: add UTF-8 and null-char support; and some new string functions
b_string now uses UTF-8 internally, and can correctly manipulate strings
that contain non-ASCII and multi-byte codepoints.
b_string now tracks the length of a string in both bytes and unicode codepoints.
string insertion functions have been updated to correctly handle strings with
multi-byte codepoints, so the index parameter of each function now refers to codepoints
rather than bytes. inserting single-byte chars into a string with no multi-byte codepoints
is still optimised to used array indexing and memmove.
a b_string_iterator has been added to simplify iterating through a UTF-8 string, without
having to use a charAt()-style interface that would incur performance penalties.
strings can now also contain null bytes.
new functions include:
- b_string_tokenise: a b_iterator interface for iterating through tokens
in a string. similar to strtok except that:
* it is re-entrant, and uses no global state.
* it supports delimiters that are longer than one character and/or contain
multi-byte UTF-8 codepoints.
* it doesn't modify the string that is being iterated over.
* it correctly handles strings with multi-byte UTF-8 codepoints and null chars.
- b_string_compare: for comparing strings. necessary to use this rather than strcpy
as b_strings can now contain null chars.
This commit is contained in:
@@ -1,6 +1,7 @@
|
||||
#ifndef BLUELIB_STRING_H_
|
||||
#define BLUELIB_STRING_H_
|
||||
|
||||
#include <blue/core/encoding.h>
|
||||
#include <blue/core/status.h>
|
||||
#include <blue/object/object.h>
|
||||
#include <blue/object/type.h>
|
||||
@@ -13,16 +14,44 @@ struct b_stream;
|
||||
#define B_CSTR(s) (b_string_create_from_cstr(s))
|
||||
#define B_RV_CSTR(s) (B_RV(b_string_create_from_cstr(s)))
|
||||
|
||||
#define b_string_foreach(it, str) \
|
||||
for (int z__b_unique_name() = b_string_iterator_begin(str, it); \
|
||||
b_string_iterator_is_valid(it); b_string_iterator_next(it))
|
||||
|
||||
typedef struct b_string b_string;
|
||||
|
||||
typedef struct b_string_iterator {
|
||||
b_iterator _base;
|
||||
int _m, _f;
|
||||
b_string *_s, *_tmp;
|
||||
const char **_d;
|
||||
size_t _nd, _ds;
|
||||
|
||||
b_status status;
|
||||
size_t iteration_index;
|
||||
size_t byte_index;
|
||||
size_t codepoint_index;
|
||||
b_wchar char_value;
|
||||
const char *string_value;
|
||||
size_t string_length;
|
||||
size_t string_codepoints;
|
||||
} b_string_iterator;
|
||||
|
||||
typedef enum b_strlen_flags {
|
||||
B_STRLEN_NORMAL = 0,
|
||||
B_STRLEN_IGNORE_ESC = 0x01u,
|
||||
B_STRLEN_IGNORE_MOD = 0x02u,
|
||||
B_STRLEN_CODEPOINTS = 0x04u,
|
||||
} b_strlen_flags;
|
||||
|
||||
typedef enum b_string_tokenise_flags {
|
||||
B_STRING_TOK_F_NORMAL = 0x00u,
|
||||
B_STRING_TOK_F_INCLUDE_EMPTY_TOKENS = 0x01u,
|
||||
} b_string_tokenise_flags;
|
||||
|
||||
BLUE_API b_string *b_string_create(void);
|
||||
BLUE_API b_string *b_string_create_from_cstr(const char *s);
|
||||
BLUE_API b_string *b_string_create_from_wstr(const b_wchar *s);
|
||||
BLUE_API b_string *b_string_create_from_c(char c, size_t count);
|
||||
BLUE_API b_string *b_string_duplicate(const b_string *str);
|
||||
|
||||
@@ -41,6 +70,7 @@ BLUE_API b_status b_string_replace(
|
||||
BLUE_API b_status b_string_replace_all(b_string *str, const char *new_data);
|
||||
BLUE_API b_status b_string_remove(b_string *str, size_t start, size_t length);
|
||||
BLUE_API b_status b_string_transform(b_string *str, int (*transformer)(int));
|
||||
BLUE_API b_status b_string_trim(b_string *str);
|
||||
static inline b_status b_string_toupper(b_string *str)
|
||||
{
|
||||
return b_string_transform(str, toupper);
|
||||
@@ -51,22 +81,42 @@ static inline b_status b_string_tolower(b_string *str)
|
||||
}
|
||||
BLUE_API b_status b_string_open_stream(b_string *str, struct b_stream **out);
|
||||
|
||||
BLUE_API void b_string_append_s(b_string *dest, const b_string *src);
|
||||
BLUE_API void b_string_append_cstr(b_string *dest, const char *src);
|
||||
BLUE_API void b_string_append_cstrf(b_string *dest, const char *format, ...);
|
||||
BLUE_API void b_string_prepend_cstr(b_string *dest, const char *src);
|
||||
BLUE_API void b_string_prepend_cstrf(b_string *dest, const char *format, ...);
|
||||
BLUE_API void b_string_insert_s(b_string *dest, const b_string *src, size_t at);
|
||||
BLUE_API void b_string_insert_cstr(b_string *dest, const char *src, size_t at);
|
||||
BLUE_API void b_string_insert_cstrn(
|
||||
BLUE_API b_status b_string_append_c(b_string *dest, char c);
|
||||
BLUE_API b_status b_string_append_wc(b_string *dest, b_wchar c);
|
||||
BLUE_API b_status b_string_append_s(b_string *dest, const b_string *src);
|
||||
BLUE_API b_status b_string_append_cstr(b_string *dest, const char *src);
|
||||
BLUE_API b_status b_string_append_wstr(b_string *dest, const b_wchar *src);
|
||||
BLUE_API b_status b_string_append_cstrf(b_string *dest, const char *format, ...);
|
||||
|
||||
BLUE_API b_status b_string_prepend_c(b_string *dest, char c);
|
||||
BLUE_API b_status b_string_prepend_wc(b_string *dest, b_wchar c);
|
||||
BLUE_API b_status b_string_prepend_cstr(b_string *dest, const char *src);
|
||||
BLUE_API b_status b_string_prepend_wstr(b_string *dest, const b_wchar *src);
|
||||
BLUE_API b_status b_string_prepend_cstrf(b_string *dest, const char *format, ...);
|
||||
|
||||
BLUE_API b_status b_string_insert_c(b_string *dest, char c, size_t at);
|
||||
BLUE_API b_status b_string_insert_wc(b_string *dest, b_wchar c, size_t at);
|
||||
BLUE_API b_status b_string_insert_s(b_string *dest, const b_string *src, size_t at);
|
||||
BLUE_API b_status b_string_insert_cstr(b_string *dest, const char *src, size_t at);
|
||||
BLUE_API b_status b_string_insert_wstr(
|
||||
b_string *dest, const b_wchar *src, size_t at);
|
||||
BLUE_API b_status b_string_insert_cstrn(
|
||||
b_string *dest, const char *src, size_t len, size_t at);
|
||||
BLUE_API void b_string_insert_cstrf(
|
||||
BLUE_API b_status b_string_insert_wstrn(
|
||||
b_string *dest, const char *src, size_t len, size_t at);
|
||||
BLUE_API b_status b_string_insert_cstrf(
|
||||
b_string *dest, size_t at, const char *format, ...);
|
||||
BLUE_API void b_string_clear(b_string *str);
|
||||
|
||||
BLUE_API b_status b_string_tokenise(
|
||||
b_string *str, const char *delims[], size_t nr_delims,
|
||||
b_string_tokenise_flags flags, b_string_iterator *it);
|
||||
|
||||
BLUE_API size_t b_string_get_size(const b_string *str, b_strlen_flags flags);
|
||||
BLUE_API size_t b_string_get_capacity(const b_string *str);
|
||||
|
||||
BLUE_API bool b_string_compare(const b_string *a, const b_string *b);
|
||||
|
||||
BLUE_API char b_string_front(const b_string *str);
|
||||
BLUE_API char b_string_back(const b_string *str);
|
||||
|
||||
@@ -75,9 +125,16 @@ BLUE_API void b_string_pop_back(b_string *str);
|
||||
BLUE_API const char *b_string_ptr(const b_string *str);
|
||||
BLUE_API b_string *b_string_substr(const b_string *str, size_t start, size_t len);
|
||||
|
||||
BLUE_API int b_string_iterator_begin(const b_string *string, b_string_iterator *it);
|
||||
BLUE_API bool b_string_iterator_next(b_string_iterator *it);
|
||||
// BLUE_API b_status b_string_iterator_erase(b_string_iterator *it);
|
||||
BLUE_API bool b_string_iterator_is_valid(const b_string_iterator *it);
|
||||
|
||||
BLUE_API char *b_strdup(const char *s);
|
||||
BLUE_API size_t b_strlen(const char *s, b_strlen_flags flags);
|
||||
BLUE_API b_wchar *b_wstrdup(const b_wchar *s);
|
||||
BLUE_API size_t b_wstrlen(const b_wchar *s);
|
||||
|
||||
BLUE_API uint64_t b_cstr_hash(const char *s);
|
||||
BLUE_API uint64_t b_string_hash(const b_string *s);
|
||||
|
||||
#endif
|
||||
|
||||
1350
object/string.c
1350
object/string.c
File diff suppressed because it is too large
Load Diff
@@ -8,9 +8,14 @@
|
||||
|
||||
struct b_string {
|
||||
struct b_object s_base;
|
||||
/* length of string, not including null-terminator */
|
||||
/* length of string in bytes, not including null-terminator.
|
||||
* a multi-byte utf-8 codepoint will be counted as multiple bytes here */
|
||||
unsigned int s_len;
|
||||
/* maximum length of string storable in the currently-allocated buffer, not including null terminator */
|
||||
/* length of string in codepoints, not including null-terminator.
|
||||
* a multi-byte utf-8 codepoint will be counted as one codepoint here */
|
||||
unsigned int s_codepoints;
|
||||
/* maximum length of string storable in the currently-allocated buffer
|
||||
* in bytes, not including null terminator */
|
||||
unsigned int s_max;
|
||||
union {
|
||||
char d_inline[STRING_INLINE_CAPACITY + 1];
|
||||
|
||||
Reference in New Issue
Block a user