#include #include #include #include #include #include #include #include #include /* maximum length of string that can be stored inline, not including null-terminator */ #define STRING_INLINE_CAPACITY 15 #define IS_VALID_UTF8_SCALAR(x) \ (((x) >= 0x0000 && (x) <= 0xD7FF) || ((x) >= 0xE000 && (x) <= 0x10FFFF)) #define STRING_TOK_F_FOUND_DELIM 0x80 /*** PRIVATE DATA *************************************************************/ static struct b_iterator_ops it_ops; enum iterator_mode { ITERATOR_MODE_NONE = 0, ITERATOR_MODE_CHARS, ITERATOR_MODE_TOKENS, }; struct b_string_p { /* length of string in bytes, not including null-terminator. * a multi-byte utf-8 codepoint will be counted as multiple bytes here */ unsigned int s_len; /* length of string in codepoints, not including null-terminator. * a multi-byte utf-8 codepoint will be counted as one codepoint here */ unsigned int s_codepoints; /* maximum length of string storable in the currently-allocated buffer * in bytes, not including null terminator */ unsigned int s_max; union { char d_inline[STRING_INLINE_CAPACITY + 1]; char *d_external; } s_data; }; /*** PRIVATE FUNCTIONS ********************************************************/ static bool string_is_inline(const struct b_string_p *str) { /* strings cannot go below STRING_INLINE_CAPACITY capacity */ return str->s_max == STRING_INLINE_CAPACITY; } static char *string_ptr(const struct b_string_p *str) { if (string_is_inline(str)) { return (char *)str->s_data.d_inline; } return str->s_data.d_external; } static size_t utf8_codepoint_size(b_wchar c) { if (!IS_VALID_UTF8_SCALAR(c)) { return 0; } if (c <= 0x7F) { return 1; } if (c <= 0x7FF) { return 2; } if (c <= 0xFFFF) { return 3; } if (c <= 0x10FFFF) { return 4; } return 0; } static int32_t decode_utf8_trailer_byte(char c) { if (!(c & 0x80) || (c & 0x40)) { return -1; } return c & 0x3F; } static b_wchar utf8_codepoint_decode(const char *s) { b_wchar result = 0; int len = 0; if (!(s[0] & 0x80)) { len = 1; result = s[0] & 0x7F; } else if (s[0] & 0xC0 && !(s[0] & 0x20)) { len = 2; result = s[0] & 0x1F; result <<= 6; } else if (s[0] & 0xE0 && !(s[0] & 0x10)) { len = 3; result = s[0] & 0x0F; result <<= 12; } else if (s[0] & 0xF0 && !(s[0] & 0x08)) { len = 4; result = s[0] & 0x07; result <<= 18; } else { return B_WCHAR_INVALID; } for (int i = 1; i < len; i++) { int32_t c = decode_utf8_trailer_byte(s[i]); if (c == -1) { return B_WCHAR_INVALID; } c <<= 6 * (len - i - 1); result |= c; } if (!IS_VALID_UTF8_SCALAR(result)) { return B_WCHAR_INVALID; } return result; } static size_t utf8_codepoint_encode(b_wchar c, char s[4]) { size_t len = utf8_codepoint_size(c); switch (len) { case 1: s[0] = c & 0x7F; break; case 2: s[0] = ((c >> 6) & 0x1F) | 0xC0; s[1] = (c & 0x3F) | 0x80; break; case 3: s[0] = ((c >> 12) & 0x0F) | 0xE0; s[1] = ((c >> 6) & 0x3F) | 0x80; s[2] = (c & 0x3F) | 0x80; break; case 4: s[0] = ((c >> 18) & 0x07) | 0xF0; s[1] = ((c >> 12) & 0x3F) | 0x80; s[2] = ((c >> 6) & 0x3F) | 0x80; s[3] = (c & 0x3F) | 0x80; break; default: return 0; } return len; } static size_t codepoint_stride(const char *s) { char c = *s; if (!(c & 0x80)) { return 1; } if ((c & 0xC0) && !(c & 0x20)) { return 2; } if ((c & 0xE0) && !(c & 0x10)) { return 3; } if ((c & 0xF0) && !(c & 0x08)) { return 4; } return 0; } static size_t get_number_of_codepoints(const char *s, size_t len) { size_t nr_codepoints = 0; const char *end = s + len; while (*s && s < end) { size_t stride = codepoint_stride(s); if (stride == 0) { /* invalid codepoint */ return 0; } nr_codepoints++; s += stride; } if (*s != 0) { /* string is not null-terminated */ return 0; } return nr_codepoints; } static size_t get_utf8_encoded_size(const b_wchar *s, size_t nr_codepoints) { size_t len = 0; for (size_t i = 0; i < nr_codepoints; i++) { size_t l = utf8_codepoint_size(s[i]); if (l == 0) { /* invalid codepoint */ return 0; } len += l; } return len; } static enum b_status convert_codepoint_range_to_byte_range( const struct b_string_p *str, size_t cp_start, size_t cp_length, size_t *out_byte_start, size_t *out_byte_length) { const char *s = string_ptr(str); size_t byte_offset = 0, byte_length = 0; for (size_t i = 0; i < cp_start; i++) { const char *cp = &s[byte_offset]; if (!cp || byte_offset >= str->s_len) { /* out of range */ return B_ERR_OUT_OF_BOUNDS; } size_t stride = codepoint_stride(cp); if (!stride) { /* invalid codepoint */ return B_ERR_BAD_STATE; } byte_offset += stride; } for (size_t i = 0; i < cp_length; i++) { size_t cp_offset = byte_offset + byte_length; const char *cp = &s[cp_offset]; if (!cp || (cp_offset >= str->s_len)) { /* out of range */ return B_ERR_OUT_OF_BOUNDS; } size_t stride = codepoint_stride(cp); if (!stride) { /* invalid codepoint */ return B_ERR_BAD_STATE; } byte_length += stride; } if (out_byte_start) { *out_byte_start = byte_offset; } if (out_byte_length) { *out_byte_length = byte_length; } return B_SUCCESS; } static char *get_next_codepoint(struct b_string_p *str, char *this_codepoint) { char c = *this_codepoint; char *end = this_codepoint - 1; size_t len = 0; if (!(c & 0x80)) { len = 1; } else if ((c & 0xC0) && !(c & 0x20)) { len = 2; } else if ((c & 0xE0) && !(c & 0x10)) { len = 3; } else if ((c & 0xF0) && !(c & 0x08)) { len = 4; } else { return NULL; } return this_codepoint + len; } static char *get_previous_codepoint(struct b_string_p *str, char *this_codepoint) { char *start = string_ptr(str); char *end = this_codepoint - 1; while (end >= start) { char c = *end; if ((c & 0x80) && !(c & 0x40)) { end--; continue; } if ((c & 0xF0) && !(c & 0x08)) { return end; } if ((c & 0xE0) && !(c & 0x10)) { return end; } if ((c & 0xC0) && !(c & 0x20)) { return end; } if (!(c & 0x80)) { return end; } } return NULL; } static char *get_last_codepoint(struct b_string_p *str) { if (str->s_len == 0) { return NULL; } return get_previous_codepoint(str, string_ptr(str) + str->s_len); } static int string_make_inline(struct b_string_p *str) { char *buffer = string_ptr(str); memcpy(str->s_data.d_inline, buffer, sizeof str->s_data.d_inline); str->s_data.d_inline[sizeof str->s_data.d_inline - 1] = '\0'; str->s_max = STRING_INLINE_CAPACITY; if (str->s_len >= str->s_max) { str->s_len = str->s_max; } free(buffer); return 0; } static int string_resize_large(struct b_string_p *str, size_t capacity) { char *buffer = string_ptr(str); char *new_buffer = realloc(buffer, capacity + 1); if (!new_buffer) { return -1; } str->s_max = capacity; str->s_data.d_external = new_buffer; return 0; } static int string_make_large(struct b_string_p *str, size_t capacity) { const char *old_buffer = string_ptr(str); char *buffer = malloc(capacity + 1); if (!buffer) { return -1; } memcpy(buffer, old_buffer, sizeof str->s_data.d_inline); buffer[str->s_len] = '\0'; str->s_max = capacity; str->s_data.d_external = buffer; return 0; } static int string_change_capacity(struct b_string_p *str, size_t capacity) { size_t old_capacity = str->s_max; if (capacity < STRING_INLINE_CAPACITY) { capacity = STRING_INLINE_CAPACITY; } bool was_inline = string_is_inline(str); bool is_now_inline = capacity == STRING_INLINE_CAPACITY; if (capacity == old_capacity) { /* this also handles the case where the old and new capacity both fit into the inline buffer. */ return 0; } if (!was_inline && is_now_inline) { /* string was large, is now small enough to fit inline. */ return string_make_inline(str); } if (!was_inline) { /* string was large, and is still large. */ return string_resize_large(str, capacity); } if (!is_now_inline) { /* string was inline, and now large enough to require a buffer. */ return string_make_large(str, capacity); } /* nothing to do */ return 0; } static b_string *string_duplicate(const struct b_string_p *str) { b_string *new_str = b_string_create(); if (!str) { return NULL; } struct b_string_p *new_str_p = b_object_get_private(new_str, B_TYPE_STRING); string_change_capacity(new_str_p, str->s_len); const char *src = string_ptr(str); char *dst = string_ptr(new_str_p); memcpy(dst, src, str->s_len); new_str_p->s_len = str->s_len; new_str_p->s_codepoints = str->s_codepoints; return new_str; } static char *string_steal(struct b_string_p *str) { char *dest = NULL; char *src = string_ptr(str); if (string_is_inline(str)) { dest = malloc(str->s_len + 1); memcpy(dest, src, str->s_len); dest[str->s_len] = 0; src[0] = 0; } else { dest = src; str->s_data.d_external = NULL; str->s_max = STRING_INLINE_CAPACITY; } str->s_len = 0; str->s_codepoints = 0; return dest; } static b_status string_reserve(struct b_string_p *str, size_t capacity) { if (str->s_max >= capacity) { return B_SUCCESS; } int err = string_change_capacity(str, capacity); return err == 0 ? B_SUCCESS : B_ERR_NO_MEMORY; } static enum b_status replace_ansi( struct b_string_p *str, size_t start, size_t length, const char *new_data) { b_status status = B_SUCCESS; size_t new_data_len = strlen(new_data); if (start >= str->s_len) { return B_ERR_INVALID_ARGUMENT; } if (start + length >= str->s_len) { length = str->s_len - start; } size_t new_str_len = str->s_len - length + new_data_len; if (new_str_len > str->s_max) { status = string_reserve(str, new_str_len); } if (!B_OK(status)) { return status; } char *s = string_ptr(str); char *substitution_start = s + start; char *excess_src = s + start + length; size_t excess_length = str->s_len - start - length; char *excess_dest = substitution_start + new_data_len; memmove(excess_dest, excess_src, excess_length); memmove(substitution_start, new_data, new_data_len); s[new_str_len] = '\0'; str->s_len = new_str_len; return B_SUCCESS; } static enum b_status replace_utf8( struct b_string_p *str, size_t start, size_t length, const char *new_data) { if (start >= str->s_codepoints) { return B_ERR_INVALID_ARGUMENT; } if (start + length >= str->s_codepoints) { length = str->s_codepoints - start; } size_t new_data_nr_bytes = strlen(new_data); size_t new_data_nr_codepoints = get_number_of_codepoints(new_data, new_data_nr_bytes); if (new_data_nr_codepoints == 0) { /* new_data is not a valid utf-8 string */ return B_ERR_INVALID_ARGUMENT; } size_t old_data_offset = 0, old_data_nr_bytes = 0; size_t old_data_nr_codepoints = length; enum b_status status = convert_codepoint_range_to_byte_range( str, start, length, &old_data_offset, &old_data_nr_bytes); if (!B_OK(status)) { return status; } size_t new_total_bytes = str->s_len - old_data_nr_bytes + new_data_nr_bytes; if (new_total_bytes > str->s_max) { status = string_reserve(str, new_total_bytes); } if (!B_OK(status)) { return status; } char *s = string_ptr(str); char *substitution_start = s + old_data_offset; char *excess_src = s + old_data_offset + old_data_nr_bytes; size_t excess_length = str->s_len - old_data_offset - old_data_nr_bytes; char *excess_dest = substitution_start + new_data_nr_bytes; memmove(excess_dest, excess_src, excess_length); memmove(substitution_start, new_data, new_data_nr_bytes); s[new_total_bytes] = '\0'; str->s_len = new_total_bytes; str->s_codepoints -= old_data_nr_codepoints; str->s_codepoints += new_data_nr_codepoints; return B_SUCCESS; } static b_status string_replace( struct b_string_p *str, size_t start, size_t length, const char *new_data) { if (str->s_len == str->s_codepoints) { return replace_ansi(str, start, length, new_data); } return replace_utf8(str, start, length, new_data); } static b_status string_replace_all(struct b_string_p *str, const char *new_data) { size_t new_len = strlen(new_data); string_reserve(str, new_len); char *dest = string_ptr(str); memcpy(dest, new_data, new_len); dest[new_len] = '\0'; str->s_len = new_len; return B_SUCCESS; } static enum b_status remove_ansi(struct b_string_p *str, size_t start, size_t length) { b_status status = B_SUCCESS; if (start >= str->s_len) { return B_ERR_INVALID_ARGUMENT; } if (start + length >= str->s_len) { length = str->s_len - start; } size_t new_str_len = str->s_len - length; char *s = string_ptr(str); char *removal_start = s + start; char *excess_src = s + start + length; size_t excess_length = str->s_len - start - length; memmove(removal_start, excess_src, excess_length); s[new_str_len] = '\0'; str->s_len = new_str_len; return B_SUCCESS; } static enum b_status remove_utf8(struct b_string_p *str, size_t start, size_t length) { size_t remove_offset = 0, remove_nr_bytes = 0; enum b_status status = convert_codepoint_range_to_byte_range( str, start, length, &remove_offset, &remove_nr_bytes); if (!B_OK(status)) { return status; } size_t new_total_bytes = str->s_len - remove_nr_bytes; char *s = string_ptr(str); char *removal_start = s + remove_offset; char *excess_src = s + remove_offset + remove_nr_bytes; size_t excess_length = str->s_len - remove_offset - remove_nr_bytes; memmove(removal_start, excess_src, excess_length); s[new_total_bytes] = '\0'; str->s_len = new_total_bytes; str->s_codepoints -= length; return B_SUCCESS; } static enum b_status string_remove( struct b_string_p *str, size_t start, size_t length) { if (str->s_len == str->s_codepoints) { return remove_ansi(str, start, length); } return remove_utf8(str, start, length); } static b_status string_transform(struct b_string_p *str, int (*transformer)(int)) { char *s = string_ptr(str); for (size_t i = 0; i < str->s_len; i++) { int c = transformer(s[i]); if (c != 0) { s[i] = c; } } return B_SUCCESS; } static enum b_status trim_ansi(struct b_string_p *str) { char *s = string_ptr(str); size_t whitespace_end = 0; for (size_t i = 0; i < str->s_len; i++) { if (!isspace(s[i])) { whitespace_end = i; break; } } memmove(s, s + whitespace_end, str->s_len - whitespace_end); str->s_len -= whitespace_end; for (long i = str->s_len - 1; i >= 0; i--) { if (isspace(s[i])) { s[i] = 0; str->s_len--; } else { break; } } return B_SUCCESS; } static enum b_status trim_utf8(struct b_string_p *str) { char *s = string_ptr(str); size_t whitespace_end = 0; size_t nr_whitespace_codepoints = 0; for (size_t i = 0; i < str->s_len;) { b_wchar c = utf8_codepoint_decode(&s[i]); if (!b_wchar_is_space(s[i])) { whitespace_end = i; break; } nr_whitespace_codepoints++; } memmove(s, s + whitespace_end, str->s_len - whitespace_end); str->s_len -= whitespace_end; str->s_codepoints -= nr_whitespace_codepoints; char *p = get_last_codepoint(str); if (!p) { return B_ERR_BAD_STATE; } for (long i = str->s_len - 1; i >= 0;) { b_wchar c = utf8_codepoint_decode(p); size_t c_size = utf8_codepoint_size(c); if (b_wchar_is_space(c)) { memset(p, 0, c_size); str->s_len -= c_size; str->s_codepoints--; } else { break; } p = get_previous_codepoint(str, p); } return B_SUCCESS; } static b_status string_trim(struct b_string_p *str) { if (str->s_len == 0) { return B_SUCCESS; } if (str->s_len == str->s_codepoints) { return trim_ansi(str); } return trim_utf8(str); } static enum b_status string_insert_cstr_ansi( struct b_string_p *dest, const char *src, size_t nr_bytes, size_t at) { if (at >= dest->s_len) { at = dest->s_len; } size_t new_size = dest->s_len + nr_bytes; if (dest->s_max < new_size) { string_change_capacity(dest, new_size); } char *dest_buf = string_ptr(dest); char *from = dest_buf + at; char *to = dest_buf + at + nr_bytes; memmove(to, from, dest->s_len - at); memcpy(from, src, nr_bytes); dest_buf[new_size] = '\0'; dest->s_len = new_size; dest->s_codepoints += nr_bytes; return B_SUCCESS; } static enum b_status string_insert_cstr_utf8( struct b_string_p *dest, const char *src, size_t nr_bytes, size_t codepoint_offset) { if (codepoint_offset >= dest->s_codepoints) { codepoint_offset = dest->s_codepoints; } size_t byte_offset = 0; enum b_status status = B_SUCCESS; if (codepoint_offset == dest->s_codepoints) { byte_offset = dest->s_len; } else { status = convert_codepoint_range_to_byte_range( dest, 0, codepoint_offset, NULL, &byte_offset); } if (!B_OK(status)) { return status; } size_t new_total_bytes = dest->s_len + nr_bytes; if (dest->s_max < new_total_bytes) { string_change_capacity(dest, new_total_bytes); } char *dest_buf = string_ptr(dest); char *from = dest_buf + byte_offset; char *to = dest_buf + byte_offset + nr_bytes; memmove(to, from, dest->s_len - byte_offset); memcpy(from, src, nr_bytes); dest_buf[new_total_bytes] = '\0'; dest->s_len += nr_bytes; dest->s_codepoints += get_number_of_codepoints(src, nr_bytes); return B_SUCCESS; } static enum b_status string_insert_wstr_ansi( struct b_string_p *dest, const b_wchar *src, size_t nr_codepoints, size_t at) { if (at >= dest->s_len) { at = dest->s_len; } size_t utf8_encoded_size = get_utf8_encoded_size(src, nr_codepoints); if (utf8_encoded_size == 0) { return B_ERR_INVALID_ARGUMENT; } size_t new_total_bytes = dest->s_len + utf8_encoded_size; if (dest->s_max < new_total_bytes) { string_change_capacity(dest, new_total_bytes); } char *dest_buf = string_ptr(dest); char *from = dest_buf + at; char *to = dest_buf + at + utf8_encoded_size; memmove(to, from, dest->s_len - at); char *ptr = dest_buf + at; for (size_t i = 0; i < nr_codepoints; i++) { char c[4]; size_t c_len = utf8_codepoint_encode(src[i], c); if (c_len == 0) { /* the input string was already checked by * get_utf8_encoded_size, so this should never happen */ return B_ERR_INVALID_ARGUMENT; } memcpy(ptr, c, c_len); ptr += c_len; } dest_buf[new_total_bytes] = '\0'; dest->s_len += utf8_encoded_size; dest->s_codepoints += nr_codepoints; return B_SUCCESS; } static enum b_status string_insert_wstr_utf8( struct b_string_p *dest, const b_wchar *src, size_t nr_codepoints, size_t codepoint_offset) { if (codepoint_offset >= dest->s_codepoints) { codepoint_offset = dest->s_codepoints; } size_t utf8_encoded_size = get_utf8_encoded_size(src, nr_codepoints); if (utf8_encoded_size == 0) { return B_ERR_INVALID_ARGUMENT; } size_t new_total_bytes = dest->s_len + utf8_encoded_size; if (dest->s_max < new_total_bytes) { string_change_capacity(dest, new_total_bytes); } size_t move_offset = 0; enum b_status status = B_SUCCESS; if (codepoint_offset == dest->s_codepoints) { move_offset = dest->s_len; } else { status = convert_codepoint_range_to_byte_range( dest, 0, codepoint_offset, NULL, &move_offset); } if (!B_OK(status)) { return status; } char *dest_buf = string_ptr(dest); char *from = dest_buf + move_offset; char *to = dest_buf + move_offset + utf8_encoded_size; memmove(to, from, dest->s_len - move_offset); char *ptr = dest_buf + move_offset; for (size_t i = 0; i < nr_codepoints; i++) { char c[4]; size_t c_len = utf8_codepoint_encode(src[i], c); if (c_len == 0) { /* the input string was already checked by * get_utf8_encoded_size, so this should never happen */ return B_ERR_INVALID_ARGUMENT; } memcpy(ptr, c, c_len); ptr += c_len; } dest_buf[new_total_bytes] = '\0'; dest->s_len += utf8_encoded_size; dest->s_codepoints += nr_codepoints; return B_SUCCESS; } static enum b_status string_insert_cstr( struct b_string_p *dest, const char *src, size_t nr_bytes, size_t at) { if (dest->s_len == dest->s_codepoints) { return string_insert_cstr_ansi(dest, src, nr_bytes, at); } return string_insert_cstr_utf8(dest, src, nr_bytes, at); } static enum b_status string_insert_wstr( struct b_string_p *dest, const b_wchar *src, size_t nr_codepoints, size_t at) { if (dest->s_len == dest->s_codepoints) { return string_insert_wstr_ansi(dest, src, nr_codepoints, at); } return string_insert_wstr_utf8(dest, src, nr_codepoints, at); } static enum b_status string_insertf( struct b_string_p *dest, size_t at, const char *format, va_list arg) { char buf[1024]; size_t len = vsnprintf(buf, sizeof buf, format, arg); return string_insert_cstr(dest, buf, len, at); } static enum b_status string_insert_c(struct b_string_p *dest, char c, size_t at) { return string_insert_cstr(dest, &c, 1, at); } static enum b_status string_insert_wc(struct b_string_p *dest, b_wchar c, size_t at) { return string_insert_wstr(dest, &c, 1, at); } static enum b_status string_insert_s( struct b_string_p *dest, const struct b_string_p *src, size_t at) { return string_insert_cstr(dest, string_ptr(src), src->s_len, at); } static void string_clear(struct b_string_p *str) { if (str->s_len == 0) { return; } char *s = string_ptr(str); *s = '\0'; str->s_len = 0; str->s_codepoints = 0; } static bool has_prefix(const char *s, const char *prefix, size_t *prefix_len) { size_t len = 0; for (size_t i = 0;; i++) { if (s[i] == 0 || prefix[i] == 0) { break; } if (s[i] != prefix[i]) { return false; } len++; } *prefix_len = len; return true; } static bool has_prefixes( const char *s, const char **prefixes, size_t nr_prefixes, size_t *selected_prefix_len) { for (size_t i = 0; i < nr_prefixes; i++) { const char *delim = prefixes[i]; if (has_prefix(s, delim, selected_prefix_len)) { return true; } } return false; } static enum b_status find_next_token(struct b_string_iterator *it) { size_t offset = it->_ds; size_t prefix_len = 0; char *start = string_ptr(it->_s_p); bool found_delim_last_time = (it->_f & STRING_TOK_F_FOUND_DELIM) != 0; bool found_delim = false; bool include_empty = (it->_f & B_STRING_TOK_F_INCLUDE_EMPTY_TOKENS); bool found_null = false; b_string_clear(it->_tmp); while (1) { char *s = start + offset; if (*s == 0) { it->_f &= ~STRING_TOK_F_FOUND_DELIM; break; } found_delim = has_prefixes(s, it->_d, it->_nd, &prefix_len); if (found_delim) { if (it->_tmp_p->s_len == 0 && !include_empty) { /* this token is empty, skip it */ offset += prefix_len; found_delim = false; continue; } it->_f |= STRING_TOK_F_FOUND_DELIM; break; } b_wchar c = utf8_codepoint_decode(s); if (c == B_WCHAR_INVALID) { return B_ERR_BAD_STATE; } b_string_append_wc(it->_tmp, c); offset += utf8_codepoint_size(c); if (offset > it->_s_p->s_len) { break; } } bool end = !found_delim && it->_tmp_p->s_len == 0; if (include_empty && found_delim_last_time) { end = false; } if (end) { it->string_value = NULL; it->string_length = 0; it->string_codepoints = 0; return B_ERR_NO_DATA; } it->_ds = offset + prefix_len; it->string_value = b_string_ptr(it->_tmp); it->string_length = it->_tmp_p->s_len; it->string_codepoints = it->_tmp_p->s_codepoints; return B_SUCCESS; } static enum b_status string_tokenise( struct b_string_p *str, const char *delims[], size_t nr_delims, b_string_tokenise_flags flags, struct b_string_iterator *it) { memset(it, 0x0, sizeof *it); if (!nr_delims) { return B_ERR_INVALID_ARGUMENT; } b_string *tmp = b_string_create(); if (!tmp) { return B_ERR_NO_MEMORY; } it->_base.it_ops = &it_ops; it->_m = ITERATOR_MODE_TOKENS; it->_d = delims; it->_nd = nr_delims; it->_s_p = str; it->_f = flags; it->_tmp = tmp; it->_tmp_p = b_object_get_private(tmp, B_TYPE_STRING); enum b_status status = find_next_token(it); if (!B_OK(status)) { b_string_unref(tmp); it->_tmp = NULL; it->_tmp_p = NULL; } return status; } static size_t string_get_size(const struct b_string_p *str, b_strlen_flags flags) { switch (flags) { case B_STRLEN_NORMAL: return str->s_len; case B_STRLEN_CODEPOINTS: return str->s_codepoints; default: return b_strlen(string_ptr(str), flags); } } static size_t string_get_capacity(const struct b_string_p *str) { return str->s_max; } static bool string_compare(const struct b_string_p *a, const struct b_string_p *b) { if (a->s_len != b->s_len) { return false; } if (a == b) { return true; } const char *ap = string_ptr(a); const char *bp = string_ptr(b); for (size_t i = 0; i < a->s_len; i++) { if (ap[i] != bp[i]) { return false; } } return true; } static char string_front(const struct b_string_p *str) { if (str->s_len == 0) { return 0; } const char *s = string_ptr(str); return s[0]; } static char string_back(const struct b_string_p *str) { if (str->s_len == 0) { return 0; } const char *s = string_ptr(str); return s[str->s_len - 1]; } static void string_pop_back(struct b_string_p *str) { if (str->s_len == 0) { return; } char *s = string_ptr(str); s[str->s_len - 1] = '\0'; str->s_len--; } static b_string *string_substr(const struct b_string_p *str, size_t start, size_t len) { if (start > string_get_size(str, B_STRLEN_NORMAL)) { return NULL; } if (start + len > string_get_size(str, B_STRLEN_NORMAL)) { len = string_get_size(str, B_STRLEN_NORMAL) - start; } b_string *newstr = b_string_create(); struct b_string_p *newstr_p = b_object_get_private(newstr, B_TYPE_STRING); string_reserve(newstr_p, len); const char *src = string_ptr(str) + start; char *dest = string_ptr(newstr_p); memcpy(dest, src, len); newstr_p->s_len = len; return newstr; } static uint64_t string_hash(const struct b_string_p *str) { #define FNV1_OFFSET_BASIS 0xcbf29ce484222325 #define FNV1_PRIME 0x100000001b3 uint64_t hash = FNV1_OFFSET_BASIS; size_t i = 0; const char *s = string_ptr(str); for (i = 0; i < str->s_len; i++) { hash ^= s[i]; hash *= FNV1_PRIME; } return hash; } /*** STREAM FUNCTIONS *********************************************************/ static enum b_status stream_close(struct b_stream *stream) { b_string *str = stream->s_ptr0; b_string_unref(str); return B_SUCCESS; } static enum b_status stream_getc(struct b_stream *stream, int *out) { struct b_string_p *str = stream->s_ptr1; if (stream->s_cursor >= str->s_len) { return B_ERR_NO_DATA; } char *s = string_ptr(str); *out = s[stream->s_cursor]; stream->s_cursor++; return B_SUCCESS; } static enum b_status stream_read( struct b_stream *stream, unsigned char *buf, size_t count, size_t *nr_read) { struct b_string_p *str = stream->s_ptr1; if (stream->s_cursor >= str->s_len) { *nr_read = 0; return B_SUCCESS; } size_t available = str->s_len - stream->s_cursor; size_t to_read = b_min(size_t, count, available); char *s = string_ptr(str) + stream->s_cursor; memcpy(buf, s, to_read); *nr_read = to_read; return B_SUCCESS; } static enum b_status stream_write( struct b_stream *stream, const unsigned char *buf, size_t count, size_t *nr_written) { struct b_string_p *str = stream->s_ptr1; enum b_status status = B_SUCCESS; if (stream->s_cursor + count > str->s_max) { status = string_reserve(str, stream->s_cursor + count); } if (!B_OK(status)) { return status; } string_insert_cstr(str, (const char *)buf, count, stream->s_cursor); stream->s_cursor += count; *nr_written = count; return B_SUCCESS; } static enum b_status stream_seek( struct b_stream *stream, long long offset, b_stream_seek_origin origin) { struct b_string_p *str = stream->s_ptr1; size_t abs_offset; switch (origin) { case B_STREAM_SEEK_START: abs_offset = offset; break; case B_STREAM_SEEK_CURRENT: abs_offset = stream->s_cursor + offset; break; case B_STREAM_SEEK_END: abs_offset = str->s_len + offset; break; default: return B_ERR_INVALID_ARGUMENT; } stream->s_cursor = abs_offset; return B_SUCCESS; } static enum b_status stream_reserve(struct b_stream *stream, size_t len) { struct b_string_p *str = stream->s_ptr1; size_t new_capacity = str->s_len + len; return string_reserve(str, new_capacity); } /*** PUBLIC FUNCTIONS *********************************************************/ b_string *b_string_create_from_cstr(const char *s) { b_string *str = b_string_create(); if (!str) { return NULL; } if (!s) { return str; } struct b_string_p *p = b_object_get_private(str, B_TYPE_STRING); size_t s_len = strlen(s); size_t s_codepoints = get_number_of_codepoints(s, s_len); b_string_reserve(str, s_len); char *dest = string_ptr(p); memcpy(dest, s, s_len); dest[s_len] = 0; p->s_len = s_len; p->s_codepoints = s_codepoints; return str; } b_string *b_string_create_from_c(char c, size_t count) { b_string *str = b_string_create(); if (!str) { return NULL; } struct b_string_p *p = b_object_get_private(str, B_TYPE_STRING); string_change_capacity(p, count); char *s = string_ptr(p); for (size_t i = 0; i < count; i++) { s[i] = c; } p->s_len = count; p->s_codepoints = count; return str; } b_string *b_string_duplicate(const b_string *str) { B_CLASS_DISPATCH_STATIC_0(B_TYPE_STRING, string_duplicate, str); } char *b_string_steal(b_string *str) { B_CLASS_DISPATCH_STATIC_0(B_TYPE_STRING, string_steal, str); } b_status b_string_reserve(b_string *str, size_t capacity) { B_CLASS_DISPATCH_STATIC(B_TYPE_STRING, string_reserve, str, capacity); } b_status b_string_replace( b_string *str, size_t start, size_t length, const char *new_data) { B_CLASS_DISPATCH_STATIC( B_TYPE_STRING, string_replace, str, start, length, new_data); } b_status b_string_replace_all(b_string *str, const char *new_data) { B_CLASS_DISPATCH_STATIC(B_TYPE_STRING, string_replace_all, str, new_data); } enum b_status b_string_remove(b_string *str, size_t start, size_t length) { B_CLASS_DISPATCH_STATIC(B_TYPE_STRING, string_remove, str, start, length); } b_status b_string_transform(b_string *str, int (*transformer)(int)) { B_CLASS_DISPATCH_STATIC(B_TYPE_STRING, string_transform, str, transformer); } b_status b_string_trim(b_string *str) { B_CLASS_DISPATCH_STATIC_0(B_TYPE_STRING, string_trim, str); } enum b_status b_string_insert_c(b_string *dest, char c, size_t at) { B_CLASS_DISPATCH_STATIC(B_TYPE_STRING, string_insert_c, dest, c, at); } enum b_status b_string_insert_wc(b_string *dest, b_wchar c, size_t at) { B_CLASS_DISPATCH_STATIC(B_TYPE_STRING, string_insert_wc, dest, c, at); } enum b_status b_string_insert_s(b_string *dest, const b_string *src, size_t at) { struct b_string_p *dest_p = b_object_get_private(dest, B_TYPE_STRING); const struct b_string_p *src_p = b_object_get_private(src, B_TYPE_STRING); return string_insert_s(dest_p, src_p, at); } enum b_status b_string_insert_cstr(b_string *dest, const char *src, size_t at) { struct b_string_p *dest_p = b_object_get_private(dest, B_TYPE_STRING); return string_insert_cstr(dest_p, src, strlen(src), at); } enum b_status b_string_insert_wstr(b_string *dest, const b_wchar *src, size_t at) { struct b_string_p *dest_p = b_object_get_private(dest, B_TYPE_STRING); return string_insert_wstr(dest_p, src, b_wstrlen(src), at); } enum b_status b_string_insert_cstrf( b_string *dest, size_t at, const char *format, ...) { struct b_string_p *dest_p = b_object_get_private(dest, B_TYPE_STRING); va_list arg; va_start(arg, format); enum b_status status = string_insertf(dest_p, at, format, arg); va_end(arg); return status; } enum b_status b_string_insert_cstrn( b_string *dest, const char *src, size_t len, size_t at) { B_CLASS_DISPATCH_STATIC( B_TYPE_STRING, string_insert_cstr, dest, src, len, at); } enum b_status b_string_append_cstrf(b_string *dest, const char *format, ...) { struct b_string_p *dest_p = b_object_get_private(dest, B_TYPE_STRING); va_list arg; va_start(arg, format); enum b_status status = string_insertf(dest_p, SIZE_MAX, format, arg); va_end(arg); return status; } enum b_status b_string_prepend_cstrf(b_string *dest, const char *format, ...) { struct b_string_p *dest_p = b_object_get_private(dest, B_TYPE_STRING); va_list arg; va_start(arg, format); enum b_status status = string_insertf(dest_p, 0, format, arg); va_end(arg); return status; } void b_string_clear(b_string *str) { B_CLASS_DISPATCH_STATIC_0(B_TYPE_STRING, string_clear, str); } enum b_status b_string_tokenise( b_string *str, const char *delims[], size_t nr_delims, b_string_tokenise_flags flags, struct b_string_iterator *it) { B_CLASS_DISPATCH_STATIC( B_TYPE_STRING, string_tokenise, str, delims, nr_delims, flags, it); } size_t b_string_get_size(const b_string *str, b_strlen_flags flags) { B_CLASS_DISPATCH_STATIC(B_TYPE_STRING, string_get_size, str, flags); } size_t b_string_get_capacity(const b_string *str) { B_CLASS_DISPATCH_STATIC_0(B_TYPE_STRING, string_get_capacity, str); } bool b_string_compare(const b_string *a, const b_string *b) { struct b_string_p *ap = b_object_get_private(a, B_TYPE_STRING); struct b_string_p *bp = b_object_get_private(a, B_TYPE_STRING); return string_compare(ap, bp); } char b_string_front(const b_string *str) { B_CLASS_DISPATCH_STATIC_0(B_TYPE_STRING, string_front, str); } char b_string_back(const b_string *str) { B_CLASS_DISPATCH_STATIC_0(B_TYPE_STRING, string_back, str); } void b_string_pop_back(b_string *str) { B_CLASS_DISPATCH_STATIC_0(B_TYPE_STRING, string_pop_back, str); } const char *b_string_ptr(const b_string *str) { B_CLASS_DISPATCH_STATIC_0(B_TYPE_STRING, string_ptr, str); } b_string *b_string_substr(const b_string *str, size_t start, size_t len) { B_CLASS_DISPATCH_STATIC(B_TYPE_STRING, string_substr, str, start, len); } uint64_t b_string_hash(const b_string *str) { B_CLASS_DISPATCH_STATIC_0(B_TYPE_STRING, string_hash, str); } enum b_status b_string_open_stream(b_string *str, struct b_stream **out) { struct b_stream *stream = malloc(sizeof *stream); if (!stream) { return B_ERR_NO_MEMORY; } memset(stream, 0x0, sizeof *stream); stream->s_mode |= B_STREAM_READ | B_STREAM_WRITE; stream->s_ptr0 = b_string_ref(str); stream->s_ptr1 = b_object_get_private(str, B_TYPE_STRING); stream->s_close = stream_close; stream->s_getc = stream_getc; stream->s_read = stream_read; stream->s_write = stream_write; stream->s_seek = stream_seek; stream->s_reserve = stream_reserve; *out = stream; return B_SUCCESS; } /*** PUBLIC ALIAS FUNCTIONS ***************************************************/ enum b_status b_string_append_c(b_string *dest, char c) { return b_string_insert_c(dest, c, SIZE_MAX); } enum b_status b_string_append_wc(b_string *dest, b_wchar c) { return b_string_insert_wc(dest, c, SIZE_MAX); } enum b_status b_string_append_s(b_string *dest, const b_string *src) { return b_string_insert_s(dest, src, SIZE_MAX); } enum b_status b_string_append_cstr(b_string *dest, const char *src) { return b_string_insert_cstr(dest, src, SIZE_MAX); } enum b_status b_string_append_wstr(b_string *dest, const b_wchar *src) { return b_string_insert_wstr(dest, src, SIZE_MAX); } enum b_status b_string_prepend_c(b_string *dest, char c) { return b_string_insert_c(dest, c, 0); } enum b_status b_string_prepend_wc(b_string *dest, b_wchar c) { return b_string_insert_wc(dest, c, 0); } enum b_status b_string_prepend_s(b_string *dest, const b_string *src) { return b_string_insert_s(dest, src, 0); } enum b_status b_string_prepend_cstr(b_string *dest, const char *src) { return b_string_insert_cstr(dest, src, 0); } enum b_status b_string_prepend_wstr(b_string *dest, const b_wchar *src) { return b_string_insert_wstr(dest, src, 0); } /*** VIRTUAL FUNCTIONS ********************************************************/ static void string_init(b_object *obj, void *priv) { struct b_string_p *str = priv; str->s_len = 0; str->s_codepoints = 0; str->s_max = STRING_INLINE_CAPACITY; } static void string_fini(b_object *obj, void *priv) { struct b_string_p *str = priv; if (!string_is_inline(str)) { free(string_ptr(str)); } } static void string_to_string(const b_object *obj, struct b_stream *out) { struct b_string_p *str = b_object_get_private(obj, B_TYPE_STRING); const char *s = string_ptr(str); for (size_t i = 0; i < str->s_len; i++) { b_stream_write_char(out, s[i]); } } /*** CLASS DEFINITION *********************************************************/ B_TYPE_CLASS_DEFINITION_BEGIN(b_string) B_TYPE_CLASS_INTERFACE_BEGIN(b_object, B_TYPE_OBJECT) B_INTERFACE_ENTRY(to_string) = string_to_string; B_TYPE_CLASS_INTERFACE_END(b_object, B_TYPE_OBJECT) B_TYPE_CLASS_DEFINITION_END(b_string) B_TYPE_DEFINITION_BEGIN(b_string) B_TYPE_ID(0x200194f6, 0x0327, 0x4a82, 0xb9c9, 0xb62ddd038c33); B_TYPE_CLASS(b_string_class); B_TYPE_INSTANCE_PRIVATE(struct b_string_p); B_TYPE_INSTANCE_INIT(string_init); B_TYPE_INSTANCE_FINI(string_fini); B_TYPE_DEFINITION_END(b_string) /*** ITERATOR FUNCTIONS *******************************************************/ static bool string_iterator_next(struct b_iterator *it) { return b_string_iterator_next((struct b_string_iterator *)it); } static bool string_iterator_is_valid(const struct b_iterator *it) { return b_string_iterator_is_valid((struct b_string_iterator *)it); } static struct b_iterator_ops it_ops = { .it_next = string_iterator_next, .it_close = NULL, .it_is_valid = string_iterator_is_valid, }; static void iterator_cleanup(b_string_iterator *it) { if (it->_tmp) { b_string_unref(it->_tmp); } memset(it, 0x0, sizeof *it); } int b_string_iterator_begin(const b_string *string, b_string_iterator *it) { memset(it, 0x0, sizeof *it); struct b_string_p *p = b_object_get_private(string, B_TYPE_STRING); it->_base.it_ops = &it_ops; if (!p->s_len) { it->status = B_ERR_NO_DATA; return -1; } const char *s = string_ptr(it->_s_p); it->_m = ITERATOR_MODE_CHARS; it->_s_p = p; it->char_value = utf8_codepoint_decode(s); if (it->char_value == B_WCHAR_INVALID) { it->status = B_ERR_BAD_FORMAT; return -1; } return 0; } static bool chars_iterator_next(b_string_iterator *it) { if (!b_string_iterator_is_valid(it)) { return false; } size_t stride = utf8_codepoint_size(it->char_value); if (stride == 0) { iterator_cleanup(it); return false; } it->byte_index += stride; it->codepoint_index += 1; if (it->byte_index >= it->_s_p->s_len) { iterator_cleanup(it); it->_s_p = NULL; it->byte_index = 0; it->codepoint_index = 0; it->char_value = B_WCHAR_INVALID; it->status = B_ERR_NO_DATA; return false; } char *p = string_ptr(it->_s_p) + it->byte_index; it->char_value = utf8_codepoint_decode(p); if (it->char_value == B_WCHAR_INVALID) { iterator_cleanup(it); it->_s_p = NULL; it->byte_index = 0; it->codepoint_index = 0; it->char_value = B_WCHAR_INVALID; it->status = B_ERR_BAD_FORMAT; return false; } it->iteration_index++; return true; } static bool tokens_iterator_next(b_string_iterator *it) { if (!b_string_iterator_is_valid(it)) { return false; } enum b_status status = find_next_token(it); if (!B_OK(status)) { iterator_cleanup(it); return false; } it->string_value = string_ptr(it->_tmp_p); it->iteration_index++; return true; } bool b_string_iterator_next(b_string_iterator *it) { switch (it->_m) { case ITERATOR_MODE_CHARS: return chars_iterator_next(it); case ITERATOR_MODE_TOKENS: return tokens_iterator_next(it); default: return false; } } static bool chars_iterator_is_valid(const struct b_string_iterator *it) { if (!it->_s_p) { return false; } if (it->byte_index >= it->_s_p->s_len) { return false; } if (it->char_value == B_WCHAR_INVALID) { return false; } return true; } static bool tokens_iterator_is_valid(const struct b_string_iterator *it) { if (!it->_s_p) { return false; } if (it->byte_index >= it->_s_p->s_len) { return false; } if (!it->string_value) { return false; } return true; } bool b_string_iterator_is_valid(const struct b_string_iterator *it) { switch (it->_m) { case ITERATOR_MODE_CHARS: return chars_iterator_is_valid(it); case ITERATOR_MODE_TOKENS: return tokens_iterator_is_valid(it); default: return false; } } /*** MISC FUNCTIONS ***********************************************************/ char *b_strdup(const char *s) { size_t len = strlen(s); char *p = malloc(len + 1); if (!p) { return NULL; } memcpy(p, s, len); p[len] = '\0'; return p; } size_t b_strlen(const char *s, b_strlen_flags flags) { if (!(flags & (B_STRLEN_IGNORE_ESC | B_STRLEN_IGNORE_MOD))) { return strlen(s); } size_t out = 0; for (size_t i = 0; s[i]; i++) { if (s[i] == '\033' && (flags & B_STRLEN_IGNORE_ESC)) { while (!isalpha(s[i]) && s[i]) { i++; } continue; } if (s[i] == '[' && (flags & B_STRLEN_IGNORE_MOD)) { i++; if (s[i] == '[') { out++; continue; } while (s[i] != ']' && s[i]) { i++; } continue; } out++; } return out; } b_wchar *b_wstrdup(const b_wchar *s) { size_t len = b_wstrlen(s); b_wchar *buf = calloc(len + 1, sizeof(b_wchar)); if (!buf) { return NULL; } memcpy(buf, s, len * sizeof(b_wchar)); return buf; } size_t b_wstrlen(const b_wchar *s) { size_t len; for (len = 0; s[len] != 0; len++) ; return len; }