Files
bluelib/ds/string.c

1654 lines
37 KiB
C
Raw Permalink Normal View History

#include <blue/core/stream.h>
2024-10-24 19:24:54 +01:00
#include <blue/core/stringstream.h>
#include <blue/ds/string.h>
2024-10-24 19:24:54 +01:00
#include <ctype.h>
#include <stdarg.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
/* maximum length of string that can be stored inline, not including
* null-terminator */
#define STRING_INLINE_CAPACITY 15
#define IS_VALID_UTF8_SCALAR(x) \
(((x) >= 0x0000 && (x) <= 0xD7FF) || ((x) >= 0xE000 && (x) <= 0x10FFFF))
#define STRING_TOK_F_FOUND_DELIM 0x80
/*** PRIVATE DATA *************************************************************/
enum iterator_mode {
ITERATOR_MODE_NONE = 0,
ITERATOR_MODE_CHARS,
ITERATOR_MODE_TOKENS,
};
struct b_string_p {
/* length of string in bytes, not including null-terminator.
* a multi-byte utf-8 codepoint will be counted as multiple bytes here */
unsigned int s_len;
/* length of string in codepoints, not including null-terminator.
* a multi-byte utf-8 codepoint will be counted as one codepoint here */
unsigned int s_codepoints;
/* maximum length of string storable in the currently-allocated buffer
* in bytes, not including null terminator */
unsigned int s_max;
union {
char d_inline[STRING_INLINE_CAPACITY + 1];
char *d_external;
} s_data;
2024-10-24 19:24:54 +01:00
};
2025-10-29 14:35:36 +00:00
struct b_string_iterator_p {
int _m, _f;
b_string *_tmp;
struct b_string_p *_s_p, *_tmp_p;
const char **_d;
size_t _nd, _ds;
size_t iteration_index;
size_t byte_index;
size_t codepoint_index;
b_wchar char_value;
const char *string_value;
size_t string_length;
size_t string_codepoints;
};
/*** PRIVATE FUNCTIONS ********************************************************/
static bool string_is_inline(const struct b_string_p *str)
{
/* strings cannot go below STRING_INLINE_CAPACITY capacity */
return str->s_max == STRING_INLINE_CAPACITY;
}
static char *string_ptr(const struct b_string_p *str)
{
if (string_is_inline(str)) {
return (char *)str->s_data.d_inline;
}
return str->s_data.d_external;
}
static enum b_status convert_codepoint_range_to_byte_range(
const struct b_string_p *str, size_t cp_start, size_t cp_length,
size_t *out_byte_start, size_t *out_byte_length)
{
const char *s = string_ptr(str);
size_t byte_offset = 0, byte_length = 0;
for (size_t i = 0; i < cp_start; i++) {
const char *cp = &s[byte_offset];
if (!cp || byte_offset >= str->s_len) {
/* out of range */
return B_ERR_OUT_OF_BOUNDS;
}
size_t stride = b_wchar_utf8_codepoint_stride(cp);
if (!stride) {
/* invalid codepoint */
return B_ERR_BAD_STATE;
}
byte_offset += stride;
}
for (size_t i = 0; i < cp_length; i++) {
size_t cp_offset = byte_offset + byte_length;
const char *cp = &s[cp_offset];
if (!cp || (cp_offset >= str->s_len)) {
/* out of range */
return B_ERR_OUT_OF_BOUNDS;
}
size_t stride = b_wchar_utf8_codepoint_stride(cp);
if (!stride) {
/* invalid codepoint */
return B_ERR_BAD_STATE;
}
byte_length += stride;
}
if (out_byte_start) {
*out_byte_start = byte_offset;
}
if (out_byte_length) {
*out_byte_length = byte_length;
}
return B_SUCCESS;
}
static char *get_next_codepoint(struct b_string_p *str, char *this_codepoint)
{
char c = *this_codepoint;
char *end = this_codepoint - 1;
size_t len = 0;
if (!(c & 0x80)) {
len = 1;
} else if ((c & 0xC0) && !(c & 0x20)) {
len = 2;
} else if ((c & 0xE0) && !(c & 0x10)) {
len = 3;
} else if ((c & 0xF0) && !(c & 0x08)) {
len = 4;
} else {
return NULL;
}
return this_codepoint + len;
}
static char *get_previous_codepoint(struct b_string_p *str, char *this_codepoint)
{
char *start = string_ptr(str);
char *end = this_codepoint - 1;
while (end >= start) {
char c = *end;
if ((c & 0x80) && !(c & 0x40)) {
end--;
continue;
}
if ((c & 0xF0) && !(c & 0x08)) {
return end;
}
if ((c & 0xE0) && !(c & 0x10)) {
return end;
}
if ((c & 0xC0) && !(c & 0x20)) {
return end;
}
if (!(c & 0x80)) {
return end;
}
}
return NULL;
}
static char *get_last_codepoint(struct b_string_p *str)
{
if (str->s_len == 0) {
return NULL;
}
return get_previous_codepoint(str, string_ptr(str) + str->s_len);
}
static int string_make_inline(struct b_string_p *str)
2024-10-24 19:24:54 +01:00
{
char *buffer = string_ptr(str);
memcpy(str->s_data.d_inline, buffer, sizeof str->s_data.d_inline);
str->s_data.d_inline[sizeof str->s_data.d_inline - 1] = '\0';
str->s_max = STRING_INLINE_CAPACITY;
if (str->s_len >= str->s_max) {
str->s_len = str->s_max;
}
free(buffer);
return 0;
}
static int string_resize_large(struct b_string_p *str, size_t capacity)
2024-10-24 19:24:54 +01:00
{
char *buffer = string_ptr(str);
char *new_buffer = realloc(buffer, capacity + 1);
if (!new_buffer) {
return -1;
}
str->s_max = capacity;
str->s_data.d_external = new_buffer;
2024-10-24 19:24:54 +01:00
return 0;
}
static int string_make_large(struct b_string_p *str, size_t capacity)
2024-10-24 19:24:54 +01:00
{
const char *old_buffer = string_ptr(str);
char *buffer = malloc(capacity + 1);
if (!buffer) {
return -1;
}
memcpy(buffer, old_buffer, sizeof str->s_data.d_inline);
buffer[str->s_len] = '\0';
str->s_max = capacity;
str->s_data.d_external = buffer;
2024-10-24 19:24:54 +01:00
return 0;
}
static int string_change_capacity(struct b_string_p *str, size_t capacity)
2024-10-24 19:24:54 +01:00
{
size_t old_capacity = str->s_max;
if (capacity < STRING_INLINE_CAPACITY) {
capacity = STRING_INLINE_CAPACITY;
}
bool was_inline = string_is_inline(str);
bool is_now_inline = capacity == STRING_INLINE_CAPACITY;
if (capacity == old_capacity) {
/* this also handles the case where the old and new capacity
* both fit into the inline buffer. */
2024-10-24 19:24:54 +01:00
return 0;
}
if (!was_inline && is_now_inline) {
/* string was large, is now small enough to fit inline. */
return string_make_inline(str);
}
if (!was_inline) {
/* string was large, and is still large. */
return string_resize_large(str, capacity);
}
if (!is_now_inline) {
/* string was inline, and now large enough to require a buffer. */
return string_make_large(str, capacity);
}
/* nothing to do */
return 0;
}
static b_string *string_duplicate(const struct b_string_p *str)
2024-10-24 19:24:54 +01:00
{
b_string *new_str = b_string_create();
2024-10-24 19:24:54 +01:00
if (!str) {
return NULL;
}
struct b_string_p *new_str_p
= b_object_get_private(new_str, B_TYPE_STRING);
2024-10-24 19:24:54 +01:00
string_change_capacity(new_str_p, str->s_len);
const char *src = string_ptr(str);
char *dst = string_ptr(new_str_p);
memcpy(dst, src, str->s_len);
new_str_p->s_len = str->s_len;
new_str_p->s_codepoints = str->s_codepoints;
return new_str;
}
static char *string_steal(struct b_string_p *str)
2024-10-24 19:24:54 +01:00
{
char *dest = NULL;
char *src = string_ptr(str);
if (string_is_inline(str)) {
dest = malloc(str->s_len + 1);
memcpy(dest, src, str->s_len);
dest[str->s_len] = 0;
src[0] = 0;
2024-10-24 19:24:54 +01:00
} else {
dest = src;
str->s_data.d_external = NULL;
str->s_max = STRING_INLINE_CAPACITY;
2024-10-24 19:24:54 +01:00
}
str->s_len = 0;
str->s_codepoints = 0;
2024-10-24 19:24:54 +01:00
return dest;
}
static b_status string_reserve(struct b_string_p *str, size_t capacity)
2024-10-24 19:24:54 +01:00
{
if (str->s_max >= capacity) {
return B_SUCCESS;
}
int err = string_change_capacity(str, capacity);
return err == 0 ? B_SUCCESS : B_ERR_NO_MEMORY;
}
static enum b_status replace_ansi(
struct b_string_p *str, size_t start, size_t length, const char *new_data)
{
b_status status = B_SUCCESS;
size_t new_data_len = strlen(new_data);
if (start >= str->s_len) {
return B_ERR_INVALID_ARGUMENT;
}
if (start + length >= str->s_len) {
length = str->s_len - start;
}
size_t new_str_len = str->s_len - length + new_data_len;
if (new_str_len > str->s_max) {
status = string_reserve(str, new_str_len);
}
if (!B_OK(status)) {
return status;
}
char *s = string_ptr(str);
char *substitution_start = s + start;
char *excess_src = s + start + length;
size_t excess_length = str->s_len - start - length;
char *excess_dest = substitution_start + new_data_len;
memmove(excess_dest, excess_src, excess_length);
memmove(substitution_start, new_data, new_data_len);
s[new_str_len] = '\0';
str->s_len = new_str_len;
return B_SUCCESS;
}
static enum b_status replace_utf8(
struct b_string_p *str, size_t start, size_t length, const char *new_data)
{
if (start >= str->s_codepoints) {
return B_ERR_INVALID_ARGUMENT;
}
if (start + length >= str->s_codepoints) {
length = str->s_codepoints - start;
}
size_t new_data_nr_bytes = strlen(new_data);
size_t new_data_nr_codepoints
= b_wchar_utf8_codepoint_count(new_data, new_data_nr_bytes);
if (new_data_nr_codepoints == 0) {
/* new_data is not a valid utf-8 string */
return B_ERR_INVALID_ARGUMENT;
}
size_t old_data_offset = 0, old_data_nr_bytes = 0;
size_t old_data_nr_codepoints = length;
enum b_status status = convert_codepoint_range_to_byte_range(
str, start, length, &old_data_offset, &old_data_nr_bytes);
if (!B_OK(status)) {
return status;
}
size_t new_total_bytes = str->s_len - old_data_nr_bytes + new_data_nr_bytes;
if (new_total_bytes > str->s_max) {
status = string_reserve(str, new_total_bytes);
}
if (!B_OK(status)) {
return status;
}
char *s = string_ptr(str);
char *substitution_start = s + old_data_offset;
char *excess_src = s + old_data_offset + old_data_nr_bytes;
size_t excess_length = str->s_len - old_data_offset - old_data_nr_bytes;
char *excess_dest = substitution_start + new_data_nr_bytes;
memmove(excess_dest, excess_src, excess_length);
memmove(substitution_start, new_data, new_data_nr_bytes);
s[new_total_bytes] = '\0';
str->s_len = new_total_bytes;
str->s_codepoints -= old_data_nr_codepoints;
str->s_codepoints += new_data_nr_codepoints;
return B_SUCCESS;
}
static b_status string_replace(
struct b_string_p *str, size_t start, size_t length, const char *new_data)
{
if (str->s_len == str->s_codepoints) {
return replace_ansi(str, start, length, new_data);
}
return replace_utf8(str, start, length, new_data);
}
static b_status string_replace_all(struct b_string_p *str, const char *new_data)
{
size_t new_len = strlen(new_data);
string_reserve(str, new_len);
char *dest = string_ptr(str);
memcpy(dest, new_data, new_len);
dest[new_len] = '\0';
str->s_len = new_len;
return B_SUCCESS;
}
static b_status string_replace_all_with_stringstream(
struct b_string_p *str, const b_stringstream *new_data)
{
size_t new_len = b_stringstream_get_length(new_data);
string_reserve(str, new_len);
char *dest = string_ptr(str);
memcpy(dest, b_stringstream_ptr(new_data), new_len);
dest[new_len] = '\0';
str->s_len = new_len;
str->s_codepoints = b_wchar_utf8_codepoint_count(dest, new_len);
return B_SUCCESS;
}
static enum b_status remove_ansi(struct b_string_p *str, size_t start, size_t length)
{
b_status status = B_SUCCESS;
if (start >= str->s_len) {
return B_ERR_INVALID_ARGUMENT;
}
if (start + length >= str->s_len) {
length = str->s_len - start;
}
size_t new_str_len = str->s_len - length;
char *s = string_ptr(str);
char *removal_start = s + start;
char *excess_src = s + start + length;
size_t excess_length = str->s_len - start - length;
memmove(removal_start, excess_src, excess_length);
s[new_str_len] = '\0';
str->s_len = new_str_len;
return B_SUCCESS;
}
static enum b_status remove_utf8(struct b_string_p *str, size_t start, size_t length)
{
size_t remove_offset = 0, remove_nr_bytes = 0;
enum b_status status = convert_codepoint_range_to_byte_range(
str, start, length, &remove_offset, &remove_nr_bytes);
if (!B_OK(status)) {
return status;
}
size_t new_total_bytes = str->s_len - remove_nr_bytes;
char *s = string_ptr(str);
char *removal_start = s + remove_offset;
char *excess_src = s + remove_offset + remove_nr_bytes;
size_t excess_length = str->s_len - remove_offset - remove_nr_bytes;
memmove(removal_start, excess_src, excess_length);
s[new_total_bytes] = '\0';
str->s_len = new_total_bytes;
str->s_codepoints -= length;
return B_SUCCESS;
}
static enum b_status string_remove(
struct b_string_p *str, size_t start, size_t length)
{
if (str->s_len == str->s_codepoints) {
return remove_ansi(str, start, length);
}
return remove_utf8(str, start, length);
}
static b_status string_transform(struct b_string_p *str, int (*transformer)(int))
{
char *s = string_ptr(str);
for (size_t i = 0; i < str->s_len; i++) {
int c = transformer(s[i]);
if (c != 0) {
s[i] = c;
}
}
return B_SUCCESS;
}
static enum b_status trim_ansi(struct b_string_p *str)
{
char *s = string_ptr(str);
size_t whitespace_end = 0;
for (size_t i = 0; i < str->s_len; i++) {
if (!isspace(s[i])) {
whitespace_end = i;
break;
}
}
memmove(s, s + whitespace_end, str->s_len - whitespace_end);
str->s_len -= whitespace_end;
for (long i = str->s_len - 1; i >= 0; i--) {
if (isspace(s[i])) {
s[i] = 0;
str->s_len--;
} else {
break;
}
}
return B_SUCCESS;
}
static enum b_status trim_utf8(struct b_string_p *str)
{
char *s = string_ptr(str);
size_t whitespace_end = 0;
size_t nr_whitespace_codepoints = 0;
for (size_t i = 0; i < str->s_len;) {
b_wchar c = b_wchar_utf8_codepoint_decode(&s[i]);
if (!b_wchar_is_space(s[i])) {
whitespace_end = i;
break;
}
nr_whitespace_codepoints++;
}
memmove(s, s + whitespace_end, str->s_len - whitespace_end);
str->s_len -= whitespace_end;
str->s_codepoints -= nr_whitespace_codepoints;
char *p = get_last_codepoint(str);
if (!p) {
return B_ERR_BAD_STATE;
}
for (long i = str->s_len - 1; i >= 0;) {
b_wchar c = b_wchar_utf8_codepoint_decode(p);
size_t c_size = b_wchar_utf8_codepoint_size(c);
if (b_wchar_is_space(c)) {
memset(p, 0, c_size);
str->s_len -= c_size;
str->s_codepoints--;
} else {
break;
}
p = get_previous_codepoint(str, p);
}
return B_SUCCESS;
}
static b_status string_trim(struct b_string_p *str)
{
if (str->s_len == 0) {
return B_SUCCESS;
}
if (str->s_len == str->s_codepoints) {
return trim_ansi(str);
}
return trim_utf8(str);
}
static enum b_status string_insert_cstr_ansi(
struct b_string_p *dest, const char *src, size_t nr_bytes, size_t at)
{
if (at >= dest->s_len) {
at = dest->s_len;
}
size_t new_size = dest->s_len + nr_bytes;
if (dest->s_max < new_size) {
string_change_capacity(dest, new_size);
}
char *dest_buf = string_ptr(dest);
char *from = dest_buf + at;
char *to = dest_buf + at + nr_bytes;
memmove(to, from, dest->s_len - at);
memcpy(from, src, nr_bytes);
dest_buf[new_size] = '\0';
dest->s_len = new_size;
dest->s_codepoints += nr_bytes;
return B_SUCCESS;
}
static enum b_status string_insert_cstr_utf8(
struct b_string_p *dest, const char *src, size_t nr_bytes,
size_t codepoint_offset)
{
if (codepoint_offset >= dest->s_codepoints) {
codepoint_offset = dest->s_codepoints;
}
size_t byte_offset = 0;
enum b_status status = B_SUCCESS;
if (codepoint_offset == dest->s_codepoints) {
byte_offset = dest->s_len;
} else {
status = convert_codepoint_range_to_byte_range(
dest, 0, codepoint_offset, NULL, &byte_offset);
}
if (!B_OK(status)) {
return status;
}
size_t new_total_bytes = dest->s_len + nr_bytes;
if (dest->s_max < new_total_bytes) {
string_change_capacity(dest, new_total_bytes);
}
char *dest_buf = string_ptr(dest);
char *from = dest_buf + byte_offset;
char *to = dest_buf + byte_offset + nr_bytes;
memmove(to, from, dest->s_len - byte_offset);
memcpy(from, src, nr_bytes);
dest_buf[new_total_bytes] = '\0';
dest->s_len += nr_bytes;
dest->s_codepoints += b_wchar_utf8_codepoint_count(src, nr_bytes);
return B_SUCCESS;
}
static enum b_status string_insert_wstr_ansi(
struct b_string_p *dest, const b_wchar *src, size_t nr_codepoints, size_t at)
{
if (at >= dest->s_len) {
at = dest->s_len;
}
size_t utf8_encoded_size
= b_wchar_utf8_string_encoded_size(src, nr_codepoints);
if (utf8_encoded_size == 0) {
return B_ERR_INVALID_ARGUMENT;
}
size_t new_total_bytes = dest->s_len + utf8_encoded_size;
if (dest->s_max < new_total_bytes) {
string_change_capacity(dest, new_total_bytes);
}
char *dest_buf = string_ptr(dest);
char *from = dest_buf + at;
char *to = dest_buf + at + utf8_encoded_size;
memmove(to, from, dest->s_len - at);
char *ptr = dest_buf + at;
for (size_t i = 0; i < nr_codepoints; i++) {
char c[4];
size_t c_len = b_wchar_utf8_codepoint_encode(src[i], c);
if (c_len == 0) {
/* the input string was already checked by
* get_utf8_encoded_size, so this should never happen */
return B_ERR_INVALID_ARGUMENT;
}
memcpy(ptr, c, c_len);
ptr += c_len;
}
dest_buf[new_total_bytes] = '\0';
dest->s_len += utf8_encoded_size;
dest->s_codepoints += nr_codepoints;
return B_SUCCESS;
}
static enum b_status string_insert_wstr_utf8(
struct b_string_p *dest, const b_wchar *src, size_t nr_codepoints,
size_t codepoint_offset)
{
if (codepoint_offset >= dest->s_codepoints) {
codepoint_offset = dest->s_codepoints;
}
size_t utf8_encoded_size
= b_wchar_utf8_string_encoded_size(src, nr_codepoints);
if (utf8_encoded_size == 0) {
return B_ERR_INVALID_ARGUMENT;
}
size_t new_total_bytes = dest->s_len + utf8_encoded_size;
if (dest->s_max < new_total_bytes) {
string_change_capacity(dest, new_total_bytes);
}
size_t move_offset = 0;
enum b_status status = B_SUCCESS;
if (codepoint_offset == dest->s_codepoints) {
move_offset = dest->s_len;
} else {
status = convert_codepoint_range_to_byte_range(
dest, 0, codepoint_offset, NULL, &move_offset);
}
if (!B_OK(status)) {
return status;
}
char *dest_buf = string_ptr(dest);
char *from = dest_buf + move_offset;
char *to = dest_buf + move_offset + utf8_encoded_size;
memmove(to, from, dest->s_len - move_offset);
char *ptr = dest_buf + move_offset;
for (size_t i = 0; i < nr_codepoints; i++) {
char c[4];
size_t c_len = b_wchar_utf8_codepoint_encode(src[i], c);
if (c_len == 0) {
/* the input string was already checked by
* get_utf8_encoded_size, so this should never happen */
return B_ERR_INVALID_ARGUMENT;
}
memcpy(ptr, c, c_len);
ptr += c_len;
}
dest_buf[new_total_bytes] = '\0';
dest->s_len += utf8_encoded_size;
dest->s_codepoints += nr_codepoints;
return B_SUCCESS;
}
static enum b_status string_insert_cstr(
struct b_string_p *dest, const char *src, size_t nr_bytes, size_t at)
2024-10-24 19:24:54 +01:00
{
if (dest->s_len == dest->s_codepoints) {
return string_insert_cstr_ansi(dest, src, nr_bytes, at);
2024-10-24 19:24:54 +01:00
}
return string_insert_cstr_utf8(dest, src, nr_bytes, at);
}
2024-10-24 19:24:54 +01:00
static enum b_status string_insert_wstr(
struct b_string_p *dest, const b_wchar *src, size_t nr_codepoints, size_t at)
{
if (dest->s_len == dest->s_codepoints) {
return string_insert_wstr_ansi(dest, src, nr_codepoints, at);
}
2024-10-24 19:24:54 +01:00
return string_insert_wstr_utf8(dest, src, nr_codepoints, at);
2024-10-24 19:24:54 +01:00
}
static enum b_status string_insertf(
struct b_string_p *dest, size_t at, const char *format, va_list arg)
2024-10-24 19:24:54 +01:00
{
char buf[1024];
size_t len = vsnprintf(buf, sizeof buf, format, arg);
return string_insert_cstr(dest, buf, len, at);
}
static enum b_status string_insert_c(struct b_string_p *dest, char c, size_t at)
{
return string_insert_cstr(dest, &c, 1, at);
2024-10-24 19:24:54 +01:00
}
static enum b_status string_insert_wc(struct b_string_p *dest, b_wchar c, size_t at)
2024-10-24 19:24:54 +01:00
{
return string_insert_wstr(dest, &c, 1, at);
2024-10-24 19:24:54 +01:00
}
static enum b_status string_insert_s(
struct b_string_p *dest, const struct b_string_p *src, size_t at)
2024-10-24 19:24:54 +01:00
{
return string_insert_cstr(dest, string_ptr(src), src->s_len, at);
2024-10-24 19:24:54 +01:00
}
static void string_clear(struct b_string_p *str)
2024-10-24 19:24:54 +01:00
{
if (str->s_len == 0) {
return;
}
char *s = string_ptr(str);
*s = '\0';
str->s_len = 0;
str->s_codepoints = 0;
}
static bool has_prefix(const char *s, const char *prefix, size_t *prefix_len)
{
size_t len = 0;
for (size_t i = 0;; i++) {
if (s[i] == 0 || prefix[i] == 0) {
break;
}
if (s[i] != prefix[i]) {
return false;
}
len++;
}
*prefix_len = len;
return true;
}
static bool has_prefixes(
const char *s, const char **prefixes, size_t nr_prefixes,
size_t *selected_prefix_len)
{
for (size_t i = 0; i < nr_prefixes; i++) {
const char *delim = prefixes[i];
if (has_prefix(s, delim, selected_prefix_len)) {
return true;
}
}
return false;
}
2025-10-29 14:35:36 +00:00
static enum b_status find_next_token(struct b_string_iterator_p *it)
{
size_t offset = it->_ds;
size_t prefix_len = 0;
char *start = string_ptr(it->_s_p);
bool found_delim_last_time = (it->_f & STRING_TOK_F_FOUND_DELIM) != 0;
bool found_delim = false;
bool include_empty = (it->_f & B_STRING_TOK_F_INCLUDE_EMPTY_TOKENS);
bool found_null = false;
b_string_clear(it->_tmp);
while (1) {
char *s = start + offset;
if (*s == 0) {
it->_f &= ~STRING_TOK_F_FOUND_DELIM;
break;
}
found_delim = has_prefixes(s, it->_d, it->_nd, &prefix_len);
if (found_delim) {
if (it->_tmp_p->s_len == 0 && !include_empty) {
/* this token is empty, skip it */
offset += prefix_len;
found_delim = false;
continue;
}
it->_f |= STRING_TOK_F_FOUND_DELIM;
break;
}
b_wchar c = b_wchar_utf8_codepoint_decode(s);
if (c == B_WCHAR_INVALID) {
return B_ERR_BAD_STATE;
}
b_string_append_wc(it->_tmp, c);
offset += b_wchar_utf8_codepoint_size(c);
if (offset > it->_s_p->s_len) {
break;
}
}
bool end = !found_delim && it->_tmp_p->s_len == 0;
if (include_empty && found_delim_last_time) {
end = false;
}
if (end) {
it->string_value = NULL;
it->string_length = 0;
it->string_codepoints = 0;
return B_ERR_NO_DATA;
}
it->_ds = offset + prefix_len;
it->string_value = b_string_ptr(it->_tmp);
it->string_length = it->_tmp_p->s_len;
it->string_codepoints = it->_tmp_p->s_codepoints;
return B_SUCCESS;
}
2025-10-29 14:35:36 +00:00
static b_iterator *string_tokenise(
struct b_string_p *str, const char *delims[], size_t nr_delims,
2025-10-29 14:35:36 +00:00
b_string_tokenise_flags flags)
{
if (!nr_delims) {
2025-10-29 14:35:36 +00:00
return NULL;
}
b_string *tmp = b_string_create();
if (!tmp) {
2025-10-29 14:35:36 +00:00
return NULL;
}
2025-10-29 14:35:36 +00:00
b_string_iterator *it_obj = b_object_create(B_TYPE_STRING_ITERATOR);
struct b_string_iterator_p *it
= b_object_get_private(it_obj, B_TYPE_STRING_ITERATOR);
it->_m = ITERATOR_MODE_TOKENS;
it->_d = delims;
it->_nd = nr_delims;
it->_s_p = str;
it->_f = flags;
it->_tmp = tmp;
it->_tmp_p = b_object_get_private(tmp, B_TYPE_STRING);
enum b_status status = find_next_token(it);
if (!B_OK(status)) {
b_string_unref(tmp);
it->_tmp = NULL;
it->_tmp_p = NULL;
}
2025-10-29 14:35:36 +00:00
return it_obj;
2024-10-24 19:24:54 +01:00
}
static size_t string_get_size(const struct b_string_p *str, b_strlen_flags flags)
2024-10-24 19:24:54 +01:00
{
switch (flags) {
case B_STRLEN_NORMAL:
return str->s_len;
case B_STRLEN_CODEPOINTS:
return str->s_codepoints;
default:
return b_strlen(string_ptr(str), flags);
2024-10-24 19:24:54 +01:00
}
}
static size_t string_get_capacity(const struct b_string_p *str)
2024-10-24 19:24:54 +01:00
{
return str->s_max;
}
static bool string_compare(const struct b_string_p *a, const struct b_string_p *b)
{
if (a->s_len != b->s_len) {
return false;
}
if (a == b) {
return true;
}
const char *ap = string_ptr(a);
const char *bp = string_ptr(b);
for (size_t i = 0; i < a->s_len; i++) {
if (ap[i] != bp[i]) {
return false;
}
}
return true;
}
static char string_front(const struct b_string_p *str)
{
if (str->s_len == 0) {
return 0;
}
const char *s = string_ptr(str);
return s[0];
}
static char string_back(const struct b_string_p *str)
{
if (str->s_len == 0) {
return 0;
}
const char *s = string_ptr(str);
return s[str->s_len - 1];
}
static void string_pop_back(struct b_string_p *str)
{
if (str->s_len == 0) {
return;
}
char *s = string_ptr(str);
s[str->s_len - 1] = '\0';
str->s_len--;
}
static b_string *string_substr(const struct b_string_p *str, size_t start, size_t len)
2024-10-24 19:24:54 +01:00
{
if (start > string_get_size(str, B_STRLEN_NORMAL)) {
2025-04-11 13:55:36 +01:00
return NULL;
}
if (start + len > string_get_size(str, B_STRLEN_NORMAL)) {
len = string_get_size(str, B_STRLEN_NORMAL) - start;
2025-04-11 13:55:36 +01:00
}
b_string *newstr = b_string_create();
struct b_string_p *newstr_p = b_object_get_private(newstr, B_TYPE_STRING);
string_reserve(newstr_p, len);
2025-04-11 13:55:36 +01:00
const char *src = string_ptr(str) + start;
char *dest = string_ptr(newstr_p);
2025-04-11 13:55:36 +01:00
memcpy(dest, src, len);
newstr_p->s_len = len;
2025-04-11 13:55:36 +01:00
return newstr;
}
static uint64_t string_hash(const struct b_string_p *str)
{
#define FNV1_OFFSET_BASIS 0xcbf29ce484222325
#define FNV1_PRIME 0x100000001b3
uint64_t hash = FNV1_OFFSET_BASIS;
size_t i = 0;
const char *s = string_ptr(str);
for (i = 0; i < str->s_len; i++) {
hash ^= s[i];
hash *= FNV1_PRIME;
}
return hash;
}
/*** PUBLIC FUNCTIONS *********************************************************/
b_string *b_string_create_from_cstr(const char *s)
{
b_string *str = b_string_create();
if (!str) {
return NULL;
}
if (!s) {
return str;
}
struct b_string_p *p = b_object_get_private(str, B_TYPE_STRING);
size_t s_len = strlen(s);
size_t s_codepoints = b_wchar_utf8_codepoint_count(s, s_len);
b_string_reserve(str, s_len);
char *dest = string_ptr(p);
memcpy(dest, s, s_len);
dest[s_len] = 0;
p->s_len = s_len;
p->s_codepoints = s_codepoints;
return str;
}
b_string *b_string_create_from_c(char c, size_t count)
{
b_string *str = b_string_create();
if (!str) {
return NULL;
}
struct b_string_p *p = b_object_get_private(str, B_TYPE_STRING);
string_change_capacity(p, count);
char *s = string_ptr(p);
for (size_t i = 0; i < count; i++) {
s[i] = c;
}
p->s_len = count;
p->s_codepoints = count;
return str;
}
b_string *b_string_duplicate(const b_string *str)
{
B_CLASS_DISPATCH_STATIC_0(B_TYPE_STRING, string_duplicate, str);
}
char *b_string_steal(b_string *str)
{
B_CLASS_DISPATCH_STATIC_0(B_TYPE_STRING, string_steal, str);
}
b_status b_string_reserve(b_string *str, size_t capacity)
{
B_CLASS_DISPATCH_STATIC(B_TYPE_STRING, string_reserve, str, capacity);
}
b_status b_string_replace(
b_string *str, size_t start, size_t length, const char *new_data)
{
B_CLASS_DISPATCH_STATIC(
B_TYPE_STRING, string_replace, str, start, length, new_data);
}
b_status b_string_replace_all(b_string *str, const char *new_data)
{
B_CLASS_DISPATCH_STATIC(B_TYPE_STRING, string_replace_all, str, new_data);
}
b_status b_string_replace_all_with_stringstream(
b_string *str, const b_stringstream *new_data)
{
B_CLASS_DISPATCH_STATIC(
B_TYPE_STRING, string_replace_all_with_stringstream, str, new_data);
}
enum b_status b_string_remove(b_string *str, size_t start, size_t length)
{
B_CLASS_DISPATCH_STATIC(B_TYPE_STRING, string_remove, str, start, length);
}
b_status b_string_transform(b_string *str, int (*transformer)(int))
{
B_CLASS_DISPATCH_STATIC(B_TYPE_STRING, string_transform, str, transformer);
}
b_status b_string_trim(b_string *str)
{
B_CLASS_DISPATCH_STATIC_0(B_TYPE_STRING, string_trim, str);
}
enum b_status b_string_insert_c(b_string *dest, char c, size_t at)
{
B_CLASS_DISPATCH_STATIC(B_TYPE_STRING, string_insert_c, dest, c, at);
}
enum b_status b_string_insert_wc(b_string *dest, b_wchar c, size_t at)
{
B_CLASS_DISPATCH_STATIC(B_TYPE_STRING, string_insert_wc, dest, c, at);
}
enum b_status b_string_insert_s(b_string *dest, const b_string *src, size_t at)
{
struct b_string_p *dest_p = b_object_get_private(dest, B_TYPE_STRING);
const struct b_string_p *src_p = b_object_get_private(src, B_TYPE_STRING);
return string_insert_s(dest_p, src_p, at);
}
enum b_status b_string_insert_cstr(b_string *dest, const char *src, size_t at)
{
struct b_string_p *dest_p = b_object_get_private(dest, B_TYPE_STRING);
return string_insert_cstr(dest_p, src, strlen(src), at);
}
enum b_status b_string_insert_wstr(b_string *dest, const b_wchar *src, size_t at)
{
struct b_string_p *dest_p = b_object_get_private(dest, B_TYPE_STRING);
return string_insert_wstr(dest_p, src, b_wstrlen(src), at);
}
enum b_status b_string_insert_cstrf(
b_string *dest, size_t at, const char *format, ...)
{
struct b_string_p *dest_p = b_object_get_private(dest, B_TYPE_STRING);
va_list arg;
va_start(arg, format);
enum b_status status = string_insertf(dest_p, at, format, arg);
va_end(arg);
return status;
}
enum b_status b_string_insert_cstrn(
b_string *dest, const char *src, size_t len, size_t at)
{
B_CLASS_DISPATCH_STATIC(
B_TYPE_STRING, string_insert_cstr, dest, src, len, at);
}
enum b_status b_string_append_cstrf(b_string *dest, const char *format, ...)
{
struct b_string_p *dest_p = b_object_get_private(dest, B_TYPE_STRING);
va_list arg;
va_start(arg, format);
enum b_status status = string_insertf(dest_p, SIZE_MAX, format, arg);
va_end(arg);
return status;
}
enum b_status b_string_prepend_cstrf(b_string *dest, const char *format, ...)
{
struct b_string_p *dest_p = b_object_get_private(dest, B_TYPE_STRING);
va_list arg;
va_start(arg, format);
enum b_status status = string_insertf(dest_p, 0, format, arg);
va_end(arg);
return status;
}
void b_string_clear(b_string *str)
{
B_CLASS_DISPATCH_STATIC_0(B_TYPE_STRING, string_clear, str);
}
2025-10-29 14:35:36 +00:00
b_iterator *b_string_tokenise(
b_string *str, const char *delims[], size_t nr_delims,
2025-10-29 14:35:36 +00:00
b_string_tokenise_flags flags)
{
B_CLASS_DISPATCH_STATIC(
2025-10-29 14:35:36 +00:00
B_TYPE_STRING, string_tokenise, str, delims, nr_delims, flags);
}
size_t b_string_get_size(const b_string *str, b_strlen_flags flags)
{
B_CLASS_DISPATCH_STATIC(B_TYPE_STRING, string_get_size, str, flags);
}
size_t b_string_get_capacity(const b_string *str)
{
B_CLASS_DISPATCH_STATIC_0(B_TYPE_STRING, string_get_capacity, str);
}
bool b_string_compare(const b_string *a, const b_string *b)
{
struct b_string_p *ap = b_object_get_private(a, B_TYPE_STRING);
struct b_string_p *bp = b_object_get_private(a, B_TYPE_STRING);
return string_compare(ap, bp);
}
char b_string_front(const b_string *str)
{
B_CLASS_DISPATCH_STATIC_0(B_TYPE_STRING, string_front, str);
}
char b_string_back(const b_string *str)
{
B_CLASS_DISPATCH_STATIC_0(B_TYPE_STRING, string_back, str);
}
void b_string_pop_back(b_string *str)
{
B_CLASS_DISPATCH_STATIC_0(B_TYPE_STRING, string_pop_back, str);
}
const char *b_string_ptr(const b_string *str)
{
B_CLASS_DISPATCH_STATIC_0(B_TYPE_STRING, string_ptr, str);
}
b_string *b_string_substr(const b_string *str, size_t start, size_t len)
{
B_CLASS_DISPATCH_STATIC(B_TYPE_STRING, string_substr, str, start, len);
}
uint64_t b_string_hash(const b_string *str)
{
B_CLASS_DISPATCH_STATIC_0(B_TYPE_STRING, string_hash, str);
}
/*** PUBLIC ALIAS FUNCTIONS ***************************************************/
enum b_status b_string_append_c(b_string *dest, char c)
{
return b_string_insert_c(dest, c, SIZE_MAX);
}
enum b_status b_string_append_wc(b_string *dest, b_wchar c)
{
return b_string_insert_wc(dest, c, SIZE_MAX);
}
enum b_status b_string_append_s(b_string *dest, const b_string *src)
{
return b_string_insert_s(dest, src, SIZE_MAX);
}
enum b_status b_string_append_cstr(b_string *dest, const char *src)
{
return b_string_insert_cstr(dest, src, SIZE_MAX);
}
enum b_status b_string_append_wstr(b_string *dest, const b_wchar *src)
{
return b_string_insert_wstr(dest, src, SIZE_MAX);
}
enum b_status b_string_prepend_c(b_string *dest, char c)
{
return b_string_insert_c(dest, c, 0);
}
enum b_status b_string_prepend_wc(b_string *dest, b_wchar c)
{
return b_string_insert_wc(dest, c, 0);
}
enum b_status b_string_prepend_s(b_string *dest, const b_string *src)
{
return b_string_insert_s(dest, src, 0);
}
enum b_status b_string_prepend_cstr(b_string *dest, const char *src)
{
return b_string_insert_cstr(dest, src, 0);
}
enum b_status b_string_prepend_wstr(b_string *dest, const b_wchar *src)
{
return b_string_insert_wstr(dest, src, 0);
}
/*** VIRTUAL FUNCTIONS ********************************************************/
static void string_init(b_object *obj, void *priv)
{
struct b_string_p *str = priv;
str->s_len = 0;
str->s_codepoints = 0;
str->s_max = STRING_INLINE_CAPACITY;
}
static void string_fini(b_object *obj, void *priv)
{
struct b_string_p *str = priv;
if (!string_is_inline(str)) {
free(string_ptr(str));
}
}
static void string_to_string(const b_object *obj, b_stream *out)
{
struct b_string_p *str = b_object_get_private(obj, B_TYPE_STRING);
const char *s = string_ptr(str);
for (size_t i = 0; i < str->s_len; i++) {
b_stream_write_char(out, s[i]);
}
}
/*** ITERATOR FUNCTIONS *******************************************************/
2025-10-29 14:35:36 +00:00
static void iterator_fini(b_iterator *obj)
{
2025-10-29 14:35:36 +00:00
struct b_string_iterator_p *it
= b_object_get_private(obj, B_TYPE_STRING_ITERATOR);
if (it->_tmp) {
b_string_unref(it->_tmp);
}
memset(it, 0x0, sizeof *it);
}
2025-10-29 14:35:36 +00:00
static b_iterator *iterator_begin(b_object *obj)
{
2025-10-29 14:35:36 +00:00
b_string_iterator *it_obj = b_object_create(B_TYPE_STRING_ITERATOR);
struct b_string_iterator_p *it
= b_object_get_private(it_obj, B_TYPE_STRING_ITERATOR);
struct b_string_p *p = b_object_get_private(obj, B_TYPE_STRING);
if (!p->s_len) {
b_iterator_set_status(it_obj, B_ERR_NO_DATA);
2025-10-29 14:35:36 +00:00
return it_obj;
}
const char *s = string_ptr(p);
it->_m = ITERATOR_MODE_CHARS;
it->_s_p = p;
it->char_value = b_wchar_utf8_codepoint_decode(s);
if (it->char_value == B_WCHAR_INVALID) {
b_iterator_set_status(it_obj, B_ERR_BAD_FORMAT);
}
2025-10-29 14:35:36 +00:00
return it_obj;
}
static const b_iterator *iterator_cbegin(const b_object *obj)
{
return iterator_begin((b_object *)obj);
}
2025-10-29 14:35:36 +00:00
static enum b_status chars_iterator_move_next(struct b_string_iterator_p *it)
{
2025-10-29 14:35:36 +00:00
if (!it->_s_p) {
return B_ERR_NO_DATA;
}
size_t stride = b_wchar_utf8_codepoint_size(it->char_value);
if (stride == 0) {
2025-10-29 14:35:36 +00:00
return B_ERR_NO_DATA;
}
it->byte_index += stride;
it->codepoint_index += 1;
if (it->byte_index >= it->_s_p->s_len) {
it->char_value = B_WCHAR_INVALID;
2025-10-29 14:35:36 +00:00
return B_ERR_NO_DATA;
}
char *p = string_ptr(it->_s_p) + it->byte_index;
it->char_value = b_wchar_utf8_codepoint_decode(p);
if (it->char_value == B_WCHAR_INVALID) {
2025-10-29 14:35:36 +00:00
return B_ERR_BAD_FORMAT;
}
it->iteration_index++;
2025-10-29 14:35:36 +00:00
return B_SUCCESS;
}
2025-10-29 14:35:36 +00:00
static enum b_status tokens_iterator_move_next(struct b_string_iterator_p *it)
{
2025-10-29 14:35:36 +00:00
if (!it->_s_p) {
return B_ERR_NO_DATA;
}
enum b_status status = find_next_token(it);
if (!B_OK(status)) {
2025-10-29 14:35:36 +00:00
return status;
}
it->string_value = string_ptr(it->_tmp_p);
it->iteration_index++;
2025-10-29 14:35:36 +00:00
return B_SUCCESS;
}
2025-10-29 14:35:36 +00:00
static enum b_status iterator_move_next(const b_iterator *obj)
{
2025-10-29 14:35:36 +00:00
struct b_string_iterator_p *it
= b_object_get_private(obj, B_TYPE_STRING_ITERATOR);
switch (it->_m) {
case ITERATOR_MODE_CHARS:
2025-10-29 14:35:36 +00:00
return chars_iterator_move_next(it);
case ITERATOR_MODE_TOKENS:
2025-10-29 14:35:36 +00:00
return tokens_iterator_move_next(it);
default:
2025-10-29 14:35:36 +00:00
return B_ERR_BAD_STATE;
}
}
2025-10-29 14:35:36 +00:00
static b_iterator_value chars_iterator_get_value(struct b_string_iterator_p *it)
{
2025-10-29 14:35:36 +00:00
return B_ITERATOR_VALUE_INT(it->char_value);
}
2025-10-29 14:35:36 +00:00
static b_iterator_value tokens_iterator_get_value(struct b_string_iterator_p *it)
{
2025-10-29 14:35:36 +00:00
return B_ITERATOR_VALUE_CPTR(it->string_value);
}
2025-10-29 14:35:36 +00:00
static b_iterator_value iterator_get_value(b_iterator *obj)
{
2025-10-29 14:35:36 +00:00
struct b_string_iterator_p *it
= b_object_get_private(obj, B_TYPE_STRING_ITERATOR);
switch (it->_m) {
case ITERATOR_MODE_CHARS:
2025-10-29 14:35:36 +00:00
return chars_iterator_get_value(it);
case ITERATOR_MODE_TOKENS:
2025-10-29 14:35:36 +00:00
return tokens_iterator_get_value(it);
default:
2025-10-29 14:35:36 +00:00
return B_ITERATOR_VALUE_NULL;
}
}
static const b_iterator_value iterator_get_cvalue(const b_iterator *obj)
{
struct b_string_iterator_p *it
= b_object_get_private(obj, B_TYPE_STRING_ITERATOR);
switch (it->_m) {
case ITERATOR_MODE_CHARS:
return chars_iterator_get_value(it);
case ITERATOR_MODE_TOKENS:
return tokens_iterator_get_value(it);
default:
return B_ITERATOR_VALUE_NULL;
}
}
2025-10-29 14:35:36 +00:00
/*** CLASS DEFINITION *********************************************************/
// ---- b_string DEFINITION
B_TYPE_CLASS_DEFINITION_BEGIN(b_string)
B_TYPE_CLASS_INTERFACE_BEGIN(b_object, B_TYPE_OBJECT)
B_INTERFACE_ENTRY(to_string) = string_to_string;
B_TYPE_CLASS_INTERFACE_END(b_object, B_TYPE_OBJECT)
B_TYPE_CLASS_INTERFACE_BEGIN(b_iterable, B_TYPE_ITERABLE)
B_INTERFACE_ENTRY(it_begin) = iterator_begin;
B_INTERFACE_ENTRY(it_cbegin) = iterator_cbegin;
B_TYPE_CLASS_INTERFACE_END(b_iterable, B_TYPE_ITERABLE)
2025-10-29 14:35:36 +00:00
B_TYPE_CLASS_DEFINITION_END(b_string)
B_TYPE_DEFINITION_BEGIN(b_string)
B_TYPE_ID(0x200194f6, 0x0327, 0x4a82, 0xb9c9, 0xb62ddd038c33);
B_TYPE_IMPLEMENTS(B_TYPE_ITERABLE);
2025-10-29 14:35:36 +00:00
B_TYPE_CLASS(b_string_class);
B_TYPE_INSTANCE_PRIVATE(struct b_string_p);
B_TYPE_INSTANCE_INIT(string_init);
B_TYPE_INSTANCE_FINI(string_fini);
B_TYPE_DEFINITION_END(b_string)
// ---- b_string_iterator DEFINITION
B_TYPE_CLASS_DEFINITION_BEGIN(b_string_iterator)
B_TYPE_CLASS_INTERFACE_BEGIN(b_object, B_TYPE_OBJECT)
B_INTERFACE_ENTRY(to_string) = NULL;
B_TYPE_CLASS_INTERFACE_END(b_object, B_TYPE_OBJECT)
B_TYPE_CLASS_INTERFACE_BEGIN(b_iterator, B_TYPE_ITERATOR)
B_INTERFACE_ENTRY(it_move_next) = iterator_move_next;
B_INTERFACE_ENTRY(it_erase) = NULL;
B_INTERFACE_ENTRY(it_get_value) = iterator_get_value;
B_INTERFACE_ENTRY(it_get_cvalue) = iterator_get_cvalue;
2025-10-29 14:35:36 +00:00
B_TYPE_CLASS_INTERFACE_END(b_iterator, B_TYPE_ITERATOR)
B_TYPE_CLASS_DEFINITION_END(b_string_iterator)
B_TYPE_DEFINITION_BEGIN(b_string_iterator)
B_TYPE_ID(0xfc06cee1, 0xb63a, 0x4718, 0x9b8e, 0x3bd2eb7a8608);
B_TYPE_EXTENDS(B_TYPE_ITERATOR);
B_TYPE_CLASS(b_string_iterator_class);
B_TYPE_INSTANCE_PRIVATE(struct b_string_iterator_p);
B_TYPE_DEFINITION_END(b_string_iterator)
/*** MISC FUNCTIONS ***********************************************************/
2024-10-24 19:24:54 +01:00
char *b_strdup(const char *s)
{
size_t len = strlen(s);
char *p = malloc(len + 1);
if (!p) {
return NULL;
}
memcpy(p, s, len);
p[len] = '\0';
return p;
}
size_t b_strlen(const char *s, b_strlen_flags flags)
{
if (!(flags & (B_STRLEN_IGNORE_ESC | B_STRLEN_IGNORE_MOD))) {
2024-10-24 19:24:54 +01:00
return strlen(s);
}
size_t out = 0;
for (size_t i = 0; s[i]; i++) {
if (s[i] == '\033' && (flags & B_STRLEN_IGNORE_ESC)) {
while (!isalpha(s[i]) && s[i]) {
i++;
}
2024-10-24 19:24:54 +01:00
continue;
2024-10-24 19:24:54 +01:00
}
if (s[i] == '[' && (flags & B_STRLEN_IGNORE_MOD)) {
i++;
if (s[i] == '[') {
out++;
continue;
}
while (s[i] != ']' && s[i]) {
i++;
}
continue;
2024-10-24 19:24:54 +01:00
}
out++;
2024-10-24 19:24:54 +01:00
}
return out;
}
b_wchar *b_wstrdup(const b_wchar *s)
{
size_t len = b_wstrlen(s);
b_wchar *buf = calloc(len + 1, sizeof(b_wchar));
if (!buf) {
return NULL;
}
memcpy(buf, s, len * sizeof(b_wchar));
return buf;
}
size_t b_wstrlen(const b_wchar *s)
{
size_t len;
for (len = 0; s[len] != 0; len++)
;
return len;
}