bluelib/object/string.c

#include "string.h"

#include <blue/core/stream.h>
#include <blue/core/stringstream.h>
#include <blue/object/string.h>
#include <blue/object/type.h>
#include <ctype.h>
#include <stdarg.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#define IS_VALID_UTF8_SCALAR(x)                                                \
	(((x) >= 0x0000 && (x) <= 0xD7FF) || ((x) >= 0xE000 && (x) <= 0x10FFFF))

#define STRING_TOK_F_FOUND_DELIM 0x80

enum iterator_mode {
	ITERATOR_MODE_NONE = 0,
	ITERATOR_MODE_CHARS,
	ITERATOR_MODE_TOKENS,
};

static void string_release(struct b_object *obj);
static void string_to_string(const struct b_object *obj, struct b_stream *out);

static struct b_object_type string_type = {
	.t_name = "corelib::string",
	.t_flags = B_OBJECT_FUNDAMENTAL,
	.t_id = B_OBJECT_TYPE_STRING,
	.t_instance_size = sizeof(struct b_string),
	.t_release = string_release,
	.t_to_string = string_to_string,
};

static size_t utf8_codepoint_size(b_wchar c)
{
	if (!IS_VALID_UTF8_SCALAR(c)) {
		return 0;
	}

	if (c <= 0x7F) {
		return 1;
	}

	if (c <= 0x7FF) {
		return 2;
	}

	if (c <= 0xFFFF) {
		return 3;
	}

	if (c <= 0x10FFFF) {
		return 4;
	}

	return 0;
}

int32_t decode_utf8_trailer_byte(char c)
{
	if (!(c & 0x80) || (c & 0x40)) {
		return -1;
	}

	return c & 0x3F;
}

static b_wchar utf8_codepoint_decode(const char *s)
{
	b_wchar result = 0;
	int len = 0;

	if (!(s[0] & 0x80)) {
		len = 1;
		result = s[0] & 0x7F;
	} else if (s[0] & 0xC0 && !(s[0] & 0x20)) {
		len = 2;
		result = s[0] & 0x1F;
		result <<= 6;
	} else if (s[0] & 0xE0 && !(s[0] & 0x10)) {
		len = 3;
		result = s[0] & 0x0F;
		result <<= 12;
	} else if (s[0] & 0xF0 && !(s[0] & 0x08)) {
		len = 4;
		result = s[0] & 0x07;
		result <<= 18;
	} else {
		return B_WCHAR_INVALID;
	}

	for (int i = 1; i < len; i++) {
		int32_t c = decode_utf8_trailer_byte(s[i]);
		if (c == -1) {
			return B_WCHAR_INVALID;
		}

		c <<= 6 * (len - i - 1);
		result |= c;
	}

	if (!IS_VALID_UTF8_SCALAR(result)) {
		return B_WCHAR_INVALID;
	}

	return result;
}

static size_t utf8_codepoint_encode(b_wchar c, char s[4])
{
	size_t len = utf8_codepoint_size(c);

	switch (len) {
	case 1:
		s[0] = c & 0x7F;
		break;
	case 2:
		s[0] = ((c >> 6) & 0x1F) | 0xC0;
		s[1] = (c & 0x3F) | 0x80;
		break;
	case 3:
		s[0] = ((c >> 12) & 0x0F) | 0xE0;
		s[1] = ((c >> 6) & 0x3F) | 0x80;
		s[2] = (c & 0x3F) | 0x80;
		break;
	case 4:
		s[0] = ((c >> 18) & 0x07) | 0xF0;
		s[1] = ((c >> 12) & 0x3F) | 0x80;
		s[2] = ((c >> 6) & 0x3F) | 0x80;
		s[3] = (c & 0x3F) | 0x80;
		break;
	default:
		return 0;
	}

	return len;
}

static size_t codepoint_stride(const char *s)
{
	char c = *s;

	if (!(c & 0x80)) {
		return 1;
	}

	if ((c & 0xC0) && !(c & 0x20)) {
		return 2;
	}

	if ((c & 0xE0) && !(c & 0x10)) {
		return 3;
	}

	if ((c & 0xF0) && !(c & 0x08)) {
		return 4;
	}

	return 0;
}

static size_t get_number_of_codepoints(const char *s, size_t len)
{
	size_t nr_codepoints = 0;
	const char *end = s + len;

	while (*s && s < end) {
		size_t stride = codepoint_stride(s);
		if (stride == 0) {
			/* invalid codepoint */
			return 0;
		}

		nr_codepoints++;
		s += stride;
	}

	if (*s != 0) {
		/* string is not null-terminated */
		return 0;
	}

	return nr_codepoints;
}

static size_t get_utf8_encoded_size(const b_wchar *s, size_t nr_codepoints)
{
	size_t len = 0;
	for (size_t i = 0; i < nr_codepoints; i++) {
		size_t l = utf8_codepoint_size(s[i]);
		if (l == 0) {
			/* invalid codepoint */
			return 0;
		}

		len += l;
	}

	return len;
}

static enum b_status convert_codepoint_range_to_byte_range(
	const struct b_string *str, size_t cp_start, size_t cp_length,
	size_t *out_byte_start, size_t *out_byte_length)
{
	const char *s = b_string_ptr(str);
	size_t byte_offset = 0, byte_length = 0;

	for (size_t i = 0; i < cp_start; i++) {
		const char *cp = &s[byte_offset];
		if (!cp || byte_offset >= str->s_len) {
			/* out of range */
			return B_ERR_OUT_OF_BOUNDS;
		}

		size_t stride = codepoint_stride(cp);
		if (!stride) {
			/* invalid codepoint */
			return B_ERR_BAD_STATE;
		}

		byte_offset += stride;
	}

	for (size_t i = 0; i < cp_length; i++) {
		size_t cp_offset = byte_offset + byte_length;
		const char *cp = &s[cp_offset];
		if (!cp || (cp_offset >= str->s_len)) {
			/* out of range */
			return B_ERR_OUT_OF_BOUNDS;
		}

		size_t stride = codepoint_stride(cp);
		if (!stride) {
			/* invalid codepoint */
			return B_ERR_BAD_STATE;
		}

		byte_length += stride;
	}

	if (out_byte_start) {
		*out_byte_start = byte_offset;
	}

	if (out_byte_length) {
		*out_byte_length = byte_length;
	}

	return B_SUCCESS;
}

struct b_string *b_string_create(void)
{
	struct b_string *str
		= (struct b_string *)b_object_type_instantiate(&string_type);
	if (!str) {
		return NULL;
	}

	str->s_len = 0;
	str->s_codepoints = 0;
	str->s_max = STRING_INLINE_CAPACITY;

	return str;
}

static bool string_is_inline(const struct b_string *str)
{
	/* strings cannot go below STRING_INLINE_CAPACITY capacity */
	return str->s_max == STRING_INLINE_CAPACITY;
}

static char *string_ptr(struct b_string *str)
{
	if (string_is_inline(str)) {
		return str->s_data.d_inline;
	}

	return str->s_data.d_external;
}

static char *get_next_codepoint(struct b_string *str, char *this_codepoint)
{
	char c = *this_codepoint;
	char *end = this_codepoint - 1;
	size_t len = 0;
	if (!(c & 0x80)) {
		len = 1;
	} else if ((c & 0xC0) && !(c & 0x20)) {
		len = 2;
	} else if ((c & 0xE0) && !(c & 0x10)) {
		len = 3;
	} else if ((c & 0xF0) && !(c & 0x08)) {
		len = 4;
	} else {
		return NULL;
	}

	return this_codepoint + len;
}

static char *get_previous_codepoint(struct b_string *str, char *this_codepoint)
{
	char *start = string_ptr(str);
	char *end = this_codepoint - 1;

	while (end >= start) {
		char c = *end;
		if ((c & 0x80) && !(c & 0x40)) {
			end--;
			continue;
		}

		if ((c & 0xF0) && !(c & 0x08)) {
			return end;
		}

		if ((c & 0xE0) && !(c & 0x10)) {
			return end;
		}

		if ((c & 0xC0) && !(c & 0x20)) {
			return end;
		}

		if (!(c & 0x80)) {
			return end;
		}
	}

	return NULL;
}

static char *get_last_codepoint(struct b_string *str)
{
	if (str->s_len == 0) {
		return NULL;
	}

	return get_previous_codepoint(str, string_ptr(str) + str->s_len);
}

static int string_make_inline(struct b_string *str)
{
	char *buffer = string_ptr(str);
	memcpy(str->s_data.d_inline, buffer, sizeof str->s_data.d_inline);
	str->s_data.d_inline[sizeof str->s_data.d_inline - 1] = '\0';

	str->s_max = STRING_INLINE_CAPACITY;

	if (str->s_len >= str->s_max) {
		str->s_len = str->s_max;
	}

	free(buffer);
	return 0;
}

static int string_resize_large(struct b_string *str, size_t capacity)
{
	char *buffer = string_ptr(str);
	char *new_buffer = realloc(buffer, capacity + 1);
	if (!new_buffer) {
		return -1;
	}

	str->s_max = capacity;
	str->s_data.d_external = new_buffer;
	return 0;
}

static int string_make_large(struct b_string *str, size_t capacity)
{
	const char *old_buffer = string_ptr(str);
	char *buffer = malloc(capacity + 1);
	if (!buffer) {
		return -1;
	}

	memcpy(buffer, old_buffer, sizeof str->s_data.d_inline);
	buffer[str->s_len] = '\0';

	str->s_max = capacity;
	str->s_data.d_external = buffer;
	return 0;
}

static int string_change_capacity(struct b_string *str, size_t capacity)
{
	size_t old_capacity = str->s_max;

	if (capacity < STRING_INLINE_CAPACITY) {
		capacity = STRING_INLINE_CAPACITY;
	}

	bool was_inline = string_is_inline(str);
	bool is_now_inline = capacity == STRING_INLINE_CAPACITY;

	if (capacity == old_capacity) {
		/* this also handles the case where the old and new capacity both fit into the inline buffer. */
		return 0;
	}

	if (!was_inline && is_now_inline) {
		/* string was large, is now small enough to fit inline. */
		return string_make_inline(str);
	}

	if (!was_inline) {
		/* string was large, and is still large. */
		return string_resize_large(str, capacity);
	}

	if (!is_now_inline) {
		/* string was inline, and now large enough to require a buffer. */
		return string_make_large(str, capacity);
	}

	/* nothing to do */
	return 0;
}

struct b_string *b_string_create_from_cstr(const char *s)
{
	struct b_string *str = b_string_create();
	if (!str) {
		return NULL;
	}

	if (!s) {
		return str;
	}

	size_t s_len = strlen(s);
	size_t s_codepoints = get_number_of_codepoints(s, s_len);
	b_string_reserve(str, s_len);

	char *dest = string_ptr(str);
	memcpy(dest, s, s_len);
	dest[s_len] = 0;

	str->s_len = s_len;
	str->s_codepoints = s_codepoints;

	return str;
}

struct b_string *b_string_create_from_c(char c, size_t count)
{
	struct b_string *str = b_string_create();
	if (!str) {
		return NULL;
	}

	string_change_capacity(str, count);
	char *s = string_ptr(str);
	for (size_t i = 0; i < count; i++) {
		s[i] = c;
	}

	str->s_len = count;
	str->s_codepoints = count;
	return str;
}

struct b_string *b_string_duplicate(const struct b_string *str)
{
	struct b_string *new_str = b_string_create();
	if (!str) {
		return NULL;
	}

	string_change_capacity(new_str, str->s_len);
	const char *src = b_string_ptr(str);
	char *dst = string_ptr(new_str);

	memcpy(dst, src, str->s_len);
	new_str->s_len = str->s_len;
	new_str->s_codepoints = str->s_codepoints;

	return new_str;
}

char *b_string_steal(struct b_string *str)
{
	char *dest = NULL;
	char *src = string_ptr(str);

	if (string_is_inline(str)) {
		dest = malloc(str->s_len + 1);
		memcpy(dest, src, str->s_len);
		dest[str->s_len] = 0;
		src[0] = 0;
	} else {
		dest = src;
		str->s_data.d_external = NULL;
		str->s_max = STRING_INLINE_CAPACITY;
	}

	str->s_len = 0;
	str->s_codepoints = 0;
	return dest;
}

b_status b_string_reserve(struct b_string *str, size_t capacity)
{
	if (str->s_max >= capacity) {
		return B_SUCCESS;
	}

	int err = string_change_capacity(str, capacity);

	return err == 0 ? B_SUCCESS : B_ERR_NO_MEMORY;
}

static enum b_status replace_ansi(
	struct b_string *str, size_t start, size_t length, const char *new_data)
{
	b_status status = B_SUCCESS;
	size_t new_data_len = strlen(new_data);

	if (start >= str->s_len) {
		return B_ERR_INVALID_ARGUMENT;
	}

	if (start + length >= str->s_len) {
		length = str->s_len - start;
	}

	size_t new_str_len = str->s_len - length + new_data_len;
	if (new_str_len > str->s_max) {
		status = b_string_reserve(str, new_str_len);
	}

	if (!B_OK(status)) {
		return status;
	}

	char *s = string_ptr(str);

	char *substitution_start = s + start;
	char *excess_src = s + start + length;
	size_t excess_length = str->s_len - start - length;
	char *excess_dest = substitution_start + new_data_len;

	memmove(excess_dest, excess_src, excess_length);
	memmove(substitution_start, new_data, new_data_len);
	s[new_str_len] = '\0';

	str->s_len = new_str_len;

	return B_SUCCESS;
}

static enum b_status replace_utf8(
	struct b_string *str, size_t start, size_t length, const char *new_data)
{
	if (start >= str->s_codepoints) {
		return B_ERR_INVALID_ARGUMENT;
	}

	if (start + length >= str->s_codepoints) {
		length = str->s_codepoints - start;
	}

	size_t new_data_nr_bytes = strlen(new_data);
	size_t new_data_nr_codepoints
		= get_number_of_codepoints(new_data, new_data_nr_bytes);
	if (new_data_nr_codepoints == 0) {
		/* new_data is not a valid utf-8 string */
		return B_ERR_INVALID_ARGUMENT;
	}

	size_t old_data_offset = 0, old_data_nr_bytes = 0;
	size_t old_data_nr_codepoints = length;
	enum b_status status = convert_codepoint_range_to_byte_range(
		str, start, length, &old_data_offset, &old_data_nr_bytes);
	if (!B_OK(status)) {
		return status;
	}

	size_t new_total_bytes = str->s_len - old_data_nr_bytes + new_data_nr_bytes;
	if (new_total_bytes > str->s_max) {
		status = b_string_reserve(str, new_total_bytes);
	}

	if (!B_OK(status)) {
		return status;
	}

	char *s = string_ptr(str);

	char *substitution_start = s + old_data_offset;
	char *excess_src = s + old_data_offset + old_data_nr_bytes;
	size_t excess_length = str->s_len - old_data_offset - old_data_nr_bytes;
	char *excess_dest = substitution_start + new_data_nr_bytes;

	memmove(excess_dest, excess_src, excess_length);
	memmove(substitution_start, new_data, new_data_nr_bytes);
	s[new_total_bytes] = '\0';

	str->s_len = new_total_bytes;
	str->s_codepoints -= old_data_nr_codepoints;
	str->s_codepoints += new_data_nr_codepoints;

	return B_SUCCESS;
}

b_status b_string_replace(
	struct b_string *str, size_t start, size_t length, const char *new_data)
{
	if (str->s_len == str->s_codepoints) {
		return replace_ansi(str, start, length, new_data);
	}

	return replace_utf8(str, start, length, new_data);
}

b_status b_string_replace_all(b_string *str, const char *new_data)
{
	size_t new_len = strlen(new_data);
	b_string_reserve(str, new_len);
	char *dest = (char *)b_string_ptr(str);
	memcpy(dest, new_data, new_len);
	dest[new_len] = '\0';
	str->s_len = new_len;

	return B_SUCCESS;
}

static enum b_status remove_ansi(struct b_string *str, size_t start, size_t length)
{
	b_status status = B_SUCCESS;

	if (start >= str->s_len) {
		return B_ERR_INVALID_ARGUMENT;
	}

	if (start + length >= str->s_len) {
		length = str->s_len - start;
	}

	size_t new_str_len = str->s_len - length;

	char *s = string_ptr(str);

	char *removal_start = s + start;
	char *excess_src = s + start + length;
	size_t excess_length = str->s_len - start - length;

	memmove(removal_start, excess_src, excess_length);
	s[new_str_len] = '\0';

	str->s_len = new_str_len;

	return B_SUCCESS;
}

static enum b_status remove_utf8(struct b_string *str, size_t start, size_t length)
{
	size_t remove_offset = 0, remove_nr_bytes = 0;
	enum b_status status = convert_codepoint_range_to_byte_range(
		str, start, length, &remove_offset, &remove_nr_bytes);
	if (!B_OK(status)) {
		return status;
	}

	size_t new_total_bytes = str->s_len - remove_nr_bytes;

	char *s = string_ptr(str);

	char *removal_start = s + remove_offset;
	char *excess_src = s + remove_offset + remove_nr_bytes;
	size_t excess_length = str->s_len - remove_offset - remove_nr_bytes;

	memmove(removal_start, excess_src, excess_length);
	s[new_total_bytes] = '\0';

	str->s_len = new_total_bytes;
	str->s_codepoints -= length;

	return B_SUCCESS;
}

enum b_status b_string_remove(struct b_string *str, size_t start, size_t length)
{
	if (str->s_len == str->s_codepoints) {
		return remove_ansi(str, start, length);
	}

	return remove_utf8(str, start, length);
}

b_status b_string_transform(struct b_string *str, int (*transformer)(int))
{
	char *s = string_ptr(str);
	for (size_t i = 0; i < str->s_len; i++) {
		int c = transformer(s[i]);

		if (c != 0) {
			s[i] = c;
		}
	}

	return B_SUCCESS;
}

static enum b_status trim_ansi(struct b_string *str)
{
	char *s = string_ptr(str);
	size_t whitespace_end = 0;
	for (size_t i = 0; i < str->s_len; i++) {
		if (!isspace(s[i])) {
			whitespace_end = i;
			break;
		}
	}

	memmove(s, s + whitespace_end, str->s_len - whitespace_end);
	str->s_len -= whitespace_end;

	for (long i = str->s_len - 1; i >= 0; i--) {
		if (isspace(s[i])) {
			s[i] = 0;
			str->s_len--;
		} else {
			break;
		}
	}

	return B_SUCCESS;
}

static enum b_status trim_utf8(struct b_string *str)
{
	char *s = string_ptr(str);
	size_t whitespace_end = 0;
	size_t nr_whitespace_codepoints = 0;
	for (size_t i = 0; i < str->s_len;) {
		b_wchar c = utf8_codepoint_decode(&s[i]);

		if (!b_wchar_is_space(s[i])) {
			whitespace_end = i;
			break;
		}

		nr_whitespace_codepoints++;
	}

	memmove(s, s + whitespace_end, str->s_len - whitespace_end);
	str->s_len -= whitespace_end;
	str->s_codepoints -= nr_whitespace_codepoints;

	char *p = get_last_codepoint(str);
	if (!p) {
		return B_ERR_BAD_STATE;
	}

	for (long i = str->s_len - 1; i >= 0;) {
		b_wchar c = utf8_codepoint_decode(p);
		size_t c_size = utf8_codepoint_size(c);

		if (b_wchar_is_space(c)) {
			memset(p, 0, c_size);
			str->s_len -= c_size;
			str->s_codepoints--;
		} else {
			break;
		}

		p = get_previous_codepoint(str, p);
	}

	return B_SUCCESS;
}

b_status b_string_trim(struct b_string *str)
{
	if (str->s_len == 0) {
		return B_SUCCESS;
	}

	if (str->s_len == str->s_codepoints) {
		return trim_ansi(str);
	}

	return trim_utf8(str);
}

static enum b_status string_insert_cstr_ansi(
	struct b_string *dest, const char *src, size_t nr_bytes, size_t at)
{
	if (at >= dest->s_len) {
		at = dest->s_len;
	}

	size_t new_size = dest->s_len + nr_bytes;
	if (dest->s_max < new_size) {
		string_change_capacity(dest, new_size);
	}

	char *dest_buf = string_ptr(dest);
	char *from = dest_buf + at;
	char *to = dest_buf + at + nr_bytes;

	memmove(to, from, dest->s_len - at);
	memcpy(from, src, nr_bytes);
	dest_buf[new_size] = '\0';

	dest->s_len = new_size;
	dest->s_codepoints += nr_bytes;
	return B_SUCCESS;
}

static enum b_status string_insert_cstr_utf8(
	struct b_string *dest, const char *src, size_t nr_bytes,
	size_t codepoint_offset)
{
	if (codepoint_offset >= dest->s_codepoints) {
		codepoint_offset = dest->s_codepoints;
	}

	size_t byte_offset = 0;
	enum b_status status = B_SUCCESS;

	if (codepoint_offset == dest->s_codepoints) {
		byte_offset = dest->s_len;
	} else {
		status = convert_codepoint_range_to_byte_range(
			dest, 0, codepoint_offset, NULL, &byte_offset);
	}

	if (!B_OK(status)) {
		return status;
	}

	size_t new_total_bytes = dest->s_len + nr_bytes;
	if (dest->s_max < new_total_bytes) {
		string_change_capacity(dest, new_total_bytes);
	}

	char *dest_buf = string_ptr(dest);
	char *from = dest_buf + byte_offset;
	char *to = dest_buf + byte_offset + nr_bytes;

	memmove(to, from, dest->s_len - byte_offset);
	memcpy(from, src, nr_bytes);
	dest_buf[new_total_bytes] = '\0';

	dest->s_len += nr_bytes;
	dest->s_codepoints += get_number_of_codepoints(src, nr_bytes);

	return B_SUCCESS;
}

static enum b_status string_insert_wstr_ansi(
	struct b_string *dest, const b_wchar *src, size_t nr_codepoints, size_t at)
{
	if (at >= dest->s_len) {
		at = dest->s_len;
	}

	size_t utf8_encoded_size = get_utf8_encoded_size(src, nr_codepoints);
	if (utf8_encoded_size == 0) {
		return B_ERR_INVALID_ARGUMENT;
	}

	size_t new_total_bytes = dest->s_len + utf8_encoded_size;
	if (dest->s_max < new_total_bytes) {
		string_change_capacity(dest, new_total_bytes);
	}

	char *dest_buf = string_ptr(dest);
	char *from = dest_buf + at;
	char *to = dest_buf + at + utf8_encoded_size;
	memmove(to, from, dest->s_len - at);

	char *ptr = dest_buf + at;
	for (size_t i = 0; i < nr_codepoints; i++) {
		char c[4];
		size_t c_len = utf8_codepoint_encode(src[i], c);
		if (c_len == 0) {
			/* the input string was already checked by
			 * get_utf8_encoded_size, so this should never happen */
			return B_ERR_INVALID_ARGUMENT;
		}

		memcpy(ptr, c, c_len);
		ptr += c_len;
	}

	dest_buf[new_total_bytes] = '\0';

	dest->s_len += utf8_encoded_size;
	dest->s_codepoints += nr_codepoints;

	return B_SUCCESS;
}

static enum b_status string_insert_wstr_utf8(
	struct b_string *dest, const b_wchar *src, size_t nr_codepoints,
	size_t codepoint_offset)
{
	if (codepoint_offset >= dest->s_codepoints) {
		codepoint_offset = dest->s_codepoints;
	}

	size_t utf8_encoded_size = get_utf8_encoded_size(src, nr_codepoints);
	if (utf8_encoded_size == 0) {
		return B_ERR_INVALID_ARGUMENT;
	}

	size_t new_total_bytes = dest->s_len + utf8_encoded_size;
	if (dest->s_max < new_total_bytes) {
		string_change_capacity(dest, new_total_bytes);
	}

	size_t move_offset = 0;
	enum b_status status = B_SUCCESS;

	if (codepoint_offset == dest->s_codepoints) {
		move_offset = dest->s_len;
	} else {
		status = convert_codepoint_range_to_byte_range(
			dest, 0, codepoint_offset, NULL, &move_offset);
	}

	if (!B_OK(status)) {
		return status;
	}

	char *dest_buf = string_ptr(dest);
	char *from = dest_buf + move_offset;
	char *to = dest_buf + move_offset + utf8_encoded_size;
	memmove(to, from, dest->s_len - move_offset);

	char *ptr = dest_buf + move_offset;
	for (size_t i = 0; i < nr_codepoints; i++) {
		char c[4];
		size_t c_len = utf8_codepoint_encode(src[i], c);
		if (c_len == 0) {
			/* the input string was already checked by
			 * get_utf8_encoded_size, so this should never happen */
			return B_ERR_INVALID_ARGUMENT;
		}

		memcpy(ptr, c, c_len);
		ptr += c_len;
	}

	dest_buf[new_total_bytes] = '\0';

	dest->s_len += utf8_encoded_size;
	dest->s_codepoints += nr_codepoints;

	return B_SUCCESS;
}

static enum b_status string_insert_cstr(
	struct b_string *dest, const char *src, size_t nr_bytes, size_t at)
{
	if (dest->s_len == dest->s_codepoints) {
		return string_insert_cstr_ansi(dest, src, nr_bytes, at);
	}

	return string_insert_cstr_utf8(dest, src, nr_bytes, at);
}

static enum b_status string_insert_wstr(
	struct b_string *dest, const b_wchar *src, size_t nr_codepoints, size_t at)
{
	if (dest->s_len == dest->s_codepoints) {
		return string_insert_wstr_ansi(dest, src, nr_codepoints, at);
	}

	return string_insert_wstr_utf8(dest, src, nr_codepoints, at);
}

static enum b_status string_insertf(
	struct b_string *dest, size_t at, const char *format, va_list arg)
{
	char buf[1024];
	size_t len = vsnprintf(buf, sizeof buf, format, arg);
	return string_insert_cstr(dest, buf, len, at);
}

enum b_status b_string_insert_c(struct b_string *dest, char c, size_t at)
{
	return string_insert_cstr(dest, &c, 1, at);
}

enum b_status b_string_insert_wc(struct b_string *dest, b_wchar c, size_t at)
{
	return string_insert_wstr(dest, &c, 1, at);
}

enum b_status b_string_insert_s(
	struct b_string *dest, const struct b_string *src, size_t at)
{
	return string_insert_cstr(dest, b_string_ptr(src), src->s_len, at);
}

enum b_status b_string_insert_cstr(struct b_string *dest, const char *src, size_t at)
{
	return string_insert_cstr(dest, src, strlen(src), at);
}

enum b_status b_string_insert_wstr(
	struct b_string *dest, const b_wchar *src, size_t at)
{
	return string_insert_wstr(dest, src, b_wstrlen(src), at);
}

enum b_status b_string_insert_cstrf(
	struct b_string *dest, size_t at, const char *format, ...)
{
	va_list arg;
	va_start(arg, format);
	enum b_status status = string_insertf(dest, at, format, arg);
	va_end(arg);

	return status;
}

enum b_status b_string_insert_cstrn(
	b_string *dest, const char *src, size_t len, size_t at)
{
	return string_insert_cstr(dest, src, len, at);
}

enum b_status b_string_append_c(struct b_string *dest, char c)
{
	return b_string_insert_c(dest, c, SIZE_MAX);
}

enum b_status b_string_append_wc(struct b_string *dest, b_wchar c)
{
	return b_string_insert_wc(dest, c, SIZE_MAX);
}

enum b_status b_string_append_s(struct b_string *dest, const struct b_string *src)
{
	return b_string_insert_s(dest, src, SIZE_MAX);
}

enum b_status b_string_append_cstr(struct b_string *dest, const char *src)
{
	return b_string_insert_cstr(dest, src, SIZE_MAX);
}

enum b_status b_string_append_wstr(struct b_string *dest, const b_wchar *src)
{
	return b_string_insert_wstr(dest, src, SIZE_MAX);
}

enum b_status b_string_append_cstrf(struct b_string *dest, const char *format, ...)
{
	va_list arg;
	va_start(arg, format);
	enum b_status status = string_insertf(dest, SIZE_MAX, format, arg);
	va_end(arg);

	return status;
}

enum b_status b_string_prepend_c(struct b_string *dest, char c)
{
	return b_string_insert_c(dest, c, 0);
}

enum b_status b_string_prepend_wc(struct b_string *dest, b_wchar c)
{
	return b_string_insert_wc(dest, c, 0);
}

enum b_status b_string_prepend_s(struct b_string *dest, const struct b_string *src)
{
	return b_string_insert_s(dest, src, 0);
}

enum b_status b_string_prepend_cstr(struct b_string *dest, const char *src)
{
	return b_string_insert_cstr(dest, src, 0);
}

enum b_status b_string_prepend_wstr(struct b_string *dest, const b_wchar *src)
{
	return b_string_insert_wstr(dest, src, 0);
}

enum b_status b_string_prepend_cstrf(struct b_string *dest, const char *format, ...)
{
	va_list arg;
	va_start(arg, format);
	enum b_status status = string_insertf(dest, 0, format, arg);
	va_end(arg);

	return status;
}

void b_string_clear(struct b_string *str)
{
	if (str->s_len == 0) {
		return;
	}

	char *s = string_ptr(str);
	*s = '\0';
	str->s_len = 0;
	str->s_codepoints = 0;
}

static struct b_iterator_ops it_ops;

static bool has_prefix(const char *s, const char *prefix, size_t *prefix_len)
{
	size_t len = 0;
	for (size_t i = 0;; i++) {
		if (s[i] == 0 || prefix[i] == 0) {
			break;
		}

		if (s[i] != prefix[i]) {
			return false;
		}

		len++;
	}

	*prefix_len = len;
	return true;
}

static bool has_prefixes(
	const char *s, const char **prefixes, size_t nr_prefixes,
	size_t *selected_prefix_len)
{
	for (size_t i = 0; i < nr_prefixes; i++) {
		const char *delim = prefixes[i];
		if (has_prefix(s, delim, selected_prefix_len)) {
			return true;
		}
	}

	return false;
}

static enum b_status find_next_token(struct b_string_iterator *it)
{
	size_t offset = it->_ds;
	size_t prefix_len = 0;
	char *start = string_ptr(it->_s);
	bool found_delim_last_time = (it->_f & STRING_TOK_F_FOUND_DELIM) != 0;
	bool found_delim = false;
	bool include_empty = (it->_f & B_STRING_TOK_F_INCLUDE_EMPTY_TOKENS);
	bool found_null = false;
	b_string_clear(it->_tmp);

	while (1) {
		char *s = start + offset;
		if (*s == 0) {
			it->_f &= ~STRING_TOK_F_FOUND_DELIM;
			break;
		}

		found_delim = has_prefixes(s, it->_d, it->_nd, &prefix_len);
		if (found_delim) {
			if (it->_tmp->s_len == 0 && !include_empty) {
				/* this token is empty, skip it */
				offset += prefix_len;
				found_delim = false;
				continue;
			}

			it->_f |= STRING_TOK_F_FOUND_DELIM;
			break;
		}

		b_wchar c = utf8_codepoint_decode(s);
		if (c == B_WCHAR_INVALID) {
			return B_ERR_BAD_STATE;
		}

		b_string_append_wc(it->_tmp, c);
		offset += utf8_codepoint_size(c);

		if (offset > it->_s->s_len) {
			break;
		}
	}

	bool end = !found_delim && it->_tmp->s_len == 0;

	if (include_empty && found_delim_last_time) {
		end = false;
	}

	if (end) {
		it->string_value = NULL;
		it->string_length = 0;
		it->string_codepoints = 0;
		return B_ERR_NO_DATA;
	}

	it->_ds = offset + prefix_len;
	it->string_value = b_string_ptr(it->_tmp);
	it->string_length = it->_tmp->s_len;
	it->string_codepoints = it->_tmp->s_codepoints;
	return B_SUCCESS;
}

enum b_status b_string_tokenise(
	struct b_string *str, const char *delims[], size_t nr_delims,
	b_string_tokenise_flags flags, struct b_string_iterator *it)
{
	memset(it, 0x0, sizeof *it);

	if (!nr_delims) {
		return B_ERR_INVALID_ARGUMENT;
	}

	struct b_string *tmp = b_string_create();
	if (!tmp) {
		return B_ERR_NO_MEMORY;
	}

	it->_base.it_ops = &it_ops;
	it->_m = ITERATOR_MODE_TOKENS;
	it->_d = delims;
	it->_nd = nr_delims;
	it->_s = str;
	it->_f = flags;
	it->_tmp = tmp;

	enum b_status status = find_next_token(it);
	if (!B_OK(status)) {
		b_string_release(tmp);
		it->_tmp = NULL;
	}

	return status;
}

size_t b_string_get_size(const struct b_string *str, b_strlen_flags flags)
{
	switch (flags) {
	case B_STRLEN_NORMAL:
		return str->s_len;
	case B_STRLEN_CODEPOINTS:
		return str->s_codepoints;
	default:
		return b_strlen(b_string_ptr(str), flags);
	}
}

size_t b_string_get_capacity(const struct b_string *str)
{
	return str->s_max;
}

bool b_string_compare(const struct b_string *a, const struct b_string *b)
{
	if (a->s_len != b->s_len) {
		return false;
	}

	if (a == b) {
		return true;
	}

	const char *ap = b_string_ptr(a);
	const char *bp = b_string_ptr(b);

	for (size_t i = 0; i < a->s_len; i++) {
		if (ap[i] != bp[i]) {
			return false;
		}
	}

	return true;
}

char b_string_front(const struct b_string *str)
{
	if (str->s_len == 0) {
		return 0;
	}

	const char *s = b_string_ptr(str);
	return s[0];
}

char b_string_back(const struct b_string *str)
{
	if (str->s_len == 0) {
		return 0;
	}

	const char *s = b_string_ptr(str);
	return s[str->s_len - 1];
}

void b_string_pop_back(struct b_string *str)
{
	if (str->s_len == 0) {
		return;
	}

	char *s = string_ptr(str);

	s[str->s_len - 1] = '\0';
	str->s_len--;
}

const char *b_string_ptr(const struct b_string *str)
{
	if (string_is_inline(str)) {
		return str->s_data.d_inline;
	}

	return str->s_data.d_external;
}

struct b_string *b_string_substr(const struct b_string *str, size_t start, size_t len)
{
	if (start > b_string_get_size(str, B_STRLEN_NORMAL)) {
		return NULL;
	}

	if (start + len > b_string_get_size(str, B_STRLEN_NORMAL)) {
		len = b_string_get_size(str, B_STRLEN_NORMAL) - start;
	}

	struct b_string *newstr = b_string_create();
	b_string_reserve(newstr, len);

	const char *src = b_string_ptr(str) + start;
	char *dest = string_ptr(newstr);

	memcpy(dest, src, len);
	newstr->s_len = len;

	return newstr;
}

static enum b_status stream_close(struct b_stream *stream)
{
	struct b_string *str = stream->s_ptr;
	b_string_release(str);

	return B_SUCCESS;
}

static enum b_status stream_getc(struct b_stream *stream, int *out)
{
	struct b_string *str = stream->s_ptr;
	if (stream->s_cursor >= str->s_len) {
		return B_ERR_NO_DATA;
	}

	char *s = string_ptr(str);
	*out = s[stream->s_cursor];
	stream->s_cursor++;

	return B_SUCCESS;
}

static enum b_status stream_read(
	struct b_stream *stream, unsigned char *buf, size_t count, size_t *nr_read)
{
	struct b_string *str = stream->s_ptr;
	if (stream->s_cursor >= str->s_len) {
		*nr_read = 0;
		return B_SUCCESS;
	}

	size_t available = str->s_len - stream->s_cursor;
	size_t to_read = b_min(size_t, count, available);

	char *s = string_ptr(str) + stream->s_cursor;

	memcpy(buf, s, to_read);

	*nr_read = to_read;

	return B_SUCCESS;
}

static enum b_status stream_write(
	struct b_stream *stream, const unsigned char *buf, size_t count,
	size_t *nr_written)
{
	struct b_string *str = stream->s_ptr;
	enum b_status status = B_SUCCESS;

	if (stream->s_cursor + count > str->s_max) {
		status = b_string_reserve(str, stream->s_cursor + count);
	}

	if (!B_OK(status)) {
		return status;
	}

	string_insert_cstr(str, (const char *)buf, count, stream->s_cursor);
	stream->s_cursor += count;

	*nr_written = count;

	return B_SUCCESS;
}

static enum b_status stream_seek(
	struct b_stream *stream, long long offset, b_stream_seek_origin origin)
{
	struct b_string *str = stream->s_ptr;

	size_t abs_offset;
	switch (origin) {
	case B_STREAM_SEEK_START:
		abs_offset = offset;
		break;
	case B_STREAM_SEEK_CURRENT:
		abs_offset = stream->s_cursor + offset;
		break;
	case B_STREAM_SEEK_END:
		abs_offset = str->s_len + offset;
		break;
	default:
		return B_ERR_INVALID_ARGUMENT;
	}

	stream->s_cursor = abs_offset;

	return B_SUCCESS;
}

static enum b_status stream_reserve(struct b_stream *stream, size_t len)
{
	struct b_string *str = stream->s_ptr;

	size_t new_capacity = str->s_len + len;
	return b_string_reserve(str, new_capacity);
}

enum b_status b_string_open_stream(struct b_string *str, struct b_stream **out)
{
	struct b_stream *stream = malloc(sizeof *stream);
	if (!stream) {
		return B_ERR_NO_MEMORY;
	}

	memset(stream, 0x0, sizeof *stream);

	stream->s_mode |= B_STREAM_READ | B_STREAM_WRITE;

	stream->s_ptr = b_string_retain(str);
	stream->s_close = stream_close;
	stream->s_getc = stream_getc;
	stream->s_read = stream_read;
	stream->s_write = stream_write;
	stream->s_seek = stream_seek;
	stream->s_reserve = stream_reserve;

	*out = stream;

	return B_SUCCESS;
}

static bool string_iterator_next(struct b_iterator *it)
{
	return b_string_iterator_next((struct b_string_iterator *)it);
}

static bool string_iterator_is_valid(const struct b_iterator *it)
{
	return b_string_iterator_is_valid((struct b_string_iterator *)it);
}

static struct b_iterator_ops it_ops = {
	.it_next = string_iterator_next,
	.it_close = NULL,
	.it_is_valid = string_iterator_is_valid,
};

static void iterator_cleanup(b_string_iterator *it)
{
	if (it->_tmp) {
		b_string_release(it->_tmp);
	}

	memset(it, 0x0, sizeof *it);
}

int b_string_iterator_begin(const struct b_string *string, b_string_iterator *it)
{
	memset(it, 0x0, sizeof *it);

	it->_base.it_ops = &it_ops;

	if (!string->s_len) {
		it->status = B_ERR_NO_DATA;
		return -1;
	}

	const char *p = b_string_ptr(string);
	it->_m = ITERATOR_MODE_CHARS;
	it->_s = B_STRING(string);
	it->char_value = utf8_codepoint_decode(p);

	if (it->char_value == B_WCHAR_INVALID) {
		it->status = B_ERR_BAD_FORMAT;
		return -1;
	}

	return 0;
}

static bool chars_iterator_next(b_string_iterator *it)
{
	if (!b_string_iterator_is_valid(it)) {
		return false;
	}

	size_t stride = utf8_codepoint_size(it->char_value);
	if (stride == 0) {
		iterator_cleanup(it);
		return false;
	}

	it->byte_index += stride;
	it->codepoint_index += 1;

	if (it->byte_index >= it->_s->s_len) {
		iterator_cleanup(it);
		it->_s = NULL;
		it->byte_index = 0;
		it->codepoint_index = 0;
		it->char_value = B_WCHAR_INVALID;
		it->status = B_ERR_NO_DATA;
		return false;
	}

	char *p = string_ptr(it->_s) + it->byte_index;
	it->char_value = utf8_codepoint_decode(p);
	if (it->char_value == B_WCHAR_INVALID) {
		iterator_cleanup(it);
		it->_s = NULL;
		it->byte_index = 0;
		it->codepoint_index = 0;
		it->char_value = B_WCHAR_INVALID;
		it->status = B_ERR_BAD_FORMAT;
		return false;
	}

	it->iteration_index++;
	return true;
}

static bool tokens_iterator_next(b_string_iterator *it)
{
	if (!b_string_iterator_is_valid(it)) {
		return false;
	}

	enum b_status status = find_next_token(it);
	if (!B_OK(status)) {
		iterator_cleanup(it);
		return false;
	}

	it->string_value = string_ptr(it->_tmp);
	it->iteration_index++;

	return true;
}

bool b_string_iterator_next(b_string_iterator *it)
{
	switch (it->_m) {
	case ITERATOR_MODE_CHARS:
		return chars_iterator_next(it);
	case ITERATOR_MODE_TOKENS:
		return tokens_iterator_next(it);
	default:
		return false;
	}
}

static bool chars_iterator_is_valid(const struct b_string_iterator *it)
{
	if (!it->_s) {
		return false;
	}

	if (it->byte_index >= it->_s->s_len) {
		return false;
	}

	if (it->char_value == B_WCHAR_INVALID) {
		return false;
	}

	return true;
}

static bool tokens_iterator_is_valid(const struct b_string_iterator *it)
{
	if (!it->_s) {
		return false;
	}

	if (it->byte_index >= it->_s->s_len) {
		return false;
	}

	if (!it->string_value) {
		return false;
	}

	return true;
}

bool b_string_iterator_is_valid(const struct b_string_iterator *it)
{
	switch (it->_m) {
	case ITERATOR_MODE_CHARS:
		return chars_iterator_is_valid(it);
	case ITERATOR_MODE_TOKENS:
		return tokens_iterator_is_valid(it);
	default:
		return false;
	}
}

static void string_release(struct b_object *obj)
{
	struct b_string *str = B_STRING(obj);
	if (!string_is_inline(str)) {
		free(string_ptr(str));
	}
}

static void string_to_string(const struct b_object *obj, struct b_stream *out)
{
	b_string *str = B_STRING(obj);
	const char *s = b_string_ptr(str);
	for (size_t i = 0; i < str->s_len; i++) {
		b_stream_write_char(out, s[i]);
	}
}

char *b_strdup(const char *s)
{
	size_t len = strlen(s);
	char *p = malloc(len + 1);
	if (!p) {
		return NULL;
	}

	memcpy(p, s, len);
	p[len] = '\0';

	return p;
}

size_t b_strlen(const char *s, b_strlen_flags flags)
{
	if (!(flags & (B_STRLEN_IGNORE_ESC | B_STRLEN_IGNORE_MOD))) {
		return strlen(s);
	}

	size_t out = 0;
	for (size_t i = 0; s[i]; i++) {
		if (s[i] == '\033' && (flags & B_STRLEN_IGNORE_ESC)) {
			while (!isalpha(s[i]) && s[i]) {
				i++;
			}

			continue;
		}

		if (s[i] == '[' && (flags & B_STRLEN_IGNORE_MOD)) {
			i++;
			if (s[i] == '[') {
				out++;
				continue;
			}

			while (s[i] != ']' && s[i]) {
				i++;
			}

			continue;
		}

		out++;
	}

	return out;
}

b_wchar *b_wstrdup(const b_wchar *s)
{
	size_t len = b_wstrlen(s);
	b_wchar *buf = calloc(len + 1, sizeof(b_wchar));
	if (!buf) {
		return NULL;
	}

	memcpy(buf, s, len * sizeof(b_wchar));

	return buf;
}

size_t b_wstrlen(const b_wchar *s)
{
	size_t len;
	for (len = 0; s[len] != 0; len++)
		;
	return len;
}

uint64_t b_string_hash(const struct b_string *str)
{
#define FNV1_OFFSET_BASIS 0xcbf29ce484222325
#define FNV1_PRIME        0x100000001b3
	uint64_t hash = FNV1_OFFSET_BASIS;
	size_t i = 0;

	const char *s = b_string_ptr(str);

	for (i = 0; i < str->s_len; i++) {
		hash ^= s[i];
		hash *= FNV1_PRIME;
	}

	return hash;
}

b_object_type_id b_string_type_id(void)
{
	return (b_object_type_id)&string_type;
}