Files
bluelib/object/string.c

1748 lines
35 KiB
C

#include "string.h"
#include <blue/core/stream.h>
#include <blue/core/stringstream.h>
#include <blue/object/string.h>
#include <blue/object/type.h>
#include <ctype.h>
#include <stdarg.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define IS_VALID_UTF8_SCALAR(x) \
(((x) >= 0x0000 && (x) <= 0xD7FF) || ((x) >= 0xE000 && (x) <= 0x10FFFF))
#define STRING_TOK_F_FOUND_DELIM 0x80
enum iterator_mode {
ITERATOR_MODE_NONE = 0,
ITERATOR_MODE_CHARS,
ITERATOR_MODE_TOKENS,
};
static void string_release(struct b_object *obj);
static void string_to_string(const struct b_object *obj, struct b_stream *out);
static struct b_object_type string_type = {
.t_name = "corelib::string",
.t_flags = B_OBJECT_FUNDAMENTAL,
.t_id = B_OBJECT_TYPE_STRING,
.t_instance_size = sizeof(struct b_string),
.t_release = string_release,
.t_to_string = string_to_string,
};
static size_t utf8_codepoint_size(b_wchar c)
{
if (!IS_VALID_UTF8_SCALAR(c)) {
return 0;
}
if (c <= 0x7F) {
return 1;
}
if (c <= 0x7FF) {
return 2;
}
if (c <= 0xFFFF) {
return 3;
}
if (c <= 0x10FFFF) {
return 4;
}
return 0;
}
int32_t decode_utf8_trailer_byte(char c)
{
if (!(c & 0x80) || (c & 0x40)) {
return -1;
}
return c & 0x3F;
}
static b_wchar utf8_codepoint_decode(const char *s)
{
b_wchar result = 0;
int len = 0;
if (!(s[0] & 0x80)) {
len = 1;
result = s[0] & 0x7F;
} else if (s[0] & 0xC0 && !(s[0] & 0x20)) {
len = 2;
result = s[0] & 0x1F;
result <<= 6;
} else if (s[0] & 0xE0 && !(s[0] & 0x10)) {
len = 3;
result = s[0] & 0x0F;
result <<= 12;
} else if (s[0] & 0xF0 && !(s[0] & 0x08)) {
len = 4;
result = s[0] & 0x07;
result <<= 18;
} else {
return B_WCHAR_INVALID;
}
for (int i = 1; i < len; i++) {
int32_t c = decode_utf8_trailer_byte(s[i]);
if (c == -1) {
return B_WCHAR_INVALID;
}
c <<= 6 * (len - i - 1);
result |= c;
}
if (!IS_VALID_UTF8_SCALAR(result)) {
return B_WCHAR_INVALID;
}
return result;
}
static size_t utf8_codepoint_encode(b_wchar c, char s[4])
{
size_t len = utf8_codepoint_size(c);
switch (len) {
case 1:
s[0] = c & 0x7F;
break;
case 2:
s[0] = ((c >> 6) & 0x1F) | 0xC0;
s[1] = (c & 0x3F) | 0x80;
break;
case 3:
s[0] = ((c >> 12) & 0x0F) | 0xE0;
s[1] = ((c >> 6) & 0x3F) | 0x80;
s[2] = (c & 0x3F) | 0x80;
break;
case 4:
s[0] = ((c >> 18) & 0x07) | 0xF0;
s[1] = ((c >> 12) & 0x3F) | 0x80;
s[2] = ((c >> 6) & 0x3F) | 0x80;
s[3] = (c & 0x3F) | 0x80;
break;
default:
return 0;
}
return len;
}
static size_t codepoint_stride(const char *s)
{
char c = *s;
if (!(c & 0x80)) {
return 1;
}
if ((c & 0xC0) && !(c & 0x20)) {
return 2;
}
if ((c & 0xE0) && !(c & 0x10)) {
return 3;
}
if ((c & 0xF0) && !(c & 0x08)) {
return 4;
}
return 0;
}
static size_t get_number_of_codepoints(const char *s, size_t len)
{
size_t nr_codepoints = 0;
const char *end = s + len;
while (*s && s < end) {
size_t stride = codepoint_stride(s);
if (stride == 0) {
/* invalid codepoint */
return 0;
}
nr_codepoints++;
s += stride;
}
if (*s != 0) {
/* string is not null-terminated */
return 0;
}
return nr_codepoints;
}
static size_t get_utf8_encoded_size(const b_wchar *s, size_t nr_codepoints)
{
size_t len = 0;
for (size_t i = 0; i < nr_codepoints; i++) {
size_t l = utf8_codepoint_size(s[i]);
if (l == 0) {
/* invalid codepoint */
return 0;
}
len += l;
}
return len;
}
static enum b_status convert_codepoint_range_to_byte_range(
const struct b_string *str, size_t cp_start, size_t cp_length,
size_t *out_byte_start, size_t *out_byte_length)
{
const char *s = b_string_ptr(str);
size_t byte_offset = 0, byte_length = 0;
for (size_t i = 0; i < cp_start; i++) {
const char *cp = &s[byte_offset];
if (!cp || byte_offset >= str->s_len) {
/* out of range */
return B_ERR_OUT_OF_BOUNDS;
}
size_t stride = codepoint_stride(cp);
if (!stride) {
/* invalid codepoint */
return B_ERR_BAD_STATE;
}
byte_offset += stride;
}
for (size_t i = 0; i < cp_length; i++) {
size_t cp_offset = byte_offset + byte_length;
const char *cp = &s[cp_offset];
if (!cp || (cp_offset >= str->s_len)) {
/* out of range */
return B_ERR_OUT_OF_BOUNDS;
}
size_t stride = codepoint_stride(cp);
if (!stride) {
/* invalid codepoint */
return B_ERR_BAD_STATE;
}
byte_length += stride;
}
if (out_byte_start) {
*out_byte_start = byte_offset;
}
if (out_byte_length) {
*out_byte_length = byte_length;
}
return B_SUCCESS;
}
struct b_string *b_string_create(void)
{
struct b_string *str
= (struct b_string *)b_object_type_instantiate(&string_type);
if (!str) {
return NULL;
}
str->s_len = 0;
str->s_codepoints = 0;
str->s_max = STRING_INLINE_CAPACITY;
return str;
}
static bool string_is_inline(const struct b_string *str)
{
/* strings cannot go below STRING_INLINE_CAPACITY capacity */
return str->s_max == STRING_INLINE_CAPACITY;
}
static char *string_ptr(struct b_string *str)
{
if (string_is_inline(str)) {
return str->s_data.d_inline;
}
return str->s_data.d_external;
}
static char *get_next_codepoint(struct b_string *str, char *this_codepoint)
{
char c = *this_codepoint;
char *end = this_codepoint - 1;
size_t len = 0;
if (!(c & 0x80)) {
len = 1;
} else if ((c & 0xC0) && !(c & 0x20)) {
len = 2;
} else if ((c & 0xE0) && !(c & 0x10)) {
len = 3;
} else if ((c & 0xF0) && !(c & 0x08)) {
len = 4;
} else {
return NULL;
}
return this_codepoint + len;
}
static char *get_previous_codepoint(struct b_string *str, char *this_codepoint)
{
char *start = string_ptr(str);
char *end = this_codepoint - 1;
while (end >= start) {
char c = *end;
if ((c & 0x80) && !(c & 0x40)) {
end--;
continue;
}
if ((c & 0xF0) && !(c & 0x08)) {
return end;
}
if ((c & 0xE0) && !(c & 0x10)) {
return end;
}
if ((c & 0xC0) && !(c & 0x20)) {
return end;
}
if (!(c & 0x80)) {
return end;
}
}
return NULL;
}
static char *get_last_codepoint(struct b_string *str)
{
if (str->s_len == 0) {
return NULL;
}
return get_previous_codepoint(str, string_ptr(str) + str->s_len);
}
static int string_make_inline(struct b_string *str)
{
char *buffer = string_ptr(str);
memcpy(str->s_data.d_inline, buffer, sizeof str->s_data.d_inline);
str->s_data.d_inline[sizeof str->s_data.d_inline - 1] = '\0';
str->s_max = STRING_INLINE_CAPACITY;
if (str->s_len >= str->s_max) {
str->s_len = str->s_max;
}
free(buffer);
return 0;
}
static int string_resize_large(struct b_string *str, size_t capacity)
{
char *buffer = string_ptr(str);
char *new_buffer = realloc(buffer, capacity + 1);
if (!new_buffer) {
return -1;
}
str->s_max = capacity;
str->s_data.d_external = new_buffer;
return 0;
}
static int string_make_large(struct b_string *str, size_t capacity)
{
const char *old_buffer = string_ptr(str);
char *buffer = malloc(capacity + 1);
if (!buffer) {
return -1;
}
memcpy(buffer, old_buffer, sizeof str->s_data.d_inline);
buffer[str->s_len] = '\0';
str->s_max = capacity;
str->s_data.d_external = buffer;
return 0;
}
static int string_change_capacity(struct b_string *str, size_t capacity)
{
size_t old_capacity = str->s_max;
if (capacity < STRING_INLINE_CAPACITY) {
capacity = STRING_INLINE_CAPACITY;
}
bool was_inline = string_is_inline(str);
bool is_now_inline = capacity == STRING_INLINE_CAPACITY;
if (capacity == old_capacity) {
/* this also handles the case where the old and new capacity both fit into the inline buffer. */
return 0;
}
if (!was_inline && is_now_inline) {
/* string was large, is now small enough to fit inline. */
return string_make_inline(str);
}
if (!was_inline) {
/* string was large, and is still large. */
return string_resize_large(str, capacity);
}
if (!is_now_inline) {
/* string was inline, and now large enough to require a buffer. */
return string_make_large(str, capacity);
}
/* nothing to do */
return 0;
}
struct b_string *b_string_create_from_cstr(const char *s)
{
struct b_string *str = b_string_create();
if (!str) {
return NULL;
}
if (!s) {
return str;
}
size_t s_len = strlen(s);
size_t s_codepoints = get_number_of_codepoints(s, s_len);
b_string_reserve(str, s_len);
char *dest = string_ptr(str);
memcpy(dest, s, s_len);
dest[s_len] = 0;
str->s_len = s_len;
str->s_codepoints = s_codepoints;
return str;
}
struct b_string *b_string_create_from_c(char c, size_t count)
{
struct b_string *str = b_string_create();
if (!str) {
return NULL;
}
string_change_capacity(str, count);
char *s = string_ptr(str);
for (size_t i = 0; i < count; i++) {
s[i] = c;
}
str->s_len = count;
str->s_codepoints = count;
return str;
}
struct b_string *b_string_duplicate(const struct b_string *str)
{
struct b_string *new_str = b_string_create();
if (!str) {
return NULL;
}
string_change_capacity(new_str, str->s_len);
const char *src = b_string_ptr(str);
char *dst = string_ptr(new_str);
memcpy(dst, src, str->s_len);
new_str->s_len = str->s_len;
new_str->s_codepoints = str->s_codepoints;
return new_str;
}
char *b_string_steal(struct b_string *str)
{
char *dest = NULL;
char *src = string_ptr(str);
if (string_is_inline(str)) {
dest = malloc(str->s_len + 1);
memcpy(dest, src, str->s_len);
dest[str->s_len] = 0;
src[0] = 0;
} else {
dest = src;
str->s_data.d_external = NULL;
str->s_max = STRING_INLINE_CAPACITY;
}
str->s_len = 0;
str->s_codepoints = 0;
return dest;
}
b_status b_string_reserve(struct b_string *str, size_t capacity)
{
if (str->s_max >= capacity) {
return B_SUCCESS;
}
int err = string_change_capacity(str, capacity);
return err == 0 ? B_SUCCESS : B_ERR_NO_MEMORY;
}
static enum b_status replace_ansi(
struct b_string *str, size_t start, size_t length, const char *new_data)
{
b_status status = B_SUCCESS;
size_t new_data_len = strlen(new_data);
if (start >= str->s_len) {
return B_ERR_INVALID_ARGUMENT;
}
if (start + length >= str->s_len) {
length = str->s_len - start;
}
size_t new_str_len = str->s_len - length + new_data_len;
if (new_str_len > str->s_max) {
status = b_string_reserve(str, new_str_len);
}
if (!B_OK(status)) {
return status;
}
char *s = string_ptr(str);
char *substitution_start = s + start;
char *excess_src = s + start + length;
size_t excess_length = str->s_len - start - length;
char *excess_dest = substitution_start + new_data_len;
memmove(excess_dest, excess_src, excess_length);
memmove(substitution_start, new_data, new_data_len);
s[new_str_len] = '\0';
str->s_len = new_str_len;
return B_SUCCESS;
}
static enum b_status replace_utf8(
struct b_string *str, size_t start, size_t length, const char *new_data)
{
if (start >= str->s_codepoints) {
return B_ERR_INVALID_ARGUMENT;
}
if (start + length >= str->s_codepoints) {
length = str->s_codepoints - start;
}
size_t new_data_nr_bytes = strlen(new_data);
size_t new_data_nr_codepoints
= get_number_of_codepoints(new_data, new_data_nr_bytes);
if (new_data_nr_codepoints == 0) {
/* new_data is not a valid utf-8 string */
return B_ERR_INVALID_ARGUMENT;
}
size_t old_data_offset = 0, old_data_nr_bytes = 0;
size_t old_data_nr_codepoints = length;
enum b_status status = convert_codepoint_range_to_byte_range(
str, start, length, &old_data_offset, &old_data_nr_bytes);
if (!B_OK(status)) {
return status;
}
size_t new_total_bytes = str->s_len - old_data_nr_bytes + new_data_nr_bytes;
if (new_total_bytes > str->s_max) {
status = b_string_reserve(str, new_total_bytes);
}
if (!B_OK(status)) {
return status;
}
char *s = string_ptr(str);
char *substitution_start = s + old_data_offset;
char *excess_src = s + old_data_offset + old_data_nr_bytes;
size_t excess_length = str->s_len - old_data_offset - old_data_nr_bytes;
char *excess_dest = substitution_start + new_data_nr_bytes;
memmove(excess_dest, excess_src, excess_length);
memmove(substitution_start, new_data, new_data_nr_bytes);
s[new_total_bytes] = '\0';
str->s_len = new_total_bytes;
str->s_codepoints -= old_data_nr_codepoints;
str->s_codepoints += new_data_nr_codepoints;
return B_SUCCESS;
}
b_status b_string_replace(
struct b_string *str, size_t start, size_t length, const char *new_data)
{
if (str->s_len == str->s_codepoints) {
return replace_ansi(str, start, length, new_data);
}
return replace_utf8(str, start, length, new_data);
}
b_status b_string_replace_all(b_string *str, const char *new_data)
{
size_t new_len = strlen(new_data);
b_string_reserve(str, new_len);
char *dest = (char *)b_string_ptr(str);
memcpy(dest, new_data, new_len);
dest[new_len] = '\0';
str->s_len = new_len;
return B_SUCCESS;
}
static enum b_status remove_ansi(struct b_string *str, size_t start, size_t length)
{
b_status status = B_SUCCESS;
if (start >= str->s_len) {
return B_ERR_INVALID_ARGUMENT;
}
if (start + length >= str->s_len) {
length = str->s_len - start;
}
size_t new_str_len = str->s_len - length;
char *s = string_ptr(str);
char *removal_start = s + start;
char *excess_src = s + start + length;
size_t excess_length = str->s_len - start - length;
memmove(removal_start, excess_src, excess_length);
s[new_str_len] = '\0';
str->s_len = new_str_len;
return B_SUCCESS;
}
static enum b_status remove_utf8(struct b_string *str, size_t start, size_t length)
{
size_t remove_offset = 0, remove_nr_bytes = 0;
enum b_status status = convert_codepoint_range_to_byte_range(
str, start, length, &remove_offset, &remove_nr_bytes);
if (!B_OK(status)) {
return status;
}
size_t new_total_bytes = str->s_len - remove_nr_bytes;
char *s = string_ptr(str);
char *removal_start = s + remove_offset;
char *excess_src = s + remove_offset + remove_nr_bytes;
size_t excess_length = str->s_len - remove_offset - remove_nr_bytes;
memmove(removal_start, excess_src, excess_length);
s[new_total_bytes] = '\0';
str->s_len = new_total_bytes;
str->s_codepoints -= length;
return B_SUCCESS;
}
enum b_status b_string_remove(struct b_string *str, size_t start, size_t length)
{
if (str->s_len == str->s_codepoints) {
return remove_ansi(str, start, length);
}
return remove_utf8(str, start, length);
}
b_status b_string_transform(struct b_string *str, int (*transformer)(int))
{
char *s = string_ptr(str);
for (size_t i = 0; i < str->s_len; i++) {
int c = transformer(s[i]);
if (c != 0) {
s[i] = c;
}
}
return B_SUCCESS;
}
static enum b_status trim_ansi(struct b_string *str)
{
char *s = string_ptr(str);
size_t whitespace_end = 0;
for (size_t i = 0; i < str->s_len; i++) {
if (!isspace(s[i])) {
whitespace_end = i;
break;
}
}
memmove(s, s + whitespace_end, str->s_len - whitespace_end);
str->s_len -= whitespace_end;
for (long i = str->s_len - 1; i >= 0; i--) {
if (isspace(s[i])) {
s[i] = 0;
str->s_len--;
} else {
break;
}
}
return B_SUCCESS;
}
static enum b_status trim_utf8(struct b_string *str)
{
char *s = string_ptr(str);
size_t whitespace_end = 0;
size_t nr_whitespace_codepoints = 0;
for (size_t i = 0; i < str->s_len;) {
b_wchar c = utf8_codepoint_decode(&s[i]);
if (!b_wchar_is_space(s[i])) {
whitespace_end = i;
break;
}
nr_whitespace_codepoints++;
}
memmove(s, s + whitespace_end, str->s_len - whitespace_end);
str->s_len -= whitespace_end;
str->s_codepoints -= nr_whitespace_codepoints;
char *p = get_last_codepoint(str);
if (!p) {
return B_ERR_BAD_STATE;
}
for (long i = str->s_len - 1; i >= 0;) {
b_wchar c = utf8_codepoint_decode(p);
size_t c_size = utf8_codepoint_size(c);
if (b_wchar_is_space(c)) {
memset(p, 0, c_size);
str->s_len -= c_size;
str->s_codepoints--;
} else {
break;
}
p = get_previous_codepoint(str, p);
}
return B_SUCCESS;
}
b_status b_string_trim(struct b_string *str)
{
if (str->s_len == 0) {
return B_SUCCESS;
}
if (str->s_len == str->s_codepoints) {
return trim_ansi(str);
}
return trim_utf8(str);
}
static enum b_status string_insert_cstr_ansi(
struct b_string *dest, const char *src, size_t nr_bytes, size_t at)
{
if (at >= dest->s_len) {
at = dest->s_len;
}
size_t new_size = dest->s_len + nr_bytes;
if (dest->s_max < new_size) {
string_change_capacity(dest, new_size);
}
char *dest_buf = string_ptr(dest);
char *from = dest_buf + at;
char *to = dest_buf + at + nr_bytes;
memmove(to, from, dest->s_len - at);
memcpy(from, src, nr_bytes);
dest_buf[new_size] = '\0';
dest->s_len = new_size;
dest->s_codepoints += nr_bytes;
return B_SUCCESS;
}
static enum b_status string_insert_cstr_utf8(
struct b_string *dest, const char *src, size_t nr_bytes,
size_t codepoint_offset)
{
if (codepoint_offset >= dest->s_codepoints) {
codepoint_offset = dest->s_codepoints;
}
size_t byte_offset = 0;
enum b_status status = B_SUCCESS;
if (codepoint_offset == dest->s_codepoints) {
byte_offset = dest->s_len;
} else {
status = convert_codepoint_range_to_byte_range(
dest, 0, codepoint_offset, NULL, &byte_offset);
}
if (!B_OK(status)) {
return status;
}
size_t new_total_bytes = dest->s_len + nr_bytes;
if (dest->s_max < new_total_bytes) {
string_change_capacity(dest, new_total_bytes);
}
char *dest_buf = string_ptr(dest);
char *from = dest_buf + byte_offset;
char *to = dest_buf + byte_offset + nr_bytes;
memmove(to, from, dest->s_len - byte_offset);
memcpy(from, src, nr_bytes);
dest_buf[new_total_bytes] = '\0';
dest->s_len += nr_bytes;
dest->s_codepoints += get_number_of_codepoints(src, nr_bytes);
return B_SUCCESS;
}
static enum b_status string_insert_wstr_ansi(
struct b_string *dest, const b_wchar *src, size_t nr_codepoints, size_t at)
{
if (at >= dest->s_len) {
at = dest->s_len;
}
size_t utf8_encoded_size = get_utf8_encoded_size(src, nr_codepoints);
if (utf8_encoded_size == 0) {
return B_ERR_INVALID_ARGUMENT;
}
size_t new_total_bytes = dest->s_len + utf8_encoded_size;
if (dest->s_max < new_total_bytes) {
string_change_capacity(dest, new_total_bytes);
}
char *dest_buf = string_ptr(dest);
char *from = dest_buf + at;
char *to = dest_buf + at + utf8_encoded_size;
memmove(to, from, dest->s_len - at);
char *ptr = dest_buf + at;
for (size_t i = 0; i < nr_codepoints; i++) {
char c[4];
size_t c_len = utf8_codepoint_encode(src[i], c);
if (c_len == 0) {
/* the input string was already checked by
* get_utf8_encoded_size, so this should never happen */
return B_ERR_INVALID_ARGUMENT;
}
memcpy(ptr, c, c_len);
ptr += c_len;
}
dest_buf[new_total_bytes] = '\0';
dest->s_len += utf8_encoded_size;
dest->s_codepoints += nr_codepoints;
return B_SUCCESS;
}
static enum b_status string_insert_wstr_utf8(
struct b_string *dest, const b_wchar *src, size_t nr_codepoints,
size_t codepoint_offset)
{
if (codepoint_offset >= dest->s_codepoints) {
codepoint_offset = dest->s_codepoints;
}
size_t utf8_encoded_size = get_utf8_encoded_size(src, nr_codepoints);
if (utf8_encoded_size == 0) {
return B_ERR_INVALID_ARGUMENT;
}
size_t new_total_bytes = dest->s_len + utf8_encoded_size;
if (dest->s_max < new_total_bytes) {
string_change_capacity(dest, new_total_bytes);
}
size_t move_offset = 0;
enum b_status status = B_SUCCESS;
if (codepoint_offset == dest->s_codepoints) {
move_offset = dest->s_len;
} else {
status = convert_codepoint_range_to_byte_range(
dest, 0, codepoint_offset, NULL, &move_offset);
}
if (!B_OK(status)) {
return status;
}
char *dest_buf = string_ptr(dest);
char *from = dest_buf + move_offset;
char *to = dest_buf + move_offset + utf8_encoded_size;
memmove(to, from, dest->s_len - move_offset);
char *ptr = dest_buf + move_offset;
for (size_t i = 0; i < nr_codepoints; i++) {
char c[4];
size_t c_len = utf8_codepoint_encode(src[i], c);
if (c_len == 0) {
/* the input string was already checked by
* get_utf8_encoded_size, so this should never happen */
return B_ERR_INVALID_ARGUMENT;
}
memcpy(ptr, c, c_len);
ptr += c_len;
}
dest_buf[new_total_bytes] = '\0';
dest->s_len += utf8_encoded_size;
dest->s_codepoints += nr_codepoints;
return B_SUCCESS;
}
static enum b_status string_insert_cstr(
struct b_string *dest, const char *src, size_t nr_bytes, size_t at)
{
if (dest->s_len == dest->s_codepoints) {
return string_insert_cstr_ansi(dest, src, nr_bytes, at);
}
return string_insert_cstr_utf8(dest, src, nr_bytes, at);
}
static enum b_status string_insert_wstr(
struct b_string *dest, const b_wchar *src, size_t nr_codepoints, size_t at)
{
if (dest->s_len == dest->s_codepoints) {
return string_insert_wstr_ansi(dest, src, nr_codepoints, at);
}
return string_insert_wstr_utf8(dest, src, nr_codepoints, at);
}
static enum b_status string_insertf(
struct b_string *dest, size_t at, const char *format, va_list arg)
{
char buf[1024];
size_t len = vsnprintf(buf, sizeof buf, format, arg);
return string_insert_cstr(dest, buf, len, at);
}
enum b_status b_string_insert_c(struct b_string *dest, char c, size_t at)
{
return string_insert_cstr(dest, &c, 1, at);
}
enum b_status b_string_insert_wc(struct b_string *dest, b_wchar c, size_t at)
{
return string_insert_wstr(dest, &c, 1, at);
}
enum b_status b_string_insert_s(
struct b_string *dest, const struct b_string *src, size_t at)
{
return string_insert_cstr(dest, b_string_ptr(src), src->s_len, at);
}
enum b_status b_string_insert_cstr(struct b_string *dest, const char *src, size_t at)
{
return string_insert_cstr(dest, src, strlen(src), at);
}
enum b_status b_string_insert_wstr(
struct b_string *dest, const b_wchar *src, size_t at)
{
return string_insert_wstr(dest, src, b_wstrlen(src), at);
}
enum b_status b_string_insert_cstrf(
struct b_string *dest, size_t at, const char *format, ...)
{
va_list arg;
va_start(arg, format);
enum b_status status = string_insertf(dest, at, format, arg);
va_end(arg);
return status;
}
enum b_status b_string_insert_cstrn(
b_string *dest, const char *src, size_t len, size_t at)
{
return string_insert_cstr(dest, src, len, at);
}
enum b_status b_string_append_c(struct b_string *dest, char c)
{
return b_string_insert_c(dest, c, SIZE_MAX);
}
enum b_status b_string_append_wc(struct b_string *dest, b_wchar c)
{
return b_string_insert_wc(dest, c, SIZE_MAX);
}
enum b_status b_string_append_s(struct b_string *dest, const struct b_string *src)
{
return b_string_insert_s(dest, src, SIZE_MAX);
}
enum b_status b_string_append_cstr(struct b_string *dest, const char *src)
{
return b_string_insert_cstr(dest, src, SIZE_MAX);
}
enum b_status b_string_append_wstr(struct b_string *dest, const b_wchar *src)
{
return b_string_insert_wstr(dest, src, SIZE_MAX);
}
enum b_status b_string_append_cstrf(struct b_string *dest, const char *format, ...)
{
va_list arg;
va_start(arg, format);
enum b_status status = string_insertf(dest, SIZE_MAX, format, arg);
va_end(arg);
return status;
}
enum b_status b_string_prepend_c(struct b_string *dest, char c)
{
return b_string_insert_c(dest, c, 0);
}
enum b_status b_string_prepend_wc(struct b_string *dest, b_wchar c)
{
return b_string_insert_wc(dest, c, 0);
}
enum b_status b_string_prepend_s(struct b_string *dest, const struct b_string *src)
{
return b_string_insert_s(dest, src, 0);
}
enum b_status b_string_prepend_cstr(struct b_string *dest, const char *src)
{
return b_string_insert_cstr(dest, src, 0);
}
enum b_status b_string_prepend_wstr(struct b_string *dest, const b_wchar *src)
{
return b_string_insert_wstr(dest, src, 0);
}
enum b_status b_string_prepend_cstrf(struct b_string *dest, const char *format, ...)
{
va_list arg;
va_start(arg, format);
enum b_status status = string_insertf(dest, 0, format, arg);
va_end(arg);
return status;
}
void b_string_clear(struct b_string *str)
{
if (str->s_len == 0) {
return;
}
char *s = string_ptr(str);
*s = '\0';
str->s_len = 0;
str->s_codepoints = 0;
}
static struct b_iterator_ops it_ops;
static bool has_prefix(const char *s, const char *prefix, size_t *prefix_len)
{
size_t len = 0;
for (size_t i = 0;; i++) {
if (s[i] == 0 || prefix[i] == 0) {
break;
}
if (s[i] != prefix[i]) {
return false;
}
len++;
}
*prefix_len = len;
return true;
}
static bool has_prefixes(
const char *s, const char **prefixes, size_t nr_prefixes,
size_t *selected_prefix_len)
{
for (size_t i = 0; i < nr_prefixes; i++) {
const char *delim = prefixes[i];
if (has_prefix(s, delim, selected_prefix_len)) {
return true;
}
}
return false;
}
static enum b_status find_next_token(struct b_string_iterator *it)
{
size_t offset = it->_ds;
size_t prefix_len = 0;
char *start = string_ptr(it->_s);
bool found_delim_last_time = (it->_f & STRING_TOK_F_FOUND_DELIM) != 0;
bool found_delim = false;
bool include_empty = (it->_f & B_STRING_TOK_F_INCLUDE_EMPTY_TOKENS);
bool found_null = false;
b_string_clear(it->_tmp);
while (1) {
char *s = start + offset;
if (*s == 0) {
it->_f &= ~STRING_TOK_F_FOUND_DELIM;
break;
}
found_delim = has_prefixes(s, it->_d, it->_nd, &prefix_len);
if (found_delim) {
if (it->_tmp->s_len == 0 && !include_empty) {
/* this token is empty, skip it */
offset += prefix_len;
found_delim = false;
continue;
}
it->_f |= STRING_TOK_F_FOUND_DELIM;
break;
}
b_wchar c = utf8_codepoint_decode(s);
if (c == B_WCHAR_INVALID) {
return B_ERR_BAD_STATE;
}
b_string_append_wc(it->_tmp, c);
offset += utf8_codepoint_size(c);
if (offset > it->_s->s_len) {
break;
}
}
bool end = !found_delim && it->_tmp->s_len == 0;
if (include_empty && found_delim_last_time) {
end = false;
}
if (end) {
it->string_value = NULL;
it->string_length = 0;
it->string_codepoints = 0;
return B_ERR_NO_DATA;
}
it->_ds = offset + prefix_len;
it->string_value = b_string_ptr(it->_tmp);
it->string_length = it->_tmp->s_len;
it->string_codepoints = it->_tmp->s_codepoints;
return B_SUCCESS;
}
enum b_status b_string_tokenise(
struct b_string *str, const char *delims[], size_t nr_delims,
b_string_tokenise_flags flags, struct b_string_iterator *it)
{
memset(it, 0x0, sizeof *it);
if (!nr_delims) {
return B_ERR_INVALID_ARGUMENT;
}
struct b_string *tmp = b_string_create();
if (!tmp) {
return B_ERR_NO_MEMORY;
}
it->_base.it_ops = &it_ops;
it->_m = ITERATOR_MODE_TOKENS;
it->_d = delims;
it->_nd = nr_delims;
it->_s = str;
it->_f = flags;
it->_tmp = tmp;
enum b_status status = find_next_token(it);
if (!B_OK(status)) {
b_string_release(tmp);
it->_tmp = NULL;
}
return status;
}
size_t b_string_get_size(const struct b_string *str, b_strlen_flags flags)
{
switch (flags) {
case B_STRLEN_NORMAL:
return str->s_len;
case B_STRLEN_CODEPOINTS:
return str->s_codepoints;
default:
return b_strlen(b_string_ptr(str), flags);
}
}
size_t b_string_get_capacity(const struct b_string *str)
{
return str->s_max;
}
bool b_string_compare(const struct b_string *a, const struct b_string *b)
{
if (a->s_len != b->s_len) {
return false;
}
if (a == b) {
return true;
}
const char *ap = b_string_ptr(a);
const char *bp = b_string_ptr(b);
for (size_t i = 0; i < a->s_len; i++) {
if (ap[i] != bp[i]) {
return false;
}
}
return true;
}
char b_string_front(const struct b_string *str)
{
if (str->s_len == 0) {
return 0;
}
const char *s = b_string_ptr(str);
return s[0];
}
char b_string_back(const struct b_string *str)
{
if (str->s_len == 0) {
return 0;
}
const char *s = b_string_ptr(str);
return s[str->s_len - 1];
}
void b_string_pop_back(struct b_string *str)
{
if (str->s_len == 0) {
return;
}
char *s = string_ptr(str);
s[str->s_len - 1] = '\0';
str->s_len--;
}
const char *b_string_ptr(const struct b_string *str)
{
if (string_is_inline(str)) {
return str->s_data.d_inline;
}
return str->s_data.d_external;
}
struct b_string *b_string_substr(const struct b_string *str, size_t start, size_t len)
{
if (start > b_string_get_size(str, B_STRLEN_NORMAL)) {
return NULL;
}
if (start + len > b_string_get_size(str, B_STRLEN_NORMAL)) {
len = b_string_get_size(str, B_STRLEN_NORMAL) - start;
}
struct b_string *newstr = b_string_create();
b_string_reserve(newstr, len);
const char *src = b_string_ptr(str) + start;
char *dest = string_ptr(newstr);
memcpy(dest, src, len);
newstr->s_len = len;
return newstr;
}
static enum b_status stream_close(struct b_stream *stream)
{
struct b_string *str = stream->s_ptr;
b_string_release(str);
return B_SUCCESS;
}
static enum b_status stream_getc(struct b_stream *stream, int *out)
{
struct b_string *str = stream->s_ptr;
if (stream->s_cursor >= str->s_len) {
return B_ERR_NO_DATA;
}
char *s = string_ptr(str);
*out = s[stream->s_cursor];
stream->s_cursor++;
return B_SUCCESS;
}
static enum b_status stream_read(
struct b_stream *stream, unsigned char *buf, size_t count, size_t *nr_read)
{
struct b_string *str = stream->s_ptr;
if (stream->s_cursor >= str->s_len) {
*nr_read = 0;
return B_SUCCESS;
}
size_t available = str->s_len - stream->s_cursor;
size_t to_read = b_min(size_t, count, available);
char *s = string_ptr(str) + stream->s_cursor;
memcpy(buf, s, to_read);
*nr_read = to_read;
return B_SUCCESS;
}
static enum b_status stream_write(
struct b_stream *stream, const unsigned char *buf, size_t count,
size_t *nr_written)
{
struct b_string *str = stream->s_ptr;
enum b_status status = B_SUCCESS;
if (stream->s_cursor + count > str->s_max) {
status = b_string_reserve(str, stream->s_cursor + count);
}
if (!B_OK(status)) {
return status;
}
string_insert_cstr(str, (const char *)buf, count, stream->s_cursor);
stream->s_cursor += count;
*nr_written = count;
return B_SUCCESS;
}
static enum b_status stream_seek(
struct b_stream *stream, long long offset, b_stream_seek_origin origin)
{
struct b_string *str = stream->s_ptr;
size_t abs_offset;
switch (origin) {
case B_STREAM_SEEK_START:
abs_offset = offset;
break;
case B_STREAM_SEEK_CURRENT:
abs_offset = stream->s_cursor + offset;
break;
case B_STREAM_SEEK_END:
abs_offset = str->s_len + offset;
break;
default:
return B_ERR_INVALID_ARGUMENT;
}
stream->s_cursor = abs_offset;
return B_SUCCESS;
}
static enum b_status stream_reserve(struct b_stream *stream, size_t len)
{
struct b_string *str = stream->s_ptr;
size_t new_capacity = str->s_len + len;
return b_string_reserve(str, new_capacity);
}
enum b_status b_string_open_stream(struct b_string *str, struct b_stream **out)
{
struct b_stream *stream = malloc(sizeof *stream);
if (!stream) {
return B_ERR_NO_MEMORY;
}
memset(stream, 0x0, sizeof *stream);
stream->s_mode |= B_STREAM_READ | B_STREAM_WRITE;
stream->s_ptr = b_string_retain(str);
stream->s_close = stream_close;
stream->s_getc = stream_getc;
stream->s_read = stream_read;
stream->s_write = stream_write;
stream->s_seek = stream_seek;
stream->s_reserve = stream_reserve;
*out = stream;
return B_SUCCESS;
}
static bool string_iterator_next(struct b_iterator *it)
{
return b_string_iterator_next((struct b_string_iterator *)it);
}
static bool string_iterator_is_valid(const struct b_iterator *it)
{
return b_string_iterator_is_valid((struct b_string_iterator *)it);
}
static struct b_iterator_ops it_ops = {
.it_next = string_iterator_next,
.it_close = NULL,
.it_is_valid = string_iterator_is_valid,
};
static void iterator_cleanup(b_string_iterator *it)
{
if (it->_tmp) {
b_string_release(it->_tmp);
}
memset(it, 0x0, sizeof *it);
}
int b_string_iterator_begin(const struct b_string *string, b_string_iterator *it)
{
memset(it, 0x0, sizeof *it);
it->_base.it_ops = &it_ops;
if (!string->s_len) {
it->status = B_ERR_NO_DATA;
return -1;
}
const char *p = b_string_ptr(string);
it->_m = ITERATOR_MODE_CHARS;
it->_s = B_STRING(string);
it->char_value = utf8_codepoint_decode(p);
if (it->char_value == B_WCHAR_INVALID) {
it->status = B_ERR_BAD_FORMAT;
return -1;
}
return 0;
}
static bool chars_iterator_next(b_string_iterator *it)
{
if (!b_string_iterator_is_valid(it)) {
return false;
}
size_t stride = utf8_codepoint_size(it->char_value);
if (stride == 0) {
iterator_cleanup(it);
return false;
}
it->byte_index += stride;
it->codepoint_index += 1;
if (it->byte_index >= it->_s->s_len) {
iterator_cleanup(it);
it->_s = NULL;
it->byte_index = 0;
it->codepoint_index = 0;
it->char_value = B_WCHAR_INVALID;
it->status = B_ERR_NO_DATA;
return false;
}
char *p = string_ptr(it->_s) + it->byte_index;
it->char_value = utf8_codepoint_decode(p);
if (it->char_value == B_WCHAR_INVALID) {
iterator_cleanup(it);
it->_s = NULL;
it->byte_index = 0;
it->codepoint_index = 0;
it->char_value = B_WCHAR_INVALID;
it->status = B_ERR_BAD_FORMAT;
return false;
}
it->iteration_index++;
return true;
}
static bool tokens_iterator_next(b_string_iterator *it)
{
if (!b_string_iterator_is_valid(it)) {
return false;
}
enum b_status status = find_next_token(it);
if (!B_OK(status)) {
iterator_cleanup(it);
return false;
}
it->string_value = string_ptr(it->_tmp);
it->iteration_index++;
return true;
}
bool b_string_iterator_next(b_string_iterator *it)
{
switch (it->_m) {
case ITERATOR_MODE_CHARS:
return chars_iterator_next(it);
case ITERATOR_MODE_TOKENS:
return tokens_iterator_next(it);
default:
return false;
}
}
static bool chars_iterator_is_valid(const struct b_string_iterator *it)
{
if (!it->_s) {
return false;
}
if (it->byte_index >= it->_s->s_len) {
return false;
}
if (it->char_value == B_WCHAR_INVALID) {
return false;
}
return true;
}
static bool tokens_iterator_is_valid(const struct b_string_iterator *it)
{
if (!it->_s) {
return false;
}
if (it->byte_index >= it->_s->s_len) {
return false;
}
if (!it->string_value) {
return false;
}
return true;
}
bool b_string_iterator_is_valid(const struct b_string_iterator *it)
{
switch (it->_m) {
case ITERATOR_MODE_CHARS:
return chars_iterator_is_valid(it);
case ITERATOR_MODE_TOKENS:
return tokens_iterator_is_valid(it);
default:
return false;
}
}
static void string_release(struct b_object *obj)
{
struct b_string *str = B_STRING(obj);
if (!string_is_inline(str)) {
free(string_ptr(str));
}
}
static void string_to_string(const struct b_object *obj, struct b_stream *out)
{
b_string *str = B_STRING(obj);
const char *s = b_string_ptr(str);
for (size_t i = 0; i < str->s_len; i++) {
b_stream_write_char(out, s[i]);
}
}
char *b_strdup(const char *s)
{
size_t len = strlen(s);
char *p = malloc(len + 1);
if (!p) {
return NULL;
}
memcpy(p, s, len);
p[len] = '\0';
return p;
}
size_t b_strlen(const char *s, b_strlen_flags flags)
{
if (!(flags & (B_STRLEN_IGNORE_ESC | B_STRLEN_IGNORE_MOD))) {
return strlen(s);
}
size_t out = 0;
for (size_t i = 0; s[i]; i++) {
if (s[i] == '\033' && (flags & B_STRLEN_IGNORE_ESC)) {
while (!isalpha(s[i]) && s[i]) {
i++;
}
continue;
}
if (s[i] == '[' && (flags & B_STRLEN_IGNORE_MOD)) {
i++;
if (s[i] == '[') {
out++;
continue;
}
while (s[i] != ']' && s[i]) {
i++;
}
continue;
}
out++;
}
return out;
}
b_wchar *b_wstrdup(const b_wchar *s)
{
size_t len = b_wstrlen(s);
b_wchar *buf = calloc(len + 1, sizeof(b_wchar));
if (!buf) {
return NULL;
}
memcpy(buf, s, len * sizeof(b_wchar));
return buf;
}
size_t b_wstrlen(const b_wchar *s)
{
size_t len;
for (len = 0; s[len] != 0; len++)
;
return len;
}
uint64_t b_string_hash(const struct b_string *str)
{
#define FNV1_OFFSET_BASIS 0xcbf29ce484222325
#define FNV1_PRIME 0x100000001b3
uint64_t hash = FNV1_OFFSET_BASIS;
size_t i = 0;
const char *s = b_string_ptr(str);
for (i = 0; i < str->s_len; i++) {
hash ^= s[i];
hash *= FNV1_PRIME;
}
return hash;
}
b_object_type_id b_string_type_id(void)
{
return (b_object_type_id)&string_type;
}