diff --git a/ds/string.c b/ds/string.c index 47f993a..e8f5f85 100644 --- a/ds/string.c +++ b/ds/string.c @@ -59,174 +59,6 @@ static char *string_ptr(const struct b_string_p *str) return str->s_data.d_external; } -static size_t utf8_codepoint_size(b_wchar c) -{ - if (!IS_VALID_UTF8_SCALAR(c)) { - return 0; - } - - if (c <= 0x7F) { - return 1; - } - - if (c <= 0x7FF) { - return 2; - } - - if (c <= 0xFFFF) { - return 3; - } - - if (c <= 0x10FFFF) { - return 4; - } - - return 0; -} - -static int32_t decode_utf8_trailer_byte(char c) -{ - if (!(c & 0x80) || (c & 0x40)) { - return -1; - } - - return c & 0x3F; -} - -static b_wchar utf8_codepoint_decode(const char *s) -{ - b_wchar result = 0; - int len = 0; - - if (!(s[0] & 0x80)) { - len = 1; - result = s[0] & 0x7F; - } else if (s[0] & 0xC0 && !(s[0] & 0x20)) { - len = 2; - result = s[0] & 0x1F; - result <<= 6; - } else if (s[0] & 0xE0 && !(s[0] & 0x10)) { - len = 3; - result = s[0] & 0x0F; - result <<= 12; - } else if (s[0] & 0xF0 && !(s[0] & 0x08)) { - len = 4; - result = s[0] & 0x07; - result <<= 18; - } else { - return B_WCHAR_INVALID; - } - - for (int i = 1; i < len; i++) { - int32_t c = decode_utf8_trailer_byte(s[i]); - if (c == -1) { - return B_WCHAR_INVALID; - } - - c <<= 6 * (len - i - 1); - result |= c; - } - - if (!IS_VALID_UTF8_SCALAR(result)) { - return B_WCHAR_INVALID; - } - - return result; -} - -static size_t utf8_codepoint_encode(b_wchar c, char s[4]) -{ - size_t len = utf8_codepoint_size(c); - - switch (len) { - case 1: - s[0] = c & 0x7F; - break; - case 2: - s[0] = ((c >> 6) & 0x1F) | 0xC0; - s[1] = (c & 0x3F) | 0x80; - break; - case 3: - s[0] = ((c >> 12) & 0x0F) | 0xE0; - s[1] = ((c >> 6) & 0x3F) | 0x80; - s[2] = (c & 0x3F) | 0x80; - break; - case 4: - s[0] = ((c >> 18) & 0x07) | 0xF0; - s[1] = ((c >> 12) & 0x3F) | 0x80; - s[2] = ((c >> 6) & 0x3F) | 0x80; - s[3] = (c & 0x3F) | 0x80; - break; - default: - return 0; - } - - return len; -} - -static size_t codepoint_stride(const char *s) -{ - char c = *s; - - if (!(c & 0x80)) { - return 1; - } - - if ((c & 0xC0) && !(c & 0x20)) { - return 2; - } - - if ((c & 0xE0) && !(c & 0x10)) { - return 3; - } - - if ((c & 0xF0) && !(c & 0x08)) { - return 4; - } - - return 0; -} - -static size_t get_number_of_codepoints(const char *s, size_t len) -{ - size_t nr_codepoints = 0; - const char *end = s + len; - - while (*s && s < end) { - size_t stride = codepoint_stride(s); - if (stride == 0) { - /* invalid codepoint */ - return 0; - } - - nr_codepoints++; - s += stride; - } - - if (*s != 0) { - /* string is not null-terminated */ - return 0; - } - - return nr_codepoints; -} - -static size_t get_utf8_encoded_size(const b_wchar *s, size_t nr_codepoints) -{ - size_t len = 0; - for (size_t i = 0; i < nr_codepoints; i++) { - size_t l = utf8_codepoint_size(s[i]); - if (l == 0) { - /* invalid codepoint */ - return 0; - } - - len += l; - } - - return len; -} - static enum b_status convert_codepoint_range_to_byte_range( const struct b_string_p *str, size_t cp_start, size_t cp_length, size_t *out_byte_start, size_t *out_byte_length) @@ -241,7 +73,7 @@ static enum b_status convert_codepoint_range_to_byte_range( return B_ERR_OUT_OF_BOUNDS; } - size_t stride = codepoint_stride(cp); + size_t stride = b_wchar_utf8_codepoint_stride(cp); if (!stride) { /* invalid codepoint */ return B_ERR_BAD_STATE; @@ -258,7 +90,7 @@ static enum b_status convert_codepoint_range_to_byte_range( return B_ERR_OUT_OF_BOUNDS; } - size_t stride = codepoint_stride(cp); + size_t stride = b_wchar_utf8_codepoint_stride(cp); if (!stride) { /* invalid codepoint */ return B_ERR_BAD_STATE; @@ -524,7 +356,7 @@ static enum b_status replace_utf8( size_t new_data_nr_bytes = strlen(new_data); size_t new_data_nr_codepoints - = get_number_of_codepoints(new_data, new_data_nr_bytes); + = b_wchar_utf8_codepoint_count(new_data, new_data_nr_bytes); if (new_data_nr_codepoints == 0) { /* new_data is not a valid utf-8 string */ return B_ERR_INVALID_ARGUMENT; @@ -697,7 +529,7 @@ static enum b_status trim_utf8(struct b_string_p *str) size_t whitespace_end = 0; size_t nr_whitespace_codepoints = 0; for (size_t i = 0; i < str->s_len;) { - b_wchar c = utf8_codepoint_decode(&s[i]); + b_wchar c = b_wchar_utf8_codepoint_decode(&s[i]); if (!b_wchar_is_space(s[i])) { whitespace_end = i; @@ -717,8 +549,8 @@ static enum b_status trim_utf8(struct b_string_p *str) } for (long i = str->s_len - 1; i >= 0;) { - b_wchar c = utf8_codepoint_decode(p); - size_t c_size = utf8_codepoint_size(c); + b_wchar c = b_wchar_utf8_codepoint_decode(p); + size_t c_size = b_wchar_utf8_codepoint_size(c); if (b_wchar_is_space(c)) { memset(p, 0, c_size); @@ -808,7 +640,7 @@ static enum b_status string_insert_cstr_utf8( dest_buf[new_total_bytes] = '\0'; dest->s_len += nr_bytes; - dest->s_codepoints += get_number_of_codepoints(src, nr_bytes); + dest->s_codepoints += b_wchar_utf8_codepoint_count(src, nr_bytes); return B_SUCCESS; } @@ -820,7 +652,8 @@ static enum b_status string_insert_wstr_ansi( at = dest->s_len; } - size_t utf8_encoded_size = get_utf8_encoded_size(src, nr_codepoints); + size_t utf8_encoded_size + = b_wchar_utf8_string_encoded_size(src, nr_codepoints); if (utf8_encoded_size == 0) { return B_ERR_INVALID_ARGUMENT; } @@ -838,7 +671,7 @@ static enum b_status string_insert_wstr_ansi( char *ptr = dest_buf + at; for (size_t i = 0; i < nr_codepoints; i++) { char c[4]; - size_t c_len = utf8_codepoint_encode(src[i], c); + size_t c_len = b_wchar_utf8_codepoint_encode(src[i], c); if (c_len == 0) { /* the input string was already checked by * get_utf8_encoded_size, so this should never happen */ @@ -865,7 +698,8 @@ static enum b_status string_insert_wstr_utf8( codepoint_offset = dest->s_codepoints; } - size_t utf8_encoded_size = get_utf8_encoded_size(src, nr_codepoints); + size_t utf8_encoded_size + = b_wchar_utf8_string_encoded_size(src, nr_codepoints); if (utf8_encoded_size == 0) { return B_ERR_INVALID_ARGUMENT; } @@ -897,7 +731,7 @@ static enum b_status string_insert_wstr_utf8( char *ptr = dest_buf + move_offset; for (size_t i = 0; i < nr_codepoints; i++) { char c[4]; - size_t c_len = utf8_codepoint_encode(src[i], c); + size_t c_len = b_wchar_utf8_codepoint_encode(src[i], c); if (c_len == 0) { /* the input string was already checked by * get_utf8_encoded_size, so this should never happen */ @@ -1036,13 +870,13 @@ static enum b_status find_next_token(struct b_string_iterator *it) break; } - b_wchar c = utf8_codepoint_decode(s); + b_wchar c = b_wchar_utf8_codepoint_decode(s); if (c == B_WCHAR_INVALID) { return B_ERR_BAD_STATE; } b_string_append_wc(it->_tmp, c); - offset += utf8_codepoint_size(c); + offset += b_wchar_utf8_codepoint_size(c); if (offset > it->_s_p->s_len) { break; @@ -1230,7 +1064,7 @@ b_string *b_string_create_from_cstr(const char *s) struct b_string_p *p = b_object_get_private(str, B_TYPE_STRING); size_t s_len = strlen(s); - size_t s_codepoints = get_number_of_codepoints(s, s_len); + size_t s_codepoints = b_wchar_utf8_codepoint_count(s, s_len); b_string_reserve(str, s_len); char *dest = string_ptr(p); @@ -1577,7 +1411,7 @@ int b_string_iterator_begin(const b_string *string, b_string_iterator *it) const char *s = string_ptr(p); it->_m = ITERATOR_MODE_CHARS; it->_s_p = p; - it->char_value = utf8_codepoint_decode(s); + it->char_value = b_wchar_utf8_codepoint_decode(s); if (it->char_value == B_WCHAR_INVALID) { it->status = B_ERR_BAD_FORMAT; @@ -1593,7 +1427,7 @@ static bool chars_iterator_next(b_string_iterator *it) return false; } - size_t stride = utf8_codepoint_size(it->char_value); + size_t stride = b_wchar_utf8_codepoint_size(it->char_value); if (stride == 0) { iterator_cleanup(it); return false; @@ -1613,7 +1447,7 @@ static bool chars_iterator_next(b_string_iterator *it) } char *p = string_ptr(it->_s_p) + it->byte_index; - it->char_value = utf8_codepoint_decode(p); + it->char_value = b_wchar_utf8_codepoint_decode(p); if (it->char_value == B_WCHAR_INVALID) { iterator_cleanup(it); it->_s_p = NULL;