#include #include #include #include #include #include #include #include #include #include #include #include #include #include #define IS_VALID_KEY_COMPONENT(tok) \ ((tok) && ((tok)->tok_type == TOK_WORD || (tok)->tok_type == TOK_STRING)) #define ENABLE_EXTENDED_LEXING(ctx) \ do { \ ctx->ctx_flags &= ~CTX_ENABLE_LONG_SYMBOLS; \ ctx->ctx_flags |= CTX_ENABLE_NUMBERS | CTX_ENABLE_TIMESTAMPS \ | CTX_ENABLE_BOOLS | CTX_ENABLE_MULTILINE_STRING; \ } while (0) #define DISABLE_EXTENDED_LEXING(ctx) \ do { \ ctx->ctx_flags |= CTX_ENABLE_LONG_SYMBOLS; \ ctx->ctx_flags \ &= ~(CTX_ENABLE_NUMBERS | CTX_ENABLE_TIMESTAMPS \ | CTX_ENABLE_BOOLS | CTX_ENABLE_MULTILINE_STRING); \ } while (0) enum object_flags { OBJECT_HEADER_MID_DEFINED = 0x01u, OBJECT_HEADER_END_DEFINED = 0x02u, OBJECT_KV_MID_DEFINED = 0x04u, OBJECT_KV_END_DEFINED = 0x08u, }; enum token_type { TOK_NONE = 0, TOK_WORD, TOK_STRING, TOK_INT, TOK_UINT, TOK_FLOAT, TOK_BOOL, TOK_TIMESTAMP, TOK_NEWLINE, TOK_EQUAL, TOK_DOT, TOK_COMMA, TOK_LEFT_BRACKET, TOK_RIGHT_BRACKET, TOK_DOUBLE_LEFT_BRACKET, TOK_DOUBLE_RIGHT_BRACKET, TOK_LEFT_BRACE, TOK_RIGHT_BRACE, }; enum ctx_flags { CTX_EOF = 0x01u, CTX_ENABLE_NUMBERS = 0x02u, CTX_ENABLE_TIMESTAMPS = 0x04u, CTX_ENABLE_BOOLS = 0x08u, CTX_ENABLE_EXTENDED_SYMBOLS = 0x10u, CTX_ENABLE_LONG_SYMBOLS = 0x20u, CTX_ENABLE_MULTILINE_STRING = 0x40u, }; enum ctx_state { CTX_STATE_NONE = 0, CTX_STATE_IN_TABLE, CTX_STATE_IN_ARRAY, }; struct timestamp { unsigned int ts_year, ts_month, ts_day; unsigned short ts_hour, ts_min, ts_sec; unsigned int ts_msec; unsigned short ts_zone_offset_hour, ts_zone_offset_minute; unsigned char ts_zone_offset_negative; }; struct token { enum token_type tok_type; struct b_queue_entry tok_entry; b_string *tok_str; union { struct { int64_t v; bool inf, nan; } i; struct { double v; bool inf, nan; } f; bool b; // struct timestamp time; b_datetime *time; } tok_value; }; struct ctx { enum ctx_flags ctx_flags; b_stream *ctx_src; b_string *ctx_wordbuf; b_string *ctx_linebuf; b_string_iterator ctx_linebuf_ptr; enum b_status ctx_status; b_hashmap *ctx_objects_flags; b_queue ctx_tokens; }; static void ctx_set_object_flags( struct ctx *ctx, b_object *obj, enum object_flags flags) { if (!obj) { return; } b_hashmap_key key = { .key_data = obj, .key_size = sizeof(b_object *), .key_flags = B_HASHMAP_KEY_F_INTVALUE, }; const b_hashmap_value *old_value = b_hashmap_get(ctx->ctx_objects_flags, &key); enum object_flags new_flags = 0; if (old_value) { new_flags = (enum object_flags)(uintptr_t)old_value->value_data; } new_flags |= flags; b_hashmap_value value = { .value_data = (void *)new_flags, .value_size = sizeof new_flags, }; b_hashmap_put(ctx->ctx_objects_flags, &key, &value); } static void ctx_clear_object_flags( struct ctx *ctx, b_object *obj, enum object_flags mask) { if (!obj) { return; } b_hashmap_key key = { .key_data = obj, .key_size = sizeof(b_object *), .key_flags = B_HASHMAP_KEY_F_INTVALUE, }; const b_hashmap_value *old_value = b_hashmap_get(ctx->ctx_objects_flags, &key); enum object_flags new_flags = 0; if (old_value) { new_flags = (enum object_flags)(uintptr_t)old_value->value_data; } new_flags &= ~mask; b_hashmap_value value = { .value_data = (void *)new_flags, .value_size = sizeof new_flags, }; b_hashmap_put(ctx->ctx_objects_flags, &key, &value); } static enum object_flags ctx_get_object_flags(struct ctx *ctx, b_object *obj) { if (!obj) { return 0; } b_hashmap_key key = { .key_data = obj, .key_size = sizeof(b_object *), .key_flags = B_HASHMAP_KEY_F_INTVALUE, }; const b_hashmap_value *value = b_hashmap_get(ctx->ctx_objects_flags, &key); if (value) { return (enum object_flags)(uintptr_t)value->value_data; } return 0; } static enum b_status data_available(struct ctx *ctx) { size_t len = b_string_get_size(ctx->ctx_linebuf, B_STRLEN_NORMAL); if (len == 0) { return B_ERR_NO_DATA; } if (!B_OK(ctx->ctx_linebuf_ptr.status)) { return ctx->ctx_linebuf_ptr.status; } return b_string_iterator_is_valid(&ctx->ctx_linebuf_ptr) ? B_SUCCESS : B_ERR_NO_DATA; } static enum b_status refill_linebuf(struct ctx *ctx) { b_string_clear(ctx->ctx_linebuf); b_stringstream *buf = b_stringstream_create(); enum b_status status = b_stream_read_line_s(ctx->ctx_src, buf); if (!B_OK(status)) { return status; } status = b_string_replace_all_with_stringstream(ctx->ctx_linebuf, buf); if (!B_OK(status)) { return status; } b_stringstream_unref(buf); b_string_iterator_begin(ctx->ctx_linebuf, &ctx->ctx_linebuf_ptr); return B_SUCCESS; } static b_string *get_wordbuf(struct ctx *ctx) { b_string_clear(ctx->ctx_wordbuf); return ctx->ctx_wordbuf; } static bool is_valid_char(b_wchar c) { if (c <= 0) { return false; } switch (c) { case '\0': case '\b': case 0x0C: case 0x1F: case 0x7F: case 0x10: return false; default: return true; } } static b_wchar advance_char(struct ctx *ctx) { enum b_status status = data_available(ctx); if (status == B_ERR_NO_DATA) { status = refill_linebuf(ctx); } if (!B_OK(status)) { ctx->ctx_status = status; return -1; } status = data_available(ctx); if (!B_OK(status)) { ctx->ctx_status = status; return -1; } const char *s = b_string_ptr(ctx->ctx_linebuf); if (!B_OK(ctx->ctx_linebuf_ptr.status)) { ctx->ctx_status = B_ERR_BAD_FORMAT; return -1; } b_wchar c = ctx->ctx_linebuf_ptr.char_value; if (!is_valid_char(c)) { ctx->ctx_status = B_ERR_BAD_FORMAT; return -1; } b_string_iterator_next(&ctx->ctx_linebuf_ptr); return c; } static b_wchar peek_char(struct ctx *ctx) { enum b_status status = data_available(ctx); if (status == B_ERR_NO_DATA) { status = refill_linebuf(ctx); } if (!B_OK(status)) { ctx->ctx_status = status; return -1; } status = data_available(ctx); if (!B_OK(status)) { ctx->ctx_status = status; return -1; } const char *s = b_string_ptr(ctx->ctx_linebuf); if (!B_OK(ctx->ctx_linebuf_ptr.status)) { ctx->ctx_status = B_ERR_BAD_FORMAT; return -1; } b_wchar c = ctx->ctx_linebuf_ptr.char_value; if (!is_valid_char(c)) { ctx->ctx_status = B_ERR_BAD_FORMAT; return -1; } return c; } #if 0 static int peek_char(struct ctx *ctx) { b_wchar c = __peek_char(ctx); if (c != '#') { return c; } c = __peek_char(ctx); while (c != '\n' && c != -1) { __advance_char(ctx); c = __peek_char(ctx); } return c; } static int advance_char(struct ctx *ctx) { b_wchar c = __advance_char(ctx); if (c != '#') { return c; } c = __peek_char(ctx); while (c != '\n' && c != -1) { __advance_char(ctx); c = __peek_char(ctx); } return c; } #endif static struct token *enqueue_token(struct ctx *ctx, enum token_type type) { struct token *tok = malloc(sizeof *tok); if (!tok) { return NULL; } memset(tok, 0x0, sizeof *tok); tok->tok_type = type; b_queue_push_back(&ctx->ctx_tokens, &tok->tok_entry); return tok; } static void discard_token(struct ctx *ctx) { struct b_queue_entry *entry = b_queue_pop_front(&ctx->ctx_tokens); if (!entry) { return; } struct token *tok = b_unbox(struct token, entry, tok_entry); if (tok->tok_str) { free(tok->tok_str); } free(tok); } static bool try_convert_word_to_timestamp(struct ctx *ctx, b_string *token_str) { b_datetime *dt = b_datetime_parse( B_DATETIME_FORMAT_RFC3339, b_string_ptr(token_str)); if (!dt) { return false; } struct token *tok = enqueue_token(ctx, TOK_TIMESTAMP); tok->tok_str = b_string_duplicate(token_str); tok->tok_value.time = dt; return true; } #if 0 static bool try_convert_word_to_timestamp(struct ctx *ctx, b_string *token_str) { const char *s = b_string_ptr(token_str); size_t len = b_string_get_size(token_str, B_STRLEN_NORMAL); size_t i = 0, c = 0; struct timestamp ts = {0}; bool has_date = false, has_time = false; if (len >= 10 && s[4] == '-' && s[7] == '-') { has_date = true; } if (len >= 8 && s[2] == ':' && s[5] == ':') { has_time = true; } if (len >= 19 && s[4] == '-' && s[7] == '-' && (s[10] == 'T' || s[10] == ' ') && s[13] == ':' && s[16] == ':') { has_date = true; has_time = true; } if (!has_date && !has_time) { return false; } if (has_date) { for (c = 0; c < 4; c++, i++) { if (!isdigit(s[i])) { return false; } ts.ts_year *= 10; ts.ts_year += (s[i] - '0'); } if (s[i++] != '-') { return false; } for (c = 0; c < 2; c++, i++) { if (!isdigit(s[i])) { return false; } ts.ts_month *= 10; ts.ts_month += (s[i] - '0'); } if (s[i++] != '-') { return false; } for (c = 0; c < 2; c++, i++) { if (!isdigit(s[i])) { return false; } ts.ts_day *= 10; ts.ts_day += (s[i] - '0'); } } if (has_date && has_time) { if (s[i] != 'T' && s[i] != ' ') { return false; } i++; } if (has_time) { for (c = 0; c < 2; c++, i++) { if (!isdigit(s[i])) { return false; } ts.ts_hour *= 10; ts.ts_hour += (s[i] - '0'); } if (s[i++] != ':') { return false; } for (c = 0; c < 2; c++, i++) { if (!isdigit(s[i])) { return false; } ts.ts_min *= 10; ts.ts_min += (s[i] - '0'); } if (s[i++] != ':') { return false; } for (c = 0; c < 2; c++, i++) { if (!isdigit(s[i])) { return false; } ts.ts_sec *= 10; ts.ts_sec += (s[i] - '0'); } } if (s[i] == '.') { i++; for (c = 0; s[i]; c++, i++) { if (!isdigit(s[i])) { break; } ts.ts_msec *= 10; ts.ts_msec += (s[i] - '0'); } if (c == 0) { return false; } } if (s[i] == '+' || s[i] == '-') { ts.ts_zone_offset_negative = s[i] == '-'; i++; for (c = 0; c < 2; c++, i++) { if (!isdigit(s[i])) { return false; } ts.ts_zone_offset_hour *= 10; ts.ts_zone_offset_hour += (s[i] - '0'); } if (s[i++] != ':') { return false; } for (c = 0; c < 2; c++, i++) { if (!isdigit(s[i])) { return false; } ts.ts_zone_offset_minute *= 10; ts.ts_zone_offset_minute += (s[i] - '0'); } } else if (s[i] == 'Z') { i++; } if (s[i] != 0) { return false; } struct token *tok = enqueue_token(ctx, TOK_TIMESTAMP); tok->tok_str = b_string_steal(token_str); tok->tok_value.time = ts; return true; } #endif static bool is_valid_digit(b_wchar c, int base) { switch (base) { case 2: return b_wchar_is_bin_digit(c); case 8: return b_wchar_is_oct_digit(c); case 10: return b_wchar_is_number(c); case 16: return b_wchar_is_hex_digit(c); default: return false; } } static bool has_trailing_zero(const char *s) { int nr_zero = 0; for (size_t i = 0; s[i]; i++) { char c = s[i]; switch (c) { case '0': nr_zero++; break; case '.': case 'e': return false; default: return nr_zero > 0; } } return false; } static bool try_convert_word_to_number(struct ctx *ctx, b_string *token_str) { size_t len = b_string_get_size(token_str, B_STRLEN_NORMAL); b_string *str = b_string_duplicate(token_str); struct token *tok = NULL; const char *s = b_string_ptr(str); if (len == 0) { return false; } size_t validation_offset = 0; bool is_decimal = false; bool has_sign = false; int base = 10; switch (*s) { case '+': case '-': validation_offset++; has_sign = true; break; case '0': switch (*(s + 1)) { case 'b': base = 2; s += 2; break; case 'o': base = 8; s += 2; break; case 'x': base = 16; s += 2; break; case '.': case 'e': case 'E': break; case '\0': tok = enqueue_token(ctx, is_decimal ? TOK_FLOAT : TOK_INT); tok->tok_value.i.v = 0; return true; default: return false; } break; default: break; } if (!strcmp(s + validation_offset, "inf")) { tok = enqueue_token(ctx, TOK_FLOAT); tok->tok_value.f.v = (*s == '-') ? -1 : 0; tok->tok_value.f.inf = true; return true; } else if (!strcmp(s + validation_offset, "nan")) { tok = enqueue_token(ctx, TOK_FLOAT); tok->tok_value.f.v = (*s == '-') ? -1 : 0; tok->tok_value.f.nan = true; return true; } char previous = 0; for (size_t i = validation_offset; s[i]; i++) { char c = s[i]; if (c == '_') { if (!is_valid_digit(previous, base)) { return false; } size_t to_remove = (s - b_string_ptr(str)) + i; b_string_remove(str, to_remove, 1); i--; previous = c; continue; } if (c == '.') { if (base != 10) { return false; } if (is_decimal) { return false; } if (!is_valid_digit(previous, base)) { return false; } is_decimal = true; previous = c; continue; } if (c == 'e' || c == 'E') { if (base == 16) { previous = c; continue; } if (base != 10) { return false; } if (!is_valid_digit(previous, base)) { return false; } is_decimal = true; previous = c; continue; } if (c == '+' || c == '-') { if (base != 10) { return false; } if (previous != 'e' && previous != 'E') { return false; } previous = c; continue; } if (!is_valid_digit(c, base)) { return false; } switch (previous) { case 0: case 'e': case 'E': case '_': case '+': case '-': case '.': break; default: if (!is_valid_digit(previous, base)) { return false; } break; } previous = c; } switch (previous) { case 'e': case 'E': if (base == 16) { break; } case '.': case '_': case 0: return false; default: break; } if (has_trailing_zero(s + validation_offset) && base == 10) { return false; } long long i; double d; bool is_valid; if (is_decimal) { int r = 0; int len = strlen(s); // d = strtold(s + offset, &ep) * mul; int ret = sscanf(s, "%lf%n", &d, &r); is_valid = (ret == 1) && r == len; } else { char *ep; i = strtoll(s, &ep, base); is_valid = ((*ep) == 0); } if (!is_valid) { return false; } tok = enqueue_token(ctx, is_decimal ? TOK_FLOAT : TOK_INT); if (is_decimal) { tok->tok_value.f.v = d; } else { tok->tok_value.i.v = i; } return true; } static bool try_convert_word_to_bool(struct ctx *ctx, b_string *token_str) { const char *s = b_string_ptr(token_str); struct token *tok = NULL; if (!strcmp(s, "true")) { tok = enqueue_token(ctx, TOK_BOOL); tok->tok_str = b_string_duplicate(token_str); tok->tok_value.b = true; } else if (!strcmp(s, "false")) { tok = enqueue_token(ctx, TOK_BOOL); tok->tok_str = b_string_duplicate(token_str); tok->tok_value.b = false; } else { return false; } return true; } static void split_word(struct ctx *ctx, b_string *wordbuf) { #if 0 long len = b_string_get_size(wordbuf, B_STRLEN_NORMAL); if (!len) { return; } char *s = b_string_steal(wordbuf); int trailing_dots = 0; char prev = 0; for (long i = 0; i < len; i++) { if (prev == '.' && s[i] == '.') { ctx->ctx_status = B_ERR_BAD_FORMAT; break; } prev = s[i]; } if (!B_OK(ctx->ctx_status)) { free(s); return; } for (; len > 0; len--) { if (s[len - 1] == '.') { trailing_dots++; } else { break; } } char *ep; char *tok = strtok_r(s, ".", &ep); unsigned int i = 0; while (tok) { if (*tok == 0) { ctx->ctx_status = B_ERR_BAD_FORMAT; break; } if (i > 0) { enqueue_token(ctx, TOK_DOT); } struct token *word = enqueue_token(ctx, TOK_WORD); word->tok_str = b_strdup(tok); i++; tok = strtok_r(NULL, ".", &ep); } for (long i = 0; i < trailing_dots; i++) { enqueue_token(ctx, TOK_DOT); } free(s); #endif const char *delims[] = {"."}; size_t nr_delims = sizeof delims / sizeof delims[0]; b_string_iterator it; b_string_tokenise( wordbuf, delims, nr_delims, B_STRING_TOK_F_INCLUDE_EMPTY_TOKENS, &it); while (b_string_iterator_is_valid(&it)) { if (it.iteration_index > 0) { enqueue_token(ctx, TOK_DOT); } if (it.string_length > 0) { struct token *word = enqueue_token(ctx, TOK_WORD); word->tok_str = b_string_create_from_cstr(it.string_value); } b_string_iterator_next(&it); } } static void read_number(struct ctx *ctx) { b_wchar c = 0; b_string *wordbuf = get_wordbuf(ctx); while (1) { c = peek_char(ctx); if (c == -1 || !B_OK(ctx->ctx_status)) { break; } bool ok = b_wchar_is_alnum(c) || c == '_' || c == '-' || c == '.' || c == '+'; if (!ok) { break; } b_string_append_wc(wordbuf, c); advance_char(ctx); } bool is_number = try_convert_word_to_number(ctx, wordbuf); if (!is_number) { ctx->ctx_status = B_ERR_BAD_FORMAT; } } static void read_word(struct ctx *ctx) { b_wchar c = 0; b_string *wordbuf = get_wordbuf(ctx); while (1) { c = peek_char(ctx); if (c == -1 || !B_OK(ctx->ctx_status)) { break; } bool ok = b_wchar_is_alnum(c) || c == '_' || c == '-' || c == '.'; if (ctx->ctx_flags & CTX_ENABLE_TIMESTAMPS) { ok = ok || c == ':' || c == ' ' || c == '+'; } if (ctx->ctx_flags & CTX_ENABLE_NUMBERS) { ok = ok || c == '+'; } if (!ok) { break; } b_string_append_wc(wordbuf, c); advance_char(ctx); } bool parsed = false; b_string_trim(wordbuf); if (b_string_get_size(wordbuf, B_STRLEN_NORMAL) == 0) { ctx->ctx_status = B_ERR_BAD_FORMAT; return; } if (ctx->ctx_flags & CTX_ENABLE_BOOLS) { parsed = try_convert_word_to_bool(ctx, wordbuf); } if (!parsed && (ctx->ctx_flags & CTX_ENABLE_TIMESTAMPS)) { parsed = try_convert_word_to_timestamp(ctx, wordbuf); } if (!parsed && (ctx->ctx_flags & CTX_ENABLE_NUMBERS)) { parsed = try_convert_word_to_number(ctx, wordbuf); } if (parsed) { return; } b_string_iterator it; b_string_foreach(&it, wordbuf) { /* only allow ASCII numbers/letters here */ bool ok = isalnum(it.char_value) || it.char_value == '_' || it.char_value == '-' || it.char_value == '.'; if (!ok) { ctx->ctx_status = B_ERR_BAD_FORMAT; return; } } split_word(ctx, wordbuf); } static b_wchar read_unicode_sequence(struct ctx *ctx) { b_wchar c = peek_char(ctx); int expected_len = 0; switch (c) { case 'u': expected_len = 4; break; case 'U': expected_len = 8; break; default: return B_WCHAR_INVALID; } advance_char(ctx); char s[9] = {0}; int len = 0; while (1) { if (len >= expected_len) { break; } b_wchar c = peek_char(ctx); if (c == -1 || c == B_WCHAR_INVALID) { break; } if (!b_wchar_is_hex_digit(c)) { break; } s[len++] = (char)c; s[len] = 0; advance_char(ctx); } if (len != expected_len) { return B_WCHAR_INVALID; } char *ep; c = strtoul(s, &ep, 16); if (*ep != 0) { return B_WCHAR_INVALID; } return c; } static void read_string(struct ctx *ctx, bool squote) { advance_char(ctx); char term = '"'; if (squote) { term = '\''; } bool multiline = false; struct token *tok = enqueue_token(ctx, TOK_STRING); b_string *str = get_wordbuf(ctx); b_wchar c = peek_char(ctx); if (c == term) { advance_char(ctx); c = peek_char(ctx); if (c == term) { advance_char(ctx); c = peek_char(ctx); multiline = true; } else { tok->tok_str = b_string_duplicate(str); return; } if (c == '\n') { advance_char(ctx); } } if (multiline && !(ctx->ctx_flags & CTX_ENABLE_MULTILINE_STRING)) { ctx->ctx_status = B_ERR_BAD_FORMAT; return; } bool fail = false; bool esc = false; bool cr = false; tok->tok_type = TOK_STRING; while (!fail) { c = peek_char(ctx); if (c == -1) { ctx->ctx_status = B_ERR_BAD_FORMAT; fail = true; break; } if (c == '\r') { if (!cr) { advance_char(ctx); cr = true; continue; } else { ctx->ctx_status = B_ERR_BAD_FORMAT; fail = true; break; } } if (esc) { if (c == '\n') { while (c != -1 && isspace(c)) { advance_char(ctx); c = peek_char(ctx); } cr = false; esc = false; continue; } if (isspace(c)) { while (c != -1 && isspace(c) && c != '\n') { advance_char(ctx); c = peek_char(ctx); } if (c != '\n') { ctx->ctx_status = B_ERR_BAD_FORMAT; fail = true; break; } while (c != -1 && isspace(c)) { advance_char(ctx); c = peek_char(ctx); } cr = false; esc = false; continue; } switch (c) { case '"': case '\\': b_string_append_wc(str, c); advance_char(ctx); break; case 'b': b_string_append_c(str, '\b'); advance_char(ctx); break; case 't': b_string_append_c(str, '\t'); advance_char(ctx); break; case 'n': b_string_append_c(str, '\n'); advance_char(ctx); break; case 'r': b_string_append_c(str, '\r'); advance_char(ctx); break; case 'f': b_string_append_c(str, '\f'); advance_char(ctx); break; case 'u': case 'U': c = read_unicode_sequence(ctx); if (c == B_WCHAR_INVALID) { ctx->ctx_status = B_ERR_BAD_FORMAT; fail = true; break; } ctx->ctx_status = B_OK(b_string_append_wc(str, c)) ? B_SUCCESS : B_ERR_BAD_FORMAT; fail = !B_OK(ctx->ctx_status); break; default: ctx->ctx_status = B_ERR_BAD_FORMAT; fail = true; break; } esc = false; continue; } else if (c == '\\' && !squote) { esc = true; } else if (c == '\n') { if (!multiline) { fail = true; ctx->ctx_status = B_ERR_BAD_FORMAT; break; } if (cr) { b_string_append_wc(str, '\r'); cr = false; } b_string_append_wc(str, c); } else if (c == term) { advance_char(ctx); if (!multiline) { break; } c = peek_char(ctx); if (c != term) { b_string_append_wc(str, term); continue; } advance_char(ctx); c = peek_char(ctx); if (c != term) { b_string_append_wc(str, term); b_string_append_wc(str, term); continue; } advance_char(ctx); c = peek_char(ctx); break; } else { b_string_append_wc(str, c); } advance_char(ctx); } if (cr) { fail = true; } if (fail) { discard_token(ctx); return; } if (!multiline) { goto done; } c = peek_char(ctx); if (c == term) { b_string_append_wc(str, c); advance_char(ctx); } c = peek_char(ctx); if (c == term) { b_string_append_wc(str, c); advance_char(ctx); } done: tok->tok_str = b_string_duplicate(str); } static void read_symbol(struct ctx *ctx) { b_wchar c = peek_char(ctx); advance_char(ctx); struct token *tok = enqueue_token(ctx, TOK_NONE); char s[] = {c, 0}; switch (c) { case '=': tok->tok_type = TOK_EQUAL; break; case '.': tok->tok_type = TOK_DOT; break; case ',': tok->tok_type = TOK_COMMA; break; case '[': if (!(ctx->ctx_flags & CTX_ENABLE_LONG_SYMBOLS)) { tok->tok_type = TOK_LEFT_BRACKET; break; } c = peek_char(ctx); switch (c) { case '[': tok->tok_type = TOK_DOUBLE_LEFT_BRACKET; advance_char(ctx); break; default: tok->tok_type = TOK_LEFT_BRACKET; break; } break; case ']': if (!(ctx->ctx_flags & CTX_ENABLE_LONG_SYMBOLS)) { /* if we're parsing more complex values, don't generate double-symbol tokens */ tok->tok_type = TOK_RIGHT_BRACKET; break; } c = peek_char(ctx); switch (c) { case ']': tok->tok_type = TOK_DOUBLE_RIGHT_BRACKET; advance_char(ctx); break; default: tok->tok_type = TOK_RIGHT_BRACKET; break; } break; case '{': tok->tok_type = TOK_LEFT_BRACE; break; case '}': tok->tok_type = TOK_RIGHT_BRACE; break; default: discard_token(ctx); ctx->ctx_status = B_ERR_BAD_FORMAT; break; } } static void read_newline(struct ctx *ctx) { b_wchar c = peek_char(ctx); while (c == '\n') { advance_char(ctx); c = peek_char(ctx); } enqueue_token(ctx, TOK_NEWLINE); ctx->ctx_status = B_SUCCESS; } static void read_comment(struct ctx *ctx) { b_wchar c = peek_char(ctx); bool cr = false; while (1) { if (c == '\n') { cr = false; break; } if (c == -1) { break; } if (cr) { ctx->ctx_status = B_ERR_BAD_FORMAT; break; } if (c == '\r') { cr = true; } advance_char(ctx); c = peek_char(ctx); } if (cr) { ctx->ctx_status = B_ERR_BAD_FORMAT; } if (!B_OK(ctx->ctx_status)) { return; } advance_char(ctx); enqueue_token(ctx, TOK_NEWLINE); } static bool is_symbol(b_wchar c) { switch (c) { case '=': case '.': case ',': case '[': case ']': case '{': case '}': return true; default: return false; } } static enum b_status advance_token(struct ctx *ctx) { b_wchar c = B_WCHAR_INVALID; discard_token(ctx); if (!b_queue_empty(&ctx->ctx_tokens)) { return B_SUCCESS; } start: c = peek_char(ctx); while (isspace(c) && c != '\n' && c != '\r') { advance_char(ctx); c = peek_char(ctx); } if (c == -1) { ctx->ctx_flags |= CTX_EOF; return B_ERR_NO_DATA; } #if 1 if (c == '#') { read_comment(ctx); goto start; } #endif if (!B_OK(ctx->ctx_status)) { return ctx->ctx_status; } if (c == '\r') { advance_char(ctx); c = peek_char(ctx); if (c != '\n') { ctx->ctx_status = B_ERR_BAD_FORMAT; return ctx->ctx_status; } } if (c == '"') { read_string(ctx, false); } else if (c == '\'') { read_string(ctx, true); } else if ((c == '+' || c == '-') && ctx->ctx_flags & CTX_ENABLE_NUMBERS) { read_number(ctx); } else if (is_symbol(c)) { read_symbol(ctx); } else if (c == '\n') { read_newline(ctx); } else { read_word(ctx); } return ctx->ctx_status; } static struct token *peek_token(struct ctx *ctx) { struct b_queue_entry *entry = b_queue_first(&ctx->ctx_tokens); if (!entry) { return NULL; } return b_unbox(struct token, entry, tok_entry); } static void ctx_cleanup(struct ctx *ctx) { if (ctx->ctx_linebuf) { b_string_unref(ctx->ctx_linebuf); ctx->ctx_linebuf = NULL; } if (ctx->ctx_wordbuf) { b_string_unref(ctx->ctx_wordbuf); ctx->ctx_wordbuf = NULL; } if (ctx->ctx_objects_flags) { b_hashmap_unref(ctx->ctx_objects_flags); ctx->ctx_objects_flags = NULL; } } static enum b_status ctx_init(struct ctx *ctx) { memset(ctx, 0x0, sizeof *ctx); ctx->ctx_linebuf = b_string_create(); ctx->ctx_wordbuf = b_string_create(); ctx->ctx_objects_flags = b_hashmap_create(NULL, NULL); return B_SUCCESS; } static enum b_status toml_serialise( b_serial_ctx *serial, b_object *src, b_stream *dest, enum b_serial_flags flags) { return B_ERR_NOT_SUPPORTED; } static void print_token(struct token *tok) { switch (tok->tok_type) { case TOK_NONE: printf("TOK_NONE\n"); break; case TOK_WORD: printf("TOK_WORD %s\n", b_string_ptr(tok->tok_str)); break; case TOK_STRING: printf("TOK_STRING %s\n", b_string_ptr(tok->tok_str)); break; case TOK_TIMESTAMP: printf("TOK_TIMESTAMP %04ld-%02ld-%02ld " "%02ld:%02ld:%02ld.%04ld %c" "%02ld:%02ld\n", b_datetime_year(tok->tok_value.time), b_datetime_month(tok->tok_value.time), b_datetime_day(tok->tok_value.time), b_datetime_hour(tok->tok_value.time), b_datetime_minute(tok->tok_value.time), b_datetime_second(tok->tok_value.time), b_datetime_subsecond(tok->tok_value.time), b_datetime_zone_offset_is_negative(tok->tok_value.time) ? '-' : '+', b_datetime_zone_offset_hour(tok->tok_value.time), b_datetime_zone_offset_minute(tok->tok_value.time)); break; case TOK_INT: printf("TOK_INT "); if (tok->tok_value.i.nan) { printf("NaN"); } else { printf("%lld", tok->tok_value.i.v); } printf("\n"); break; case TOK_FLOAT: printf("TOK_FLOAT "); if (tok->tok_value.f.nan) { printf("NaN"); } else { printf("%lf", tok->tok_value.f.v); } printf("\n"); break; case TOK_BOOL: printf("TOK_BOOL %s\n", tok->tok_value.b ? "true" : "false"); break; case TOK_NEWLINE: printf("TOK_NEWLINE\n"); break; case TOK_EQUAL: printf("TOK_EQUAL\n"); break; case TOK_DOT: printf("TOK_DOT\n"); break; case TOK_COMMA: printf("TOK_COMMA\n"); break; case TOK_LEFT_BRACKET: printf("TOK_LEFT_BRACKET\n"); break; case TOK_RIGHT_BRACKET: printf("TOK_RIGHT_BRACKET\n"); break; case TOK_DOUBLE_LEFT_BRACKET: printf("TOK_DOUBLE_LEFT_BRACKET\n"); break; case TOK_DOUBLE_RIGHT_BRACKET: printf("TOK_DOUBLE_RIGHT_BRACKET\n"); break; case TOK_LEFT_BRACE: printf("TOK_LEFT_BRACE\n"); break; case TOK_RIGHT_BRACE: printf("TOK_RIGHT_BRACE\n"); break; default: break; } } static enum b_status parse_value(struct ctx *ctx, b_object **result); static enum b_status parse_key_value_pair(struct ctx *ctx, b_dict *container); static enum b_status parse_timestamp(struct ctx *ctx, b_object **result) { struct token *tok = peek_token(ctx); b_datetime *dt = tok->tok_value.time; tok->tok_value.time = NULL; *result = (dt); return B_SUCCESS; } static enum b_status parse_string(struct ctx *ctx, b_object **result) { struct token *tok = peek_token(ctx); b_string *str = b_string_duplicate(tok->tok_str); if (!str) { return B_ERR_NO_MEMORY; } *result = (str); return B_SUCCESS; } static enum b_status parse_int(struct ctx *ctx, b_object **result) { struct token *tok = peek_token(ctx); b_number *val = B_LONGLONG(tok->tok_value.i.v); if (!val) { return B_ERR_NO_MEMORY; } if (tok->tok_value.i.inf) { if (tok->tok_value.i.v >= 0) { b_number_set_inf_positive(val, true); } else { b_number_set_inf_negative(val, true); } } else if (tok->tok_value.i.nan) { if (tok->tok_value.i.v >= 0) { b_number_set_nan_positive(val, true); } else { b_number_set_nan_negative(val, true); } } *result = (val); return B_SUCCESS; } static enum b_status parse_float(struct ctx *ctx, b_object **result) { struct token *tok = peek_token(ctx); b_number *val = B_DOUBLE(tok->tok_value.f.v); if (!val) { return B_ERR_NO_MEMORY; } if (tok->tok_value.f.inf) { if (tok->tok_value.f.v >= 0) { b_number_set_inf_positive(val, true); } else { b_number_set_inf_negative(val, true); } } else if (tok->tok_value.f.nan) { if (tok->tok_value.f.v >= 0) { b_number_set_nan_positive(val, true); } else { b_number_set_nan_negative(val, true); } } *result = (val); return B_SUCCESS; } static enum b_status parse_bool(struct ctx *ctx, b_object **result) { struct token *tok = peek_token(ctx); b_number *val = B_INT8(tok->tok_value.b); if (!val) { return B_ERR_NO_MEMORY; } *result = (val); return B_SUCCESS; } static enum b_status parse_table_inline(struct ctx *ctx, b_object **result) { DISABLE_EXTENDED_LEXING(ctx); advance_token(ctx); b_dict *table = b_dict_create(); if (!table) { return B_ERR_NO_MEMORY; } struct token *tok = peek_token(ctx); if (tok && tok->tok_type == TOK_RIGHT_BRACE) { *result = (table); return B_SUCCESS; } bool done = false; while (!done) { b_object *value; enum b_status status = parse_key_value_pair(ctx, table); if (!B_OK(status)) { b_dict_unref(table); return status; } tok = peek_token(ctx); if (!tok) { b_dict_unref(table); return status; } switch (tok->tok_type) { case TOK_RIGHT_BRACE: done = true; break; case TOK_COMMA: advance_token(ctx); break; default: b_dict_unref(table); return B_ERR_BAD_FORMAT; } } *result = (table); return B_SUCCESS; } static void skip_newlines(struct ctx *ctx) { struct token *tok = peek_token(ctx); while (tok && tok->tok_type == TOK_NEWLINE) { advance_token(ctx); tok = peek_token(ctx); } } static enum b_status parse_array_inline(struct ctx *ctx, b_object **result) { bool done = false; ENABLE_EXTENDED_LEXING(ctx); advance_token(ctx); b_array *array = b_array_create(); if (!array) { return B_ERR_NO_MEMORY; } struct token *tok = peek_token(ctx); if (!tok) { b_array_unref(array); return B_ERR_BAD_FORMAT; } if (tok->tok_type == TOK_RIGHT_BRACKET) { done = true; } while (!done) { skip_newlines(ctx); tok = peek_token(ctx); if (!tok) { b_array_unref(array); return B_ERR_BAD_FORMAT; } if (tok->tok_type == TOK_RIGHT_BRACKET) { done = true; break; } b_object *value; enum b_status status = parse_value(ctx, &value); if (!B_OK(status)) { b_array_unref(array); return status; } b_array_append(array, B_RV(value)); ENABLE_EXTENDED_LEXING(ctx); advance_token(ctx); skip_newlines(ctx); tok = peek_token(ctx); if (tok && tok->tok_type == TOK_RIGHT_BRACKET) { done = true; break; } if (!tok || tok->tok_type != TOK_COMMA) { b_array_unref(array); return B_ERR_BAD_FORMAT; } ENABLE_EXTENDED_LEXING(ctx); advance_token(ctx); } DISABLE_EXTENDED_LEXING(ctx); *result = (array); return B_SUCCESS; } static enum b_status parse_value(struct ctx *ctx, b_object **result) { struct token *tok = peek_token(ctx); if (!tok) { return B_ERR_BAD_FORMAT; } switch (tok->tok_type) { case TOK_STRING: return parse_string(ctx, result); case TOK_INT: return parse_int(ctx, result); case TOK_FLOAT: return parse_float(ctx, result); case TOK_BOOL: return parse_bool(ctx, result); case TOK_TIMESTAMP: return parse_timestamp(ctx, result); case TOK_LEFT_BRACKET: return parse_array_inline(ctx, result); case TOK_LEFT_BRACE: return parse_table_inline(ctx, result); default: return B_ERR_BAD_FORMAT; } } static enum b_status parse_key_value_pair(struct ctx *ctx, b_dict *container) { struct token *tok = peek_token(ctx); if (!IS_VALID_KEY_COMPONENT(tok)) { return B_ERR_BAD_FORMAT; } b_string *key = b_string_duplicate(tok->tok_str); if (!key) { return B_ERR_NO_MEMORY; } advance_token(ctx); tok = peek_token(ctx); if (!tok) { return B_ERR_BAD_FORMAT; } while (tok && tok->tok_type == TOK_DOT) { b_object *sub_dict = b_dict_at_sk(container, key); if (!sub_dict) { sub_dict = (b_dict_create()); b_dict_put_sk(container, key, B_RV(sub_dict)); } else if (sub_dict && !b_object_is_type(sub_dict, B_TYPE_DICT)) { free(key); return B_ERR_BAD_FORMAT; } #if 1 enum object_flags flags = ctx_get_object_flags(ctx, sub_dict); if (flags & (OBJECT_KV_END_DEFINED | OBJECT_HEADER_END_DEFINED)) { free(key); return B_ERR_BAD_FORMAT; } #endif ctx_set_object_flags(ctx, sub_dict, OBJECT_KV_MID_DEFINED); advance_token(ctx); tok = peek_token(ctx); if (!IS_VALID_KEY_COMPONENT(tok)) { free(key); return B_ERR_BAD_FORMAT; } container = sub_dict; b_string_unref(key); key = b_string_duplicate(tok->tok_str); if (!key) { return B_ERR_NO_MEMORY; } advance_token(ctx); tok = peek_token(ctx); } if (b_dict_has_skey(container, key)) { return B_ERR_BAD_FORMAT; } if (!tok) { return B_ERR_BAD_FORMAT; } if (tok->tok_type != TOK_EQUAL) { return B_ERR_BAD_FORMAT; } ENABLE_EXTENDED_LEXING(ctx); advance_token(ctx); b_object *value = NULL; enum b_status status = parse_value(ctx, &value); DISABLE_EXTENDED_LEXING(ctx); if (!B_OK(status)) { return status; } status = advance_token(ctx); if (!B_OK(status) && status != B_ERR_NO_DATA) { return status; } b_dict_put_sk(container, key, B_RV(value)); if (b_object_is_type(value, B_TYPE_DICT) || b_object_is_type(value, B_TYPE_ARRAY)) { ctx_set_object_flags(ctx, value, OBJECT_KV_END_DEFINED); } return B_SUCCESS; } static enum b_status parse_table_header( struct ctx *ctx, b_dict *container, b_dict **new_container) { advance_token(ctx); struct token *tok = peek_token(ctx); if (!IS_VALID_KEY_COMPONENT(tok)) { return B_ERR_BAD_FORMAT; } b_string *key = b_string_duplicate(tok->tok_str); if (!key) { return B_ERR_NO_MEMORY; } advance_token(ctx); tok = peek_token(ctx); if (!tok) { return B_ERR_BAD_FORMAT; } while (tok && tok->tok_type == TOK_DOT) { b_object *sub_dict = b_dict_at_sk(container, key); enum object_flags flags = ctx_get_object_flags(ctx, sub_dict); if (!sub_dict) { sub_dict = (b_dict_create()); b_dict_put_sk(container, key, B_RV(sub_dict)); } else if (b_object_is_type(sub_dict, B_TYPE_ARRAY)) { sub_dict = b_array_at(sub_dict, b_array_size(sub_dict) - 1); } else if (!b_object_is_type(sub_dict, B_TYPE_DICT)) { return B_ERR_BAD_FORMAT; } if (flags & OBJECT_KV_END_DEFINED) { return B_ERR_BAD_FORMAT; } advance_token(ctx); tok = peek_token(ctx); if (!IS_VALID_KEY_COMPONENT(tok)) { return B_ERR_BAD_FORMAT; } ctx_set_object_flags(ctx, sub_dict, OBJECT_HEADER_MID_DEFINED); container = sub_dict; b_string_unref(key); key = b_string_duplicate(tok->tok_str); if (!key) { return B_ERR_NO_MEMORY; } advance_token(ctx); tok = peek_token(ctx); } if (!tok || tok->tok_type != TOK_RIGHT_BRACKET) { return B_ERR_BAD_FORMAT; } b_dict *new_table = b_dict_at_sk(container, key); if (!new_table) { new_table = b_dict_create(); if (!new_table) { free(key); return B_ERR_NO_MEMORY; } b_dict_put_sk(container, key, B_RV(new_table)); } if (!b_object_is_type((new_table), B_TYPE_DICT)) { return B_ERR_BAD_FORMAT; } enum object_flags flags = ctx_get_object_flags(ctx, (new_table)); if (flags & (OBJECT_HEADER_END_DEFINED | OBJECT_KV_MID_DEFINED | OBJECT_KV_END_DEFINED)) { return B_ERR_BAD_FORMAT; } ctx_set_object_flags(ctx, (new_table), OBJECT_HEADER_END_DEFINED); b_string_unref(key); advance_token(ctx); *new_container = new_table; return B_SUCCESS; } static enum b_status parse_array_header( struct ctx *ctx, b_dict *container, b_dict **new_container) { advance_token(ctx); struct token *tok = peek_token(ctx); if (!IS_VALID_KEY_COMPONENT(tok)) { return B_ERR_BAD_FORMAT; } b_string *key = b_string_duplicate(tok->tok_str); if (!key) { return B_ERR_NO_MEMORY; } advance_token(ctx); tok = peek_token(ctx); if (!tok) { return B_ERR_BAD_FORMAT; } while (tok && tok->tok_type == TOK_DOT) { b_object *sub_dict = b_dict_at_sk(container, key); if (!sub_dict) { sub_dict = (b_dict_create()); b_dict_put_sk(container, key, B_RV(sub_dict)); } else if (b_object_is_type(sub_dict, B_TYPE_ARRAY)) { sub_dict = b_array_at(sub_dict, b_array_size(sub_dict) - 1); } else if (!b_object_is_type(sub_dict, B_TYPE_DICT)) { return B_ERR_BAD_FORMAT; } advance_token(ctx); tok = peek_token(ctx); if (!IS_VALID_KEY_COMPONENT(tok)) { return B_ERR_BAD_FORMAT; } container = sub_dict; b_string_unref(key); key = b_string_duplicate(tok->tok_str); if (!key) { return B_ERR_NO_MEMORY; } advance_token(ctx); tok = peek_token(ctx); } if (!tok || tok->tok_type != TOK_DOUBLE_RIGHT_BRACKET) { return B_ERR_BAD_FORMAT; } b_array *array = b_dict_get_sk(container, key); if (!array) { array = b_array_create(); b_dict_put_sk(container, key, B_RV(array)); } else if (!b_object_is_type(array, B_TYPE_ARRAY)) { return B_ERR_BAD_FORMAT; } free(key); enum object_flags flags = ctx_get_object_flags(ctx, (array)); if (flags & OBJECT_KV_END_DEFINED) { return B_ERR_NO_MEMORY; } b_dict *new_table = b_dict_create(); if (!new_table) { return B_ERR_NO_MEMORY; } b_array_append(array, B_RV(new_table)); advance_token(ctx); *new_container = new_table; return B_SUCCESS; } static enum b_status parse_root(struct ctx *ctx, b_dict **result) { enum b_status status = B_SUCCESS; b_dict *root = b_dict_create(); b_dict *current = root; while (!(ctx->ctx_flags & CTX_EOF) && B_OK(status)) { struct token *tok = peek_token(ctx); if (!tok) { break; } switch (tok->tok_type) { case TOK_LEFT_BRACKET: status = parse_table_header(ctx, root, ¤t); if (!B_OK(status)) { break; } tok = peek_token(ctx); if (tok && tok->tok_type != TOK_NEWLINE) { status = B_ERR_BAD_FORMAT; } break; case TOK_DOUBLE_LEFT_BRACKET: status = parse_array_header(ctx, root, ¤t); if (!B_OK(status)) { break; } tok = peek_token(ctx); if (tok && tok->tok_type != TOK_NEWLINE) { status = B_ERR_BAD_FORMAT; } break; case TOK_WORD: case TOK_STRING: status = parse_key_value_pair(ctx, current); if (!B_OK(status)) { break; } tok = peek_token(ctx); if (tok && tok->tok_type != TOK_NEWLINE) { status = B_ERR_BAD_FORMAT; } advance_token(ctx); break; case TOK_NEWLINE: advance_token(ctx); break; default: status = B_ERR_BAD_FORMAT; break; } if (!B_OK(ctx->ctx_status) && ctx->ctx_status != B_ERR_NO_DATA) { status = ctx->ctx_status; } } if (!B_OK(status)) { b_dict_unref(root); root = NULL; } *result = root; return status; } static enum b_status toml_deserialise( b_serial_ctx *serial, b_stream *src, b_object **dest, enum b_serial_flags flags) { struct ctx ctx = {0}; enum b_status status = ctx_init(&ctx); if (!B_OK(status)) { return status; } ctx.ctx_src = src; ctx.ctx_flags = CTX_ENABLE_LONG_SYMBOLS; status = advance_token(&ctx); if (!B_OK(ctx.ctx_status) && ctx.ctx_status != B_ERR_NO_DATA) { return ctx.ctx_status; } if (ctx.ctx_flags & CTX_EOF) { *dest = (b_dict_create()); return B_SUCCESS; } b_dict *result = NULL; status = parse_root(&ctx, &result); if (!B_OK(status)) { return status; } *dest = (result); #if 0 ctx.ctx_flags = CTX_ENABLE_NUMBERS | CTX_ENABLE_TIMESTAMPS | CTX_ENABLE_BOOLS; while (!(ctx.ctx_flags & CTX_EOF) && B_OK(ctx.ctx_status)) { struct token *tok = peek_token(&ctx); print_token(tok); status = advance_token(&ctx); } #endif return B_SUCCESS; } /*** VIRTUAL FUNCTIONS ********************************************************/ static void toml_serial_ctx_init(b_object *obj, void *priv) { } static void toml_serial_ctx_fini(b_object *obj, void *priv) { } /*** CLASS DEFINITION *********************************************************/ B_TYPE_CLASS_DEFINITION_BEGIN(b_toml_serial_ctx) B_TYPE_CLASS_INTERFACE_BEGIN(b_object, B_TYPE_OBJECT) B_INTERFACE_ENTRY(to_string) = NULL; B_TYPE_CLASS_INTERFACE_END(b_object, B_TYPE_OBJECT) B_TYPE_CLASS_INTERFACE_BEGIN(b_serial_ctx, B_TYPE_SERIAL_CTX) B_INTERFACE_ENTRY(s_serialise) = toml_serialise; B_INTERFACE_ENTRY(s_deserialise) = toml_deserialise; B_TYPE_CLASS_INTERFACE_END(b_serial_ctx, B_TYPE_SERIAL_CTX) B_TYPE_CLASS_DEFINITION_END(b_toml_serial_ctx) B_TYPE_DEFINITION_BEGIN(b_toml_serial_ctx) B_TYPE_ID(0xaec8dca0, 0x131a, 0x4217, 0x916b, 0xaed15756601c); B_TYPE_CLASS(b_toml_serial_ctx_class); B_TYPE_EXTENDS(B_TYPE_SERIAL_CTX); B_TYPE_INSTANCE_INIT(toml_serial_ctx_init); B_TYPE_INSTANCE_FINI(toml_serial_ctx_fini); B_TYPE_DEFINITION_END(b_toml_serial_ctx)