diff --git a/mie/CMakeLists.txt b/mie/CMakeLists.txt index 1aae8e6..b56a1bb 100644 --- a/mie/CMakeLists.txt +++ b/mie/CMakeLists.txt @@ -11,5 +11,5 @@ else () endif () target_include_directories(mie PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include/) -target_link_libraries(mie Bluelib::Core Bluelib::Ds) +target_link_libraries(mie Bluelib::Core Bluelib::Ds Bluelib::Io) target_compile_definitions(mie PRIVATE MIE_EXPORT=1 MIE_STATIC=${MIE_STATIC}) diff --git a/mie/include/mie/parse/lex.h b/mie/include/mie/parse/lex.h new file mode 100644 index 0000000..300e5ce --- /dev/null +++ b/mie/include/mie/parse/lex.h @@ -0,0 +1,18 @@ +#ifndef MIE_PARSE_LEX_H_ +#define MIE_PARSE_LEX_H_ + +#include +#include +#include + +struct mie_lex; +struct mie_token; + +MIE_API struct mie_lex *mie_lex_create(b_stream *src); +MIE_API void mie_lex_destroy(struct mie_lex *lex); + +MIE_API enum mie_status mie_lex_get_status(const struct mie_lex *lex); +MIE_API struct mie_token *mie_lex_peek(struct mie_lex *lex); +MIE_API void mie_lex_advance(struct mie_lex *lex); + +#endif diff --git a/mie/include/mie/parse/parse.h b/mie/include/mie/parse/parse.h new file mode 100644 index 0000000..48cc23a --- /dev/null +++ b/mie/include/mie/parse/parse.h @@ -0,0 +1,62 @@ +#ifndef MIE_PARSE_PARSE_H_ +#define MIE_PARSE_PARSE_H_ + +#include +#include +#include +#include +#include + +struct mie_parse_ctx; +struct mie_lex; +struct mie_ctx; + +/* these structs are temporary, and are just here for documentation purposes atm */ + +struct mie_argument { +}; + +struct mie_unresolved_operand { +}; + +struct mie_region { +}; + +struct mie_type { +}; + +MIE_API struct mie_parse_ctx *mie_parse_ctx_create( + struct mie_ctx *ctx, struct mie_lex *lex); +MIE_API void mie_parse_ctx_destroy(struct mie_parse_ctx *ctx); +MIE_API enum mie_status mie_parse_ctx_get_status(const struct mie_parse_ctx *ctx); + +MIE_API enum mie_token_type mie_parse_ctx_peek(struct mie_parse_ctx *ctx); +MIE_API bool mie_parse_ctx_advance(struct mie_parse_ctx *ctx); + +MIE_API bool mie_parse_ctx_parse_instname(struct mie_parse_ctx *ctx, b_string *out); +MIE_API bool mie_parse_ctx_parse_graphname(struct mie_parse_ctx *ctx, b_string *out); +MIE_API bool mie_parse_ctx_parse_vregname(struct mie_parse_ctx *ctx, b_string *out); +MIE_API bool mie_parse_ctx_parse_mregname(struct mie_parse_ctx *ctx, b_string *out); +MIE_API bool mie_parse_ctx_parse_blockname(struct mie_parse_ctx *ctx, b_string *out); +MIE_API bool mie_parse_ctx_parse_typename(struct mie_parse_ctx *ctx, b_string *out); +MIE_API bool mie_parse_ctx_parse_symname(struct mie_parse_ctx *ctx, b_string *out); +MIE_API bool mie_parse_ctx_parse_string(struct mie_parse_ctx *ctx, b_string *out); +MIE_API bool mie_parse_ctx_parse_keyword(struct mie_parse_ctx *ctx, const char *kw); +MIE_API bool mie_parse_ctx_parse_symbol( + struct mie_parse_ctx *ctx, enum mie_token_symbol sym); +MIE_API bool mie_parse_ctx_parse_operand( + struct mie_parse_ctx *ctx, struct mie_unresolved_operand *out); +MIE_API bool mie_parse_ctx_parse_region( + struct mie_parse_ctx *ctx, struct mie_region *region); +MIE_API bool mie_parse_ctx_parse_type( + struct mie_parse_ctx *ctx, struct mie_type **out); +MIE_API bool mie_parse_ctx_parse_assignment_list( + struct mie_parse_ctx *ctx, struct mie_argument **out_lhs, + struct mie_unresolved_operand **out_rhs, size_t *out_count); + +MIE_API bool mie_parse_ctx_parse_unknown_keyword( + struct mie_parse_ctx *ctx, b_string *out); +MIE_API bool mie_parse_ctx_parse_unknown_symbol( + struct mie_parse_ctx *ctx, enum mie_token_symbol sym); + +#endif diff --git a/mie/include/mie/parse/token.h b/mie/include/mie/parse/token.h new file mode 100644 index 0000000..07252b0 --- /dev/null +++ b/mie/include/mie/parse/token.h @@ -0,0 +1,82 @@ +#ifndef MIE_PARSE_TOKEN_H_ +#define MIE_PARSE_TOKEN_H_ + +#include +#include + +enum mie_token_type { + MIE_TOK_NONE = 0, + MIE_TOK_LINEFEED, + MIE_TOK_INT, + MIE_TOK_DOUBLE, + MIE_TOK_SYMBOL, + MIE_TOK_STRING, + MIE_TOK_WORD, /* single words, not dot-delimited */ + MIE_TOK_NAME, /* set of words with at least one dot */ + MIE_TOK_INSTNAME, /* word or name, prefixed with an * asterisk */ + MIE_TOK_SYMNAME, /* word or name, prefixed with an @ at */ + MIE_TOK_OPNAME, /* word or name, prefixed with a ~ tilde */ + MIE_TOK_GRAPHNAME, /* word or name, prefixed with a + plus */ + MIE_TOK_VREGNAME, /* word or name, prefixed with a % percent */ + MIE_TOK_MREGNAME, /* word or name, prefixed with a $ dollar */ + MIE_TOK_BLOCKNAME, /* word or name, prefixed with a ^ caret */ + MIE_TOK_TYPENAME, /* word or name, prefixed with a # hash */ +}; + +enum mie_token_value_type { + MIE_TOK_V_NONE = 0, + MIE_TOK_V_INT, + MIE_TOK_V_DOUBLE, + MIE_TOK_V_STRING, + MIE_TOK_V_SYMBOL, +}; + +enum mie_token_symbol { + MIE_SYM_NONE = 0, + MIE_SYM_COLON, + MIE_SYM_EQUAL, + MIE_SYM_COMMA, + MIE_SYM_HYPHEN, + MIE_SYM_ASTERISK, + MIE_SYM_PLUS, + MIE_SYM_PERCENT, + MIE_SYM_DOLLAR, + MIE_SYM_CARET, + MIE_SYM_HASH, + MIE_SYM_TILDE, + MIE_SYM_ATSIGN, + MIE_SYM_LEFT_BRACE, + MIE_SYM_RIGHT_BRACE, + MIE_SYM_LEFT_BRACKET, + MIE_SYM_RIGHT_BRACKET, + MIE_SYM_LEFT_PAREN, + MIE_SYM_RIGHT_PAREN, + MIE_SYM_LEFT_ANGLE, + MIE_SYM_RIGHT_ANGLE, + MIE_SYM_HYPHEN_RIGHT_ANGLE, + MIE_SYM_OTHER, +}; + +struct mie_token_location { + unsigned int c_row, c_col; +}; + +struct mie_token { + struct mie_token_location tok_start, tok_end; + enum mie_token_type tok_type; + enum mie_token_value_type tok_value_type; + b_queue_entry tok_entry; + + union { + char *tok_str; + enum mie_token_symbol tok_sym; + long long tok_int; + double tok_double; + }; +}; + +MIE_API void mie_token_destroy(struct mie_token *tok); +MIE_API const char *mie_token_type_to_string(enum mie_token_type type); +MIE_API const char *mie_token_symbol_to_string(enum mie_token_symbol sym); + +#endif diff --git a/mie/parse/lex.c b/mie/parse/lex.c new file mode 100644 index 0000000..8395e36 --- /dev/null +++ b/mie/parse/lex.c @@ -0,0 +1,898 @@ +#include "lex.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define LINEBUF_DEFAULT_CAPACITY 1024 + +#define LEX_TOKEN_DEF(i, n) {.id = (i), .name = (n)} + +#define IS_VALID_IDENT_CHAR(c) \ + (b_wchar_is_alnum(c) || c == '.' || c == '-' || c == '_') +#define IS_VALID_IDENT_START_CHAR(c) \ + (b_wchar_is_alpha(c) || c == '.' || c == '_') +#define IS_VALID_REG_START_CHAR(c) (b_wchar_is_alnum(c) || c == '.' || c == '_') + +static struct lex_token_def symbols[] = { + LEX_TOKEN_DEF(MIE_SYM_COLON, ":"), + LEX_TOKEN_DEF(MIE_SYM_EQUAL, "="), + LEX_TOKEN_DEF(MIE_SYM_COMMA, ","), + LEX_TOKEN_DEF(MIE_SYM_HYPHEN, "-"), + LEX_TOKEN_DEF(MIE_SYM_ASTERISK, "*"), + LEX_TOKEN_DEF(MIE_SYM_PLUS, "+"), + LEX_TOKEN_DEF(MIE_SYM_PERCENT, "%"), + LEX_TOKEN_DEF(MIE_SYM_DOLLAR, "$"), + LEX_TOKEN_DEF(MIE_SYM_CARET, "^"), + LEX_TOKEN_DEF(MIE_SYM_HASH, "#"), + LEX_TOKEN_DEF(MIE_SYM_ATSIGN, "@"), + LEX_TOKEN_DEF(MIE_SYM_TILDE, "~"), + LEX_TOKEN_DEF(MIE_SYM_LEFT_BRACE, "{"), + LEX_TOKEN_DEF(MIE_SYM_RIGHT_BRACE, "}"), + LEX_TOKEN_DEF(MIE_SYM_LEFT_BRACKET, "["), + LEX_TOKEN_DEF(MIE_SYM_RIGHT_BRACKET, "]"), + LEX_TOKEN_DEF(MIE_SYM_LEFT_PAREN, "("), + LEX_TOKEN_DEF(MIE_SYM_RIGHT_PAREN, ")"), + LEX_TOKEN_DEF(MIE_SYM_LEFT_ANGLE, "<"), + LEX_TOKEN_DEF(MIE_SYM_RIGHT_ANGLE, ">"), + LEX_TOKEN_DEF(MIE_SYM_HYPHEN_RIGHT_ANGLE, "->"), +}; +static const size_t nr_symbols = sizeof symbols / sizeof symbols[0]; + +static struct mie_lex_symbol_node *get_symbol_node( + struct mie_lex_symbol_node *node, char c) +{ + b_queue_entry *entry = b_queue_first(&node->s_children); + while (entry) { + struct mie_lex_symbol_node *child + = b_unbox(struct mie_lex_symbol_node, entry, s_entry); + if (child->s_char == c) { + return child; + } + + entry = b_queue_next(entry); + } + + return NULL; +} + +static b_string *get_temp_string(struct mie_lex *lex) +{ + if (!lex->lex_temp) { + lex->lex_temp = b_string_create(); + } + + b_string_clear(lex->lex_temp); + return lex->lex_temp; +} + +static enum mie_status put_symbol( + struct mie_lex_symbol_node *tree, struct lex_token_def *sym) +{ + for (size_t i = 0; sym->name[i]; i++) { + char c = sym->name[i]; + struct mie_lex_symbol_node *child = get_symbol_node(tree, c); + if (child) { + tree = child; + continue; + } + + child = malloc(sizeof *child); + if (!child) { + return MIE_ERR_NO_MEMORY; + } + + memset(child, 0x0, sizeof *child); + + child->s_def = NULL; + child->s_char = c; + + b_queue_push_back(&tree->s_children, &child->s_entry); + tree = child; + } + + tree->s_def = sym; + return MIE_SUCCESS; +} + +static void destroy_symbol_tree(struct mie_lex_symbol_node *tree) +{ + b_queue_entry *entry = b_queue_first(&tree->s_children); + while (entry) { + struct mie_lex_symbol_node *node + = b_unbox(struct mie_lex_symbol_node, entry, s_entry); + b_queue_entry *next = b_queue_next(entry); + b_queue_delete(&tree->s_children, entry); + + destroy_symbol_tree(node); + + entry = next; + } + + free(tree); +} + +static struct mie_lex_symbol_node *build_symbol_tree(void) +{ + struct mie_lex_symbol_node *root = malloc(sizeof *root); + if (!root) { + return NULL; + } + + memset(root, 0x0, sizeof *root); + root->s_def = NULL; + + enum mie_status status = MIE_SUCCESS; + for (size_t i = 0; i < nr_symbols; i++) { + status = put_symbol(root, &symbols[i]); + + if (status != MIE_SUCCESS) { + destroy_symbol_tree(root); + return NULL; + } + } + + return root; +} + +struct mie_lex *mie_lex_create(b_stream *src) +{ + struct mie_lex *lex = malloc(sizeof *lex); + if (!lex) { + return NULL; + } + + memset(lex, 0x0, sizeof *lex); + + lex->lex_cursor_row = lex->lex_cursor_col = 1; + + lex->lex_status = MIE_SUCCESS; + lex->lex_source = src; + lex->lex_linebuf = b_string_create(); + + lex->lex_sym_tree = build_symbol_tree(); + if (!lex->lex_sym_tree) { + mie_lex_destroy(lex); + return NULL; + } + + return lex; +} + +void mie_lex_destroy(struct mie_lex *lex) +{ + b_queue_entry *entry = b_queue_first(&lex->lex_queue); + + while (entry) { + struct mie_token *tok + = b_unbox(struct mie_token, entry, tok_entry); + b_queue_entry *next = b_queue_next(entry); + b_queue_delete(&lex->lex_queue, entry); + + mie_token_destroy(tok); + + entry = next; + } + + if (lex->lex_linebuf) { + free(lex->lex_linebuf); + } + + if (lex->lex_sym_tree) { + destroy_symbol_tree(lex->lex_sym_tree); + } + + if (lex->lex_temp) { + b_string_unref(lex->lex_temp); + } + + free(lex); +} + +enum mie_status mie_lex_get_status(const struct mie_lex *lex) +{ + return lex->lex_status; +} + +static enum mie_status refill_linebuf(struct mie_lex *lex) +{ + if (!lex->lex_source) { + return MIE_ERR_EOF; + } + + if (lex->lex_linebuf_ptr) { + b_iterator_unref(lex->lex_linebuf_ptr); + lex->lex_linebuf_ptr = NULL; + } + + b_stringstream *s = b_stringstream_create(); + + b_status status = b_stream_read_line_s(lex->lex_source, s); + + if (status == B_ERR_NO_DATA) { + return MIE_ERR_EOF; + } + + if (!B_OK(status)) { + return MIE_ERR_INTERNAL_FAILURE; + } + + b_string_replace_all_with_stringstream(lex->lex_linebuf, s); + b_stringstream_unref(s); + + lex->lex_linebuf_ptr = b_iterator_begin(lex->lex_linebuf); + + return MIE_SUCCESS; +} + +static int peek(struct mie_lex *lex) +{ + enum mie_status status = MIE_SUCCESS; + + if (!lex->lex_linebuf_ptr || !b_iterator_is_valid(lex->lex_linebuf_ptr)) { + status = refill_linebuf(lex); + } + + if (status != MIE_SUCCESS) { + return -status; + } + + if (b_string_get_size(lex->lex_linebuf, B_STRLEN_NORMAL) == 0) { + return -MIE_ERR_EOF; + } + + b_wchar c = b_iterator_get_value(lex->lex_linebuf_ptr).v_int; + return c; +} + +static int advance(struct mie_lex *lex) +{ + enum mie_status status = MIE_SUCCESS; + + if (!b_iterator_is_valid(lex->lex_linebuf_ptr)) { + status = refill_linebuf(lex); + } + + if (status != MIE_SUCCESS) { + return -status; + } + + if (b_string_get_size(lex->lex_linebuf, B_STRLEN_NORMAL) == 0) { + return -MIE_ERR_EOF; + } + + b_wchar c = b_iterator_get_value(lex->lex_linebuf_ptr).v_int; + b_iterator_move_next(lex->lex_linebuf_ptr); + + lex->lex_cursor_col++; + if (c == '\n') { + lex->lex_cursor_col = 1; + lex->lex_cursor_row++; + } + return c; +} + +static bool input_available(struct mie_lex *lex) +{ + return lex->lex_linebuf_ptr && b_iterator_is_valid(lex->lex_linebuf_ptr); +} + +static bool char_can_begin_symbol(char c) +{ + for (size_t i = 0; i < nr_symbols; i++) { + if (symbols[i].name[0] == c) { + return true; + } + } + + return false; +} + +static struct mie_token *create_token(enum mie_token_type type) +{ + struct mie_token *tok = malloc(sizeof *tok); + if (!tok) { + return NULL; + } + + memset(tok, 0x0, sizeof *tok); + + tok->tok_type = type; + return tok; +} + +static void set_token_start(struct mie_lex *lex) +{ + lex->lex_token_start_row = lex->lex_cursor_row; + lex->lex_token_start_col = lex->lex_cursor_col; +} + +static void set_token_end(struct mie_lex *lex) +{ + lex->lex_token_end_row = lex->lex_cursor_row; + lex->lex_token_end_col = lex->lex_cursor_col; +} + +static enum mie_status push_token(struct mie_lex *lex, struct mie_token *tok) +{ + tok->tok_start.c_row = lex->lex_token_start_row; + tok->tok_start.c_col = lex->lex_token_start_col; + tok->tok_end.c_row = lex->lex_token_end_row; + tok->tok_end.c_col = lex->lex_token_end_col; + + b_queue_push_back(&lex->lex_queue, &tok->tok_entry); + return MIE_SUCCESS; +} + +static enum mie_status push_linefeed(struct mie_lex *lex) +{ + struct mie_token *tok = malloc(sizeof *tok); + if (!tok) { + return MIE_ERR_NO_MEMORY; + } + + memset(tok, 0x0, sizeof *tok); + + tok->tok_type = MIE_TOK_LINEFEED; + tok->tok_value_type = MIE_TOK_V_NONE; + return push_token(lex, tok); +} + +static enum mie_status push_symbol(struct mie_lex *lex, enum mie_token_symbol sym) +{ + struct mie_token *tok = malloc(sizeof *tok); + if (!tok) { + return MIE_ERR_NO_MEMORY; + } + + memset(tok, 0x0, sizeof *tok); + + tok->tok_type = MIE_TOK_SYMBOL; + tok->tok_value_type = MIE_TOK_V_SYMBOL; + tok->tok_sym = sym; + return push_token(lex, tok); +} + +static enum mie_status push_string_token( + struct mie_lex *lex, enum mie_token_type type, char *s) +{ + struct mie_token *tok = malloc(sizeof *tok); + if (!tok) { + return MIE_ERR_NO_MEMORY; + } + + char *ep = NULL; + long long v = strtoll(s, &ep, 10); + + memset(tok, 0x0, sizeof *tok); + + tok->tok_type = type; + + if (*ep == '\0') { + tok->tok_int = v; + tok->tok_value_type = MIE_TOK_V_INT; + free(s); + } else { + tok->tok_str = s; + tok->tok_value_type = MIE_TOK_V_STRING; + } + + return push_token(lex, tok); +} + +static enum mie_status push_int(struct mie_lex *lex, unsigned long long v) +{ + struct mie_token *tok = malloc(sizeof *tok); + if (!tok) { + return MIE_ERR_NO_MEMORY; + } + + memset(tok, 0x0, sizeof *tok); + + tok->tok_type = MIE_TOK_INT; + tok->tok_value_type = MIE_TOK_V_INT; + tok->tok_int = v; + return push_token(lex, tok); +} + +static enum mie_status push_double(struct mie_lex *lex, double v) +{ + struct mie_token *tok = malloc(sizeof *tok); + if (!tok) { + return MIE_ERR_NO_MEMORY; + } + + memset(tok, 0x0, sizeof *tok); + + tok->tok_type = MIE_TOK_DOUBLE; + tok->tok_value_type = MIE_TOK_V_DOUBLE; + tok->tok_double = v; + return push_token(lex, tok); +} + +static enum mie_status read_line_comment(struct mie_lex *lex) +{ + while (true) { + b_wchar c = advance(lex); + + if (c == -MIE_ERR_EOF || c == '\n') { + break; + } + + if (c < 0) { + return -c; + } + } + + return MIE_SUCCESS; +} + +static enum mie_status read_number(struct mie_lex *lex, bool negate) +{ + int token_len = 0; + int base = 10; + int dots = 0; + b_string *str = get_temp_string(lex); + + if (!negate) { + set_token_start(lex); + } + + while (true) { + b_wchar c = peek(lex); + if (c == -MIE_ERR_EOF) { + break; + } + + if (c < 0) { + return -c; + } + + if (c == '_') { + token_len++; + set_token_end(lex); + advance(lex); + continue; + } + + if (c == '.') { + if (base != 10) { + return MIE_ERR_BAD_SYNTAX; + } + + if (dots > 0) { + return MIE_ERR_BAD_SYNTAX; + } + + token_len++; + dots++; + char s[] = {c, 0}; + b_string_append_cstr(str, s); + set_token_end(lex); + advance(lex); + continue; + } + + if (b_wchar_is_space(c) || b_wchar_is_punct(c)) { + break; + } + + if (c == 'x' && token_len == 1) { + base = 16; + token_len++; + set_token_end(lex); + advance(lex); + continue; + } + + if (c == 'b' && token_len == 1) { + base = 2; + token_len++; + set_token_end(lex); + advance(lex); + continue; + } + + if (base == 2 && c != '0' && c != '1') { + return MIE_ERR_BAD_SYNTAX; + } + + if (base == 10 && !isdigit(c)) { + return MIE_ERR_BAD_SYNTAX; + } + + if (base == 16 && !isxdigit(c)) { + return MIE_ERR_BAD_SYNTAX; + } + + b_string_append_wc(str, c); + set_token_end(lex); + advance(lex); + token_len++; + } + + if (token_len == 1 && base == 7) { + return push_int(lex, 0); + } + + const char *s = b_string_ptr(str); + char *ep = NULL; + + /* negative numbers will be lexed as a hyphen followed by a positive + * number. */ + + if (dots > 0) { + double v = strtod(s, &ep); + + if (*ep != '\0') { + return MIE_ERR_BAD_SYNTAX; + } + + if (negate) { + v *= -1; + } + + return push_double(lex, v); + } else { + unsigned long long v = strtoull(s, &ep, base); + + if (*ep != '\0') { + return MIE_ERR_BAD_SYNTAX; + } + + if (negate) { + v *= -1; + } + + return push_int(lex, v); + } +} + +static enum mie_status read_ident(struct mie_lex *lex, enum mie_token_type type) +{ + int dots = 0; + b_string *str = get_temp_string(lex); + b_wchar prev = 0; + + if (type == MIE_TOK_NONE) { + set_token_start(lex); + } + + while (1) { + b_wchar c = peek(lex); + + if ((c == '.' || c == '-') && prev == c) { + return MIE_ERR_BAD_SYNTAX; + } + + if (c == '.') { + dots++; + } + + if (!IS_VALID_IDENT_CHAR(c)) { + break; + } + + prev = c; + b_string_append_wc(str, c); + set_token_end(lex); + advance(lex); + } + + if (type == MIE_TOK_NONE) { + type = dots > 0 ? MIE_TOK_NAME : MIE_TOK_WORD; + } + + char *s = b_string_steal(str); + + switch (type) { + case MIE_TOK_INSTNAME: + if (dots > 0) { + return push_string_token(lex, type, s); + } else { + push_symbol(lex, MIE_SYM_ASTERISK); + return push_string_token(lex, MIE_TOK_WORD, s); + } + break; + default: + return push_string_token(lex, type, s); + } +} + +static enum mie_status read_string(struct mie_lex *lex) +{ + b_string *str = get_temp_string(lex); + + b_wchar c = peek(lex); + bool esc = false; + + if (c != '"') { + return MIE_ERR_BAD_SYNTAX; + } + + advance(lex); + + while (1) { + b_wchar c = peek(lex); + + if (esc) { + switch (c) { + case '\\': + case '"': + b_string_append_wc(str, c); + break; + default: + return MIE_ERR_BAD_SYNTAX; + } + + esc = false; + advance(lex); + continue; + } + + if (c == '\\') { + esc = true; + advance(lex); + continue; + } + + if (c == '"') { + advance(lex); + break; + } + + b_string_append_wc(str, c); + advance(lex); + } + + char *s = b_string_steal(str); + return push_string_token(lex, MIE_TOK_STRING, s); +} + +static enum mie_status read_symbol(struct mie_lex *lex) +{ + struct mie_lex_symbol_node *node = lex->lex_sym_tree; + set_token_start(lex); + b_wchar prev = 0; + + while (true) { + b_wchar c = peek(lex); + if (c < 0) { + break; + } + + struct mie_lex_symbol_node *next = get_symbol_node(node, c); + if (!next) { + prev = c; + break; + } + + node = next; + set_token_end(lex); + advance(lex); + prev = c; + } + + if (!node || node->s_def == NULL) { + return MIE_ERR_BAD_SYNTAX; + } + + if (node->s_def->id == MIE_SYM_HYPHEN && isdigit(prev)) { + return read_number(lex, true); + } + + if (IS_VALID_IDENT_START_CHAR(prev)) { + switch (node->s_def->id) { + case MIE_SYM_ASTERISK: + return read_ident(lex, MIE_TOK_INSTNAME); + case MIE_SYM_PLUS: + return read_ident(lex, MIE_TOK_GRAPHNAME); + case MIE_SYM_PERCENT: + return read_ident(lex, MIE_TOK_VREGNAME); + case MIE_SYM_DOLLAR: + return read_ident(lex, MIE_TOK_MREGNAME); + case MIE_SYM_CARET: + return read_ident(lex, MIE_TOK_BLOCKNAME); + case MIE_SYM_TILDE: + return read_ident(lex, MIE_TOK_OPNAME); + case MIE_SYM_HASH: + return read_ident(lex, MIE_TOK_TYPENAME); + case MIE_SYM_ATSIGN: + return read_ident(lex, MIE_TOK_SYMNAME); + default: + break; + } + } + + if (IS_VALID_REG_START_CHAR(prev)) { + switch (node->s_def->id) { + case MIE_SYM_PERCENT: + return read_ident(lex, MIE_TOK_VREGNAME); + case MIE_SYM_DOLLAR: + return read_ident(lex, MIE_TOK_MREGNAME); + case MIE_SYM_ATSIGN: + return read_ident(lex, MIE_TOK_SYMNAME); + default: + break; + } + } + + return push_symbol(lex, node->s_def->id); +} + +static void skip_whitespace(struct mie_lex *lex) +{ + b_wchar c = peek(lex); + + while (b_wchar_is_space(c)) { + advance(lex); + c = peek(lex); + } +} + +static bool should_skip(b_wchar c, bool skip_linefeeds) +{ + bool skip = b_wchar_is_space(c); + + if (!skip_linefeeds) { + skip = (skip && c != '\n'); + } + + return skip; +} + +static void skip_ignored_chars(struct mie_lex *lex, bool include_linefeeds) +{ + b_wchar c = peek(lex); + + while (1) { + while (should_skip(c, include_linefeeds)) { + advance(lex); + c = peek(lex); + } + + if (c != ';') { + break; + } + + advance(lex); + c = peek(lex); + + while (c != '\n') { + advance(lex); + c = peek(lex); + } + + advance(lex); + c = peek(lex); + } +} + +static enum mie_status pump_tokens(struct mie_lex *lex) +{ + b_wchar c = peek(lex); + + if (c < 0) { + return -c; + } + + while (1) { + if (c == ';' || (b_wchar_is_space(c) && c != '\n')) { + skip_ignored_chars(lex, false); + } else { + break; + } + + c = peek(lex); + } + + if (c == '\\') { + advance(lex); + skip_ignored_chars(lex, true); + c = peek(lex); + } + + if (c == '\n') { + set_token_start(lex); + set_token_end(lex); + + while (c == '\n') { + advance(lex); + + if (!input_available(lex)) { + break; + } + + c = peek(lex); + } + + if (c < 0) { + return -c; + } + + return push_linefeed(lex); + } + + while (b_wchar_is_space(c) && c != '\n') { + advance(lex); + c = peek(lex); + } + + if (IS_VALID_IDENT_START_CHAR(c)) { + return read_ident(lex, MIE_TOK_NONE); + } + + if (char_can_begin_symbol(c)) { + return read_symbol(lex); + } + + if (c == '"') { + return read_string(lex); + } + + if (isdigit(c)) { + return read_number(lex, false); + } + + return MIE_ERR_BAD_SYNTAX; +} + +struct mie_token *mie_lex_peek(struct mie_lex *lex) +{ + enum mie_status status = MIE_SUCCESS; + + while (b_queue_empty(&lex->lex_queue)) { + status = pump_tokens(lex); + + if (status != MIE_SUCCESS) { + lex->lex_status = status; + return NULL; + } + } + + lex->lex_status = status; + + b_queue_entry *entry = b_queue_first(&lex->lex_queue); + struct mie_token *tok = b_unbox(struct mie_token, entry, tok_entry); + return tok; +} + +void mie_lex_advance(struct mie_lex *lex) +{ + enum mie_status status = MIE_SUCCESS; + + while (b_queue_empty(&lex->lex_queue)) { + status = pump_tokens(lex); + + if (status != MIE_SUCCESS) { + lex->lex_status = status; + return; + } + } + + b_queue_entry *entry = b_queue_pop_front(&lex->lex_queue); + struct mie_token *tok = b_unbox(struct mie_token, entry, tok_entry); + mie_token_destroy(tok); +} + +bool mie_lex_tokens_available(struct mie_lex *lex) +{ + if (!b_queue_empty(&lex->lex_queue)) { + return true; + } + + if (input_available(lex)) { + return true; + } + + return false; +} diff --git a/mie/parse/lex.h b/mie/parse/lex.h new file mode 100644 index 0000000..30a2f3f --- /dev/null +++ b/mie/parse/lex.h @@ -0,0 +1,45 @@ +#ifndef _PARSE_LEX_H_ +#define _PARSE_LEX_H_ + +#include +#include +#include +#include +#include +#include +#include + +struct mie_lex { + struct mie_lex_symbol_node *lex_sym_tree; + b_stream *lex_source; + enum mie_status lex_status; + + b_queue lex_queue; + + b_string *lex_temp; + b_queue lex_state; + unsigned int lex_brace_depth; + + unsigned long lex_token_start_row, lex_token_start_col; + unsigned long lex_token_end_row, lex_token_end_col; + unsigned long lex_cursor_row, lex_cursor_col; + + b_string *lex_linebuf; + b_iterator *lex_linebuf_ptr; +}; + +struct mie_lex_symbol_node { + char s_char; + struct lex_token_def *s_def; + + b_queue_entry s_entry; + b_queue s_children; +}; + +struct lex_token_def { + int id; + const char *name; + uint64_t name_hash; +}; + +#endif diff --git a/mie/parse/parse.c b/mie/parse/parse.c new file mode 100644 index 0000000..e69de29 diff --git a/mie/parse/token.c b/mie/parse/token.c new file mode 100644 index 0000000..9103b2e --- /dev/null +++ b/mie/parse/token.c @@ -0,0 +1,72 @@ +#include + +void mie_token_destroy(struct mie_token *tok) +{ + switch (tok->tok_value_type) { + case MIE_TOK_V_STRING: + free(tok->tok_str); + break; + default: + break; + } + + free(tok); +} + +#define ENUM_STR(x) \ + case x: \ + return #x + +const char *mie_token_type_to_string(enum mie_token_type type) +{ + switch (type) { + ENUM_STR(MIE_TOK_NONE); + ENUM_STR(MIE_TOK_LINEFEED); + ENUM_STR(MIE_TOK_INT); + ENUM_STR(MIE_TOK_DOUBLE); + ENUM_STR(MIE_TOK_SYMBOL); + ENUM_STR(MIE_TOK_WORD); + ENUM_STR(MIE_TOK_NAME); + ENUM_STR(MIE_TOK_OPNAME); + ENUM_STR(MIE_TOK_INSTNAME); + ENUM_STR(MIE_TOK_GRAPHNAME); + ENUM_STR(MIE_TOK_VREGNAME); + ENUM_STR(MIE_TOK_MREGNAME); + ENUM_STR(MIE_TOK_BLOCKNAME); + ENUM_STR(MIE_TOK_TYPENAME); + ENUM_STR(MIE_TOK_SYMNAME); + ENUM_STR(MIE_TOK_STRING); + default: + return ""; + } +} + +const char *mie_token_symbol_to_string(enum mie_token_symbol sym) +{ + switch (sym) { + ENUM_STR(MIE_SYM_NONE); + ENUM_STR(MIE_SYM_COLON); + ENUM_STR(MIE_SYM_EQUAL); + ENUM_STR(MIE_SYM_COMMA); + ENUM_STR(MIE_SYM_HYPHEN); + ENUM_STR(MIE_SYM_ASTERISK); + ENUM_STR(MIE_SYM_PLUS); + ENUM_STR(MIE_SYM_PERCENT); + ENUM_STR(MIE_SYM_DOLLAR); + ENUM_STR(MIE_SYM_CARET); + ENUM_STR(MIE_SYM_HASH); + ENUM_STR(MIE_SYM_ATSIGN); + ENUM_STR(MIE_SYM_LEFT_BRACE); + ENUM_STR(MIE_SYM_RIGHT_BRACE); + ENUM_STR(MIE_SYM_LEFT_BRACKET); + ENUM_STR(MIE_SYM_RIGHT_BRACKET); + ENUM_STR(MIE_SYM_LEFT_PAREN); + ENUM_STR(MIE_SYM_RIGHT_PAREN); + ENUM_STR(MIE_SYM_LEFT_ANGLE); + ENUM_STR(MIE_SYM_RIGHT_ANGLE); + ENUM_STR(MIE_SYM_HYPHEN_RIGHT_ANGLE); + ENUM_STR(MIE_SYM_OTHER); + default: + return ""; + } +}