diff --git a/lang/include/ivy/lang/lex.h b/lang/include/ivy/lang/lex.h index fa2e558..f086dc3 100644 --- a/lang/include/ivy/lang/lex.h +++ b/lang/include/ivy/lang/lex.h @@ -56,17 +56,20 @@ enum ivy_symbol { IVY_SYM_COLON, IVY_SYM_DOUBLE_COLON, IVY_SYM_PLUS, - IVY_SYM_MINUS, + IVY_SYM_HYPHEN, + IVY_SYM_DOUBLE_HYPHEN, IVY_SYM_FORWARD_SLASH, IVY_SYM_ASTERISK, IVY_SYM_PERCENT, IVY_SYM_AMPERSAND, IVY_SYM_EQUAL, IVY_SYM_DOUBLE_EQUAL, + IVY_SYM_DOUBLE_LEFT_ANGLE, + IVY_SYM_DOUBLE_RIGHT_ANGLE, IVY_SYM_DOUBLE_LEFT_ANGLE_EQUAL, IVY_SYM_DOUBLE_RIGHT_ANGLE_EQUAL, IVY_SYM_PLUS_EQUAL, - IVY_SYM_MINUS_EQUAL, + IVY_SYM_HYPHEN_EQUAL, IVY_SYM_FORWARD_SLASH_EQUAL, IVY_SYM_ASTERISK_EQUAL, IVY_SYM_AMPERSAND_EQUAL, @@ -97,10 +100,14 @@ struct ivy_token { }; }; +struct ivy_lexer_symbol_node; + struct ivy_lexer { struct ivy_line_source *lex_source; enum ivy_status lex_status; struct ivy_token *lex_queue; + struct ivy_lexer_symbol_node *lex_sym_tree; + enum ivy_token_type lex_prev_token; char *lex_linebuf; size_t lex_linebuf_len; @@ -116,4 +123,8 @@ extern struct ivy_token *ivy_lexer_read(struct ivy_lexer *lex); extern void ivy_token_destroy(struct ivy_token *tok); +extern const char *ivy_lex_token_type_to_string(enum ivy_token_type type); +extern const char *ivy_keyword_to_string(enum ivy_keyword keyword); +extern const char *ivy_symbol_to_string(enum ivy_symbol sym); + #endif diff --git a/lang/lex.c b/lang/lex.c index 1e067a9..ba16102 100644 --- a/lang/lex.c +++ b/lang/lex.c @@ -1,7 +1,10 @@ +#include +#include #include #include #include #include +#include #include #include @@ -12,9 +15,18 @@ .id = (i), .name = (n) \ } +struct ivy_lexer_symbol_node { + char s_char; + enum ivy_symbol s_id; + + b_queue_entry s_entry; + b_queue s_children; +}; + struct lex_token_def { int id; const char *name; + uint64_t name_hash; }; static struct lex_token_def keywords[] = { @@ -54,17 +66,22 @@ static struct lex_token_def symbols[] = { LEX_TOKEN_DEF(IVY_SYM_COLON, ":"), LEX_TOKEN_DEF(IVY_SYM_DOUBLE_COLON, "::"), LEX_TOKEN_DEF(IVY_SYM_PLUS, "+"), - LEX_TOKEN_DEF(IVY_SYM_MINUS, "-"), + LEX_TOKEN_DEF(IVY_SYM_HYPHEN, "-"), + LEX_TOKEN_DEF(IVY_SYM_DOUBLE_HYPHEN, "--"), LEX_TOKEN_DEF(IVY_SYM_FORWARD_SLASH, "/"), LEX_TOKEN_DEF(IVY_SYM_ASTERISK, "*"), + LEX_TOKEN_DEF(IVY_SYM_FORWARD_SLASH_ASTERISK, "/*"), + LEX_TOKEN_DEF(IVY_SYM_ASTERISK_FORWARD_SLASH, "*/"), LEX_TOKEN_DEF(IVY_SYM_PERCENT, "%"), LEX_TOKEN_DEF(IVY_SYM_AMPERSAND, "&"), LEX_TOKEN_DEF(IVY_SYM_EQUAL, "="), LEX_TOKEN_DEF(IVY_SYM_DOUBLE_EQUAL, "=="), + LEX_TOKEN_DEF(IVY_SYM_DOUBLE_LEFT_ANGLE, "<<"), + LEX_TOKEN_DEF(IVY_SYM_DOUBLE_RIGHT_ANGLE, ">>"), LEX_TOKEN_DEF(IVY_SYM_DOUBLE_LEFT_ANGLE_EQUAL, "<<="), LEX_TOKEN_DEF(IVY_SYM_DOUBLE_RIGHT_ANGLE_EQUAL, ">>="), LEX_TOKEN_DEF(IVY_SYM_PLUS_EQUAL, "+="), - LEX_TOKEN_DEF(IVY_SYM_MINUS_EQUAL, "-="), + LEX_TOKEN_DEF(IVY_SYM_HYPHEN_EQUAL, "-="), LEX_TOKEN_DEF(IVY_SYM_FORWARD_SLASH_EQUAL, "/="), LEX_TOKEN_DEF(IVY_SYM_ASTERISK_EQUAL, "*="), LEX_TOKEN_DEF(IVY_SYM_AMPERSAND_EQUAL, "&="), @@ -82,15 +99,153 @@ static struct lex_token_def symbols[] = { }; static const size_t nr_symbols = sizeof symbols / sizeof symbols[0]; +static struct ivy_lexer_symbol_node *get_symbol_node( + struct ivy_lexer_symbol_node *node, char c) +{ + b_queue_iterator it; + b_queue_foreach (&it, &node->s_children) { + struct ivy_lexer_symbol_node *child = b_unbox( + struct ivy_lexer_symbol_node, it.entry, s_entry); + if (child->s_char == c) { + return child; + } + } + + return NULL; +} + +static enum ivy_status put_symbol( + struct ivy_lexer_symbol_node *tree, struct lex_token_def *sym) +{ + for (size_t i = 0; sym->name[i]; i++) { + char c = sym->name[i]; + struct ivy_lexer_symbol_node *child = get_symbol_node(tree, c); + if (child) { + tree = child; + continue; + } + + child = malloc(sizeof *child); + if (!child) { + return IVY_ERR_NO_MEMORY; + } + + child->s_id = IVY_SYM_NONE; + child->s_char = c; + + b_queue_push_back(&tree->s_children, &child->s_entry); + tree = child; + } + + tree->s_id = sym->id; + return IVY_OK; +} + +static void destroy_symbol_tree(struct ivy_lexer_symbol_node *tree) +{ + b_queue_iterator it; + b_queue_iterator_begin(&tree->s_children, &it); + while (b_queue_iterator_is_valid(&it)) { + struct ivy_lexer_symbol_node *node = b_unbox( + struct ivy_lexer_symbol_node, it.entry, s_entry); + b_queue_iterator_erase(&it); + + destroy_symbol_tree(node); + } + + free(tree); +} + +static struct ivy_lexer_symbol_node *build_symbol_tree(void) +{ + struct ivy_lexer_symbol_node *root = malloc(sizeof *root); + if (!root) { + return NULL; + } + + memset(root, 0x0, sizeof *root); + root->s_id = IVY_SYM_NONE; + + enum ivy_status status = IVY_OK; + for (size_t i = 0; i < nr_symbols; i++) { + status = put_symbol(root, &symbols[i]); + + if (status != IVY_OK) { + destroy_symbol_tree(root); + return NULL; + } + } + + return root; +} + +static void print_symbol_node(struct ivy_lexer_symbol_node *node, int depth) +{ + for (int i = 0; i < depth; i++) { + fputs(" ", stdout); + } + + printf("%c", node->s_char); + + if (node->s_id != IVY_SYM_NONE) { + printf(" (%s)", ivy_symbol_to_string(node->s_id)); + } + + printf("\n"); + + b_queue_iterator it; + b_queue_foreach (&it, &node->s_children) { + struct ivy_lexer_symbol_node *child = b_unbox( + struct ivy_lexer_symbol_node, it.entry, s_entry); + + print_symbol_node(child, depth + 1); + } +} + +static void init_keywords(void) +{ + for (size_t i = 0; i < nr_keywords; i++) { + keywords[i].name_hash = b_hash_string(keywords[i].name); + } +} + +static enum ivy_keyword find_keyword_by_name(const char *s) +{ + uint64_t s_hash = b_hash_string(s); + + for (size_t i = 0; i < nr_keywords; i++) { + struct lex_token_def *def = &keywords[i]; + + if (s_hash != def->name_hash) { + continue; + } + + if (strcmp(s, def->name) != 0) { + continue; + } + + return def->id; + } + + return IVY_KW_NONE; +} + enum ivy_status ivy_lexer_init(struct ivy_lexer *lex) { memset(lex, 0x0, sizeof *lex); lex->lex_status = IVY_OK; + lex->lex_prev_token = IVY_TOK_NONE; lex->lex_linebuf = malloc(LINEBUF_DEFAULT_CAPACITY); lex->lex_linebuf_cap = LINEBUF_DEFAULT_CAPACITY; + lex->lex_sym_tree = build_symbol_tree(); + print_symbol_node(lex->lex_sym_tree, 0); + + /* TODO only do keyword initialisation once */ + init_keywords(); + return IVY_OK; } @@ -106,6 +261,10 @@ void ivy_lexer_finish(struct ivy_lexer *lex) free(lex->lex_linebuf); } + if (lex->lex_sym_tree) { + destroy_symbol_tree(lex->lex_sym_tree); + } + memset(lex, 0x0, sizeof *lex); } @@ -115,9 +274,15 @@ static enum ivy_status refill_linebuf(struct ivy_lexer *lex) return IVY_ERR_EOF; } - return ivy_line_source_readline( + enum ivy_status status = ivy_line_source_readline( lex->lex_source, lex->lex_linebuf, lex->lex_linebuf_cap, &lex->lex_linebuf_len, NULL); + + if (status == IVY_OK) { + lex->lex_linebuf_ptr = 0; + } + + return status; } static int peek(struct ivy_lexer *lex) @@ -140,6 +305,30 @@ static int peek(struct ivy_lexer *lex) return c; } +static int peek_next(struct ivy_lexer *lex) +{ + enum ivy_status status = IVY_OK; + + if (lex->lex_linebuf_ptr >= lex->lex_linebuf_len) { + status = refill_linebuf(lex); + } + + if (status != IVY_OK) { + return status; + } + + if (lex->lex_linebuf_len == 0) { + return IVY_ERR_EOF; + } + + if (lex->lex_linebuf_ptr + 1 >= lex->lex_linebuf_len) { + return IVY_ERR_EOF; + } + + int c = lex->lex_linebuf[lex->lex_linebuf_ptr + 1]; + return c; +} + static int advance(struct ivy_lexer *lex) { enum ivy_status status = IVY_OK; @@ -193,11 +382,16 @@ static enum ivy_status push_token(struct ivy_lexer *lex, struct ivy_token *tok) } *slot = tok; + lex->lex_prev_token = tok->t_type; return IVY_OK; } static enum ivy_status push_linefeed(struct ivy_lexer *lex) { + if (lex->lex_prev_token == IVY_TOK_LINEFEED) { + return IVY_OK; + } + struct ivy_token *tok = malloc(sizeof *tok); if (!tok) { return IVY_ERR_NO_MEMORY; @@ -223,22 +417,126 @@ static enum ivy_status push_symbol(struct ivy_lexer *lex, enum ivy_symbol sym) return push_token(lex, tok); } +static enum ivy_status push_keyword(struct ivy_lexer *lex, enum ivy_keyword keyword) +{ + struct ivy_token *tok = malloc(sizeof *tok); + if (!tok) { + return IVY_ERR_NO_MEMORY; + } + + memset(tok, 0x0, sizeof *tok); + + tok->t_type = IVY_TOK_KEYWORD; + tok->t_keyword = keyword; + return push_token(lex, tok); +} + +static enum ivy_status read_line_comment(struct ivy_lexer *lex) +{ + while (true) { + int c = advance(lex); + + if (c == IVY_ERR_EOF || c == '\n') { + break; + } + + if (c < 0) { + return c; + } + } + + return IVY_OK; +} + +static enum ivy_status read_block_comment(struct ivy_lexer *lex) +{ + int depth = 1; + char buf[2] = {0}; + + while (depth > 0) { + int c = peek(lex); + if (c < 0) { + return c; + } + + if (!buf[0]) { + buf[0] = c; + } else if (!buf[1]) { + buf[1] = c; + } else { + buf[0] = buf[1]; + buf[1] = c; + } + + if (buf[0] == '/' && buf[1] == '*') { + depth++; + } else if (buf[0] == '*' && buf[1] == '/') { + depth--; + } + + advance(lex); + } + + return IVY_OK; +} + +static enum ivy_status read_symbol(struct ivy_lexer *lex) +{ + char sym_buf[32]; + unsigned int sym_len = 0; + struct ivy_lexer_symbol_node *node = lex->lex_sym_tree; + + while (true) { + int c = peek(lex); + + struct ivy_lexer_symbol_node *next = get_symbol_node(node, c); + if (!next) { + break; + } + + node = next; + advance(lex); + } + + if (!node || node->s_id == IVY_SYM_NONE) { + return IVY_ERR_BAD_SYNTAX; + } + + if (node->s_id == IVY_SYM_FORWARD_SLASH_ASTERISK) { + return read_block_comment(lex); + } else if (node->s_id == IVY_SYM_DOUBLE_HYPHEN) { + return read_line_comment(lex); + } + + push_symbol(lex, node->s_id); + return IVY_OK; +} + static enum ivy_status read_ident(struct ivy_lexer *lex) { b_string *str = b_string_create(); - int c = peek(lex); + bool label = false; while (true) { + int c = peek(lex); + if (c < 0) { break; } + if (c == ':' && peek_next(lex) != ':') { + advance(lex); + label = true; + break; + } + if (!isalnum(c) && c != '_') { break; } char s[2] = {c, 0}; b_string_append_cstr(str, s); + advance(lex); } const char *s = b_string_ptr(str); @@ -247,12 +545,18 @@ static enum ivy_status read_ident(struct ivy_lexer *lex) push_symbol(lex, IVY_SYM_UNDERSCORE); } - struct ivy_token *tok = create_token(IVY_TOK_IDENT); + enum ivy_keyword keyword = IVY_KW_NONE; + if (!label && (keyword = find_keyword_by_name(s)) != IVY_KW_NONE) { + b_string_release(str); + return push_keyword(lex, keyword); + } + + struct ivy_token *tok + = create_token(label ? IVY_TOK_LABEL : IVY_TOK_IDENT); tok->t_str = b_string_steal(str); b_string_release(str); - push_token(lex, tok); - return IVY_OK; + return push_token(lex, tok); } static enum ivy_status pump_tokens(struct ivy_lexer *lex) @@ -277,6 +581,15 @@ static enum ivy_status pump_tokens(struct ivy_lexer *lex) return push_linefeed(lex); } + while (isspace(c)) { + advance(lex); + c = peek(lex); + } + + if (char_can_begin_symbol(c)) { + return read_symbol(lex); + } + if (isalpha(c) || c == '_') { return read_ident(lex); } @@ -288,15 +601,16 @@ struct ivy_token *ivy_lexer_peek(struct ivy_lexer *lex) { enum ivy_status status = IVY_OK; - if (!lex->lex_queue) { + while (!lex->lex_queue) { status = pump_tokens(lex); + + if (status != IVY_OK) { + lex->lex_status = status; + return NULL; + } } - if (status != IVY_OK) { - lex->lex_status = status; - return NULL; - } - + lex->lex_status = status; struct ivy_token *tok = lex->lex_queue; return tok; } @@ -305,13 +619,13 @@ struct ivy_token *ivy_lexer_read(struct ivy_lexer *lex) { enum ivy_status status = IVY_OK; - if (!lex->lex_queue) { + while (!lex->lex_queue) { status = pump_tokens(lex); - } - if (status != IVY_OK) { - lex->lex_status = status; - return NULL; + if (status != IVY_OK) { + lex->lex_status = status; + return NULL; + } } struct ivy_token *tok = lex->lex_queue; @@ -333,3 +647,106 @@ void ivy_token_destroy(struct ivy_token *tok) free(tok); } + +#define ENUM_STR(x) \ + case x: \ + return #x + +const char *ivy_lex_token_type_to_string(enum ivy_token_type type) +{ + switch (type) { + ENUM_STR(IVY_TOK_NONE); + ENUM_STR(IVY_TOK_KEYWORD); + ENUM_STR(IVY_TOK_SYMBOL); + ENUM_STR(IVY_TOK_ATOM); + ENUM_STR(IVY_TOK_NUMBER); + ENUM_STR(IVY_TOK_LABEL); + ENUM_STR(IVY_TOK_IDENT); + ENUM_STR(IVY_TOK_STRING); + ENUM_STR(IVY_TOK_STR_START); + ENUM_STR(IVY_TOK_STR_END); + ENUM_STR(IVY_TOK_LINEFEED); + default: + return ""; + } +} + +const char *ivy_keyword_to_string(enum ivy_keyword keyword) +{ + switch (keyword) { + ENUM_STR(IVY_KW_NONE); + ENUM_STR(IVY_KW_PACKAGE); + ENUM_STR(IVY_KW_USE); + ENUM_STR(IVY_KW_CLASS); + ENUM_STR(IVY_KW_PROTOCOL); + ENUM_STR(IVY_KW_TRY); + ENUM_STR(IVY_KW_THROW); + ENUM_STR(IVY_KW_CATCH); + ENUM_STR(IVY_KW_IF); + ENUM_STR(IVY_KW_AND); + ENUM_STR(IVY_KW_OR); + ENUM_STR(IVY_KW_IS); + ENUM_STR(IVY_KW_NOT); + ENUM_STR(IVY_KW_ELSE); + ENUM_STR(IVY_KW_WHILE); + ENUM_STR(IVY_KW_FOR); + ENUM_STR(IVY_KW_MATCH); + ENUM_STR(IVY_KW_UNLESS); + ENUM_STR(IVY_KW_IN); + ENUM_STR(IVY_KW_DO); + ENUM_STR(IVY_KW_END); + default: + return ""; + } +} + +const char *ivy_symbol_to_string(enum ivy_symbol sym) +{ + switch (sym) { + ENUM_STR(IVY_SYM_NONE); + ENUM_STR(IVY_SYM_DOT); + ENUM_STR(IVY_SYM_LEFT_BRACE); + ENUM_STR(IVY_SYM_RIGHT_BRACE); + ENUM_STR(IVY_SYM_LEFT_BRACKET); + ENUM_STR(IVY_SYM_RIGHT_BRACKET); + ENUM_STR(IVY_SYM_LEFT_PAREN); + ENUM_STR(IVY_SYM_RIGHT_PAREN); + ENUM_STR(IVY_SYM_LEFT_ANGLE); + ENUM_STR(IVY_SYM_RIGHT_ANGLE); + ENUM_STR(IVY_SYM_COLON); + ENUM_STR(IVY_SYM_DOUBLE_COLON); + ENUM_STR(IVY_SYM_PLUS); + ENUM_STR(IVY_SYM_HYPHEN); + ENUM_STR(IVY_SYM_DOUBLE_HYPHEN); + ENUM_STR(IVY_SYM_FORWARD_SLASH); + ENUM_STR(IVY_SYM_ASTERISK); + ENUM_STR(IVY_SYM_PERCENT); + ENUM_STR(IVY_SYM_AMPERSAND); + ENUM_STR(IVY_SYM_EQUAL); + ENUM_STR(IVY_SYM_DOUBLE_EQUAL); + ENUM_STR(IVY_SYM_DOUBLE_LEFT_ANGLE); + ENUM_STR(IVY_SYM_DOUBLE_RIGHT_ANGLE); + ENUM_STR(IVY_SYM_DOUBLE_LEFT_ANGLE_EQUAL); + ENUM_STR(IVY_SYM_DOUBLE_RIGHT_ANGLE_EQUAL); + ENUM_STR(IVY_SYM_PLUS_EQUAL); + ENUM_STR(IVY_SYM_HYPHEN_EQUAL); + ENUM_STR(IVY_SYM_FORWARD_SLASH_EQUAL); + ENUM_STR(IVY_SYM_ASTERISK_EQUAL); + ENUM_STR(IVY_SYM_AMPERSAND_EQUAL); + ENUM_STR(IVY_SYM_PIPE_EQUAL); + ENUM_STR(IVY_SYM_PERCENT_EQUAL); + ENUM_STR(IVY_SYM_CARET_EQUAL); + ENUM_STR(IVY_SYM_BANG); + ENUM_STR(IVY_SYM_PIPE); + ENUM_STR(IVY_SYM_CARET); + ENUM_STR(IVY_SYM_UNDERSCORE); + ENUM_STR(IVY_SYM_COMMA); + ENUM_STR(IVY_SYM_DOLLAR); + ENUM_STR(IVY_SYM_RIGHT_ARROW); + ENUM_STR(IVY_SYM_BIG_RIGHT_ARROW); + ENUM_STR(IVY_SYM_FORWARD_SLASH_ASTERISK); + ENUM_STR(IVY_SYM_ASTERISK_FORWARD_SLASH); + default: + return ""; + } +}