diff --git a/lang/CMakeLists.txt b/lang/CMakeLists.txt index 47cec5c..10885e2 100644 --- a/lang/CMakeLists.txt +++ b/lang/CMakeLists.txt @@ -2,3 +2,4 @@ file(GLOB_RECURSE lang_sources *.c *.h include/ivy/lang/*.h) add_library(ivy-lang SHARED ${lang_sources}) target_include_directories(ivy-lang PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include/) +target_link_libraries(ivy-lang ivy-common Bluelib::Core Bluelib::Object) diff --git a/lang/include/ivy/lang/lex.h b/lang/include/ivy/lang/lex.h index e69de29..fa2e558 100644 --- a/lang/include/ivy/lang/lex.h +++ b/lang/include/ivy/lang/lex.h @@ -0,0 +1,119 @@ +#ifndef IVY_LANG_LEX_H_ +#define IVY_LANG_LEX_H_ + +#include +#include + +enum ivy_token_type { + IVY_TOK_NONE = 0, + IVY_TOK_KEYWORD, + IVY_TOK_SYMBOL, + IVY_TOK_ATOM, + IVY_TOK_NUMBER, + IVY_TOK_LABEL, + IVY_TOK_IDENT, + IVY_TOK_STRING, + IVY_TOK_STR_START, + IVY_TOK_STR_END, + IVY_TOK_LINEFEED, +}; + +enum ivy_keyword { + IVY_KW_NONE = 0, + IVY_KW_PACKAGE, + IVY_KW_USE, + IVY_KW_CLASS, + IVY_KW_PROTOCOL, + IVY_KW_TRY, + IVY_KW_THROW, + IVY_KW_CATCH, + IVY_KW_IF, + IVY_KW_AND, + IVY_KW_OR, + IVY_KW_IS, + IVY_KW_NOT, + IVY_KW_ELSE, + IVY_KW_WHILE, + IVY_KW_FOR, + IVY_KW_MATCH, + IVY_KW_UNLESS, + IVY_KW_IN, + IVY_KW_DO, + IVY_KW_END, +}; + +enum ivy_symbol { + IVY_SYM_NONE = 0, + IVY_SYM_DOT, + IVY_SYM_LEFT_BRACE, + IVY_SYM_RIGHT_BRACE, + IVY_SYM_LEFT_BRACKET, + IVY_SYM_RIGHT_BRACKET, + IVY_SYM_LEFT_PAREN, + IVY_SYM_RIGHT_PAREN, + IVY_SYM_LEFT_ANGLE, + IVY_SYM_RIGHT_ANGLE, + IVY_SYM_COLON, + IVY_SYM_DOUBLE_COLON, + IVY_SYM_PLUS, + IVY_SYM_MINUS, + IVY_SYM_FORWARD_SLASH, + IVY_SYM_ASTERISK, + IVY_SYM_PERCENT, + IVY_SYM_AMPERSAND, + IVY_SYM_EQUAL, + IVY_SYM_DOUBLE_EQUAL, + IVY_SYM_DOUBLE_LEFT_ANGLE_EQUAL, + IVY_SYM_DOUBLE_RIGHT_ANGLE_EQUAL, + IVY_SYM_PLUS_EQUAL, + IVY_SYM_MINUS_EQUAL, + IVY_SYM_FORWARD_SLASH_EQUAL, + IVY_SYM_ASTERISK_EQUAL, + IVY_SYM_AMPERSAND_EQUAL, + IVY_SYM_PIPE_EQUAL, + IVY_SYM_PERCENT_EQUAL, + IVY_SYM_CARET_EQUAL, + IVY_SYM_BANG, + IVY_SYM_PIPE, + IVY_SYM_CARET, + IVY_SYM_UNDERSCORE, + IVY_SYM_COMMA, + IVY_SYM_DOLLAR, + IVY_SYM_RIGHT_ARROW, + IVY_SYM_BIG_RIGHT_ARROW, + IVY_SYM_FORWARD_SLASH_ASTERISK, + IVY_SYM_ASTERISK_FORWARD_SLASH, +}; + +struct ivy_token { + enum ivy_token_type t_type; + struct ivy_token *t_next; + + union { + enum ivy_keyword t_keyword; + enum ivy_symbol t_symbol; + signed long long t_number; + char *t_str; + }; +}; + +struct ivy_lexer { + struct ivy_line_source *lex_source; + enum ivy_status lex_status; + struct ivy_token *lex_queue; + + char *lex_linebuf; + size_t lex_linebuf_len; + size_t lex_linebuf_cap; + size_t lex_linebuf_ptr; +}; + +extern enum ivy_status ivy_lexer_init(struct ivy_lexer *lex); +extern void ivy_lexer_finish(struct ivy_lexer *lex); + +extern struct ivy_token *ivy_lexer_peek(struct ivy_lexer *lex); +extern struct ivy_token *ivy_lexer_read(struct ivy_lexer *lex); + +extern void ivy_token_destroy(struct ivy_token *tok); + +#endif diff --git a/lang/lex.c b/lang/lex.c new file mode 100644 index 0000000..1e067a9 --- /dev/null +++ b/lang/lex.c @@ -0,0 +1,335 @@ +#include +#include +#include +#include +#include +#include + +#define LINEBUF_DEFAULT_CAPACITY 1024 + +#define LEX_TOKEN_DEF(i, n) \ + { \ + .id = (i), .name = (n) \ + } + +struct lex_token_def { + int id; + const char *name; +}; + +static struct lex_token_def keywords[] = { + LEX_TOKEN_DEF(IVY_KW_PACKAGE, "package"), + LEX_TOKEN_DEF(IVY_KW_USE, "use"), + LEX_TOKEN_DEF(IVY_KW_CLASS, "class"), + LEX_TOKEN_DEF(IVY_KW_PROTOCOL, "protocol"), + LEX_TOKEN_DEF(IVY_KW_TRY, "try"), + LEX_TOKEN_DEF(IVY_KW_THROW, "throw"), + LEX_TOKEN_DEF(IVY_KW_CATCH, "catch"), + LEX_TOKEN_DEF(IVY_KW_IF, "if"), + LEX_TOKEN_DEF(IVY_KW_AND, "and"), + LEX_TOKEN_DEF(IVY_KW_OR, "or"), + LEX_TOKEN_DEF(IVY_KW_IS, "is"), + LEX_TOKEN_DEF(IVY_KW_NOT, "not"), + LEX_TOKEN_DEF(IVY_KW_ELSE, "else"), + LEX_TOKEN_DEF(IVY_KW_WHILE, "while"), + LEX_TOKEN_DEF(IVY_KW_FOR, "for"), + LEX_TOKEN_DEF(IVY_KW_MATCH, "match"), + LEX_TOKEN_DEF(IVY_KW_UNLESS, "unless"), + LEX_TOKEN_DEF(IVY_KW_IN, "in"), + LEX_TOKEN_DEF(IVY_KW_DO, "do"), + LEX_TOKEN_DEF(IVY_KW_END, "end"), +}; +static const size_t nr_keywords = sizeof keywords / sizeof keywords[0]; + +static struct lex_token_def symbols[] = { + LEX_TOKEN_DEF(IVY_SYM_DOT, "."), + LEX_TOKEN_DEF(IVY_SYM_LEFT_BRACE, "{"), + LEX_TOKEN_DEF(IVY_SYM_RIGHT_BRACE, "}"), + LEX_TOKEN_DEF(IVY_SYM_LEFT_BRACKET, "["), + LEX_TOKEN_DEF(IVY_SYM_RIGHT_BRACKET, "]"), + LEX_TOKEN_DEF(IVY_SYM_LEFT_PAREN, "("), + LEX_TOKEN_DEF(IVY_SYM_RIGHT_PAREN, ")"), + LEX_TOKEN_DEF(IVY_SYM_LEFT_ANGLE, "<"), + LEX_TOKEN_DEF(IVY_SYM_RIGHT_ANGLE, ">"), + LEX_TOKEN_DEF(IVY_SYM_COLON, ":"), + LEX_TOKEN_DEF(IVY_SYM_DOUBLE_COLON, "::"), + LEX_TOKEN_DEF(IVY_SYM_PLUS, "+"), + LEX_TOKEN_DEF(IVY_SYM_MINUS, "-"), + LEX_TOKEN_DEF(IVY_SYM_FORWARD_SLASH, "/"), + LEX_TOKEN_DEF(IVY_SYM_ASTERISK, "*"), + LEX_TOKEN_DEF(IVY_SYM_PERCENT, "%"), + LEX_TOKEN_DEF(IVY_SYM_AMPERSAND, "&"), + LEX_TOKEN_DEF(IVY_SYM_EQUAL, "="), + LEX_TOKEN_DEF(IVY_SYM_DOUBLE_EQUAL, "=="), + LEX_TOKEN_DEF(IVY_SYM_DOUBLE_LEFT_ANGLE_EQUAL, "<<="), + LEX_TOKEN_DEF(IVY_SYM_DOUBLE_RIGHT_ANGLE_EQUAL, ">>="), + LEX_TOKEN_DEF(IVY_SYM_PLUS_EQUAL, "+="), + LEX_TOKEN_DEF(IVY_SYM_MINUS_EQUAL, "-="), + LEX_TOKEN_DEF(IVY_SYM_FORWARD_SLASH_EQUAL, "/="), + LEX_TOKEN_DEF(IVY_SYM_ASTERISK_EQUAL, "*="), + LEX_TOKEN_DEF(IVY_SYM_AMPERSAND_EQUAL, "&="), + LEX_TOKEN_DEF(IVY_SYM_PIPE_EQUAL, "|="), + LEX_TOKEN_DEF(IVY_SYM_PERCENT_EQUAL, "%="), + LEX_TOKEN_DEF(IVY_SYM_CARET_EQUAL, "^="), + LEX_TOKEN_DEF(IVY_SYM_BANG, "!"), + LEX_TOKEN_DEF(IVY_SYM_PIPE, "|"), + LEX_TOKEN_DEF(IVY_SYM_CARET, "^"), + LEX_TOKEN_DEF(IVY_SYM_UNDERSCORE, "_"), + LEX_TOKEN_DEF(IVY_SYM_COMMA, ","), + LEX_TOKEN_DEF(IVY_SYM_DOLLAR, "$"), + LEX_TOKEN_DEF(IVY_SYM_RIGHT_ARROW, "->"), + LEX_TOKEN_DEF(IVY_SYM_BIG_RIGHT_ARROW, "=>"), +}; +static const size_t nr_symbols = sizeof symbols / sizeof symbols[0]; + +enum ivy_status ivy_lexer_init(struct ivy_lexer *lex) +{ + memset(lex, 0x0, sizeof *lex); + + lex->lex_status = IVY_OK; + + lex->lex_linebuf = malloc(LINEBUF_DEFAULT_CAPACITY); + lex->lex_linebuf_cap = LINEBUF_DEFAULT_CAPACITY; + + return IVY_OK; +} + +void ivy_lexer_finish(struct ivy_lexer *lex) +{ + while (lex->lex_queue) { + struct ivy_token *next = lex->lex_queue->t_next; + ivy_token_destroy(lex->lex_queue); + lex->lex_queue = next; + } + + if (lex->lex_linebuf) { + free(lex->lex_linebuf); + } + + memset(lex, 0x0, sizeof *lex); +} + +static enum ivy_status refill_linebuf(struct ivy_lexer *lex) +{ + if (!lex->lex_source) { + return IVY_ERR_EOF; + } + + return ivy_line_source_readline( + lex->lex_source, lex->lex_linebuf, lex->lex_linebuf_cap, + &lex->lex_linebuf_len, NULL); +} + +static int peek(struct ivy_lexer *lex) +{ + enum ivy_status status = IVY_OK; + + if (lex->lex_linebuf_ptr >= lex->lex_linebuf_len) { + status = refill_linebuf(lex); + } + + if (status != IVY_OK) { + return status; + } + + if (lex->lex_linebuf_len == 0) { + return IVY_ERR_EOF; + } + + int c = lex->lex_linebuf[lex->lex_linebuf_ptr]; + return c; +} + +static int advance(struct ivy_lexer *lex) +{ + enum ivy_status status = IVY_OK; + + if (lex->lex_linebuf_ptr >= lex->lex_linebuf_len) { + status = refill_linebuf(lex); + } + + if (status != IVY_OK) { + return status; + } + + if (lex->lex_linebuf_len == 0) { + return IVY_ERR_EOF; + } + + int c = lex->lex_linebuf[lex->lex_linebuf_ptr++]; + return c; +} + +static bool char_can_begin_symbol(char c) +{ + for (size_t i = 0; i < nr_symbols; i++) { + if (symbols[i].name[0] == c) { + return true; + } + } + + return false; +} + +static struct ivy_token *create_token(enum ivy_token_type type) +{ + struct ivy_token *tok = malloc(sizeof *tok); + if (!tok) { + return NULL; + } + + memset(tok, 0x0, sizeof *tok); + + tok->t_type = type; + return tok; +} + +static enum ivy_status push_token(struct ivy_lexer *lex, struct ivy_token *tok) +{ + struct ivy_token **slot = &lex->lex_queue; + + while (*slot) { + slot = &(*slot)->t_next; + } + + *slot = tok; + return IVY_OK; +} + +static enum ivy_status push_linefeed(struct ivy_lexer *lex) +{ + struct ivy_token *tok = malloc(sizeof *tok); + if (!tok) { + return IVY_ERR_NO_MEMORY; + } + + memset(tok, 0x0, sizeof *tok); + + tok->t_type = IVY_TOK_LINEFEED; + return push_token(lex, tok); +} + +static enum ivy_status push_symbol(struct ivy_lexer *lex, enum ivy_symbol sym) +{ + struct ivy_token *tok = malloc(sizeof *tok); + if (!tok) { + return IVY_ERR_NO_MEMORY; + } + + memset(tok, 0x0, sizeof *tok); + + tok->t_type = IVY_TOK_SYMBOL; + tok->t_symbol = sym; + return push_token(lex, tok); +} + +static enum ivy_status read_ident(struct ivy_lexer *lex) +{ + b_string *str = b_string_create(); + int c = peek(lex); + + while (true) { + if (c < 0) { + break; + } + + if (!isalnum(c) && c != '_') { + break; + } + + char s[2] = {c, 0}; + b_string_append_cstr(str, s); + } + + const char *s = b_string_ptr(str); + if (!strcmp(s, "_")) { + b_string_release(str); + push_symbol(lex, IVY_SYM_UNDERSCORE); + } + + struct ivy_token *tok = create_token(IVY_TOK_IDENT); + tok->t_str = b_string_steal(str); + b_string_release(str); + + push_token(lex, tok); + return IVY_OK; +} + +static enum ivy_status pump_tokens(struct ivy_lexer *lex) +{ + enum ivy_status status; + int c = peek(lex); + + if (c < 0) { + return c; + } + + if (c == '\n') { + while (c == '\n') { + advance(lex); + c = peek(lex); + } + + if (c < 0) { + return c; + } + + return push_linefeed(lex); + } + + if (isalpha(c) || c == '_') { + return read_ident(lex); + } + + return IVY_ERR_BAD_SYNTAX; +} + +struct ivy_token *ivy_lexer_peek(struct ivy_lexer *lex) +{ + enum ivy_status status = IVY_OK; + + if (!lex->lex_queue) { + status = pump_tokens(lex); + } + + if (status != IVY_OK) { + lex->lex_status = status; + return NULL; + } + + struct ivy_token *tok = lex->lex_queue; + return tok; +} + +struct ivy_token *ivy_lexer_read(struct ivy_lexer *lex) +{ + enum ivy_status status = IVY_OK; + + if (!lex->lex_queue) { + status = pump_tokens(lex); + } + + if (status != IVY_OK) { + lex->lex_status = status; + return NULL; + } + + struct ivy_token *tok = lex->lex_queue; + lex->lex_queue = lex->lex_queue->t_next; + return tok; +} + +void ivy_token_destroy(struct ivy_token *tok) +{ + switch (tok->t_type) { + case IVY_TOK_ATOM: + case IVY_TOK_STRING: + case IVY_TOK_IDENT: + free(tok->t_str); + break; + default: + break; + } + + free(tok); +}