From b0cbe42fc4d216c596d67458f546a8ada0e33b35 Mon Sep 17 00:00:00 2001 From: Max Wash Date: Fri, 7 Nov 2025 09:49:29 +0000 Subject: [PATCH] lang: lex: add support for kebab-case identifiers and negative numbers identifiers can now contain hyphens, with the following restrictions: * an identifier cannot start or end with a hyphen. * an identifier cannot contain more than one hyphen in a row. kebab-case identifiers can be used for type and variable names, as well as message identifiers and labels. to avoid ambiguity, the lexer now enforces whitespace around most binary operators (with a few exceptions, such as semicolons). trying to compile a "compact" arithmetic expression, such as y=1+2 will now result in a "missing whitespace" error. --- lang/include/ivy/lang/lex.h | 2 +- lang/internal.c | 6 +- lang/lex.c | 286 ++++++++++++++++++++++-------------- lang/lex.h | 8 +- 4 files changed, 185 insertions(+), 117 deletions(-) diff --git a/lang/include/ivy/lang/lex.h b/lang/include/ivy/lang/lex.h index 070bb41..fd80160 100644 --- a/lang/include/ivy/lang/lex.h +++ b/lang/include/ivy/lang/lex.h @@ -128,7 +128,7 @@ struct ivy_token { union { enum ivy_keyword t_keyword; enum ivy_symbol t_symbol; - unsigned long long t_int; + long long t_int; double t_double; char *t_str; }; diff --git a/lang/internal.c b/lang/internal.c index c78cf78..82bcac0 100644 --- a/lang/internal.c +++ b/lang/internal.c @@ -14,8 +14,10 @@ static void print_symbol_node(struct ivy_lexer_symbol_node *node, int depth) b_printf("[cyan]%c[reset]", node->s_char); - if (node->s_id != IVY_SYM_NONE) { - b_printf(" ([magenta]%s[reset])", ivy_symbol_to_string(node->s_id)); + if (node->s_def != NULL) { + b_printf( + " ([magenta]%s[reset])", + ivy_symbol_to_string(node->s_def->id)); } b_printf("\n"); diff --git a/lang/lex.c b/lang/lex.c index 01cee07..522b177 100644 --- a/lang/lex.c +++ b/lang/lex.c @@ -17,7 +17,9 @@ #define LINEBUF_DEFAULT_CAPACITY 1024 -#define LEX_TOKEN_DEF(i, n) {.id = (i), .name = (n)} +#define LEX_TOKEN_DEF2(i, n, f) {.id = (i), .name = (n), .flags = (f)} +#define LEX_TOKEN_DEF(i, n) LEX_TOKEN_DEF2(i, n, 0) +#define LEX_TOKEN_DEF_W(i, n) LEX_TOKEN_DEF2(i, n, LEX_TOK_REQUIRES_WHITESPACE) static struct lex_token_def keywords[] = { LEX_TOKEN_DEF(IVY_KW_PACKAGE, "package"), @@ -65,36 +67,36 @@ static struct lex_token_def symbols[] = { LEX_TOKEN_DEF(IVY_SYM_RIGHT_BRACKET, "]"), LEX_TOKEN_DEF(IVY_SYM_LEFT_PAREN, "("), LEX_TOKEN_DEF(IVY_SYM_RIGHT_PAREN, ")"), - LEX_TOKEN_DEF(IVY_SYM_LEFT_ANGLE, "<"), - LEX_TOKEN_DEF(IVY_SYM_RIGHT_ANGLE, ">"), + LEX_TOKEN_DEF_W(IVY_SYM_LEFT_ANGLE, "<"), + LEX_TOKEN_DEF_W(IVY_SYM_RIGHT_ANGLE, ">"), LEX_TOKEN_DEF(IVY_SYM_COLON, ":"), LEX_TOKEN_DEF(IVY_SYM_DOUBLE_COLON, "::"), - LEX_TOKEN_DEF(IVY_SYM_PLUS, "+"), - LEX_TOKEN_DEF(IVY_SYM_HYPHEN, "-"), + LEX_TOKEN_DEF_W(IVY_SYM_PLUS, "+"), + LEX_TOKEN_DEF_W(IVY_SYM_HYPHEN, "-"), LEX_TOKEN_DEF(IVY_SYM_DOUBLE_HYPHEN, "--"), - LEX_TOKEN_DEF(IVY_SYM_FORWARD_SLASH, "/"), - LEX_TOKEN_DEF(IVY_SYM_ASTERISK, "*"), + LEX_TOKEN_DEF_W(IVY_SYM_FORWARD_SLASH, "/"), + LEX_TOKEN_DEF_W(IVY_SYM_ASTERISK, "*"), LEX_TOKEN_DEF(IVY_SYM_FORWARD_SLASH_ASTERISK, "/*"), LEX_TOKEN_DEF(IVY_SYM_ASTERISK_FORWARD_SLASH, "*/"), - LEX_TOKEN_DEF(IVY_SYM_PERCENT, "%"), - LEX_TOKEN_DEF(IVY_SYM_AMPERSAND, "&"), - LEX_TOKEN_DEF(IVY_SYM_EQUAL, "="), - LEX_TOKEN_DEF(IVY_SYM_DOUBLE_EQUAL, "=="), - LEX_TOKEN_DEF(IVY_SYM_DOUBLE_LEFT_ANGLE, "<<"), - LEX_TOKEN_DEF(IVY_SYM_DOUBLE_RIGHT_ANGLE, ">>"), - LEX_TOKEN_DEF(IVY_SYM_LEFT_ANGLE_EQUAL, "<="), - LEX_TOKEN_DEF(IVY_SYM_RIGHT_ANGLE_EQUAL, ">="), - LEX_TOKEN_DEF(IVY_SYM_DOUBLE_LEFT_ANGLE_EQUAL, "<<="), - LEX_TOKEN_DEF(IVY_SYM_DOUBLE_RIGHT_ANGLE_EQUAL, ">>="), - LEX_TOKEN_DEF(IVY_SYM_PLUS_EQUAL, "+="), - LEX_TOKEN_DEF(IVY_SYM_HYPHEN_EQUAL, "-="), - LEX_TOKEN_DEF(IVY_SYM_FORWARD_SLASH_EQUAL, "/="), - LEX_TOKEN_DEF(IVY_SYM_ASTERISK_EQUAL, "*="), - LEX_TOKEN_DEF(IVY_SYM_AMPERSAND_EQUAL, "&="), - LEX_TOKEN_DEF(IVY_SYM_PIPE_EQUAL, "|="), - LEX_TOKEN_DEF(IVY_SYM_PERCENT_EQUAL, "%="), - LEX_TOKEN_DEF(IVY_SYM_CARET_EQUAL, "^="), - LEX_TOKEN_DEF(IVY_SYM_BANG_EQUAL, "!="), + LEX_TOKEN_DEF_W(IVY_SYM_PERCENT, "%"), + LEX_TOKEN_DEF_W(IVY_SYM_AMPERSAND, "&"), + LEX_TOKEN_DEF_W(IVY_SYM_EQUAL, "="), + LEX_TOKEN_DEF_W(IVY_SYM_DOUBLE_EQUAL, "=="), + LEX_TOKEN_DEF_W(IVY_SYM_DOUBLE_LEFT_ANGLE, "<<"), + LEX_TOKEN_DEF_W(IVY_SYM_DOUBLE_RIGHT_ANGLE, ">>"), + LEX_TOKEN_DEF_W(IVY_SYM_LEFT_ANGLE_EQUAL, "<="), + LEX_TOKEN_DEF_W(IVY_SYM_RIGHT_ANGLE_EQUAL, ">="), + LEX_TOKEN_DEF_W(IVY_SYM_DOUBLE_LEFT_ANGLE_EQUAL, "<<="), + LEX_TOKEN_DEF_W(IVY_SYM_DOUBLE_RIGHT_ANGLE_EQUAL, ">>="), + LEX_TOKEN_DEF_W(IVY_SYM_PLUS_EQUAL, "+="), + LEX_TOKEN_DEF_W(IVY_SYM_HYPHEN_EQUAL, "-="), + LEX_TOKEN_DEF_W(IVY_SYM_FORWARD_SLASH_EQUAL, "/="), + LEX_TOKEN_DEF_W(IVY_SYM_ASTERISK_EQUAL, "*="), + LEX_TOKEN_DEF_W(IVY_SYM_AMPERSAND_EQUAL, "&="), + LEX_TOKEN_DEF_W(IVY_SYM_PIPE_EQUAL, "|="), + LEX_TOKEN_DEF_W(IVY_SYM_PERCENT_EQUAL, "%="), + LEX_TOKEN_DEF_W(IVY_SYM_CARET_EQUAL, "^="), + LEX_TOKEN_DEF_W(IVY_SYM_BANG_EQUAL, "!="), LEX_TOKEN_DEF(IVY_SYM_HASH, "#"), LEX_TOKEN_DEF(IVY_SYM_BANG, "!"), LEX_TOKEN_DEF(IVY_SYM_PIPE, "|"), @@ -111,6 +113,44 @@ static struct lex_token_def symbols[] = { }; static const size_t nr_symbols = sizeof symbols / sizeof symbols[0]; +static void report_unrecognised_char(struct ivy_lexer *lex, int c) +{ + struct ivy_diag *diag = ivy_diag_ctx_create_diag( + lex->lex_diag_ctx, IVY_LANG_E_UNRECOGNISED_SYMBOL); + + ivy_diag_set_location(diag, lex->lex_cursor_row, lex->lex_cursor_col); + ivy_diag_push_msg(diag, IVY_LANG_MSG_UNKNOWN_SYMBOL_ENCOUNTERED); + + const struct ivy_diag_highlight hl[] = { + IVY_DIAG_HL( + ERROR, lex->lex_cursor_row, lex->lex_cursor_col, + lex->lex_cursor_row, lex->lex_cursor_col), + }; + const size_t nr_hl = sizeof hl / sizeof hl[0]; + + ivy_diag_push_snippet( + diag, lex->lex_cursor_row, lex->lex_cursor_row, NULL, 0, hl, nr_hl); +} + +static void report_missing_whitespace(struct ivy_lexer *lex, int msg) +{ + struct ivy_diag *diag = ivy_diag_ctx_create_diag( + lex->lex_diag_ctx, IVY_LANG_E_MISSING_WHITESPACE); + + ivy_diag_set_location(diag, lex->lex_cursor_row, lex->lex_cursor_col); + ivy_diag_push_msg(diag, msg); + + const struct ivy_diag_highlight hl[] = { + IVY_DIAG_HL( + ERROR, lex->lex_token_start_row, lex->lex_token_start_col, + lex->lex_token_end_row, lex->lex_token_end_col), + }; + const size_t nr_hl = sizeof hl / sizeof hl[0]; + + ivy_diag_push_snippet( + diag, lex->lex_cursor_row, lex->lex_cursor_row, NULL, 0, hl, nr_hl); +} + static struct lexer_state *push_lexer_state( struct ivy_lexer *lex, enum lexer_state_type state_type) { @@ -209,14 +249,14 @@ static enum ivy_status put_symbol( memset(child, 0x0, sizeof *child); - child->s_id = IVY_SYM_NONE; + child->s_def = NULL; child->s_char = c; b_queue_push_back(&tree->s_children, &child->s_entry); tree = child; } - tree->s_id = sym->id; + tree->s_def = sym; return IVY_OK; } @@ -245,7 +285,7 @@ static struct ivy_lexer_symbol_node *build_symbol_tree(void) } memset(root, 0x0, sizeof *root); - root->s_id = IVY_SYM_NONE; + root->s_def = NULL; enum ivy_status status = IVY_OK; for (size_t i = 0; i < nr_symbols; i++) { @@ -380,6 +420,11 @@ static enum ivy_status refill_linebuf(struct ivy_lexer *lex) return status; } +static int peek_prev(struct ivy_lexer *lex) +{ + return lex->lex_prev_char; +} + static int peek(struct ivy_lexer *lex) { enum ivy_status status = IVY_OK; @@ -441,9 +486,11 @@ static int advance(struct ivy_lexer *lex) } int c = lex->lex_linebuf[lex->lex_linebuf_ptr++]; + lex->lex_prev_char = c; + lex->lex_cur_char = lex->lex_linebuf[lex->lex_linebuf_ptr]; lex->lex_cursor_col++; - if (c == '\n') { + if (lex->lex_cur_char == '\n') { lex->lex_cursor_col = 1; lex->lex_cursor_row++; } @@ -802,73 +849,16 @@ static enum ivy_status read_string_content(struct ivy_lexer *lex) return status; } -static enum ivy_status read_symbol(struct ivy_lexer *lex) -{ - struct ivy_lexer_symbol_node *node = lex->lex_sym_tree; - struct lexer_state *state = get_lexer_state(lex); - set_token_start(lex); - - while (true) { - int c = peek(lex); - if (c < 0) { - break; - } - - struct ivy_lexer_symbol_node *next = get_symbol_node(node, c); - if (!next) { - break; - } - - node = next; - set_token_end(lex); - advance(lex); - } - - if (!node || node->s_id == IVY_SYM_NONE) { - return IVY_ERR_BAD_SYNTAX; - } - - switch (node->s_id) { - case IVY_SYM_SQUOTE: - return read_squote_marker(lex); - case IVY_SYM_DQUOTE: - return read_dquote_marker(lex); - case IVY_SYM_FORWARD_SLASH_ASTERISK: - return read_block_comment(lex); - case IVY_SYM_DOUBLE_HYPHEN: - return read_line_comment(lex); - case IVY_SYM_DOLLAR: - return read_atom(lex); - case IVY_SYM_LEFT_BRACE: - push_symbol(lex, node->s_id); - lex->lex_brace_depth++; - - if (state->s_type == STATE_FSTRING) { - push_lexer_state(lex, STATE_INTERPOLATION); - } - return IVY_OK; - case IVY_SYM_RIGHT_BRACE: - push_symbol(lex, node->s_id); - lex->lex_brace_depth--; - - if (state->s_type == STATE_INTERPOLATION - && lex->lex_brace_depth < state->s_brace_depth) { - pop_lexer_state(lex); - } - return IVY_OK; - default: - push_symbol(lex, node->s_id); - return IVY_OK; - } -} - -static enum ivy_status read_number(struct ivy_lexer *lex) +static enum ivy_status read_number(struct ivy_lexer *lex, bool negate) { int token_len = 0; int base = 10; int dots = 0; b_string *str = get_temp_string(lex); - set_token_start(lex); + + if (!negate) { + set_token_start(lex); + } while (true) { int c = peek(lex); @@ -969,6 +959,10 @@ static enum ivy_status read_number(struct ivy_lexer *lex) return IVY_ERR_BAD_SYNTAX; } + if (negate) { + v *= -1; + } + return push_double(lex, v); } else { @@ -978,15 +972,95 @@ static enum ivy_status read_number(struct ivy_lexer *lex) return IVY_ERR_BAD_SYNTAX; } + if (negate) { + v *= -1; + } + return push_int(lex, v); } } +static enum ivy_status read_symbol(struct ivy_lexer *lex) +{ + struct ivy_lexer_symbol_node *node = lex->lex_sym_tree; + struct lexer_state *state = get_lexer_state(lex); + set_token_start(lex); + char prefix = peek_prev(lex); + char prev = 0; + + while (true) { + int c = peek(lex); + if (c < 0) { + break; + } + + struct ivy_lexer_symbol_node *next = get_symbol_node(node, c); + if (!next) { + prev = c; + break; + } + + node = next; + set_token_end(lex); + advance(lex); + prev = c; + } + + if (!node || node->s_def == NULL) { + return IVY_ERR_BAD_SYNTAX; + } + + if (node->s_def->id == IVY_SYM_HYPHEN && isdigit(prev)) { + return read_number(lex, true); + } + + if ((node->s_def->flags & LEX_TOK_REQUIRES_WHITESPACE) + && (!isspace(prev) || !isspace(prefix))) { + report_missing_whitespace( + lex, IVY_LANG_MSG_WHITESPACE_REQUIRED_AROUND_BINARY_OP); + return IVY_ERR_BAD_SYNTAX; + } + + switch (node->s_def->id) { + case IVY_SYM_SQUOTE: + return read_squote_marker(lex); + case IVY_SYM_DQUOTE: + return read_dquote_marker(lex); + case IVY_SYM_FORWARD_SLASH_ASTERISK: + return read_block_comment(lex); + case IVY_SYM_DOUBLE_HYPHEN: + return read_line_comment(lex); + case IVY_SYM_DOLLAR: + return read_atom(lex); + case IVY_SYM_LEFT_BRACE: + push_symbol(lex, node->s_def->id); + lex->lex_brace_depth++; + + if (state->s_type == STATE_FSTRING) { + push_lexer_state(lex, STATE_INTERPOLATION); + } + return IVY_OK; + case IVY_SYM_RIGHT_BRACE: + push_symbol(lex, node->s_def->id); + lex->lex_brace_depth--; + + if (state->s_type == STATE_INTERPOLATION + && lex->lex_brace_depth < state->s_brace_depth) { + pop_lexer_state(lex); + } + return IVY_OK; + default: + push_symbol(lex, node->s_def->id); + return IVY_OK; + } +} + static enum ivy_status read_ident(struct ivy_lexer *lex) { b_string *str = get_temp_string(lex); bool label = false; set_token_start(lex); + char prev = 0; while (true) { int c = peek(lex); @@ -1002,14 +1076,19 @@ static enum ivy_status read_ident(struct ivy_lexer *lex) break; } - if (!isalnum(c) && c != '_') { + if (!isalnum(c) && c != '_' && c != '-') { break; } + if (c == '-' && prev == '-') { + return IVY_ERR_BAD_SYNTAX; + } + char s[2] = {c, 0}; b_string_append_cstr(str, s); set_token_end(lex); advance(lex); + prev = c; } const char *s = b_string_ptr(str); @@ -1029,25 +1108,6 @@ static enum ivy_status read_ident(struct ivy_lexer *lex) return push_token(lex, tok); } -static void report_unrecognised_char(struct ivy_lexer *lex, int c) -{ - struct ivy_diag *diag = ivy_diag_ctx_create_diag( - lex->lex_diag_ctx, IVY_LANG_E_UNRECOGNISED_SYMBOL); - - ivy_diag_set_location(diag, lex->lex_cursor_row, lex->lex_cursor_col); - ivy_diag_push_msg(diag, IVY_LANG_MSG_UNKNOWN_SYMBOL_ENCOUNTERED); - - const struct ivy_diag_highlight hl[] = { - IVY_DIAG_HL( - ERROR, lex->lex_cursor_row, lex->lex_cursor_col, - lex->lex_cursor_row, lex->lex_cursor_col), - }; - const size_t nr_hl = sizeof hl / sizeof hl[0]; - - ivy_diag_push_snippet( - diag, lex->lex_cursor_row, lex->lex_cursor_row, NULL, 0, hl, nr_hl); -} - static enum ivy_status pump_tokens(struct ivy_lexer *lex) { struct lexer_state *state = get_lexer_state(lex); @@ -1106,7 +1166,7 @@ static enum ivy_status pump_tokens(struct ivy_lexer *lex) } if (isdigit(c)) { - return read_number(lex); + return read_number(lex, false); } report_unrecognised_char(lex, c); diff --git a/lang/lex.h b/lang/lex.h index 6a841e1..d3bca2a 100644 --- a/lang/lex.h +++ b/lang/lex.h @@ -8,12 +8,17 @@ #include #include +enum lex_token_flags { + LEX_TOK_REQUIRES_WHITESPACE = 0x01u, +}; + struct ivy_lexer { struct ivy_lexer_symbol_node *lex_sym_tree; struct ivy_diag_ctx *lex_diag_ctx; struct ivy_line_source *lex_source; b_dict *lex_keywords; enum ivy_status lex_status; + int lex_prev_char, lex_cur_char; b_queue lex_queue; enum ivy_token_type lex_prev_token; @@ -47,7 +52,7 @@ struct lexer_state { struct ivy_lexer_symbol_node { char s_char; - enum ivy_symbol s_id; + struct lex_token_def *s_def; b_queue_entry s_entry; b_queue s_children; @@ -55,6 +60,7 @@ struct ivy_lexer_symbol_node { struct lex_token_def { int id; + enum lex_token_flags flags; const char *name; uint64_t name_hash; };