From f14c387a6ebfe58ae385d6652099e6d082d7cdf9 Mon Sep 17 00:00:00 2001 From: Max Wash Date: Sat, 16 Nov 2024 23:03:43 +0000 Subject: [PATCH] lang: lex: implement state stack; make lexer structure opaque the state stack is used to track whether a string is currently being read, what kind of string it is, and whether or not we are in an interpolation within that string. --- lang/include/ivy/lang/lex.h | 23 ++--- lang/lex.c | 169 ++++++++++++++++++++++++++++++++---- 2 files changed, 157 insertions(+), 35 deletions(-) diff --git a/lang/include/ivy/lang/lex.h b/lang/include/ivy/lang/lex.h index bd50059..774541a 100644 --- a/lang/include/ivy/lang/lex.h +++ b/lang/include/ivy/lang/lex.h @@ -104,24 +104,15 @@ struct ivy_token { }; struct ivy_lexer_symbol_node; +struct ivy_lexer_state; +struct ivy_lexer; +IVY_API enum ivy_status ivy_lexer_create(struct ivy_lexer **lex); +IVY_API void ivy_lexer_destroy(struct ivy_lexer *lex); -struct ivy_lexer { - struct ivy_line_source *lex_source; - enum ivy_status lex_status; - struct ivy_token *lex_queue; - struct ivy_lexer_symbol_node *lex_sym_tree; - enum ivy_token_type lex_prev_token; - unsigned int lex_state; - - char *lex_linebuf; - size_t lex_linebuf_len; - size_t lex_linebuf_cap; - size_t lex_linebuf_ptr; -}; - -IVY_API enum ivy_status ivy_lexer_init(struct ivy_lexer *lex); -IVY_API void ivy_lexer_finish(struct ivy_lexer *lex); +IVY_API void ivy_lexer_set_source( + struct ivy_lexer *lex, struct ivy_line_source *src); +IVY_API enum ivy_status ivy_lexer_get_status(struct ivy_lexer *lex); IVY_API struct ivy_token *ivy_lexer_peek(struct ivy_lexer *lex); IVY_API struct ivy_token *ivy_lexer_read(struct ivy_lexer *lex); diff --git a/lang/lex.c b/lang/lex.c index 3f108f4..1909e27 100644 --- a/lang/lex.c +++ b/lang/lex.c @@ -15,6 +15,24 @@ .id = (i), .name = (n) \ } +struct ivy_lexer { + struct ivy_lexer_symbol_node *lex_sym_tree; + struct ivy_line_source *lex_source; + + enum ivy_status lex_status; + + struct ivy_token *lex_queue; + enum ivy_token_type lex_prev_token; + + b_queue lex_state; + unsigned int lex_brace_depth; + + char *lex_linebuf; + size_t lex_linebuf_len; + size_t lex_linebuf_cap; + size_t lex_linebuf_ptr; +}; + enum lexer_state_type { STATE_NORMAL, STATE_STRING, @@ -24,6 +42,8 @@ enum lexer_state_type { struct lexer_state { enum lexer_state_type s_type; + unsigned int s_brace_depth; + b_queue_entry s_entry; }; struct ivy_lexer_symbol_node { @@ -112,6 +132,57 @@ static struct lex_token_def symbols[] = { }; static const size_t nr_symbols = sizeof symbols / sizeof symbols[0]; +static struct lexer_state *push_lexer_state( + struct ivy_lexer *lex, enum lexer_state_type state_type) +{ + struct lexer_state *state = malloc(sizeof *state); + if (!state) { + return NULL; + } + + memset(state, 0x0, sizeof *state); + + state->s_type = state_type; + state->s_brace_depth = lex->lex_brace_depth; + b_queue_push_back(&lex->lex_state, &state->s_entry); + + return state; +} + +static void pop_lexer_state(struct ivy_lexer *lex) +{ + b_queue_entry *entry = b_queue_pop_back(&lex->lex_state); + if (!entry) { + return; + } + + struct lexer_state *state = b_unbox(struct lexer_state, entry, s_entry); + free(state); +} + +static struct lexer_state *get_lexer_state(struct ivy_lexer *lex) +{ + b_queue_entry *entry = b_queue_last(&lex->lex_state); + if (!entry) { + return NULL; + } + + return b_unbox(struct lexer_state, entry, s_entry); +} + +static void destroy_state_stack(b_queue *state) +{ + b_queue_iterator it; + b_queue_iterator_begin(state, &it); + while (b_queue_iterator_is_valid(&it)) { + struct lexer_state *node + = b_unbox(struct lexer_state, it.entry, s_entry); + b_queue_iterator_erase(&it); + + free(node); + } +} + static struct ivy_lexer_symbol_node *get_symbol_node( struct ivy_lexer_symbol_node *node, char c) { @@ -245,8 +316,13 @@ static enum ivy_keyword find_keyword_by_name(const char *s) return IVY_KW_NONE; } -enum ivy_status ivy_lexer_init(struct ivy_lexer *lex) +enum ivy_status ivy_lexer_create(struct ivy_lexer **lexp) { + struct ivy_lexer *lex = malloc(sizeof *lex); + if (!lex) { + return IVY_ERR_NO_MEMORY; + } + memset(lex, 0x0, sizeof *lex); lex->lex_status = IVY_OK; @@ -256,15 +332,26 @@ enum ivy_status ivy_lexer_init(struct ivy_lexer *lex) lex->lex_linebuf_cap = LINEBUF_DEFAULT_CAPACITY; lex->lex_sym_tree = build_symbol_tree(); + if (!lex->lex_sym_tree) { + ivy_lexer_destroy(lex); + return IVY_ERR_NO_MEMORY; + } + + if (!push_lexer_state(lex, STATE_NORMAL)) { + ivy_lexer_destroy(lex); + return IVY_ERR_NO_MEMORY; + } + print_symbol_node(lex->lex_sym_tree, 0); /* TODO only do keyword initialisation once */ init_keywords(); + *lexp = lex; return IVY_OK; } -void ivy_lexer_finish(struct ivy_lexer *lex) +void ivy_lexer_destroy(struct ivy_lexer *lex) { while (lex->lex_queue) { struct ivy_token *next = lex->lex_queue->t_next; @@ -280,7 +367,19 @@ void ivy_lexer_finish(struct ivy_lexer *lex) destroy_symbol_tree(lex->lex_sym_tree); } - memset(lex, 0x0, sizeof *lex); + destroy_state_stack(&lex->lex_state); + + free(lex); +} + +void ivy_lexer_set_source(struct ivy_lexer *lex, struct ivy_line_source *src) +{ + lex->lex_source = src; +} + +enum ivy_status ivy_lexer_get_status(struct ivy_lexer *lex) +{ + return lex->lex_status; } static enum ivy_status refill_linebuf(struct ivy_lexer *lex) @@ -538,27 +637,38 @@ static enum ivy_status read_block_comment(struct ivy_lexer *lex) static enum ivy_status read_squote_marker(struct ivy_lexer *lex) { enum ivy_status status = IVY_OK; + struct lexer_state *state = get_lexer_state(lex); - if (lex->lex_state & STATE_FSTRING) { + if (state->s_type == STATE_FSTRING) { /* already within an fstring */ - lex->lex_state &= ~STATE_FSTRING; - return push_string_end(lex); - } else { - /* start of a new fstring */ - status = push_string_start(lex); - lex->lex_state |= STATE_FSTRING; + pop_lexer_state(lex); + return push_string_end(lex); } + + /* start of a new fstring */ + status = push_string_start(lex); + + if (status != IVY_OK) { + return status; + } + + if (!push_lexer_state(lex, STATE_FSTRING)) { + return IVY_ERR_NO_MEMORY; + } + + return IVY_OK; } static enum ivy_status read_dquote_marker(struct ivy_lexer *lex) { - + return IVY_OK; } static enum ivy_status read_string_content(struct ivy_lexer *lex) { int c; b_string *str = b_string_create(); + struct lexer_state *state = get_lexer_state(lex); if (!str) { return IVY_ERR_NO_MEMORY; @@ -567,20 +677,17 @@ static enum ivy_status read_string_content(struct ivy_lexer *lex) while (true) { c = peek(lex); - if (c == '{') { + if (state->s_type == STATE_FSTRING && (c == '\'' || c == '{')) { break; } - if ((lex->lex_state & STATE_FSTRING) && c == '\'') { - break; - } - - if ((lex->lex_state & STATE_STRING) && c == '"') { + if (state->s_type == STATE_STRING && c == '"') { break; } char s[2] = {c, 0}; b_string_append_cstr(str, s); + advance(lex); } if (b_string_get_size(str, B_STRLEN_NORMAL) == 0) { @@ -602,6 +709,7 @@ static enum ivy_status read_string_content(struct ivy_lexer *lex) static enum ivy_status read_symbol(struct ivy_lexer *lex) { struct ivy_lexer_symbol_node *node = lex->lex_sym_tree; + struct lexer_state *state = get_lexer_state(lex); while (true) { int c = peek(lex); @@ -628,6 +736,23 @@ static enum ivy_status read_symbol(struct ivy_lexer *lex) return read_block_comment(lex); case IVY_SYM_DOUBLE_HYPHEN: return read_line_comment(lex); + case IVY_SYM_LEFT_BRACE: + push_symbol(lex, node->s_id); + lex->lex_brace_depth++; + + if (state->s_type == STATE_FSTRING) { + push_lexer_state(lex, STATE_INTERPOLATION); + } + return IVY_OK; + case IVY_SYM_RIGHT_BRACE: + push_symbol(lex, node->s_id); + lex->lex_brace_depth--; + + if (state->s_type == STATE_INTERPOLATION + && lex->lex_brace_depth < state->s_brace_depth) { + pop_lexer_state(lex); + } + return IVY_OK; default: push_symbol(lex, node->s_id); return IVY_OK; @@ -684,20 +809,26 @@ static enum ivy_status read_ident(struct ivy_lexer *lex) static enum ivy_status pump_tokens(struct ivy_lexer *lex) { enum ivy_status status; + struct lexer_state *state = get_lexer_state(lex); + int c = peek(lex); if (c < 0) { return c; } - if (lex->lex_state & STATE_STRING && c != '"') { + if (state->s_type == STATE_STRING && c != '"') { return read_string_content(lex); } - if ((lex->lex_state & STATE_FSTRING) && !(lex->lex_state & STATE_INTERPOLATION)) { + if (state->s_type == STATE_FSTRING && c != '\'' && c != '{') { return read_string_content(lex); } + /* `state` is invalid past this point, as the read_* functions may + * perform state transitions. */ + state = NULL; + if (c == '\n') { while (c == '\n') { advance(lex);