lang: lex: implement state stack; make lexer structure opaque

the state stack is used to track whether a string is currently being read,
what kind of string it is, and whether or not we are in an interpolation
within that string.
This commit is contained in:
2024-11-16 23:03:43 +00:00
parent 412e9abf05
commit f14c387a6e
2 changed files with 157 additions and 35 deletions

View File

@@ -104,24 +104,15 @@ struct ivy_token {
}; };
struct ivy_lexer_symbol_node; struct ivy_lexer_symbol_node;
struct ivy_lexer_state;
struct ivy_lexer;
IVY_API enum ivy_status ivy_lexer_create(struct ivy_lexer **lex);
IVY_API void ivy_lexer_destroy(struct ivy_lexer *lex);
struct ivy_lexer { IVY_API void ivy_lexer_set_source(
struct ivy_line_source *lex_source; struct ivy_lexer *lex, struct ivy_line_source *src);
enum ivy_status lex_status; IVY_API enum ivy_status ivy_lexer_get_status(struct ivy_lexer *lex);
struct ivy_token *lex_queue;
struct ivy_lexer_symbol_node *lex_sym_tree;
enum ivy_token_type lex_prev_token;
unsigned int lex_state;
char *lex_linebuf;
size_t lex_linebuf_len;
size_t lex_linebuf_cap;
size_t lex_linebuf_ptr;
};
IVY_API enum ivy_status ivy_lexer_init(struct ivy_lexer *lex);
IVY_API void ivy_lexer_finish(struct ivy_lexer *lex);
IVY_API struct ivy_token *ivy_lexer_peek(struct ivy_lexer *lex); IVY_API struct ivy_token *ivy_lexer_peek(struct ivy_lexer *lex);
IVY_API struct ivy_token *ivy_lexer_read(struct ivy_lexer *lex); IVY_API struct ivy_token *ivy_lexer_read(struct ivy_lexer *lex);

View File

@@ -15,6 +15,24 @@
.id = (i), .name = (n) \ .id = (i), .name = (n) \
} }
struct ivy_lexer {
struct ivy_lexer_symbol_node *lex_sym_tree;
struct ivy_line_source *lex_source;
enum ivy_status lex_status;
struct ivy_token *lex_queue;
enum ivy_token_type lex_prev_token;
b_queue lex_state;
unsigned int lex_brace_depth;
char *lex_linebuf;
size_t lex_linebuf_len;
size_t lex_linebuf_cap;
size_t lex_linebuf_ptr;
};
enum lexer_state_type { enum lexer_state_type {
STATE_NORMAL, STATE_NORMAL,
STATE_STRING, STATE_STRING,
@@ -24,6 +42,8 @@ enum lexer_state_type {
struct lexer_state { struct lexer_state {
enum lexer_state_type s_type; enum lexer_state_type s_type;
unsigned int s_brace_depth;
b_queue_entry s_entry;
}; };
struct ivy_lexer_symbol_node { struct ivy_lexer_symbol_node {
@@ -112,6 +132,57 @@ static struct lex_token_def symbols[] = {
}; };
static const size_t nr_symbols = sizeof symbols / sizeof symbols[0]; static const size_t nr_symbols = sizeof symbols / sizeof symbols[0];
static struct lexer_state *push_lexer_state(
struct ivy_lexer *lex, enum lexer_state_type state_type)
{
struct lexer_state *state = malloc(sizeof *state);
if (!state) {
return NULL;
}
memset(state, 0x0, sizeof *state);
state->s_type = state_type;
state->s_brace_depth = lex->lex_brace_depth;
b_queue_push_back(&lex->lex_state, &state->s_entry);
return state;
}
static void pop_lexer_state(struct ivy_lexer *lex)
{
b_queue_entry *entry = b_queue_pop_back(&lex->lex_state);
if (!entry) {
return;
}
struct lexer_state *state = b_unbox(struct lexer_state, entry, s_entry);
free(state);
}
static struct lexer_state *get_lexer_state(struct ivy_lexer *lex)
{
b_queue_entry *entry = b_queue_last(&lex->lex_state);
if (!entry) {
return NULL;
}
return b_unbox(struct lexer_state, entry, s_entry);
}
static void destroy_state_stack(b_queue *state)
{
b_queue_iterator it;
b_queue_iterator_begin(state, &it);
while (b_queue_iterator_is_valid(&it)) {
struct lexer_state *node
= b_unbox(struct lexer_state, it.entry, s_entry);
b_queue_iterator_erase(&it);
free(node);
}
}
static struct ivy_lexer_symbol_node *get_symbol_node( static struct ivy_lexer_symbol_node *get_symbol_node(
struct ivy_lexer_symbol_node *node, char c) struct ivy_lexer_symbol_node *node, char c)
{ {
@@ -245,8 +316,13 @@ static enum ivy_keyword find_keyword_by_name(const char *s)
return IVY_KW_NONE; return IVY_KW_NONE;
} }
enum ivy_status ivy_lexer_init(struct ivy_lexer *lex) enum ivy_status ivy_lexer_create(struct ivy_lexer **lexp)
{ {
struct ivy_lexer *lex = malloc(sizeof *lex);
if (!lex) {
return IVY_ERR_NO_MEMORY;
}
memset(lex, 0x0, sizeof *lex); memset(lex, 0x0, sizeof *lex);
lex->lex_status = IVY_OK; lex->lex_status = IVY_OK;
@@ -256,15 +332,26 @@ enum ivy_status ivy_lexer_init(struct ivy_lexer *lex)
lex->lex_linebuf_cap = LINEBUF_DEFAULT_CAPACITY; lex->lex_linebuf_cap = LINEBUF_DEFAULT_CAPACITY;
lex->lex_sym_tree = build_symbol_tree(); lex->lex_sym_tree = build_symbol_tree();
if (!lex->lex_sym_tree) {
ivy_lexer_destroy(lex);
return IVY_ERR_NO_MEMORY;
}
if (!push_lexer_state(lex, STATE_NORMAL)) {
ivy_lexer_destroy(lex);
return IVY_ERR_NO_MEMORY;
}
print_symbol_node(lex->lex_sym_tree, 0); print_symbol_node(lex->lex_sym_tree, 0);
/* TODO only do keyword initialisation once */ /* TODO only do keyword initialisation once */
init_keywords(); init_keywords();
*lexp = lex;
return IVY_OK; return IVY_OK;
} }
void ivy_lexer_finish(struct ivy_lexer *lex) void ivy_lexer_destroy(struct ivy_lexer *lex)
{ {
while (lex->lex_queue) { while (lex->lex_queue) {
struct ivy_token *next = lex->lex_queue->t_next; struct ivy_token *next = lex->lex_queue->t_next;
@@ -280,7 +367,19 @@ void ivy_lexer_finish(struct ivy_lexer *lex)
destroy_symbol_tree(lex->lex_sym_tree); destroy_symbol_tree(lex->lex_sym_tree);
} }
memset(lex, 0x0, sizeof *lex); destroy_state_stack(&lex->lex_state);
free(lex);
}
void ivy_lexer_set_source(struct ivy_lexer *lex, struct ivy_line_source *src)
{
lex->lex_source = src;
}
enum ivy_status ivy_lexer_get_status(struct ivy_lexer *lex)
{
return lex->lex_status;
} }
static enum ivy_status refill_linebuf(struct ivy_lexer *lex) static enum ivy_status refill_linebuf(struct ivy_lexer *lex)
@@ -538,27 +637,38 @@ static enum ivy_status read_block_comment(struct ivy_lexer *lex)
static enum ivy_status read_squote_marker(struct ivy_lexer *lex) static enum ivy_status read_squote_marker(struct ivy_lexer *lex)
{ {
enum ivy_status status = IVY_OK; enum ivy_status status = IVY_OK;
struct lexer_state *state = get_lexer_state(lex);
if (lex->lex_state & STATE_FSTRING) { if (state->s_type == STATE_FSTRING) {
/* already within an fstring */ /* already within an fstring */
lex->lex_state &= ~STATE_FSTRING; pop_lexer_state(lex);
return push_string_end(lex); return push_string_end(lex);
} else { }
/* start of a new fstring */ /* start of a new fstring */
status = push_string_start(lex); status = push_string_start(lex);
lex->lex_state |= STATE_FSTRING;
if (status != IVY_OK) {
return status;
} }
if (!push_lexer_state(lex, STATE_FSTRING)) {
return IVY_ERR_NO_MEMORY;
}
return IVY_OK;
} }
static enum ivy_status read_dquote_marker(struct ivy_lexer *lex) static enum ivy_status read_dquote_marker(struct ivy_lexer *lex)
{ {
return IVY_OK;
} }
static enum ivy_status read_string_content(struct ivy_lexer *lex) static enum ivy_status read_string_content(struct ivy_lexer *lex)
{ {
int c; int c;
b_string *str = b_string_create(); b_string *str = b_string_create();
struct lexer_state *state = get_lexer_state(lex);
if (!str) { if (!str) {
return IVY_ERR_NO_MEMORY; return IVY_ERR_NO_MEMORY;
@@ -567,20 +677,17 @@ static enum ivy_status read_string_content(struct ivy_lexer *lex)
while (true) { while (true) {
c = peek(lex); c = peek(lex);
if (c == '{') { if (state->s_type == STATE_FSTRING && (c == '\'' || c == '{')) {
break; break;
} }
if ((lex->lex_state & STATE_FSTRING) && c == '\'') { if (state->s_type == STATE_STRING && c == '"') {
break;
}
if ((lex->lex_state & STATE_STRING) && c == '"') {
break; break;
} }
char s[2] = {c, 0}; char s[2] = {c, 0};
b_string_append_cstr(str, s); b_string_append_cstr(str, s);
advance(lex);
} }
if (b_string_get_size(str, B_STRLEN_NORMAL) == 0) { if (b_string_get_size(str, B_STRLEN_NORMAL) == 0) {
@@ -602,6 +709,7 @@ static enum ivy_status read_string_content(struct ivy_lexer *lex)
static enum ivy_status read_symbol(struct ivy_lexer *lex) static enum ivy_status read_symbol(struct ivy_lexer *lex)
{ {
struct ivy_lexer_symbol_node *node = lex->lex_sym_tree; struct ivy_lexer_symbol_node *node = lex->lex_sym_tree;
struct lexer_state *state = get_lexer_state(lex);
while (true) { while (true) {
int c = peek(lex); int c = peek(lex);
@@ -628,6 +736,23 @@ static enum ivy_status read_symbol(struct ivy_lexer *lex)
return read_block_comment(lex); return read_block_comment(lex);
case IVY_SYM_DOUBLE_HYPHEN: case IVY_SYM_DOUBLE_HYPHEN:
return read_line_comment(lex); return read_line_comment(lex);
case IVY_SYM_LEFT_BRACE:
push_symbol(lex, node->s_id);
lex->lex_brace_depth++;
if (state->s_type == STATE_FSTRING) {
push_lexer_state(lex, STATE_INTERPOLATION);
}
return IVY_OK;
case IVY_SYM_RIGHT_BRACE:
push_symbol(lex, node->s_id);
lex->lex_brace_depth--;
if (state->s_type == STATE_INTERPOLATION
&& lex->lex_brace_depth < state->s_brace_depth) {
pop_lexer_state(lex);
}
return IVY_OK;
default: default:
push_symbol(lex, node->s_id); push_symbol(lex, node->s_id);
return IVY_OK; return IVY_OK;
@@ -684,20 +809,26 @@ static enum ivy_status read_ident(struct ivy_lexer *lex)
static enum ivy_status pump_tokens(struct ivy_lexer *lex) static enum ivy_status pump_tokens(struct ivy_lexer *lex)
{ {
enum ivy_status status; enum ivy_status status;
struct lexer_state *state = get_lexer_state(lex);
int c = peek(lex); int c = peek(lex);
if (c < 0) { if (c < 0) {
return c; return c;
} }
if (lex->lex_state & STATE_STRING && c != '"') { if (state->s_type == STATE_STRING && c != '"') {
return read_string_content(lex); return read_string_content(lex);
} }
if ((lex->lex_state & STATE_FSTRING) && !(lex->lex_state & STATE_INTERPOLATION)) { if (state->s_type == STATE_FSTRING && c != '\'' && c != '{') {
return read_string_content(lex); return read_string_content(lex);
} }
/* `state` is invalid past this point, as the read_* functions may
* perform state transitions. */
state = NULL;
if (c == '\n') { if (c == '\n') {
while (c == '\n') { while (c == '\n') {
advance(lex); advance(lex);