lang: lex: implement state stack; make lexer structure opaque

the state stack is used to track whether a string is currently being read,
what kind of string it is, and whether or not we are in an interpolation
within that string.
This commit is contained in:
2024-11-16 23:03:43 +00:00
parent 412e9abf05
commit f14c387a6e
2 changed files with 157 additions and 35 deletions

View File

@@ -104,24 +104,15 @@ struct ivy_token {
};
struct ivy_lexer_symbol_node;
struct ivy_lexer_state;
struct ivy_lexer;
IVY_API enum ivy_status ivy_lexer_create(struct ivy_lexer **lex);
IVY_API void ivy_lexer_destroy(struct ivy_lexer *lex);
struct ivy_lexer {
struct ivy_line_source *lex_source;
enum ivy_status lex_status;
struct ivy_token *lex_queue;
struct ivy_lexer_symbol_node *lex_sym_tree;
enum ivy_token_type lex_prev_token;
unsigned int lex_state;
char *lex_linebuf;
size_t lex_linebuf_len;
size_t lex_linebuf_cap;
size_t lex_linebuf_ptr;
};
IVY_API enum ivy_status ivy_lexer_init(struct ivy_lexer *lex);
IVY_API void ivy_lexer_finish(struct ivy_lexer *lex);
IVY_API void ivy_lexer_set_source(
struct ivy_lexer *lex, struct ivy_line_source *src);
IVY_API enum ivy_status ivy_lexer_get_status(struct ivy_lexer *lex);
IVY_API struct ivy_token *ivy_lexer_peek(struct ivy_lexer *lex);
IVY_API struct ivy_token *ivy_lexer_read(struct ivy_lexer *lex);

View File

@@ -15,6 +15,24 @@
.id = (i), .name = (n) \
}
struct ivy_lexer {
struct ivy_lexer_symbol_node *lex_sym_tree;
struct ivy_line_source *lex_source;
enum ivy_status lex_status;
struct ivy_token *lex_queue;
enum ivy_token_type lex_prev_token;
b_queue lex_state;
unsigned int lex_brace_depth;
char *lex_linebuf;
size_t lex_linebuf_len;
size_t lex_linebuf_cap;
size_t lex_linebuf_ptr;
};
enum lexer_state_type {
STATE_NORMAL,
STATE_STRING,
@@ -24,6 +42,8 @@ enum lexer_state_type {
struct lexer_state {
enum lexer_state_type s_type;
unsigned int s_brace_depth;
b_queue_entry s_entry;
};
struct ivy_lexer_symbol_node {
@@ -112,6 +132,57 @@ static struct lex_token_def symbols[] = {
};
static const size_t nr_symbols = sizeof symbols / sizeof symbols[0];
static struct lexer_state *push_lexer_state(
struct ivy_lexer *lex, enum lexer_state_type state_type)
{
struct lexer_state *state = malloc(sizeof *state);
if (!state) {
return NULL;
}
memset(state, 0x0, sizeof *state);
state->s_type = state_type;
state->s_brace_depth = lex->lex_brace_depth;
b_queue_push_back(&lex->lex_state, &state->s_entry);
return state;
}
static void pop_lexer_state(struct ivy_lexer *lex)
{
b_queue_entry *entry = b_queue_pop_back(&lex->lex_state);
if (!entry) {
return;
}
struct lexer_state *state = b_unbox(struct lexer_state, entry, s_entry);
free(state);
}
static struct lexer_state *get_lexer_state(struct ivy_lexer *lex)
{
b_queue_entry *entry = b_queue_last(&lex->lex_state);
if (!entry) {
return NULL;
}
return b_unbox(struct lexer_state, entry, s_entry);
}
static void destroy_state_stack(b_queue *state)
{
b_queue_iterator it;
b_queue_iterator_begin(state, &it);
while (b_queue_iterator_is_valid(&it)) {
struct lexer_state *node
= b_unbox(struct lexer_state, it.entry, s_entry);
b_queue_iterator_erase(&it);
free(node);
}
}
static struct ivy_lexer_symbol_node *get_symbol_node(
struct ivy_lexer_symbol_node *node, char c)
{
@@ -245,8 +316,13 @@ static enum ivy_keyword find_keyword_by_name(const char *s)
return IVY_KW_NONE;
}
enum ivy_status ivy_lexer_init(struct ivy_lexer *lex)
enum ivy_status ivy_lexer_create(struct ivy_lexer **lexp)
{
struct ivy_lexer *lex = malloc(sizeof *lex);
if (!lex) {
return IVY_ERR_NO_MEMORY;
}
memset(lex, 0x0, sizeof *lex);
lex->lex_status = IVY_OK;
@@ -256,15 +332,26 @@ enum ivy_status ivy_lexer_init(struct ivy_lexer *lex)
lex->lex_linebuf_cap = LINEBUF_DEFAULT_CAPACITY;
lex->lex_sym_tree = build_symbol_tree();
if (!lex->lex_sym_tree) {
ivy_lexer_destroy(lex);
return IVY_ERR_NO_MEMORY;
}
if (!push_lexer_state(lex, STATE_NORMAL)) {
ivy_lexer_destroy(lex);
return IVY_ERR_NO_MEMORY;
}
print_symbol_node(lex->lex_sym_tree, 0);
/* TODO only do keyword initialisation once */
init_keywords();
*lexp = lex;
return IVY_OK;
}
void ivy_lexer_finish(struct ivy_lexer *lex)
void ivy_lexer_destroy(struct ivy_lexer *lex)
{
while (lex->lex_queue) {
struct ivy_token *next = lex->lex_queue->t_next;
@@ -280,7 +367,19 @@ void ivy_lexer_finish(struct ivy_lexer *lex)
destroy_symbol_tree(lex->lex_sym_tree);
}
memset(lex, 0x0, sizeof *lex);
destroy_state_stack(&lex->lex_state);
free(lex);
}
void ivy_lexer_set_source(struct ivy_lexer *lex, struct ivy_line_source *src)
{
lex->lex_source = src;
}
enum ivy_status ivy_lexer_get_status(struct ivy_lexer *lex)
{
return lex->lex_status;
}
static enum ivy_status refill_linebuf(struct ivy_lexer *lex)
@@ -538,27 +637,38 @@ static enum ivy_status read_block_comment(struct ivy_lexer *lex)
static enum ivy_status read_squote_marker(struct ivy_lexer *lex)
{
enum ivy_status status = IVY_OK;
struct lexer_state *state = get_lexer_state(lex);
if (lex->lex_state & STATE_FSTRING) {
if (state->s_type == STATE_FSTRING) {
/* already within an fstring */
lex->lex_state &= ~STATE_FSTRING;
return push_string_end(lex);
} else {
/* start of a new fstring */
status = push_string_start(lex);
lex->lex_state |= STATE_FSTRING;
pop_lexer_state(lex);
return push_string_end(lex);
}
/* start of a new fstring */
status = push_string_start(lex);
if (status != IVY_OK) {
return status;
}
if (!push_lexer_state(lex, STATE_FSTRING)) {
return IVY_ERR_NO_MEMORY;
}
return IVY_OK;
}
static enum ivy_status read_dquote_marker(struct ivy_lexer *lex)
{
return IVY_OK;
}
static enum ivy_status read_string_content(struct ivy_lexer *lex)
{
int c;
b_string *str = b_string_create();
struct lexer_state *state = get_lexer_state(lex);
if (!str) {
return IVY_ERR_NO_MEMORY;
@@ -567,20 +677,17 @@ static enum ivy_status read_string_content(struct ivy_lexer *lex)
while (true) {
c = peek(lex);
if (c == '{') {
if (state->s_type == STATE_FSTRING && (c == '\'' || c == '{')) {
break;
}
if ((lex->lex_state & STATE_FSTRING) && c == '\'') {
break;
}
if ((lex->lex_state & STATE_STRING) && c == '"') {
if (state->s_type == STATE_STRING && c == '"') {
break;
}
char s[2] = {c, 0};
b_string_append_cstr(str, s);
advance(lex);
}
if (b_string_get_size(str, B_STRLEN_NORMAL) == 0) {
@@ -602,6 +709,7 @@ static enum ivy_status read_string_content(struct ivy_lexer *lex)
static enum ivy_status read_symbol(struct ivy_lexer *lex)
{
struct ivy_lexer_symbol_node *node = lex->lex_sym_tree;
struct lexer_state *state = get_lexer_state(lex);
while (true) {
int c = peek(lex);
@@ -628,6 +736,23 @@ static enum ivy_status read_symbol(struct ivy_lexer *lex)
return read_block_comment(lex);
case IVY_SYM_DOUBLE_HYPHEN:
return read_line_comment(lex);
case IVY_SYM_LEFT_BRACE:
push_symbol(lex, node->s_id);
lex->lex_brace_depth++;
if (state->s_type == STATE_FSTRING) {
push_lexer_state(lex, STATE_INTERPOLATION);
}
return IVY_OK;
case IVY_SYM_RIGHT_BRACE:
push_symbol(lex, node->s_id);
lex->lex_brace_depth--;
if (state->s_type == STATE_INTERPOLATION
&& lex->lex_brace_depth < state->s_brace_depth) {
pop_lexer_state(lex);
}
return IVY_OK;
default:
push_symbol(lex, node->s_id);
return IVY_OK;
@@ -684,20 +809,26 @@ static enum ivy_status read_ident(struct ivy_lexer *lex)
static enum ivy_status pump_tokens(struct ivy_lexer *lex)
{
enum ivy_status status;
struct lexer_state *state = get_lexer_state(lex);
int c = peek(lex);
if (c < 0) {
return c;
}
if (lex->lex_state & STATE_STRING && c != '"') {
if (state->s_type == STATE_STRING && c != '"') {
return read_string_content(lex);
}
if ((lex->lex_state & STATE_FSTRING) && !(lex->lex_state & STATE_INTERPOLATION)) {
if (state->s_type == STATE_FSTRING && c != '\'' && c != '{') {
return read_string_content(lex);
}
/* `state` is invalid past this point, as the read_* functions may
* perform state transitions. */
state = NULL;
if (c == '\n') {
while (c == '\n') {
advance(lex);