lang: lex: implement lexing of numbers and atoms

This commit is contained in:
2024-11-17 09:31:58 +00:00
parent f55b8f4851
commit 104e549b32
2 changed files with 192 additions and 16 deletions

View File

@@ -40,6 +40,8 @@ enum ivy_keyword {
IVY_KW_UNLESS,
IVY_KW_IN,
IVY_KW_DO,
IVY_KW_GET,
IVY_KW_SET,
IVY_KW_END,
};
@@ -79,14 +81,16 @@ enum ivy_symbol {
IVY_SYM_PIPE_EQUAL,
IVY_SYM_PERCENT_EQUAL,
IVY_SYM_CARET_EQUAL,
IVY_SYM_HASH,
IVY_SYM_BANG,
IVY_SYM_PIPE,
IVY_SYM_CARET,
IVY_SYM_UNDERSCORE,
IVY_SYM_COMMA,
IVY_SYM_SEMICOLON,
IVY_SYM_DOLLAR,
IVY_SYM_RIGHT_ARROW,
IVY_SYM_BIG_RIGHT_ARROW,
IVY_SYM_HYPHEN_RIGHT_ANGLE,
IVY_SYM_EQUAL_RIGHT_ANGLE,
IVY_SYM_FORWARD_SLASH_ASTERISK,
IVY_SYM_ASTERISK_FORWARD_SLASH,
};
@@ -98,7 +102,7 @@ struct ivy_token {
union {
enum ivy_keyword t_keyword;
enum ivy_symbol t_symbol;
signed long long t_number;
unsigned long long t_number;
char *t_str;
};
};

View File

@@ -24,6 +24,7 @@ struct ivy_lexer {
struct ivy_token *lex_queue;
enum ivy_token_type lex_prev_token;
b_string *lex_temp;
b_queue lex_state;
unsigned int lex_brace_depth;
@@ -80,6 +81,8 @@ static struct lex_token_def keywords[] = {
LEX_TOKEN_DEF(IVY_KW_UNLESS, "unless"),
LEX_TOKEN_DEF(IVY_KW_IN, "in"),
LEX_TOKEN_DEF(IVY_KW_DO, "do"),
LEX_TOKEN_DEF(IVY_KW_GET, "get"),
LEX_TOKEN_DEF(IVY_KW_SET, "set"),
LEX_TOKEN_DEF(IVY_KW_END, "end"),
};
static const size_t nr_keywords = sizeof keywords / sizeof keywords[0];
@@ -121,14 +124,16 @@ static struct lex_token_def symbols[] = {
LEX_TOKEN_DEF(IVY_SYM_PIPE_EQUAL, "|="),
LEX_TOKEN_DEF(IVY_SYM_PERCENT_EQUAL, "%="),
LEX_TOKEN_DEF(IVY_SYM_CARET_EQUAL, "^="),
LEX_TOKEN_DEF(IVY_SYM_HASH, "#"),
LEX_TOKEN_DEF(IVY_SYM_BANG, "!"),
LEX_TOKEN_DEF(IVY_SYM_PIPE, "|"),
LEX_TOKEN_DEF(IVY_SYM_CARET, "^"),
LEX_TOKEN_DEF(IVY_SYM_UNDERSCORE, "_"),
LEX_TOKEN_DEF(IVY_SYM_COMMA, ","),
LEX_TOKEN_DEF(IVY_SYM_SEMICOLON, ";"),
LEX_TOKEN_DEF(IVY_SYM_DOLLAR, "$"),
LEX_TOKEN_DEF(IVY_SYM_RIGHT_ARROW, "->"),
LEX_TOKEN_DEF(IVY_SYM_BIG_RIGHT_ARROW, "=>"),
LEX_TOKEN_DEF(IVY_SYM_HYPHEN_RIGHT_ANGLE, "->"),
LEX_TOKEN_DEF(IVY_SYM_EQUAL_RIGHT_ANGLE, "=>"),
};
static const size_t nr_symbols = sizeof symbols / sizeof symbols[0];
@@ -198,6 +203,16 @@ static struct ivy_lexer_symbol_node *get_symbol_node(
return NULL;
}
static b_string *get_temp_string(struct ivy_lexer *lex)
{
if (!lex->lex_temp) {
lex->lex_temp = b_string_create();
}
b_string_clear(lex->lex_temp);
return lex->lex_temp;
}
static enum ivy_status put_symbol(
struct ivy_lexer_symbol_node *tree, struct lex_token_def *sym)
{
@@ -367,6 +382,10 @@ void ivy_lexer_destroy(struct ivy_lexer *lex)
destroy_symbol_tree(lex->lex_sym_tree);
}
if (lex->lex_temp) {
b_string_release(lex->lex_temp);
}
destroy_state_stack(&lex->lex_state);
free(lex);
@@ -571,6 +590,34 @@ static enum ivy_status push_symbol(struct ivy_lexer *lex, enum ivy_symbol sym)
return push_token(lex, tok);
}
static enum ivy_status push_atom(struct ivy_lexer *lex, char *s)
{
struct ivy_token *tok = malloc(sizeof *tok);
if (!tok) {
return IVY_ERR_NO_MEMORY;
}
memset(tok, 0x0, sizeof *tok);
tok->t_type = IVY_TOK_ATOM;
tok->t_str = s;
return push_token(lex, tok);
}
static enum ivy_status push_number(struct ivy_lexer *lex, unsigned long long v)
{
struct ivy_token *tok = malloc(sizeof *tok);
if (!tok) {
return IVY_ERR_NO_MEMORY;
}
memset(tok, 0x0, sizeof *tok);
tok->t_type = IVY_TOK_NUMBER;
tok->t_number = v;
return push_token(lex, tok);
}
static enum ivy_status push_keyword(struct ivy_lexer *lex, enum ivy_keyword keyword)
{
struct ivy_token *tok = malloc(sizeof *tok);
@@ -661,13 +708,54 @@ static enum ivy_status read_squote_marker(struct ivy_lexer *lex)
static enum ivy_status read_dquote_marker(struct ivy_lexer *lex)
{
struct lexer_state *state = get_lexer_state(lex);
if (state->s_type == STATE_STRING) {
/* already within a string */
pop_lexer_state(lex);
return IVY_OK;
}
/* start of a new string */
if (!push_lexer_state(lex, STATE_STRING)) {
return IVY_ERR_NO_MEMORY;
}
return IVY_OK;
}
static enum ivy_status read_atom(struct ivy_lexer *lex)
{
b_string *str = get_temp_string(lex);
while (true) {
int c = peek(lex);
if (c == IVY_ERR_EOF) {
break;
}
if (c < 0) {
return c;
}
if (!isalnum(c) && c != ':' && c != '_') {
break;
}
char s[] = {c, 0};
b_string_append_cstr(str, s);
advance(lex);
}
char *s = b_string_steal(str);
return push_atom(lex, s);
}
static enum ivy_status read_string_content(struct ivy_lexer *lex)
{
int c;
b_string *str = b_string_create();
b_string *str = get_temp_string(lex);
struct lexer_state *state = get_lexer_state(lex);
if (!str) {
@@ -691,12 +779,10 @@ static enum ivy_status read_string_content(struct ivy_lexer *lex)
}
if (b_string_get_size(str, B_STRLEN_NORMAL) == 0) {
b_string_release(str);
return IVY_OK;
}
char *s = b_string_steal(str);
b_string_release(str);
enum ivy_status status = push_string_content(lex, s);
if (status != IVY_OK) {
@@ -736,6 +822,8 @@ static enum ivy_status read_symbol(struct ivy_lexer *lex)
return read_block_comment(lex);
case IVY_SYM_DOUBLE_HYPHEN:
return read_line_comment(lex);
case IVY_SYM_HASH:
return read_atom(lex);
case IVY_SYM_LEFT_BRACE:
push_symbol(lex, node->s_id);
lex->lex_brace_depth++;
@@ -759,9 +847,91 @@ static enum ivy_status read_symbol(struct ivy_lexer *lex)
}
}
static enum ivy_status read_number(struct ivy_lexer *lex)
{
int token_len = 0;
int base = 10;
b_string *str = get_temp_string(lex);
while (true) {
int c = peek(lex);
if (c == IVY_ERR_EOF) {
break;
}
if (c < 0) {
return c;
}
if (c == '_') {
token_len++;
advance(lex);
continue;
}
if (isspace(c) || ispunct(c)) {
break;
}
if (c == '0' && token_len == 0) {
base = 7;
token_len++;
advance(lex);
continue;
}
if (c == 'x' && token_len == 1) {
base = 16;
token_len++;
advance(lex);
continue;
}
if (c == 'b' && token_len == 1) {
base = 2;
token_len++;
advance(lex);
continue;
}
if (base == 2 && c != '0' && c != '1') {
return IVY_ERR_BAD_SYNTAX;
}
if (base == 10 && !isdigit(c)) {
return IVY_ERR_BAD_SYNTAX;
}
if (base == 16 && !isxdigit(c)) {
return IVY_ERR_BAD_SYNTAX;
}
char s[] = {c, 0};
b_string_append_cstr(str, s);
advance(lex);
}
if (token_len == 1 && base == 7) {
return push_number(lex, 0);
}
const char *s = b_string_ptr(str);
char *ep;
/* negative numbers will be lexed as a hyphen followed by a positive
* number. */
unsigned long long v = strtoull(s, &ep, base);
if (*ep != '\0') {
return IVY_ERR_BAD_SYNTAX;
}
return push_number(lex, v);
}
static enum ivy_status read_ident(struct ivy_lexer *lex)
{
b_string *str = b_string_create();
b_string *str = get_temp_string(lex);
bool label = false;
while (true) {
@@ -788,20 +958,17 @@ static enum ivy_status read_ident(struct ivy_lexer *lex)
const char *s = b_string_ptr(str);
if (!strcmp(s, "_")) {
b_string_release(str);
push_symbol(lex, IVY_SYM_UNDERSCORE);
}
enum ivy_keyword keyword = IVY_KW_NONE;
if (!label && (keyword = find_keyword_by_name(s)) != IVY_KW_NONE) {
b_string_release(str);
return push_keyword(lex, keyword);
}
struct ivy_token *tok
= create_token(label ? IVY_TOK_LABEL : IVY_TOK_IDENT);
tok->t_str = b_string_steal(str);
b_string_release(str);
return push_token(lex, tok);
}
@@ -825,8 +992,8 @@ static enum ivy_status pump_tokens(struct ivy_lexer *lex)
return read_string_content(lex);
}
/* `state` is invalid past this point, as the read_* functions may
* perform state transitions. */
/* `state` is invalid past this point, as the read_* functions
* may perform state transitions. */
state = NULL;
if (c == '\n') {
@@ -855,6 +1022,10 @@ static enum ivy_status pump_tokens(struct ivy_lexer *lex)
return read_ident(lex);
}
if (isnumber(c)) {
return read_number(lex);
}
return IVY_ERR_BAD_SYNTAX;
}
@@ -1002,9 +1173,10 @@ const char *ivy_symbol_to_string(enum ivy_symbol sym)
ENUM_STR(IVY_SYM_CARET);
ENUM_STR(IVY_SYM_UNDERSCORE);
ENUM_STR(IVY_SYM_COMMA);
ENUM_STR(IVY_SYM_SEMICOLON);
ENUM_STR(IVY_SYM_DOLLAR);
ENUM_STR(IVY_SYM_RIGHT_ARROW);
ENUM_STR(IVY_SYM_BIG_RIGHT_ARROW);
ENUM_STR(IVY_SYM_HYPHEN_RIGHT_ANGLE);
ENUM_STR(IVY_SYM_EQUAL_RIGHT_ANGLE);
ENUM_STR(IVY_SYM_FORWARD_SLASH_ASTERISK);
ENUM_STR(IVY_SYM_ASTERISK_FORWARD_SLASH);
default: