lang: lex: implement lexing of numbers and atoms
This commit is contained in:
@@ -40,6 +40,8 @@ enum ivy_keyword {
|
||||
IVY_KW_UNLESS,
|
||||
IVY_KW_IN,
|
||||
IVY_KW_DO,
|
||||
IVY_KW_GET,
|
||||
IVY_KW_SET,
|
||||
IVY_KW_END,
|
||||
};
|
||||
|
||||
@@ -79,14 +81,16 @@ enum ivy_symbol {
|
||||
IVY_SYM_PIPE_EQUAL,
|
||||
IVY_SYM_PERCENT_EQUAL,
|
||||
IVY_SYM_CARET_EQUAL,
|
||||
IVY_SYM_HASH,
|
||||
IVY_SYM_BANG,
|
||||
IVY_SYM_PIPE,
|
||||
IVY_SYM_CARET,
|
||||
IVY_SYM_UNDERSCORE,
|
||||
IVY_SYM_COMMA,
|
||||
IVY_SYM_SEMICOLON,
|
||||
IVY_SYM_DOLLAR,
|
||||
IVY_SYM_RIGHT_ARROW,
|
||||
IVY_SYM_BIG_RIGHT_ARROW,
|
||||
IVY_SYM_HYPHEN_RIGHT_ANGLE,
|
||||
IVY_SYM_EQUAL_RIGHT_ANGLE,
|
||||
IVY_SYM_FORWARD_SLASH_ASTERISK,
|
||||
IVY_SYM_ASTERISK_FORWARD_SLASH,
|
||||
};
|
||||
@@ -98,7 +102,7 @@ struct ivy_token {
|
||||
union {
|
||||
enum ivy_keyword t_keyword;
|
||||
enum ivy_symbol t_symbol;
|
||||
signed long long t_number;
|
||||
unsigned long long t_number;
|
||||
char *t_str;
|
||||
};
|
||||
};
|
||||
|
||||
198
lang/lex.c
198
lang/lex.c
@@ -24,6 +24,7 @@ struct ivy_lexer {
|
||||
struct ivy_token *lex_queue;
|
||||
enum ivy_token_type lex_prev_token;
|
||||
|
||||
b_string *lex_temp;
|
||||
b_queue lex_state;
|
||||
unsigned int lex_brace_depth;
|
||||
|
||||
@@ -80,6 +81,8 @@ static struct lex_token_def keywords[] = {
|
||||
LEX_TOKEN_DEF(IVY_KW_UNLESS, "unless"),
|
||||
LEX_TOKEN_DEF(IVY_KW_IN, "in"),
|
||||
LEX_TOKEN_DEF(IVY_KW_DO, "do"),
|
||||
LEX_TOKEN_DEF(IVY_KW_GET, "get"),
|
||||
LEX_TOKEN_DEF(IVY_KW_SET, "set"),
|
||||
LEX_TOKEN_DEF(IVY_KW_END, "end"),
|
||||
};
|
||||
static const size_t nr_keywords = sizeof keywords / sizeof keywords[0];
|
||||
@@ -121,14 +124,16 @@ static struct lex_token_def symbols[] = {
|
||||
LEX_TOKEN_DEF(IVY_SYM_PIPE_EQUAL, "|="),
|
||||
LEX_TOKEN_DEF(IVY_SYM_PERCENT_EQUAL, "%="),
|
||||
LEX_TOKEN_DEF(IVY_SYM_CARET_EQUAL, "^="),
|
||||
LEX_TOKEN_DEF(IVY_SYM_HASH, "#"),
|
||||
LEX_TOKEN_DEF(IVY_SYM_BANG, "!"),
|
||||
LEX_TOKEN_DEF(IVY_SYM_PIPE, "|"),
|
||||
LEX_TOKEN_DEF(IVY_SYM_CARET, "^"),
|
||||
LEX_TOKEN_DEF(IVY_SYM_UNDERSCORE, "_"),
|
||||
LEX_TOKEN_DEF(IVY_SYM_COMMA, ","),
|
||||
LEX_TOKEN_DEF(IVY_SYM_SEMICOLON, ";"),
|
||||
LEX_TOKEN_DEF(IVY_SYM_DOLLAR, "$"),
|
||||
LEX_TOKEN_DEF(IVY_SYM_RIGHT_ARROW, "->"),
|
||||
LEX_TOKEN_DEF(IVY_SYM_BIG_RIGHT_ARROW, "=>"),
|
||||
LEX_TOKEN_DEF(IVY_SYM_HYPHEN_RIGHT_ANGLE, "->"),
|
||||
LEX_TOKEN_DEF(IVY_SYM_EQUAL_RIGHT_ANGLE, "=>"),
|
||||
};
|
||||
static const size_t nr_symbols = sizeof symbols / sizeof symbols[0];
|
||||
|
||||
@@ -198,6 +203,16 @@ static struct ivy_lexer_symbol_node *get_symbol_node(
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static b_string *get_temp_string(struct ivy_lexer *lex)
|
||||
{
|
||||
if (!lex->lex_temp) {
|
||||
lex->lex_temp = b_string_create();
|
||||
}
|
||||
|
||||
b_string_clear(lex->lex_temp);
|
||||
return lex->lex_temp;
|
||||
}
|
||||
|
||||
static enum ivy_status put_symbol(
|
||||
struct ivy_lexer_symbol_node *tree, struct lex_token_def *sym)
|
||||
{
|
||||
@@ -367,6 +382,10 @@ void ivy_lexer_destroy(struct ivy_lexer *lex)
|
||||
destroy_symbol_tree(lex->lex_sym_tree);
|
||||
}
|
||||
|
||||
if (lex->lex_temp) {
|
||||
b_string_release(lex->lex_temp);
|
||||
}
|
||||
|
||||
destroy_state_stack(&lex->lex_state);
|
||||
|
||||
free(lex);
|
||||
@@ -571,6 +590,34 @@ static enum ivy_status push_symbol(struct ivy_lexer *lex, enum ivy_symbol sym)
|
||||
return push_token(lex, tok);
|
||||
}
|
||||
|
||||
static enum ivy_status push_atom(struct ivy_lexer *lex, char *s)
|
||||
{
|
||||
struct ivy_token *tok = malloc(sizeof *tok);
|
||||
if (!tok) {
|
||||
return IVY_ERR_NO_MEMORY;
|
||||
}
|
||||
|
||||
memset(tok, 0x0, sizeof *tok);
|
||||
|
||||
tok->t_type = IVY_TOK_ATOM;
|
||||
tok->t_str = s;
|
||||
return push_token(lex, tok);
|
||||
}
|
||||
|
||||
static enum ivy_status push_number(struct ivy_lexer *lex, unsigned long long v)
|
||||
{
|
||||
struct ivy_token *tok = malloc(sizeof *tok);
|
||||
if (!tok) {
|
||||
return IVY_ERR_NO_MEMORY;
|
||||
}
|
||||
|
||||
memset(tok, 0x0, sizeof *tok);
|
||||
|
||||
tok->t_type = IVY_TOK_NUMBER;
|
||||
tok->t_number = v;
|
||||
return push_token(lex, tok);
|
||||
}
|
||||
|
||||
static enum ivy_status push_keyword(struct ivy_lexer *lex, enum ivy_keyword keyword)
|
||||
{
|
||||
struct ivy_token *tok = malloc(sizeof *tok);
|
||||
@@ -661,13 +708,54 @@ static enum ivy_status read_squote_marker(struct ivy_lexer *lex)
|
||||
|
||||
static enum ivy_status read_dquote_marker(struct ivy_lexer *lex)
|
||||
{
|
||||
struct lexer_state *state = get_lexer_state(lex);
|
||||
|
||||
if (state->s_type == STATE_STRING) {
|
||||
/* already within a string */
|
||||
pop_lexer_state(lex);
|
||||
return IVY_OK;
|
||||
}
|
||||
|
||||
/* start of a new string */
|
||||
if (!push_lexer_state(lex, STATE_STRING)) {
|
||||
return IVY_ERR_NO_MEMORY;
|
||||
}
|
||||
|
||||
return IVY_OK;
|
||||
}
|
||||
|
||||
static enum ivy_status read_atom(struct ivy_lexer *lex)
|
||||
{
|
||||
b_string *str = get_temp_string(lex);
|
||||
|
||||
while (true) {
|
||||
int c = peek(lex);
|
||||
|
||||
if (c == IVY_ERR_EOF) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (c < 0) {
|
||||
return c;
|
||||
}
|
||||
|
||||
if (!isalnum(c) && c != ':' && c != '_') {
|
||||
break;
|
||||
}
|
||||
|
||||
char s[] = {c, 0};
|
||||
b_string_append_cstr(str, s);
|
||||
advance(lex);
|
||||
}
|
||||
|
||||
char *s = b_string_steal(str);
|
||||
return push_atom(lex, s);
|
||||
}
|
||||
|
||||
static enum ivy_status read_string_content(struct ivy_lexer *lex)
|
||||
{
|
||||
int c;
|
||||
b_string *str = b_string_create();
|
||||
b_string *str = get_temp_string(lex);
|
||||
struct lexer_state *state = get_lexer_state(lex);
|
||||
|
||||
if (!str) {
|
||||
@@ -691,12 +779,10 @@ static enum ivy_status read_string_content(struct ivy_lexer *lex)
|
||||
}
|
||||
|
||||
if (b_string_get_size(str, B_STRLEN_NORMAL) == 0) {
|
||||
b_string_release(str);
|
||||
return IVY_OK;
|
||||
}
|
||||
|
||||
char *s = b_string_steal(str);
|
||||
b_string_release(str);
|
||||
|
||||
enum ivy_status status = push_string_content(lex, s);
|
||||
if (status != IVY_OK) {
|
||||
@@ -736,6 +822,8 @@ static enum ivy_status read_symbol(struct ivy_lexer *lex)
|
||||
return read_block_comment(lex);
|
||||
case IVY_SYM_DOUBLE_HYPHEN:
|
||||
return read_line_comment(lex);
|
||||
case IVY_SYM_HASH:
|
||||
return read_atom(lex);
|
||||
case IVY_SYM_LEFT_BRACE:
|
||||
push_symbol(lex, node->s_id);
|
||||
lex->lex_brace_depth++;
|
||||
@@ -759,9 +847,91 @@ static enum ivy_status read_symbol(struct ivy_lexer *lex)
|
||||
}
|
||||
}
|
||||
|
||||
static enum ivy_status read_number(struct ivy_lexer *lex)
|
||||
{
|
||||
int token_len = 0;
|
||||
int base = 10;
|
||||
b_string *str = get_temp_string(lex);
|
||||
|
||||
while (true) {
|
||||
int c = peek(lex);
|
||||
if (c == IVY_ERR_EOF) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (c < 0) {
|
||||
return c;
|
||||
}
|
||||
|
||||
if (c == '_') {
|
||||
token_len++;
|
||||
advance(lex);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (isspace(c) || ispunct(c)) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (c == '0' && token_len == 0) {
|
||||
base = 7;
|
||||
token_len++;
|
||||
advance(lex);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (c == 'x' && token_len == 1) {
|
||||
base = 16;
|
||||
token_len++;
|
||||
advance(lex);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (c == 'b' && token_len == 1) {
|
||||
base = 2;
|
||||
token_len++;
|
||||
advance(lex);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (base == 2 && c != '0' && c != '1') {
|
||||
return IVY_ERR_BAD_SYNTAX;
|
||||
}
|
||||
|
||||
if (base == 10 && !isdigit(c)) {
|
||||
return IVY_ERR_BAD_SYNTAX;
|
||||
}
|
||||
|
||||
if (base == 16 && !isxdigit(c)) {
|
||||
return IVY_ERR_BAD_SYNTAX;
|
||||
}
|
||||
|
||||
char s[] = {c, 0};
|
||||
b_string_append_cstr(str, s);
|
||||
advance(lex);
|
||||
}
|
||||
|
||||
if (token_len == 1 && base == 7) {
|
||||
return push_number(lex, 0);
|
||||
}
|
||||
|
||||
const char *s = b_string_ptr(str);
|
||||
char *ep;
|
||||
|
||||
/* negative numbers will be lexed as a hyphen followed by a positive
|
||||
* number. */
|
||||
unsigned long long v = strtoull(s, &ep, base);
|
||||
|
||||
if (*ep != '\0') {
|
||||
return IVY_ERR_BAD_SYNTAX;
|
||||
}
|
||||
|
||||
return push_number(lex, v);
|
||||
}
|
||||
|
||||
static enum ivy_status read_ident(struct ivy_lexer *lex)
|
||||
{
|
||||
b_string *str = b_string_create();
|
||||
b_string *str = get_temp_string(lex);
|
||||
bool label = false;
|
||||
|
||||
while (true) {
|
||||
@@ -788,20 +958,17 @@ static enum ivy_status read_ident(struct ivy_lexer *lex)
|
||||
|
||||
const char *s = b_string_ptr(str);
|
||||
if (!strcmp(s, "_")) {
|
||||
b_string_release(str);
|
||||
push_symbol(lex, IVY_SYM_UNDERSCORE);
|
||||
}
|
||||
|
||||
enum ivy_keyword keyword = IVY_KW_NONE;
|
||||
if (!label && (keyword = find_keyword_by_name(s)) != IVY_KW_NONE) {
|
||||
b_string_release(str);
|
||||
return push_keyword(lex, keyword);
|
||||
}
|
||||
|
||||
struct ivy_token *tok
|
||||
= create_token(label ? IVY_TOK_LABEL : IVY_TOK_IDENT);
|
||||
tok->t_str = b_string_steal(str);
|
||||
b_string_release(str);
|
||||
|
||||
return push_token(lex, tok);
|
||||
}
|
||||
@@ -825,8 +992,8 @@ static enum ivy_status pump_tokens(struct ivy_lexer *lex)
|
||||
return read_string_content(lex);
|
||||
}
|
||||
|
||||
/* `state` is invalid past this point, as the read_* functions may
|
||||
* perform state transitions. */
|
||||
/* `state` is invalid past this point, as the read_* functions
|
||||
* may perform state transitions. */
|
||||
state = NULL;
|
||||
|
||||
if (c == '\n') {
|
||||
@@ -855,6 +1022,10 @@ static enum ivy_status pump_tokens(struct ivy_lexer *lex)
|
||||
return read_ident(lex);
|
||||
}
|
||||
|
||||
if (isnumber(c)) {
|
||||
return read_number(lex);
|
||||
}
|
||||
|
||||
return IVY_ERR_BAD_SYNTAX;
|
||||
}
|
||||
|
||||
@@ -1002,9 +1173,10 @@ const char *ivy_symbol_to_string(enum ivy_symbol sym)
|
||||
ENUM_STR(IVY_SYM_CARET);
|
||||
ENUM_STR(IVY_SYM_UNDERSCORE);
|
||||
ENUM_STR(IVY_SYM_COMMA);
|
||||
ENUM_STR(IVY_SYM_SEMICOLON);
|
||||
ENUM_STR(IVY_SYM_DOLLAR);
|
||||
ENUM_STR(IVY_SYM_RIGHT_ARROW);
|
||||
ENUM_STR(IVY_SYM_BIG_RIGHT_ARROW);
|
||||
ENUM_STR(IVY_SYM_HYPHEN_RIGHT_ANGLE);
|
||||
ENUM_STR(IVY_SYM_EQUAL_RIGHT_ANGLE);
|
||||
ENUM_STR(IVY_SYM_FORWARD_SLASH_ASTERISK);
|
||||
ENUM_STR(IVY_SYM_ASTERISK_FORWARD_SLASH);
|
||||
default:
|
||||
|
||||
Reference in New Issue
Block a user