#include "lex.h" #include "line-source.h" #include "token.h" #include #include #include #include #include #include #include #include #include #include #include #include #define LINEBUF_DEFAULT_CAPACITY 1024 #define LEX_TOKEN_DEF(i, n) {.id = (i), .name = (n)} #define IS_VALID_IDENT_CHAR(c) \ (b_wchar_is_alnum(c) || c == '.' || c == '-' || c == '_') #define IS_VALID_IDENT_START_CHAR(c) \ (b_wchar_is_alpha(c) || c == '.' || c == '_') #define IS_VALID_REG_START_CHAR(c) (b_wchar_is_alnum(c) || c == '.' || c == '_') static struct lex_token_def symbols[] = { LEX_TOKEN_DEF(SYM_COMMA, ","), LEX_TOKEN_DEF(SYM_HYPHEN, "-"), LEX_TOKEN_DEF(SYM_LEFT_BRACKET, "["), LEX_TOKEN_DEF(SYM_RIGHT_BRACKET, "]"), LEX_TOKEN_DEF(SYM_LEFT_BRACE, "{"), LEX_TOKEN_DEF(SYM_RIGHT_BRACE, "}"), LEX_TOKEN_DEF(SYM_LEFT_PAREN, "("), LEX_TOKEN_DEF(SYM_RIGHT_PAREN, ")"), LEX_TOKEN_DEF(SYM_SEMICOLON, ";"), LEX_TOKEN_DEF(SYM_COLON, ":"), LEX_TOKEN_DEF(SYM_HYPHEN_RIGHT_ANGLE, "->"), }; static const size_t nr_symbols = sizeof symbols / sizeof symbols[0]; static struct lex_token_def keywords[] = { LEX_TOKEN_DEF(KW_INTERFACE, "interface"), LEX_TOKEN_DEF(KW_FUNC, "func"), }; static const size_t nr_keywords = sizeof keywords / sizeof keywords[0]; static struct lex_symbol_node *get_symbol_node( struct lex_symbol_node *node, char c) { b_queue_entry *entry = b_queue_first(&node->s_children); while (entry) { struct lex_symbol_node *child = b_unbox(struct lex_symbol_node, entry, s_entry); if (child->s_char == c) { return child; } entry = b_queue_next(entry); } return NULL; } static b_string *get_temp_string(struct lex *lex) { if (!lex->lex_temp) { lex->lex_temp = b_string_create(); } b_string_clear(lex->lex_temp); return lex->lex_temp; } static enum status put_symbol( struct lex_symbol_node *tree, struct lex_token_def *sym) { for (size_t i = 0; sym->name[i]; i++) { char c = sym->name[i]; struct lex_symbol_node *child = get_symbol_node(tree, c); if (child) { tree = child; continue; } child = malloc(sizeof *child); if (!child) { return ERR_NO_MEMORY; } memset(child, 0x0, sizeof *child); child->s_def = NULL; child->s_char = c; b_queue_push_back(&tree->s_children, &child->s_entry); tree = child; } tree->s_def = sym; return SUCCESS; } static void destroy_symbol_tree(struct lex_symbol_node *tree) { b_queue_entry *entry = b_queue_first(&tree->s_children); while (entry) { struct lex_symbol_node *node = b_unbox(struct lex_symbol_node, entry, s_entry); b_queue_entry *next = b_queue_next(entry); b_queue_delete(&tree->s_children, entry); destroy_symbol_tree(node); entry = next; } free(tree); } static struct lex_symbol_node *build_symbol_tree(void) { struct lex_symbol_node *root = malloc(sizeof *root); if (!root) { return NULL; } memset(root, 0x0, sizeof *root); root->s_def = NULL; enum status status = SUCCESS; for (size_t i = 0; i < nr_symbols; i++) { status = put_symbol(root, &symbols[i]); if (status != SUCCESS) { destroy_symbol_tree(root); return NULL; } } return root; } struct lex *lex_create(struct line_source *src) { struct lex *lex = malloc(sizeof *lex); if (!lex) { return NULL; } memset(lex, 0x0, sizeof *lex); lex->lex_status = SUCCESS; lex->lex_source = src; lex->lex_sym_tree = build_symbol_tree(); if (!lex->lex_sym_tree) { lex_destroy(lex); return NULL; } return lex; } void lex_destroy(struct lex *lex) { b_queue_entry *entry = b_queue_first(&lex->lex_queue); while (entry) { struct token *tok = b_unbox(struct token, entry, tok_entry); b_queue_entry *next = b_queue_next(entry); b_queue_delete(&lex->lex_queue, entry); token_destroy(tok); entry = next; } if (lex->lex_sym_tree) { destroy_symbol_tree(lex->lex_sym_tree); } if (lex->lex_temp) { b_string_unref(lex->lex_temp); } free(lex); } enum status lex_get_status(const struct lex *lex) { return lex->lex_status; } struct line_source *lex_get_line_source(const struct lex *lex) { return lex->lex_source; } const struct file_cell *lex_get_cursor(const struct lex *lex) { return &lex->lex_source->s_cursor; } static bool char_can_begin_symbol(char c) { for (size_t i = 0; i < nr_symbols; i++) { if (symbols[i].name[0] == c) { return true; } } return false; } static struct token *create_token(enum token_type type) { struct token *tok = malloc(sizeof *tok); if (!tok) { return NULL; } memset(tok, 0x0, sizeof *tok); tok->tok_type = type; return tok; } static void set_token_start(struct lex *lex) { lex->lex_token_start = *line_source_get_cursor(lex->lex_source); } static void set_token_end(struct lex *lex) { lex->lex_token_end = *line_source_get_cursor(lex->lex_source); } static enum status push_token(struct lex *lex, struct token *tok) { tok->tok_location.s_start = lex->lex_token_start; tok->tok_location.s_end = lex->lex_token_end; b_queue_push_back(&lex->lex_queue, &tok->tok_entry); return SUCCESS; } static enum status push_symbol(struct lex *lex, enum token_symbol sym) { struct token *tok = malloc(sizeof *tok); if (!tok) { return ERR_NO_MEMORY; } memset(tok, 0x0, sizeof *tok); tok->tok_type = TOK_SYMBOL; tok->tok_value_type = TOK_V_SYMBOL; tok->tok_sym = sym; return push_token(lex, tok); } static enum status push_keyword(struct lex *lex, enum token_keyword kw) { struct token *tok = malloc(sizeof *tok); if (!tok) { return ERR_NO_MEMORY; } memset(tok, 0x0, sizeof *tok); tok->tok_type = TOK_KEYWORD; tok->tok_value_type = TOK_V_KEYWORD; tok->tok_kw = kw; return push_token(lex, tok); } static enum status push_string_token( struct lex *lex, enum token_type type, char *s) { struct token *tok = malloc(sizeof *tok); if (!tok) { return ERR_NO_MEMORY; } char *ep = NULL; long long v = strtoll(s, &ep, 10); memset(tok, 0x0, sizeof *tok); tok->tok_type = type; if (*ep == '\0') { tok->tok_int = v; tok->tok_value_type = TOK_V_INT; free(s); } else { tok->tok_str = s; tok->tok_value_type = TOK_V_STRING; } return push_token(lex, tok); } static enum status push_int(struct lex *lex, unsigned long long v) { struct token *tok = malloc(sizeof *tok); if (!tok) { return ERR_NO_MEMORY; } memset(tok, 0x0, sizeof *tok); tok->tok_type = TOK_INT; tok->tok_value_type = TOK_V_INT; tok->tok_int = v; return push_token(lex, tok); } static enum status read_line_comment(struct lex *lex) { while (true) { b_wchar c = line_source_getc(lex->lex_source); if (c == -ERR_EOF || c == '\n') { break; } if (c < 0) { return -c; } } return SUCCESS; } static enum status read_number(struct lex *lex, bool negate) { int token_len = 0; int base = 10; int dots = 0; b_string *str = get_temp_string(lex); if (!negate) { set_token_start(lex); } while (true) { b_wchar c = line_source_peekc(lex->lex_source); if (c == -ERR_EOF) { break; } if (c < 0) { return -c; } if (c == '_') { token_len++; set_token_end(lex); line_source_getc(lex->lex_source); continue; } if (c == '.') { return ERR_BAD_SYNTAX; } if (b_wchar_is_space(c) || b_wchar_is_punct(c)) { break; } if (c == 'x' && token_len == 1) { base = 16; token_len++; set_token_end(lex); line_source_getc(lex->lex_source); continue; } if (c == 'b' && token_len == 1) { base = 2; token_len++; set_token_end(lex); line_source_getc(lex->lex_source); continue; } if (base == 2 && c != '0' && c != '1') { return ERR_BAD_SYNTAX; } if (base == 10 && !isdigit(c)) { return ERR_BAD_SYNTAX; } if (base == 16 && !isxdigit(c)) { return ERR_BAD_SYNTAX; } b_string_append_wc(str, c); set_token_end(lex); line_source_getc(lex->lex_source); token_len++; } if (token_len == 1 && base == 7) { return push_int(lex, 0); } const char *s = b_string_ptr(str); char *ep = NULL; /* negative numbers will be lexed as a hyphen followed by a positive * number. */ unsigned long long v = strtoull(s, &ep, base); if (*ep != '\0') { return ERR_BAD_SYNTAX; } if (negate) { v *= -1; } return push_int(lex, v); } static enum token_keyword find_keyword(const char *s) { for (size_t i = 0; i < nr_keywords; i++) { if (!strcmp(keywords[i].name, s)) { return keywords[i].id; } } return KW_NONE; } static enum status read_ident(struct lex *lex, enum token_type type) { int dots = 0; b_string *str = get_temp_string(lex); b_wchar prev = 0; if (type == TOK_NONE) { set_token_start(lex); } while (1) { b_wchar c = line_source_peekc(lex->lex_source); if ((c == '.' || c == '-') && prev == c) { return ERR_BAD_SYNTAX; } if (c == '.') { dots++; } if (!IS_VALID_IDENT_CHAR(c)) { break; } prev = c; b_string_append_wc(str, c); set_token_end(lex); line_source_getc(lex->lex_source); } if (type == TOK_NONE) { type = dots > 0 ? TOK_NAME : TOK_WORD; } char *s = b_string_steal(str); enum token_keyword kw = find_keyword(s); if (kw != KW_NONE) { free(s); return push_keyword(lex, kw); } return push_string_token(lex, type, s); } static enum status read_string(struct lex *lex) { b_string *str = get_temp_string(lex); b_wchar c = line_source_peekc(lex->lex_source); bool esc = false; if (c != '"') { return ERR_BAD_SYNTAX; } line_source_getc(lex->lex_source); while (1) { b_wchar c = line_source_peekc(lex->lex_source); if (esc) { switch (c) { case '\\': case '"': b_string_append_wc(str, c); break; default: return ERR_BAD_SYNTAX; } esc = false; line_source_getc(lex->lex_source); continue; } if (c == '\\') { esc = true; line_source_getc(lex->lex_source); continue; } if (c == '"') { line_source_getc(lex->lex_source); break; } b_string_append_wc(str, c); line_source_getc(lex->lex_source); } char *s = b_string_steal(str); return push_string_token(lex, TOK_STRING, s); } static enum status read_symbol(struct lex *lex) { struct lex_symbol_node *node = lex->lex_sym_tree; set_token_start(lex); b_wchar prev = 0; while (true) { b_wchar c = line_source_peekc(lex->lex_source); if (c < 0) { break; } struct lex_symbol_node *next = get_symbol_node(node, c); if (!next) { prev = c; break; } node = next; set_token_end(lex); line_source_getc(lex->lex_source); prev = c; } if (!node || node->s_def == NULL) { return ERR_BAD_SYNTAX; } if (node->s_def->id == SYM_HYPHEN && isdigit(prev)) { return read_number(lex, true); } return push_symbol(lex, node->s_def->id); } static void skip_whitespace(struct lex *lex) { b_wchar c = line_source_peekc(lex->lex_source); while (b_wchar_is_space(c)) { line_source_getc(lex->lex_source); c = line_source_peekc(lex->lex_source); } } static bool should_skip(b_wchar c, bool skip_linefeeds) { bool skip = b_wchar_is_space(c); if (!skip_linefeeds) { skip = (skip && c != '\n'); } return skip; } static void skip_ignored_chars(struct lex *lex, bool include_linefeeds) { b_wchar c = line_source_peekc(lex->lex_source); while (1) { while (should_skip(c, include_linefeeds)) { line_source_getc(lex->lex_source); c = line_source_peekc(lex->lex_source); } if (c != '#') { break; } line_source_getc(lex->lex_source); c = line_source_peekc(lex->lex_source); while (c != '\n') { line_source_getc(lex->lex_source); c = line_source_peekc(lex->lex_source); } line_source_getc(lex->lex_source); c = line_source_peekc(lex->lex_source); } } static enum status pump_tokens(struct lex *lex) { b_wchar c = line_source_peekc(lex->lex_source); if (c < 0) { return -c; } while (1) { if (c == '#' || (b_wchar_is_space(c) && c != '\n')) { skip_ignored_chars(lex, false); } else { break; } c = line_source_peekc(lex->lex_source); } if (c == '\\') { line_source_getc(lex->lex_source); skip_ignored_chars(lex, true); c = line_source_peekc(lex->lex_source); } if (c == '\n') { set_token_start(lex); set_token_end(lex); while (c == '\n') { line_source_getc(lex->lex_source); if (!line_source_input_available(lex->lex_source)) { break; } c = line_source_peekc(lex->lex_source); } if (c < 0) { return -c; } return SUCCESS; } while (b_wchar_is_space(c) && c != '\n') { line_source_getc(lex->lex_source); c = line_source_peekc(lex->lex_source); } if (IS_VALID_IDENT_START_CHAR(c)) { return read_ident(lex, TOK_NONE); } if (char_can_begin_symbol(c)) { return read_symbol(lex); } if (c == '"') { return read_string(lex); } if (isdigit(c)) { return read_number(lex, false); } return ERR_BAD_SYNTAX; } struct token *lex_peek(struct lex *lex) { enum status status = SUCCESS; while (b_queue_empty(&lex->lex_queue)) { status = pump_tokens(lex); if (status != SUCCESS) { lex->lex_status = status; return NULL; } } lex->lex_status = status; b_queue_entry *entry = b_queue_first(&lex->lex_queue); struct token *tok = b_unbox(struct token, entry, tok_entry); return tok; } void lex_advance(struct lex *lex) { enum status status = SUCCESS; while (b_queue_empty(&lex->lex_queue)) { status = pump_tokens(lex); if (status != SUCCESS) { lex->lex_status = status; return; } } b_queue_entry *entry = b_queue_pop_front(&lex->lex_queue); struct token *tok = b_unbox(struct token, entry, tok_entry); token_destroy(tok); } bool lex_tokens_available(struct lex *lex) { if (!b_queue_empty(&lex->lex_queue)) { return true; } if (line_source_input_available(lex->lex_source)) { return true; } return false; }