Files
ivy/lang/lex.c

753 lines
16 KiB
C

#include <blue/core/hash.h>
#include <blue/core/queue.h>
#include <blue/object/string.h>
#include <ctype.h>
#include <ivy/lang/lex.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define LINEBUF_DEFAULT_CAPACITY 1024
#define LEX_TOKEN_DEF(i, n) \
{ \
.id = (i), .name = (n) \
}
struct ivy_lexer_symbol_node {
char s_char;
enum ivy_symbol s_id;
b_queue_entry s_entry;
b_queue s_children;
};
struct lex_token_def {
int id;
const char *name;
uint64_t name_hash;
};
static struct lex_token_def keywords[] = {
LEX_TOKEN_DEF(IVY_KW_PACKAGE, "package"),
LEX_TOKEN_DEF(IVY_KW_USE, "use"),
LEX_TOKEN_DEF(IVY_KW_CLASS, "class"),
LEX_TOKEN_DEF(IVY_KW_PROTOCOL, "protocol"),
LEX_TOKEN_DEF(IVY_KW_TRY, "try"),
LEX_TOKEN_DEF(IVY_KW_THROW, "throw"),
LEX_TOKEN_DEF(IVY_KW_CATCH, "catch"),
LEX_TOKEN_DEF(IVY_KW_IF, "if"),
LEX_TOKEN_DEF(IVY_KW_AND, "and"),
LEX_TOKEN_DEF(IVY_KW_OR, "or"),
LEX_TOKEN_DEF(IVY_KW_IS, "is"),
LEX_TOKEN_DEF(IVY_KW_NOT, "not"),
LEX_TOKEN_DEF(IVY_KW_ELSE, "else"),
LEX_TOKEN_DEF(IVY_KW_WHILE, "while"),
LEX_TOKEN_DEF(IVY_KW_FOR, "for"),
LEX_TOKEN_DEF(IVY_KW_MATCH, "match"),
LEX_TOKEN_DEF(IVY_KW_UNLESS, "unless"),
LEX_TOKEN_DEF(IVY_KW_IN, "in"),
LEX_TOKEN_DEF(IVY_KW_DO, "do"),
LEX_TOKEN_DEF(IVY_KW_END, "end"),
};
static const size_t nr_keywords = sizeof keywords / sizeof keywords[0];
static struct lex_token_def symbols[] = {
LEX_TOKEN_DEF(IVY_SYM_DOT, "."),
LEX_TOKEN_DEF(IVY_SYM_LEFT_BRACE, "{"),
LEX_TOKEN_DEF(IVY_SYM_RIGHT_BRACE, "}"),
LEX_TOKEN_DEF(IVY_SYM_LEFT_BRACKET, "["),
LEX_TOKEN_DEF(IVY_SYM_RIGHT_BRACKET, "]"),
LEX_TOKEN_DEF(IVY_SYM_LEFT_PAREN, "("),
LEX_TOKEN_DEF(IVY_SYM_RIGHT_PAREN, ")"),
LEX_TOKEN_DEF(IVY_SYM_LEFT_ANGLE, "<"),
LEX_TOKEN_DEF(IVY_SYM_RIGHT_ANGLE, ">"),
LEX_TOKEN_DEF(IVY_SYM_COLON, ":"),
LEX_TOKEN_DEF(IVY_SYM_DOUBLE_COLON, "::"),
LEX_TOKEN_DEF(IVY_SYM_PLUS, "+"),
LEX_TOKEN_DEF(IVY_SYM_HYPHEN, "-"),
LEX_TOKEN_DEF(IVY_SYM_DOUBLE_HYPHEN, "--"),
LEX_TOKEN_DEF(IVY_SYM_FORWARD_SLASH, "/"),
LEX_TOKEN_DEF(IVY_SYM_ASTERISK, "*"),
LEX_TOKEN_DEF(IVY_SYM_FORWARD_SLASH_ASTERISK, "/*"),
LEX_TOKEN_DEF(IVY_SYM_ASTERISK_FORWARD_SLASH, "*/"),
LEX_TOKEN_DEF(IVY_SYM_PERCENT, "%"),
LEX_TOKEN_DEF(IVY_SYM_AMPERSAND, "&"),
LEX_TOKEN_DEF(IVY_SYM_EQUAL, "="),
LEX_TOKEN_DEF(IVY_SYM_DOUBLE_EQUAL, "=="),
LEX_TOKEN_DEF(IVY_SYM_DOUBLE_LEFT_ANGLE, "<<"),
LEX_TOKEN_DEF(IVY_SYM_DOUBLE_RIGHT_ANGLE, ">>"),
LEX_TOKEN_DEF(IVY_SYM_DOUBLE_LEFT_ANGLE_EQUAL, "<<="),
LEX_TOKEN_DEF(IVY_SYM_DOUBLE_RIGHT_ANGLE_EQUAL, ">>="),
LEX_TOKEN_DEF(IVY_SYM_PLUS_EQUAL, "+="),
LEX_TOKEN_DEF(IVY_SYM_HYPHEN_EQUAL, "-="),
LEX_TOKEN_DEF(IVY_SYM_FORWARD_SLASH_EQUAL, "/="),
LEX_TOKEN_DEF(IVY_SYM_ASTERISK_EQUAL, "*="),
LEX_TOKEN_DEF(IVY_SYM_AMPERSAND_EQUAL, "&="),
LEX_TOKEN_DEF(IVY_SYM_PIPE_EQUAL, "|="),
LEX_TOKEN_DEF(IVY_SYM_PERCENT_EQUAL, "%="),
LEX_TOKEN_DEF(IVY_SYM_CARET_EQUAL, "^="),
LEX_TOKEN_DEF(IVY_SYM_BANG, "!"),
LEX_TOKEN_DEF(IVY_SYM_PIPE, "|"),
LEX_TOKEN_DEF(IVY_SYM_CARET, "^"),
LEX_TOKEN_DEF(IVY_SYM_UNDERSCORE, "_"),
LEX_TOKEN_DEF(IVY_SYM_COMMA, ","),
LEX_TOKEN_DEF(IVY_SYM_DOLLAR, "$"),
LEX_TOKEN_DEF(IVY_SYM_RIGHT_ARROW, "->"),
LEX_TOKEN_DEF(IVY_SYM_BIG_RIGHT_ARROW, "=>"),
};
static const size_t nr_symbols = sizeof symbols / sizeof symbols[0];
static struct ivy_lexer_symbol_node *get_symbol_node(
struct ivy_lexer_symbol_node *node, char c)
{
b_queue_iterator it;
b_queue_foreach (&it, &node->s_children) {
struct ivy_lexer_symbol_node *child = b_unbox(
struct ivy_lexer_symbol_node, it.entry, s_entry);
if (child->s_char == c) {
return child;
}
}
return NULL;
}
static enum ivy_status put_symbol(
struct ivy_lexer_symbol_node *tree, struct lex_token_def *sym)
{
for (size_t i = 0; sym->name[i]; i++) {
char c = sym->name[i];
struct ivy_lexer_symbol_node *child = get_symbol_node(tree, c);
if (child) {
tree = child;
continue;
}
child = malloc(sizeof *child);
if (!child) {
return IVY_ERR_NO_MEMORY;
}
child->s_id = IVY_SYM_NONE;
child->s_char = c;
b_queue_push_back(&tree->s_children, &child->s_entry);
tree = child;
}
tree->s_id = sym->id;
return IVY_OK;
}
static void destroy_symbol_tree(struct ivy_lexer_symbol_node *tree)
{
b_queue_iterator it;
b_queue_iterator_begin(&tree->s_children, &it);
while (b_queue_iterator_is_valid(&it)) {
struct ivy_lexer_symbol_node *node = b_unbox(
struct ivy_lexer_symbol_node, it.entry, s_entry);
b_queue_iterator_erase(&it);
destroy_symbol_tree(node);
}
free(tree);
}
static struct ivy_lexer_symbol_node *build_symbol_tree(void)
{
struct ivy_lexer_symbol_node *root = malloc(sizeof *root);
if (!root) {
return NULL;
}
memset(root, 0x0, sizeof *root);
root->s_id = IVY_SYM_NONE;
enum ivy_status status = IVY_OK;
for (size_t i = 0; i < nr_symbols; i++) {
status = put_symbol(root, &symbols[i]);
if (status != IVY_OK) {
destroy_symbol_tree(root);
return NULL;
}
}
return root;
}
static void print_symbol_node(struct ivy_lexer_symbol_node *node, int depth)
{
for (int i = 0; i < depth; i++) {
fputs(" ", stdout);
}
printf("%c", node->s_char);
if (node->s_id != IVY_SYM_NONE) {
printf(" (%s)", ivy_symbol_to_string(node->s_id));
}
printf("\n");
b_queue_iterator it;
b_queue_foreach (&it, &node->s_children) {
struct ivy_lexer_symbol_node *child = b_unbox(
struct ivy_lexer_symbol_node, it.entry, s_entry);
print_symbol_node(child, depth + 1);
}
}
static void init_keywords(void)
{
for (size_t i = 0; i < nr_keywords; i++) {
keywords[i].name_hash = b_hash_string(keywords[i].name);
}
}
static enum ivy_keyword find_keyword_by_name(const char *s)
{
uint64_t s_hash = b_hash_string(s);
for (size_t i = 0; i < nr_keywords; i++) {
struct lex_token_def *def = &keywords[i];
if (s_hash != def->name_hash) {
continue;
}
if (strcmp(s, def->name) != 0) {
continue;
}
return def->id;
}
return IVY_KW_NONE;
}
enum ivy_status ivy_lexer_init(struct ivy_lexer *lex)
{
memset(lex, 0x0, sizeof *lex);
lex->lex_status = IVY_OK;
lex->lex_prev_token = IVY_TOK_NONE;
lex->lex_linebuf = malloc(LINEBUF_DEFAULT_CAPACITY);
lex->lex_linebuf_cap = LINEBUF_DEFAULT_CAPACITY;
lex->lex_sym_tree = build_symbol_tree();
print_symbol_node(lex->lex_sym_tree, 0);
/* TODO only do keyword initialisation once */
init_keywords();
return IVY_OK;
}
void ivy_lexer_finish(struct ivy_lexer *lex)
{
while (lex->lex_queue) {
struct ivy_token *next = lex->lex_queue->t_next;
ivy_token_destroy(lex->lex_queue);
lex->lex_queue = next;
}
if (lex->lex_linebuf) {
free(lex->lex_linebuf);
}
if (lex->lex_sym_tree) {
destroy_symbol_tree(lex->lex_sym_tree);
}
memset(lex, 0x0, sizeof *lex);
}
static enum ivy_status refill_linebuf(struct ivy_lexer *lex)
{
if (!lex->lex_source) {
return IVY_ERR_EOF;
}
enum ivy_status status = ivy_line_source_readline(
lex->lex_source, lex->lex_linebuf, lex->lex_linebuf_cap,
&lex->lex_linebuf_len, NULL);
if (status == IVY_OK) {
lex->lex_linebuf_ptr = 0;
}
return status;
}
static int peek(struct ivy_lexer *lex)
{
enum ivy_status status = IVY_OK;
if (lex->lex_linebuf_ptr >= lex->lex_linebuf_len) {
status = refill_linebuf(lex);
}
if (status != IVY_OK) {
return status;
}
if (lex->lex_linebuf_len == 0) {
return IVY_ERR_EOF;
}
int c = lex->lex_linebuf[lex->lex_linebuf_ptr];
return c;
}
static int peek_next(struct ivy_lexer *lex)
{
enum ivy_status status = IVY_OK;
if (lex->lex_linebuf_ptr >= lex->lex_linebuf_len) {
status = refill_linebuf(lex);
}
if (status != IVY_OK) {
return status;
}
if (lex->lex_linebuf_len == 0) {
return IVY_ERR_EOF;
}
if (lex->lex_linebuf_ptr + 1 >= lex->lex_linebuf_len) {
return IVY_ERR_EOF;
}
int c = lex->lex_linebuf[lex->lex_linebuf_ptr + 1];
return c;
}
static int advance(struct ivy_lexer *lex)
{
enum ivy_status status = IVY_OK;
if (lex->lex_linebuf_ptr >= lex->lex_linebuf_len) {
status = refill_linebuf(lex);
}
if (status != IVY_OK) {
return status;
}
if (lex->lex_linebuf_len == 0) {
return IVY_ERR_EOF;
}
int c = lex->lex_linebuf[lex->lex_linebuf_ptr++];
return c;
}
static bool char_can_begin_symbol(char c)
{
for (size_t i = 0; i < nr_symbols; i++) {
if (symbols[i].name[0] == c) {
return true;
}
}
return false;
}
static struct ivy_token *create_token(enum ivy_token_type type)
{
struct ivy_token *tok = malloc(sizeof *tok);
if (!tok) {
return NULL;
}
memset(tok, 0x0, sizeof *tok);
tok->t_type = type;
return tok;
}
static enum ivy_status push_token(struct ivy_lexer *lex, struct ivy_token *tok)
{
struct ivy_token **slot = &lex->lex_queue;
while (*slot) {
slot = &(*slot)->t_next;
}
*slot = tok;
lex->lex_prev_token = tok->t_type;
return IVY_OK;
}
static enum ivy_status push_linefeed(struct ivy_lexer *lex)
{
if (lex->lex_prev_token == IVY_TOK_LINEFEED) {
return IVY_OK;
}
struct ivy_token *tok = malloc(sizeof *tok);
if (!tok) {
return IVY_ERR_NO_MEMORY;
}
memset(tok, 0x0, sizeof *tok);
tok->t_type = IVY_TOK_LINEFEED;
return push_token(lex, tok);
}
static enum ivy_status push_symbol(struct ivy_lexer *lex, enum ivy_symbol sym)
{
struct ivy_token *tok = malloc(sizeof *tok);
if (!tok) {
return IVY_ERR_NO_MEMORY;
}
memset(tok, 0x0, sizeof *tok);
tok->t_type = IVY_TOK_SYMBOL;
tok->t_symbol = sym;
return push_token(lex, tok);
}
static enum ivy_status push_keyword(struct ivy_lexer *lex, enum ivy_keyword keyword)
{
struct ivy_token *tok = malloc(sizeof *tok);
if (!tok) {
return IVY_ERR_NO_MEMORY;
}
memset(tok, 0x0, sizeof *tok);
tok->t_type = IVY_TOK_KEYWORD;
tok->t_keyword = keyword;
return push_token(lex, tok);
}
static enum ivy_status read_line_comment(struct ivy_lexer *lex)
{
while (true) {
int c = advance(lex);
if (c == IVY_ERR_EOF || c == '\n') {
break;
}
if (c < 0) {
return c;
}
}
return IVY_OK;
}
static enum ivy_status read_block_comment(struct ivy_lexer *lex)
{
int depth = 1;
char buf[2] = {0};
while (depth > 0) {
int c = peek(lex);
if (c < 0) {
return c;
}
if (!buf[0]) {
buf[0] = c;
} else if (!buf[1]) {
buf[1] = c;
} else {
buf[0] = buf[1];
buf[1] = c;
}
if (buf[0] == '/' && buf[1] == '*') {
depth++;
} else if (buf[0] == '*' && buf[1] == '/') {
depth--;
}
advance(lex);
}
return IVY_OK;
}
static enum ivy_status read_symbol(struct ivy_lexer *lex)
{
char sym_buf[32];
unsigned int sym_len = 0;
struct ivy_lexer_symbol_node *node = lex->lex_sym_tree;
while (true) {
int c = peek(lex);
struct ivy_lexer_symbol_node *next = get_symbol_node(node, c);
if (!next) {
break;
}
node = next;
advance(lex);
}
if (!node || node->s_id == IVY_SYM_NONE) {
return IVY_ERR_BAD_SYNTAX;
}
if (node->s_id == IVY_SYM_FORWARD_SLASH_ASTERISK) {
return read_block_comment(lex);
} else if (node->s_id == IVY_SYM_DOUBLE_HYPHEN) {
return read_line_comment(lex);
}
push_symbol(lex, node->s_id);
return IVY_OK;
}
static enum ivy_status read_ident(struct ivy_lexer *lex)
{
b_string *str = b_string_create();
bool label = false;
while (true) {
int c = peek(lex);
if (c < 0) {
break;
}
if (c == ':' && peek_next(lex) != ':') {
advance(lex);
label = true;
break;
}
if (!isalnum(c) && c != '_') {
break;
}
char s[2] = {c, 0};
b_string_append_cstr(str, s);
advance(lex);
}
const char *s = b_string_ptr(str);
if (!strcmp(s, "_")) {
b_string_release(str);
push_symbol(lex, IVY_SYM_UNDERSCORE);
}
enum ivy_keyword keyword = IVY_KW_NONE;
if (!label && (keyword = find_keyword_by_name(s)) != IVY_KW_NONE) {
b_string_release(str);
return push_keyword(lex, keyword);
}
struct ivy_token *tok
= create_token(label ? IVY_TOK_LABEL : IVY_TOK_IDENT);
tok->t_str = b_string_steal(str);
b_string_release(str);
return push_token(lex, tok);
}
static enum ivy_status pump_tokens(struct ivy_lexer *lex)
{
enum ivy_status status;
int c = peek(lex);
if (c < 0) {
return c;
}
if (c == '\n') {
while (c == '\n') {
advance(lex);
c = peek(lex);
}
if (c < 0) {
return c;
}
return push_linefeed(lex);
}
while (isspace(c)) {
advance(lex);
c = peek(lex);
}
if (char_can_begin_symbol(c)) {
return read_symbol(lex);
}
if (isalpha(c) || c == '_') {
return read_ident(lex);
}
return IVY_ERR_BAD_SYNTAX;
}
struct ivy_token *ivy_lexer_peek(struct ivy_lexer *lex)
{
enum ivy_status status = IVY_OK;
while (!lex->lex_queue) {
status = pump_tokens(lex);
if (status != IVY_OK) {
lex->lex_status = status;
return NULL;
}
}
lex->lex_status = status;
struct ivy_token *tok = lex->lex_queue;
return tok;
}
struct ivy_token *ivy_lexer_read(struct ivy_lexer *lex)
{
enum ivy_status status = IVY_OK;
while (!lex->lex_queue) {
status = pump_tokens(lex);
if (status != IVY_OK) {
lex->lex_status = status;
return NULL;
}
}
struct ivy_token *tok = lex->lex_queue;
lex->lex_queue = lex->lex_queue->t_next;
return tok;
}
void ivy_token_destroy(struct ivy_token *tok)
{
switch (tok->t_type) {
case IVY_TOK_ATOM:
case IVY_TOK_STRING:
case IVY_TOK_IDENT:
free(tok->t_str);
break;
default:
break;
}
free(tok);
}
#define ENUM_STR(x) \
case x: \
return #x
const char *ivy_lex_token_type_to_string(enum ivy_token_type type)
{
switch (type) {
ENUM_STR(IVY_TOK_NONE);
ENUM_STR(IVY_TOK_KEYWORD);
ENUM_STR(IVY_TOK_SYMBOL);
ENUM_STR(IVY_TOK_ATOM);
ENUM_STR(IVY_TOK_NUMBER);
ENUM_STR(IVY_TOK_LABEL);
ENUM_STR(IVY_TOK_IDENT);
ENUM_STR(IVY_TOK_STRING);
ENUM_STR(IVY_TOK_STR_START);
ENUM_STR(IVY_TOK_STR_END);
ENUM_STR(IVY_TOK_LINEFEED);
default:
return "";
}
}
const char *ivy_keyword_to_string(enum ivy_keyword keyword)
{
switch (keyword) {
ENUM_STR(IVY_KW_NONE);
ENUM_STR(IVY_KW_PACKAGE);
ENUM_STR(IVY_KW_USE);
ENUM_STR(IVY_KW_CLASS);
ENUM_STR(IVY_KW_PROTOCOL);
ENUM_STR(IVY_KW_TRY);
ENUM_STR(IVY_KW_THROW);
ENUM_STR(IVY_KW_CATCH);
ENUM_STR(IVY_KW_IF);
ENUM_STR(IVY_KW_AND);
ENUM_STR(IVY_KW_OR);
ENUM_STR(IVY_KW_IS);
ENUM_STR(IVY_KW_NOT);
ENUM_STR(IVY_KW_ELSE);
ENUM_STR(IVY_KW_WHILE);
ENUM_STR(IVY_KW_FOR);
ENUM_STR(IVY_KW_MATCH);
ENUM_STR(IVY_KW_UNLESS);
ENUM_STR(IVY_KW_IN);
ENUM_STR(IVY_KW_DO);
ENUM_STR(IVY_KW_END);
default:
return "";
}
}
const char *ivy_symbol_to_string(enum ivy_symbol sym)
{
switch (sym) {
ENUM_STR(IVY_SYM_NONE);
ENUM_STR(IVY_SYM_DOT);
ENUM_STR(IVY_SYM_LEFT_BRACE);
ENUM_STR(IVY_SYM_RIGHT_BRACE);
ENUM_STR(IVY_SYM_LEFT_BRACKET);
ENUM_STR(IVY_SYM_RIGHT_BRACKET);
ENUM_STR(IVY_SYM_LEFT_PAREN);
ENUM_STR(IVY_SYM_RIGHT_PAREN);
ENUM_STR(IVY_SYM_LEFT_ANGLE);
ENUM_STR(IVY_SYM_RIGHT_ANGLE);
ENUM_STR(IVY_SYM_COLON);
ENUM_STR(IVY_SYM_DOUBLE_COLON);
ENUM_STR(IVY_SYM_PLUS);
ENUM_STR(IVY_SYM_HYPHEN);
ENUM_STR(IVY_SYM_DOUBLE_HYPHEN);
ENUM_STR(IVY_SYM_FORWARD_SLASH);
ENUM_STR(IVY_SYM_ASTERISK);
ENUM_STR(IVY_SYM_PERCENT);
ENUM_STR(IVY_SYM_AMPERSAND);
ENUM_STR(IVY_SYM_EQUAL);
ENUM_STR(IVY_SYM_DOUBLE_EQUAL);
ENUM_STR(IVY_SYM_DOUBLE_LEFT_ANGLE);
ENUM_STR(IVY_SYM_DOUBLE_RIGHT_ANGLE);
ENUM_STR(IVY_SYM_DOUBLE_LEFT_ANGLE_EQUAL);
ENUM_STR(IVY_SYM_DOUBLE_RIGHT_ANGLE_EQUAL);
ENUM_STR(IVY_SYM_PLUS_EQUAL);
ENUM_STR(IVY_SYM_HYPHEN_EQUAL);
ENUM_STR(IVY_SYM_FORWARD_SLASH_EQUAL);
ENUM_STR(IVY_SYM_ASTERISK_EQUAL);
ENUM_STR(IVY_SYM_AMPERSAND_EQUAL);
ENUM_STR(IVY_SYM_PIPE_EQUAL);
ENUM_STR(IVY_SYM_PERCENT_EQUAL);
ENUM_STR(IVY_SYM_CARET_EQUAL);
ENUM_STR(IVY_SYM_BANG);
ENUM_STR(IVY_SYM_PIPE);
ENUM_STR(IVY_SYM_CARET);
ENUM_STR(IVY_SYM_UNDERSCORE);
ENUM_STR(IVY_SYM_COMMA);
ENUM_STR(IVY_SYM_DOLLAR);
ENUM_STR(IVY_SYM_RIGHT_ARROW);
ENUM_STR(IVY_SYM_BIG_RIGHT_ARROW);
ENUM_STR(IVY_SYM_FORWARD_SLASH_ASTERISK);
ENUM_STR(IVY_SYM_ASTERISK_FORWARD_SLASH);
default:
return "";
}
}