Files
rosetta/toolchain/xpcg/lex.c

744 lines
14 KiB
C
Raw Normal View History

#include "lex.h"
#include "line-source.h"
#include "token.h"
#include <blue/core/hash.h>
#include <blue/core/misc.h>
#include <blue/core/queue.h>
#include <blue/ds/dict.h>
#include <blue/ds/number.h>
#include <blue/ds/string.h>
#include <ctype.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <wctype.h>
#define LINEBUF_DEFAULT_CAPACITY 1024
#define LEX_TOKEN_DEF(i, n) {.id = (i), .name = (n)}
#define IS_VALID_IDENT_CHAR(c) \
(b_wchar_is_alnum(c) || c == '.' || c == '-' || c == '_')
#define IS_VALID_IDENT_START_CHAR(c) \
(b_wchar_is_alpha(c) || c == '.' || c == '_')
#define IS_VALID_REG_START_CHAR(c) (b_wchar_is_alnum(c) || c == '.' || c == '_')
static struct lex_token_def symbols[] = {
LEX_TOKEN_DEF(SYM_COMMA, ","),
LEX_TOKEN_DEF(SYM_HYPHEN, "-"),
LEX_TOKEN_DEF(SYM_LEFT_BRACKET, "["),
LEX_TOKEN_DEF(SYM_RIGHT_BRACKET, "]"),
LEX_TOKEN_DEF(SYM_LEFT_BRACE, "{"),
LEX_TOKEN_DEF(SYM_RIGHT_BRACE, "}"),
LEX_TOKEN_DEF(SYM_LEFT_PAREN, "("),
LEX_TOKEN_DEF(SYM_RIGHT_PAREN, ")"),
LEX_TOKEN_DEF(SYM_SEMICOLON, ";"),
LEX_TOKEN_DEF(SYM_COLON, ":"),
LEX_TOKEN_DEF(SYM_HYPHEN_RIGHT_ANGLE, "->"),
};
static const size_t nr_symbols = sizeof symbols / sizeof symbols[0];
static struct lex_token_def keywords[] = {
LEX_TOKEN_DEF(KW_INTERFACE, "interface"),
LEX_TOKEN_DEF(KW_FUNC, "func"),
};
static const size_t nr_keywords = sizeof keywords / sizeof keywords[0];
static struct lex_symbol_node *get_symbol_node(
struct lex_symbol_node *node,
char c)
{
b_queue_entry *entry = b_queue_first(&node->s_children);
while (entry) {
struct lex_symbol_node *child
= b_unbox(struct lex_symbol_node, entry, s_entry);
if (child->s_char == c) {
return child;
}
entry = b_queue_next(entry);
}
return NULL;
}
static b_string *get_temp_string(struct lex *lex)
{
if (!lex->lex_temp) {
lex->lex_temp = b_string_create();
}
b_string_clear(lex->lex_temp);
return lex->lex_temp;
}
static enum status put_symbol(
struct lex_symbol_node *tree,
struct lex_token_def *sym)
{
for (size_t i = 0; sym->name[i]; i++) {
char c = sym->name[i];
struct lex_symbol_node *child = get_symbol_node(tree, c);
if (child) {
tree = child;
continue;
}
child = malloc(sizeof *child);
if (!child) {
return ERR_NO_MEMORY;
}
memset(child, 0x0, sizeof *child);
child->s_def = NULL;
child->s_char = c;
b_queue_push_back(&tree->s_children, &child->s_entry);
tree = child;
}
tree->s_def = sym;
return SUCCESS;
}
static void destroy_symbol_tree(struct lex_symbol_node *tree)
{
b_queue_entry *entry = b_queue_first(&tree->s_children);
while (entry) {
struct lex_symbol_node *node
= b_unbox(struct lex_symbol_node, entry, s_entry);
b_queue_entry *next = b_queue_next(entry);
b_queue_delete(&tree->s_children, entry);
destroy_symbol_tree(node);
entry = next;
}
free(tree);
}
static struct lex_symbol_node *build_symbol_tree(void)
{
struct lex_symbol_node *root = malloc(sizeof *root);
if (!root) {
return NULL;
}
memset(root, 0x0, sizeof *root);
root->s_def = NULL;
enum status status = SUCCESS;
for (size_t i = 0; i < nr_symbols; i++) {
status = put_symbol(root, &symbols[i]);
if (status != SUCCESS) {
destroy_symbol_tree(root);
return NULL;
}
}
return root;
}
struct lex *lex_create(struct line_source *src)
{
struct lex *lex = malloc(sizeof *lex);
if (!lex) {
return NULL;
}
memset(lex, 0x0, sizeof *lex);
lex->lex_status = SUCCESS;
lex->lex_source = src;
lex->lex_sym_tree = build_symbol_tree();
if (!lex->lex_sym_tree) {
lex_destroy(lex);
return NULL;
}
return lex;
}
void lex_destroy(struct lex *lex)
{
b_queue_entry *entry = b_queue_first(&lex->lex_queue);
while (entry) {
struct token *tok = b_unbox(struct token, entry, tok_entry);
b_queue_entry *next = b_queue_next(entry);
b_queue_delete(&lex->lex_queue, entry);
token_destroy(tok);
entry = next;
}
if (lex->lex_sym_tree) {
destroy_symbol_tree(lex->lex_sym_tree);
}
if (lex->lex_temp) {
b_string_unref(lex->lex_temp);
}
free(lex);
}
enum status lex_get_status(const struct lex *lex)
{
return lex->lex_status;
}
struct line_source *lex_get_line_source(const struct lex *lex)
{
return lex->lex_source;
}
const struct file_cell *lex_get_cursor(const struct lex *lex)
{
return &lex->lex_source->s_cursor;
}
static bool char_can_begin_symbol(char c)
{
for (size_t i = 0; i < nr_symbols; i++) {
if (symbols[i].name[0] == c) {
return true;
}
}
return false;
}
static struct token *create_token(enum token_type type)
{
struct token *tok = malloc(sizeof *tok);
if (!tok) {
return NULL;
}
memset(tok, 0x0, sizeof *tok);
tok->tok_type = type;
return tok;
}
static void set_token_start(struct lex *lex)
{
lex->lex_token_start = *line_source_get_cursor(lex->lex_source);
}
static void set_token_end(struct lex *lex)
{
lex->lex_token_end = *line_source_get_cursor(lex->lex_source);
}
static enum status push_token(struct lex *lex, struct token *tok)
{
tok->tok_location.s_start = lex->lex_token_start;
tok->tok_location.s_end = lex->lex_token_end;
b_queue_push_back(&lex->lex_queue, &tok->tok_entry);
return SUCCESS;
}
static enum status push_symbol(struct lex *lex, enum token_symbol sym)
{
struct token *tok = malloc(sizeof *tok);
if (!tok) {
return ERR_NO_MEMORY;
}
memset(tok, 0x0, sizeof *tok);
tok->tok_type = TOK_SYMBOL;
tok->tok_value_type = TOK_V_SYMBOL;
tok->tok_sym = sym;
return push_token(lex, tok);
}
static enum status push_keyword(struct lex *lex, enum token_keyword kw)
{
struct token *tok = malloc(sizeof *tok);
if (!tok) {
return ERR_NO_MEMORY;
}
memset(tok, 0x0, sizeof *tok);
tok->tok_type = TOK_KEYWORD;
tok->tok_value_type = TOK_V_KEYWORD;
tok->tok_kw = kw;
return push_token(lex, tok);
}
static enum status push_string_token(
struct lex *lex,
enum token_type type,
char *s)
{
struct token *tok = malloc(sizeof *tok);
if (!tok) {
return ERR_NO_MEMORY;
}
char *ep = NULL;
long long v = strtoll(s, &ep, 10);
memset(tok, 0x0, sizeof *tok);
tok->tok_type = type;
if (*ep == '\0') {
tok->tok_int = v;
tok->tok_value_type = TOK_V_INT;
free(s);
} else {
tok->tok_str = s;
tok->tok_value_type = TOK_V_STRING;
}
return push_token(lex, tok);
}
static enum status push_int(struct lex *lex, unsigned long long v)
{
struct token *tok = malloc(sizeof *tok);
if (!tok) {
return ERR_NO_MEMORY;
}
memset(tok, 0x0, sizeof *tok);
tok->tok_type = TOK_INT;
tok->tok_value_type = TOK_V_INT;
tok->tok_int = v;
return push_token(lex, tok);
}
static enum status read_line_comment(struct lex *lex)
{
while (true) {
b_wchar c = line_source_getc(lex->lex_source);
if (c == -ERR_EOF || c == '\n') {
break;
}
if (c < 0) {
return -c;
}
}
return SUCCESS;
}
static enum status read_number(struct lex *lex, bool negate)
{
int token_len = 0;
int base = 10;
int dots = 0;
b_string *str = get_temp_string(lex);
if (!negate) {
set_token_start(lex);
}
while (true) {
b_wchar c = line_source_peekc(lex->lex_source);
if (c == -ERR_EOF) {
break;
}
if (c < 0) {
return -c;
}
if (c == '_') {
token_len++;
set_token_end(lex);
line_source_getc(lex->lex_source);
continue;
}
if (c == '.') {
return ERR_BAD_SYNTAX;
}
if (b_wchar_is_space(c) || b_wchar_is_punct(c)) {
break;
}
if (c == 'x' && token_len == 1) {
base = 16;
token_len++;
set_token_end(lex);
line_source_getc(lex->lex_source);
continue;
}
if (c == 'b' && token_len == 1) {
base = 2;
token_len++;
set_token_end(lex);
line_source_getc(lex->lex_source);
continue;
}
if (base == 2 && c != '0' && c != '1') {
return ERR_BAD_SYNTAX;
}
if (base == 10 && !isdigit(c)) {
return ERR_BAD_SYNTAX;
}
if (base == 16 && !isxdigit(c)) {
return ERR_BAD_SYNTAX;
}
b_string_append_wc(str, c);
set_token_end(lex);
line_source_getc(lex->lex_source);
token_len++;
}
if (token_len == 1 && base == 7) {
return push_int(lex, 0);
}
const char *s = b_string_ptr(str);
char *ep = NULL;
/* negative numbers will be lexed as a hyphen followed by a positive
* number. */
unsigned long long v = strtoull(s, &ep, base);
if (*ep != '\0') {
return ERR_BAD_SYNTAX;
}
if (negate) {
v *= -1;
}
return push_int(lex, v);
}
static enum token_keyword find_keyword(const char *s)
{
for (size_t i = 0; i < nr_keywords; i++) {
if (!strcmp(keywords[i].name, s)) {
return keywords[i].id;
}
}
return KW_NONE;
}
static enum status read_ident(struct lex *lex, enum token_type type)
{
int dots = 0;
b_string *str = get_temp_string(lex);
b_wchar prev = 0;
if (type == TOK_NONE) {
set_token_start(lex);
}
while (1) {
b_wchar c = line_source_peekc(lex->lex_source);
if ((c == '.' || c == '-') && prev == c) {
return ERR_BAD_SYNTAX;
}
if (c == '.') {
dots++;
}
if (!IS_VALID_IDENT_CHAR(c)) {
break;
}
prev = c;
b_string_append_wc(str, c);
set_token_end(lex);
line_source_getc(lex->lex_source);
}
if (type == TOK_NONE) {
type = dots > 0 ? TOK_NAME : TOK_WORD;
}
char *s = b_string_steal(str);
enum token_keyword kw = find_keyword(s);
if (kw != KW_NONE) {
free(s);
return push_keyword(lex, kw);
}
return push_string_token(lex, type, s);
}
static enum status read_string(struct lex *lex)
{
b_string *str = get_temp_string(lex);
b_wchar c = line_source_peekc(lex->lex_source);
bool esc = false;
if (c != '"') {
return ERR_BAD_SYNTAX;
}
line_source_getc(lex->lex_source);
while (1) {
b_wchar c = line_source_peekc(lex->lex_source);
if (esc) {
switch (c) {
case '\\':
case '"':
b_string_append_wc(str, c);
break;
default:
return ERR_BAD_SYNTAX;
}
esc = false;
line_source_getc(lex->lex_source);
continue;
}
if (c == '\\') {
esc = true;
line_source_getc(lex->lex_source);
continue;
}
if (c == '"') {
line_source_getc(lex->lex_source);
break;
}
b_string_append_wc(str, c);
line_source_getc(lex->lex_source);
}
char *s = b_string_steal(str);
return push_string_token(lex, TOK_STRING, s);
}
static enum status read_symbol(struct lex *lex)
{
struct lex_symbol_node *node = lex->lex_sym_tree;
set_token_start(lex);
b_wchar prev = 0;
while (true) {
b_wchar c = line_source_peekc(lex->lex_source);
if (c < 0) {
break;
}
struct lex_symbol_node *next = get_symbol_node(node, c);
if (!next) {
prev = c;
break;
}
node = next;
set_token_end(lex);
line_source_getc(lex->lex_source);
prev = c;
}
if (!node || node->s_def == NULL) {
return ERR_BAD_SYNTAX;
}
if (node->s_def->id == SYM_HYPHEN && isdigit(prev)) {
return read_number(lex, true);
}
return push_symbol(lex, node->s_def->id);
}
static void skip_whitespace(struct lex *lex)
{
b_wchar c = line_source_peekc(lex->lex_source);
while (b_wchar_is_space(c)) {
line_source_getc(lex->lex_source);
c = line_source_peekc(lex->lex_source);
}
}
static bool should_skip(b_wchar c, bool skip_linefeeds)
{
bool skip = b_wchar_is_space(c);
if (!skip_linefeeds) {
skip = (skip && c != '\n');
}
return skip;
}
static void skip_ignored_chars(struct lex *lex, bool include_linefeeds)
{
b_wchar c = line_source_peekc(lex->lex_source);
while (1) {
while (should_skip(c, include_linefeeds)) {
line_source_getc(lex->lex_source);
c = line_source_peekc(lex->lex_source);
}
if (c != '#') {
break;
}
line_source_getc(lex->lex_source);
c = line_source_peekc(lex->lex_source);
while (c != '\n') {
line_source_getc(lex->lex_source);
c = line_source_peekc(lex->lex_source);
}
line_source_getc(lex->lex_source);
c = line_source_peekc(lex->lex_source);
}
}
static enum status pump_tokens(struct lex *lex)
{
b_wchar c = line_source_peekc(lex->lex_source);
if (c < 0) {
return -c;
}
while (1) {
if (c == '#' || (b_wchar_is_space(c) && c != '\n')) {
skip_ignored_chars(lex, false);
} else {
break;
}
c = line_source_peekc(lex->lex_source);
}
if (c == '\\') {
line_source_getc(lex->lex_source);
skip_ignored_chars(lex, true);
c = line_source_peekc(lex->lex_source);
}
if (c == '\n') {
set_token_start(lex);
set_token_end(lex);
while (c == '\n') {
line_source_getc(lex->lex_source);
if (!line_source_input_available(lex->lex_source)) {
break;
}
c = line_source_peekc(lex->lex_source);
}
if (c < 0) {
return -c;
}
return SUCCESS;
}
while (b_wchar_is_space(c) && c != '\n') {
line_source_getc(lex->lex_source);
c = line_source_peekc(lex->lex_source);
}
if (IS_VALID_IDENT_START_CHAR(c)) {
return read_ident(lex, TOK_NONE);
}
if (char_can_begin_symbol(c)) {
return read_symbol(lex);
}
if (c == '"') {
return read_string(lex);
}
if (isdigit(c)) {
return read_number(lex, false);
}
return ERR_BAD_SYNTAX;
}
struct token *lex_peek(struct lex *lex)
{
enum status status = SUCCESS;
while (b_queue_empty(&lex->lex_queue)) {
status = pump_tokens(lex);
if (status != SUCCESS) {
lex->lex_status = status;
return NULL;
}
}
lex->lex_status = status;
b_queue_entry *entry = b_queue_first(&lex->lex_queue);
struct token *tok = b_unbox(struct token, entry, tok_entry);
return tok;
}
void lex_advance(struct lex *lex)
{
enum status status = SUCCESS;
while (b_queue_empty(&lex->lex_queue)) {
status = pump_tokens(lex);
if (status != SUCCESS) {
lex->lex_status = status;
return;
}
}
b_queue_entry *entry = b_queue_pop_front(&lex->lex_queue);
struct token *tok = b_unbox(struct token, entry, tok_entry);
token_destroy(tok);
}
bool lex_tokens_available(struct lex *lex)
{
if (!b_queue_empty(&lex->lex_queue)) {
return true;
}
if (line_source_input_available(lex->lex_source)) {
return true;
}
return false;
}