lang: implement identifier support in the lexer

This commit is contained in:
2024-11-13 21:37:00 +00:00
parent cbd8639605
commit 6b02a86ba4
3 changed files with 455 additions and 0 deletions

View File

@@ -2,3 +2,4 @@ file(GLOB_RECURSE lang_sources *.c *.h include/ivy/lang/*.h)
add_library(ivy-lang SHARED ${lang_sources})
target_include_directories(ivy-lang PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include/)
target_link_libraries(ivy-lang ivy-common Bluelib::Core Bluelib::Object)

View File

@@ -0,0 +1,119 @@
#ifndef IVY_LANG_LEX_H_
#define IVY_LANG_LEX_H_
#include <ivy/line-source.h>
#include <ivy/status.h>
enum ivy_token_type {
IVY_TOK_NONE = 0,
IVY_TOK_KEYWORD,
IVY_TOK_SYMBOL,
IVY_TOK_ATOM,
IVY_TOK_NUMBER,
IVY_TOK_LABEL,
IVY_TOK_IDENT,
IVY_TOK_STRING,
IVY_TOK_STR_START,
IVY_TOK_STR_END,
IVY_TOK_LINEFEED,
};
enum ivy_keyword {
IVY_KW_NONE = 0,
IVY_KW_PACKAGE,
IVY_KW_USE,
IVY_KW_CLASS,
IVY_KW_PROTOCOL,
IVY_KW_TRY,
IVY_KW_THROW,
IVY_KW_CATCH,
IVY_KW_IF,
IVY_KW_AND,
IVY_KW_OR,
IVY_KW_IS,
IVY_KW_NOT,
IVY_KW_ELSE,
IVY_KW_WHILE,
IVY_KW_FOR,
IVY_KW_MATCH,
IVY_KW_UNLESS,
IVY_KW_IN,
IVY_KW_DO,
IVY_KW_END,
};
enum ivy_symbol {
IVY_SYM_NONE = 0,
IVY_SYM_DOT,
IVY_SYM_LEFT_BRACE,
IVY_SYM_RIGHT_BRACE,
IVY_SYM_LEFT_BRACKET,
IVY_SYM_RIGHT_BRACKET,
IVY_SYM_LEFT_PAREN,
IVY_SYM_RIGHT_PAREN,
IVY_SYM_LEFT_ANGLE,
IVY_SYM_RIGHT_ANGLE,
IVY_SYM_COLON,
IVY_SYM_DOUBLE_COLON,
IVY_SYM_PLUS,
IVY_SYM_MINUS,
IVY_SYM_FORWARD_SLASH,
IVY_SYM_ASTERISK,
IVY_SYM_PERCENT,
IVY_SYM_AMPERSAND,
IVY_SYM_EQUAL,
IVY_SYM_DOUBLE_EQUAL,
IVY_SYM_DOUBLE_LEFT_ANGLE_EQUAL,
IVY_SYM_DOUBLE_RIGHT_ANGLE_EQUAL,
IVY_SYM_PLUS_EQUAL,
IVY_SYM_MINUS_EQUAL,
IVY_SYM_FORWARD_SLASH_EQUAL,
IVY_SYM_ASTERISK_EQUAL,
IVY_SYM_AMPERSAND_EQUAL,
IVY_SYM_PIPE_EQUAL,
IVY_SYM_PERCENT_EQUAL,
IVY_SYM_CARET_EQUAL,
IVY_SYM_BANG,
IVY_SYM_PIPE,
IVY_SYM_CARET,
IVY_SYM_UNDERSCORE,
IVY_SYM_COMMA,
IVY_SYM_DOLLAR,
IVY_SYM_RIGHT_ARROW,
IVY_SYM_BIG_RIGHT_ARROW,
IVY_SYM_FORWARD_SLASH_ASTERISK,
IVY_SYM_ASTERISK_FORWARD_SLASH,
};
struct ivy_token {
enum ivy_token_type t_type;
struct ivy_token *t_next;
union {
enum ivy_keyword t_keyword;
enum ivy_symbol t_symbol;
signed long long t_number;
char *t_str;
};
};
struct ivy_lexer {
struct ivy_line_source *lex_source;
enum ivy_status lex_status;
struct ivy_token *lex_queue;
char *lex_linebuf;
size_t lex_linebuf_len;
size_t lex_linebuf_cap;
size_t lex_linebuf_ptr;
};
extern enum ivy_status ivy_lexer_init(struct ivy_lexer *lex);
extern void ivy_lexer_finish(struct ivy_lexer *lex);
extern struct ivy_token *ivy_lexer_peek(struct ivy_lexer *lex);
extern struct ivy_token *ivy_lexer_read(struct ivy_lexer *lex);
extern void ivy_token_destroy(struct ivy_token *tok);
#endif

335
lang/lex.c Normal file
View File

@@ -0,0 +1,335 @@
#include <blue/object/string.h>
#include <ctype.h>
#include <ivy/lang/lex.h>
#include <stdbool.h>
#include <stdlib.h>
#include <string.h>
#define LINEBUF_DEFAULT_CAPACITY 1024
#define LEX_TOKEN_DEF(i, n) \
{ \
.id = (i), .name = (n) \
}
struct lex_token_def {
int id;
const char *name;
};
static struct lex_token_def keywords[] = {
LEX_TOKEN_DEF(IVY_KW_PACKAGE, "package"),
LEX_TOKEN_DEF(IVY_KW_USE, "use"),
LEX_TOKEN_DEF(IVY_KW_CLASS, "class"),
LEX_TOKEN_DEF(IVY_KW_PROTOCOL, "protocol"),
LEX_TOKEN_DEF(IVY_KW_TRY, "try"),
LEX_TOKEN_DEF(IVY_KW_THROW, "throw"),
LEX_TOKEN_DEF(IVY_KW_CATCH, "catch"),
LEX_TOKEN_DEF(IVY_KW_IF, "if"),
LEX_TOKEN_DEF(IVY_KW_AND, "and"),
LEX_TOKEN_DEF(IVY_KW_OR, "or"),
LEX_TOKEN_DEF(IVY_KW_IS, "is"),
LEX_TOKEN_DEF(IVY_KW_NOT, "not"),
LEX_TOKEN_DEF(IVY_KW_ELSE, "else"),
LEX_TOKEN_DEF(IVY_KW_WHILE, "while"),
LEX_TOKEN_DEF(IVY_KW_FOR, "for"),
LEX_TOKEN_DEF(IVY_KW_MATCH, "match"),
LEX_TOKEN_DEF(IVY_KW_UNLESS, "unless"),
LEX_TOKEN_DEF(IVY_KW_IN, "in"),
LEX_TOKEN_DEF(IVY_KW_DO, "do"),
LEX_TOKEN_DEF(IVY_KW_END, "end"),
};
static const size_t nr_keywords = sizeof keywords / sizeof keywords[0];
static struct lex_token_def symbols[] = {
LEX_TOKEN_DEF(IVY_SYM_DOT, "."),
LEX_TOKEN_DEF(IVY_SYM_LEFT_BRACE, "{"),
LEX_TOKEN_DEF(IVY_SYM_RIGHT_BRACE, "}"),
LEX_TOKEN_DEF(IVY_SYM_LEFT_BRACKET, "["),
LEX_TOKEN_DEF(IVY_SYM_RIGHT_BRACKET, "]"),
LEX_TOKEN_DEF(IVY_SYM_LEFT_PAREN, "("),
LEX_TOKEN_DEF(IVY_SYM_RIGHT_PAREN, ")"),
LEX_TOKEN_DEF(IVY_SYM_LEFT_ANGLE, "<"),
LEX_TOKEN_DEF(IVY_SYM_RIGHT_ANGLE, ">"),
LEX_TOKEN_DEF(IVY_SYM_COLON, ":"),
LEX_TOKEN_DEF(IVY_SYM_DOUBLE_COLON, "::"),
LEX_TOKEN_DEF(IVY_SYM_PLUS, "+"),
LEX_TOKEN_DEF(IVY_SYM_MINUS, "-"),
LEX_TOKEN_DEF(IVY_SYM_FORWARD_SLASH, "/"),
LEX_TOKEN_DEF(IVY_SYM_ASTERISK, "*"),
LEX_TOKEN_DEF(IVY_SYM_PERCENT, "%"),
LEX_TOKEN_DEF(IVY_SYM_AMPERSAND, "&"),
LEX_TOKEN_DEF(IVY_SYM_EQUAL, "="),
LEX_TOKEN_DEF(IVY_SYM_DOUBLE_EQUAL, "=="),
LEX_TOKEN_DEF(IVY_SYM_DOUBLE_LEFT_ANGLE_EQUAL, "<<="),
LEX_TOKEN_DEF(IVY_SYM_DOUBLE_RIGHT_ANGLE_EQUAL, ">>="),
LEX_TOKEN_DEF(IVY_SYM_PLUS_EQUAL, "+="),
LEX_TOKEN_DEF(IVY_SYM_MINUS_EQUAL, "-="),
LEX_TOKEN_DEF(IVY_SYM_FORWARD_SLASH_EQUAL, "/="),
LEX_TOKEN_DEF(IVY_SYM_ASTERISK_EQUAL, "*="),
LEX_TOKEN_DEF(IVY_SYM_AMPERSAND_EQUAL, "&="),
LEX_TOKEN_DEF(IVY_SYM_PIPE_EQUAL, "|="),
LEX_TOKEN_DEF(IVY_SYM_PERCENT_EQUAL, "%="),
LEX_TOKEN_DEF(IVY_SYM_CARET_EQUAL, "^="),
LEX_TOKEN_DEF(IVY_SYM_BANG, "!"),
LEX_TOKEN_DEF(IVY_SYM_PIPE, "|"),
LEX_TOKEN_DEF(IVY_SYM_CARET, "^"),
LEX_TOKEN_DEF(IVY_SYM_UNDERSCORE, "_"),
LEX_TOKEN_DEF(IVY_SYM_COMMA, ","),
LEX_TOKEN_DEF(IVY_SYM_DOLLAR, "$"),
LEX_TOKEN_DEF(IVY_SYM_RIGHT_ARROW, "->"),
LEX_TOKEN_DEF(IVY_SYM_BIG_RIGHT_ARROW, "=>"),
};
static const size_t nr_symbols = sizeof symbols / sizeof symbols[0];
enum ivy_status ivy_lexer_init(struct ivy_lexer *lex)
{
memset(lex, 0x0, sizeof *lex);
lex->lex_status = IVY_OK;
lex->lex_linebuf = malloc(LINEBUF_DEFAULT_CAPACITY);
lex->lex_linebuf_cap = LINEBUF_DEFAULT_CAPACITY;
return IVY_OK;
}
void ivy_lexer_finish(struct ivy_lexer *lex)
{
while (lex->lex_queue) {
struct ivy_token *next = lex->lex_queue->t_next;
ivy_token_destroy(lex->lex_queue);
lex->lex_queue = next;
}
if (lex->lex_linebuf) {
free(lex->lex_linebuf);
}
memset(lex, 0x0, sizeof *lex);
}
static enum ivy_status refill_linebuf(struct ivy_lexer *lex)
{
if (!lex->lex_source) {
return IVY_ERR_EOF;
}
return ivy_line_source_readline(
lex->lex_source, lex->lex_linebuf, lex->lex_linebuf_cap,
&lex->lex_linebuf_len, NULL);
}
static int peek(struct ivy_lexer *lex)
{
enum ivy_status status = IVY_OK;
if (lex->lex_linebuf_ptr >= lex->lex_linebuf_len) {
status = refill_linebuf(lex);
}
if (status != IVY_OK) {
return status;
}
if (lex->lex_linebuf_len == 0) {
return IVY_ERR_EOF;
}
int c = lex->lex_linebuf[lex->lex_linebuf_ptr];
return c;
}
static int advance(struct ivy_lexer *lex)
{
enum ivy_status status = IVY_OK;
if (lex->lex_linebuf_ptr >= lex->lex_linebuf_len) {
status = refill_linebuf(lex);
}
if (status != IVY_OK) {
return status;
}
if (lex->lex_linebuf_len == 0) {
return IVY_ERR_EOF;
}
int c = lex->lex_linebuf[lex->lex_linebuf_ptr++];
return c;
}
static bool char_can_begin_symbol(char c)
{
for (size_t i = 0; i < nr_symbols; i++) {
if (symbols[i].name[0] == c) {
return true;
}
}
return false;
}
static struct ivy_token *create_token(enum ivy_token_type type)
{
struct ivy_token *tok = malloc(sizeof *tok);
if (!tok) {
return NULL;
}
memset(tok, 0x0, sizeof *tok);
tok->t_type = type;
return tok;
}
static enum ivy_status push_token(struct ivy_lexer *lex, struct ivy_token *tok)
{
struct ivy_token **slot = &lex->lex_queue;
while (*slot) {
slot = &(*slot)->t_next;
}
*slot = tok;
return IVY_OK;
}
static enum ivy_status push_linefeed(struct ivy_lexer *lex)
{
struct ivy_token *tok = malloc(sizeof *tok);
if (!tok) {
return IVY_ERR_NO_MEMORY;
}
memset(tok, 0x0, sizeof *tok);
tok->t_type = IVY_TOK_LINEFEED;
return push_token(lex, tok);
}
static enum ivy_status push_symbol(struct ivy_lexer *lex, enum ivy_symbol sym)
{
struct ivy_token *tok = malloc(sizeof *tok);
if (!tok) {
return IVY_ERR_NO_MEMORY;
}
memset(tok, 0x0, sizeof *tok);
tok->t_type = IVY_TOK_SYMBOL;
tok->t_symbol = sym;
return push_token(lex, tok);
}
static enum ivy_status read_ident(struct ivy_lexer *lex)
{
b_string *str = b_string_create();
int c = peek(lex);
while (true) {
if (c < 0) {
break;
}
if (!isalnum(c) && c != '_') {
break;
}
char s[2] = {c, 0};
b_string_append_cstr(str, s);
}
const char *s = b_string_ptr(str);
if (!strcmp(s, "_")) {
b_string_release(str);
push_symbol(lex, IVY_SYM_UNDERSCORE);
}
struct ivy_token *tok = create_token(IVY_TOK_IDENT);
tok->t_str = b_string_steal(str);
b_string_release(str);
push_token(lex, tok);
return IVY_OK;
}
static enum ivy_status pump_tokens(struct ivy_lexer *lex)
{
enum ivy_status status;
int c = peek(lex);
if (c < 0) {
return c;
}
if (c == '\n') {
while (c == '\n') {
advance(lex);
c = peek(lex);
}
if (c < 0) {
return c;
}
return push_linefeed(lex);
}
if (isalpha(c) || c == '_') {
return read_ident(lex);
}
return IVY_ERR_BAD_SYNTAX;
}
struct ivy_token *ivy_lexer_peek(struct ivy_lexer *lex)
{
enum ivy_status status = IVY_OK;
if (!lex->lex_queue) {
status = pump_tokens(lex);
}
if (status != IVY_OK) {
lex->lex_status = status;
return NULL;
}
struct ivy_token *tok = lex->lex_queue;
return tok;
}
struct ivy_token *ivy_lexer_read(struct ivy_lexer *lex)
{
enum ivy_status status = IVY_OK;
if (!lex->lex_queue) {
status = pump_tokens(lex);
}
if (status != IVY_OK) {
lex->lex_status = status;
return NULL;
}
struct ivy_token *tok = lex->lex_queue;
lex->lex_queue = lex->lex_queue->t_next;
return tok;
}
void ivy_token_destroy(struct ivy_token *tok)
{
switch (tok->t_type) {
case IVY_TOK_ATOM:
case IVY_TOK_STRING:
case IVY_TOK_IDENT:
free(tok->t_str);
break;
default:
break;
}
free(tok);
}