From 36dbdcee3228fe0f5dab1affe0d28b60a2ef45bb Mon Sep 17 00:00:00 2001 From: "Michael D. Lowis" Date: Sat, 16 Mar 2019 21:14:55 -0400 Subject: [PATCH] added custom handcoded lexer --- Makefile | 1 + source/lex.c | 215 ++++++++++++++++++++++++++++++++++++++++++++++++ source/lexer.l | 23 ------ source/main.c | 6 +- source/pprint.c | 12 +-- source/sclpl.h | 13 ++- 6 files changed, 235 insertions(+), 35 deletions(-) create mode 100644 source/lex.c diff --git a/Makefile b/Makefile index efe80dd..63881f1 100644 --- a/Makefile +++ b/Makefile @@ -23,6 +23,7 @@ OBJS = source/main.o \ source/pprint.o \ source/parser.o \ source/lexer.o \ + source/lex.o \ source/ast.o \ source/types.o \ source/syms.o \ diff --git a/source/lex.c b/source/lex.c new file mode 100644 index 0000000..b3ccd11 --- /dev/null +++ b/source/lex.c @@ -0,0 +1,215 @@ +#include +#include +#include +#include + +typedef struct { + char* keyword; + int type; +} KeywordDef; + +static const char FirstChar[256] = { + /* Whitespace */ + [' '] = 1, ['\t'] = 1, ['\r'] = 1, ['\n'] = 1, + /* comment start */ + ['#'] = 2, + /* number or op */ + ['+'] = 3, ['-'] = 3, + /* number digits */ + ['0'] = 4, ['1'] = 4, ['2'] = 4, ['3'] = 4, ['4'] = 4, + ['5'] = 4, ['6'] = 4, ['7'] = 4, ['8'] = 4, ['9'] = 4, + /* alpha characters */ + ['A'] = 5, ['B'] = 5, ['C'] = 5, ['D'] = 5, ['E'] = 5, + ['F'] = 5, ['G'] = 5, ['H'] = 5, ['I'] = 5, ['J'] = 5, + ['K'] = 5, ['L'] = 5, ['M'] = 5, ['N'] = 5, ['O'] = 5, + ['P'] = 5, ['Q'] = 5, ['R'] = 5, ['S'] = 5, ['T'] = 5, + ['U'] = 5, ['V'] = 5, ['W'] = 5, ['X'] = 5, ['Y'] = 5, + ['Z'] = 5, ['a'] = 5, ['b'] = 5, ['c'] = 5, ['d'] = 5, + ['e'] = 5, ['f'] = 5, ['g'] = 5, ['h'] = 5, ['i'] = 5, + ['j'] = 5, ['k'] = 5, ['l'] = 5, ['m'] = 5, ['n'] = 5, + ['o'] = 5, ['p'] = 5, ['q'] = 5, ['r'] = 5, ['s'] = 5, + ['t'] = 5, ['u'] = 5, ['v'] = 5, ['w'] = 5, ['x'] = 5, + ['y'] = 5, ['z'] = 5, + /* punctuation */ + ['('] = 6, [')'] = 6, ['['] = 6, [']'] = 6, ['{'] = 6, ['}'] = 6, + ['.'] = 6, [','] = 6, [':'] = 6, ['&'] = 6, ['='] = 6, [';'] = 6, + ['*'] = 6, ['\''] = 6, + /* strings */ + ['"'] = 7 +}; + +char SPACE[256] = { + [' '] = 1, ['\t'] = 1, ['\r'] = 1, ['\n'] = 1, +}; + +char DIGIT[256] = { + ['0'] = 1, ['1'] = 1, ['2'] = 1, ['3'] = 1, ['4'] = 1, + ['5'] = 1, ['6'] = 1, ['7'] = 1, ['8'] = 1, ['9'] = 1, +}; + +char ALNUM_[256] = { + ['0'] = 1, ['1'] = 1, ['2'] = 1, ['3'] = 1, ['4'] = 1, + ['5'] = 1, ['6'] = 1, ['7'] = 1, ['8'] = 1, ['9'] = 1, + ['A'] = 1, ['B'] = 1, ['C'] = 1, ['D'] = 1, ['E'] = 1, + ['F'] = 1, ['G'] = 1, ['H'] = 1, ['I'] = 1, ['J'] = 1, + ['K'] = 1, ['L'] = 1, ['M'] = 1, ['N'] = 1, ['O'] = 1, + ['P'] = 1, ['Q'] = 1, ['R'] = 1, ['S'] = 1, ['T'] = 1, + ['U'] = 1, ['V'] = 1, ['W'] = 1, ['X'] = 1, ['Y'] = 1, + ['Z'] = 1, ['a'] = 1, ['b'] = 1, ['c'] = 1, ['d'] = 1, + ['e'] = 1, ['f'] = 1, ['g'] = 1, ['h'] = 1, ['i'] = 1, + ['j'] = 1, ['k'] = 1, ['l'] = 1, ['m'] = 1, ['n'] = 1, + ['o'] = 1, ['p'] = 1, ['q'] = 1, ['r'] = 1, ['s'] = 1, + ['t'] = 1, ['u'] = 1, ['v'] = 1, ['w'] = 1, ['x'] = 1, + ['y'] = 1, ['z'] = 1, ['_'] = 1, +}; + +#define NUM_KEYWORDS (sizeof(Keywords) / sizeof(Keywords[0])) +KeywordDef Keywords[] = { + { "else", T_ELSE }, + { "false", T_BOOL }, + { "fun", T_FUN }, + { "if", T_IF }, + { "let", T_LET }, + { "provide", T_PROVIDES }, + { "require", T_REQUIRES }, + { "return", T_RETURN }, + { "struct", T_STRUCT }, + { "true", T_BOOL }, + { "type", T_TYPE }, + { "union", T_UNION }, + { "var", T_VAR }, +}; + +static int keywcmp(const void* a, const void* b) { + return strcmp(((KeywordDef*)a)->keyword, ((KeywordDef*)b)->keyword); +} + +static inline char* file_load(char* path) { + int fd = -1, nread = 0, length = 0; + struct stat sb = {0}; + char* contents = NULL; + if (((fd = open(path, O_RDONLY, 0)) >= 0) && (fstat(fd, &sb) >= 0) && (sb.st_size > 0)) { + contents = calloc(sb.st_size + 1u, 1u); + while (sb.st_size && (nread = read(fd, contents+length, sb.st_size)) > 0) + length += nread, sb.st_size -= nread; + } + if (fd > 0) close(fd); + return contents; +} + +static inline void convert_value(Tok* tok) { + switch (tok->type) { + case T_STRING: { + size_t len = strlen(tok->text+1); + char* strtext = malloc(len); + strncpy(strtext, tok->text+1, len); + strtext[len-1] = '\0'; + free(tok->text), tok->text = strtext; + break; + } + + case T_INT: { + tok->value.integer = strtol(tok->text, NULL, 0); + break; + } + + case T_ID: { + KeywordDef key = { .keyword = tok->text }; + KeywordDef* match = bsearch( + &key, Keywords, NUM_KEYWORDS, sizeof(KeywordDef), keywcmp); + if (match) tok->type = match->type; + break; + } + + case T_BOOL: { + tok->value.integer = (tok->text[0] == 't'); + break; + } + + default: + break; + } +} + +static inline void readtok(Parser* ctx) { + Tok* tok = &(ctx->tok); + char *beg = ctx->file->fpos, *curr = ctx->file->fpos; + tok->offset = (beg - ctx->file->fbeg); + switch (FirstChar[*curr++]) { + case 1: /* skip whitespace */ + for (; SPACE[*curr]; curr++); + break; + + case 2: /* skip comments */ + for (; *curr != '\n'; curr++); + break; + + case 3: /* +/- as ops or number signs */ + tok->type = *(curr-1); + if (!DIGIT[*curr]) break; + /* fallthrough to number parsing */ + + case 4: + tok->type = T_INT; + for (; DIGIT[*curr]; curr++); + break; + + case 5: + tok->type = T_ID; + for (; ALNUM_[*curr]; curr++); + break; + + case 6: /* single char tokens */ + tok->type = *(curr-1); + break; + + case 7: /* string parsing */ + tok->type = T_STRING; + for (; *curr != '"'; curr++); + curr++; + break; + + case 0: /* error handling */ + default: + fprintf(stderr, "Failed to parse token '%c'\n", *(curr-1)); + exit(1); + } + + if (tok->type) { + size_t sz = (curr - beg); + tok->text = malloc(sz+1); + tok->text[sz] = '\0'; + strncpy(tok->text, beg, sz); + convert_value(tok); + } + + ctx->file->fpos = curr; +} + +void lexfile(Parser* ctx, char* path) { + LexFile* file = calloc(sizeof(file), 1u); + file->path = strdup(path); + file->fbeg = file->fpos = file_load(path); + file->next = ctx->file; + ctx->file = file; +} + +void lex(Parser* ctx) { + ctx->tok.type = T_NONE; + while (ctx->tok.type == T_NONE) { + if (!ctx->file) { + /* no more files left to process */ + ctx->tok.type = T_END_FILE; + return; + } else if (!*(ctx->file->fpos)) { + /* grab the next file to process */ + LexFile* f = ctx->file; + ctx->file = f->next; + f->next = ctx->done; + ctx->done = f; + } else { + /* parse out a token */ + readtok(ctx); + } + } +} diff --git a/source/lexer.l b/source/lexer.l index 994adcd..4958cb4 100644 --- a/source/lexer.l +++ b/source/lexer.l @@ -1,7 +1,5 @@ %{ #include -#include -#include static union { char* text; @@ -126,27 +124,6 @@ false { %% -static char* file_load(char* path) { - int fd = -1, nread = 0, length = 0; - struct stat sb = {0}; - char* contents = NULL; - if (((fd = open(path, O_RDONLY, 0)) >= 0) && (fstat(fd, &sb) >= 0) && (sb.st_size > 0)) { - contents = calloc(sb.st_size + 1u, 1u); - while (sb.st_size && (nread = read(fd, contents+length, sb.st_size)) > 0) - length += nread, sb.st_size -= nread; - } - if (fd > 0) close(fd); - return contents; -} - -void lexfile(Parser* ctx, char* path) { - LexFile* file = calloc(sizeof(file), 1u); - file->path = strdup(path); - file->fbeg = file->fpos = file_load(path); - file->next = ctx->file; - ctx->file = file; -} - void gettoken(Parser* ctx) { ctx->tok.line = yylineno; ctx->tok.type = yylex(); diff --git a/source/main.c b/source/main.c index 5bb8c93..beb1f94 100644 --- a/source/main.c +++ b/source/main.c @@ -1,14 +1,16 @@ #include char* ARGV0; -char* Artifact = "bin"; +char* Artifact = "tok"; /* Driver Modes *****************************************************************************/ static int emit_tokens(int argc, char **argv) { Parser ctx = {0}; + for (; argc; argc--,argv++) + lexfile(&ctx, *argv); while (1) { - gettoken(&ctx); + lex(&ctx); if (ctx.tok.type == T_END_FILE) break; else diff --git a/source/pprint.c b/source/pprint.c index 7eb8bba..f39eed0 100644 --- a/source/pprint.c +++ b/source/pprint.c @@ -8,7 +8,7 @@ static void print_indent(FILE* file, int depth) { static const char* token_type_to_string(int type) { #define TOK(name) case (name): return #name switch(type) { - TOK(T_NONE); TOK(T_ERROR); TOK(T_END_FILE); TOK(T_PACKAGE); + TOK(T_NONE); TOK(T_ERROR); TOK(T_END_FILE); TOK(T_REQUIRES); TOK(T_PROVIDES); TOK(T_LET); TOK(T_VAR); TOK(T_FUN); TOK(T_TYPE); TOK(T_STRUCT); TOK(T_UNION); TOK(T_RETURN); TOK(T_IF); TOK(T_ELSE); TOK(T_ID); @@ -59,15 +59,15 @@ void pprint_token_value(FILE* file, Tok* token) { #define TOK(name) case (name): fprintf(file, "%s", #name); break switch(token->type) { /* value tokens */ - case T_STRING: fprintf(file, "\"%s\"", token->value.text); break; - case T_ID: fprintf(file, "%s", token->value.text); break; - case T_CHAR: print_char(file, token->value.integer); break; - case T_INT: fprintf(file, "%lld", token->value.integer); break; + case T_STRING: fprintf(file, "\"%s\"", token->text); break; + case T_ID: fprintf(file, "%s", token->text); break; + case T_CHAR: print_char(file, token->value.integer); break; + case T_INT: fprintf(file, "%lld", token->value.integer); break; case T_FLOAT: fprintf(file, "%f", token->value.floating); break; case T_BOOL: fprintf(file, "%s", (token->value.integer)?"true":"false"); break; /* keyword tokens */ - TOK(T_NONE); TOK(T_ERROR); TOK(T_END_FILE); TOK(T_PACKAGE); + TOK(T_NONE); TOK(T_ERROR); TOK(T_END_FILE); TOK(T_REQUIRES); TOK(T_PROVIDES); TOK(T_LET); TOK(T_VAR); TOK(T_FUN); TOK(T_TYPE); TOK(T_STRUCT); TOK(T_UNION); TOK(T_RETURN); TOK(T_IF); TOK(T_ELSE); diff --git a/source/sclpl.h b/source/sclpl.h index d70e568..156324c 100644 --- a/source/sclpl.h +++ b/source/sclpl.h @@ -23,10 +23,12 @@ static void* emalloc(size_t size) { /* Token Types *****************************************************************************/ typedef enum { - T_NONE = 0, T_ERROR = 256, T_END_FILE, - T_PACKAGE, T_REQUIRES, T_PROVIDES, T_LET, T_VAR, T_FUN, T_TYPE, T_STRUCT, + T_NONE = 0, + T_STRING = 256, T_ID, T_INT, T_BOOL, T_CHAR, T_FLOAT, + T_REQUIRES, T_PROVIDES, T_LET, T_VAR, T_FUN, T_TYPE, T_STRUCT, T_UNION, T_RETURN, T_IF, T_ELSE, - T_ID, T_CHAR, T_INT, T_FLOAT, T_BOOL, T_STRING, + T_ERROR = -2, + T_END_FILE = -1 } TokType; typedef struct { @@ -34,7 +36,9 @@ typedef struct { size_t line; size_t col; TokType type; - union { + char* text; + long offset; + union { char* text; long long integer; double floating; @@ -195,6 +199,7 @@ typedef struct LexFile { } LexFile; typedef struct { + LexFile* done; LexFile* file; Tok tok; SymTable syms; -- 2.52.0