From 36dbdcee3228fe0f5dab1affe0d28b60a2ef45bb Mon Sep 17 00:00:00 2001
From: "Michael D. Lowis" <mike@mdlowis.com>
Date: Sat, 16 Mar 2019 21:14:55 -0400
Subject: [PATCH] added custom handcoded lexer

---
 Makefile        |   1 +
 source/lex.c    | 215 ++++++++++++++++++++++++++++++++++++++++++++++++
 source/lexer.l  |  23 ------
 source/main.c   |   6 +-
 source/pprint.c |  12 +--
 source/sclpl.h  |  13 ++-
 6 files changed, 235 insertions(+), 35 deletions(-)
 create mode 100644 source/lex.c

diff --git a/Makefile b/Makefile
index efe80dd..63881f1 100644
--- a/Makefile
+++ b/Makefile
@@ -23,6 +23,7 @@ OBJS = source/main.o    \
        source/pprint.o  \
        source/parser.o  \
        source/lexer.o   \
+       source/lex.o   \
        source/ast.o     \
        source/types.o   \
        source/syms.o    \
diff --git a/source/lex.c b/source/lex.c
new file mode 100644
index 0000000..b3ccd11
--- /dev/null
+++ b/source/lex.c
@@ -0,0 +1,215 @@
+#include <sclpl.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+
+typedef struct {
+    char* keyword;
+    int type;
+} KeywordDef;
+
+static const char FirstChar[256] = {
+    /* Whitespace */
+    [' '] = 1, ['\t'] = 1, ['\r'] = 1, ['\n'] = 1,
+    /* comment start */
+    ['#'] = 2,
+    /* number or op */
+    ['+'] = 3, ['-'] = 3,
+    /* number digits */
+    ['0'] = 4, ['1'] = 4, ['2'] = 4, ['3'] = 4, ['4'] = 4,
+    ['5'] = 4, ['6'] = 4, ['7'] = 4, ['8'] = 4, ['9'] = 4,
+    /* alpha characters */
+    ['A'] = 5, ['B'] = 5, ['C'] = 5, ['D'] = 5, ['E'] = 5,
+    ['F'] = 5, ['G'] = 5, ['H'] = 5, ['I'] = 5, ['J'] = 5,
+    ['K'] = 5, ['L'] = 5, ['M'] = 5, ['N'] = 5, ['O'] = 5,
+    ['P'] = 5, ['Q'] = 5, ['R'] = 5, ['S'] = 5, ['T'] = 5,
+    ['U'] = 5, ['V'] = 5, ['W'] = 5, ['X'] = 5, ['Y'] = 5,
+    ['Z'] = 5, ['a'] = 5, ['b'] = 5, ['c'] = 5, ['d'] = 5,
+    ['e'] = 5, ['f'] = 5, ['g'] = 5, ['h'] = 5, ['i'] = 5,
+    ['j'] = 5, ['k'] = 5, ['l'] = 5, ['m'] = 5, ['n'] = 5,
+    ['o'] = 5, ['p'] = 5, ['q'] = 5, ['r'] = 5, ['s'] = 5,
+    ['t'] = 5, ['u'] = 5, ['v'] = 5, ['w'] = 5, ['x'] = 5,
+    ['y'] = 5, ['z'] = 5,
+    /* punctuation */
+    ['('] = 6, [')'] = 6, ['['] = 6, [']'] = 6, ['{'] = 6, ['}'] = 6,
+    ['.'] = 6, [','] = 6, [':'] = 6, ['&'] = 6, ['='] = 6, [';'] = 6,
+    ['*'] = 6, ['\''] = 6,
+    /* strings */
+    ['"'] = 7
+};
+
+char SPACE[256] = {
+    [' '] = 1, ['\t'] = 1, ['\r'] = 1, ['\n'] = 1,
+};
+
+char DIGIT[256] = {
+    ['0'] = 1, ['1'] = 1, ['2'] = 1, ['3'] = 1, ['4'] = 1,
+    ['5'] = 1, ['6'] = 1, ['7'] = 1, ['8'] = 1, ['9'] = 1,
+};
+
+char ALNUM_[256] = {
+    ['0'] = 1, ['1'] = 1, ['2'] = 1, ['3'] = 1, ['4'] = 1,
+    ['5'] = 1, ['6'] = 1, ['7'] = 1, ['8'] = 1, ['9'] = 1,
+    ['A'] = 1, ['B'] = 1, ['C'] = 1, ['D'] = 1, ['E'] = 1,
+    ['F'] = 1, ['G'] = 1, ['H'] = 1, ['I'] = 1, ['J'] = 1,
+    ['K'] = 1, ['L'] = 1, ['M'] = 1, ['N'] = 1, ['O'] = 1,
+    ['P'] = 1, ['Q'] = 1, ['R'] = 1, ['S'] = 1, ['T'] = 1,
+    ['U'] = 1, ['V'] = 1, ['W'] = 1, ['X'] = 1, ['Y'] = 1,
+    ['Z'] = 1, ['a'] = 1, ['b'] = 1, ['c'] = 1, ['d'] = 1,
+    ['e'] = 1, ['f'] = 1, ['g'] = 1, ['h'] = 1, ['i'] = 1,
+    ['j'] = 1, ['k'] = 1, ['l'] = 1, ['m'] = 1, ['n'] = 1,
+    ['o'] = 1, ['p'] = 1, ['q'] = 1, ['r'] = 1, ['s'] = 1,
+    ['t'] = 1, ['u'] = 1, ['v'] = 1, ['w'] = 1, ['x'] = 1,
+    ['y'] = 1, ['z'] = 1, ['_'] = 1,
+};
+
+#define NUM_KEYWORDS (sizeof(Keywords) / sizeof(Keywords[0]))
+KeywordDef Keywords[] = {
+    { "else",    T_ELSE     },
+    { "false",   T_BOOL     },
+    { "fun",     T_FUN      },
+    { "if",      T_IF       },
+    { "let",     T_LET      },
+    { "provide", T_PROVIDES },
+    { "require", T_REQUIRES },
+    { "return",  T_RETURN   },
+    { "struct",  T_STRUCT   },
+    { "true",    T_BOOL     },
+    { "type",    T_TYPE     },
+    { "union",   T_UNION    },
+    { "var",     T_VAR      },
+};
+
+static int keywcmp(const void* a, const void* b) {
+    return strcmp(((KeywordDef*)a)->keyword, ((KeywordDef*)b)->keyword);
+}
+
+static inline char* file_load(char* path) {
+    int fd = -1, nread = 0, length = 0;
+    struct stat sb = {0};
+    char* contents = NULL;
+    if (((fd = open(path, O_RDONLY, 0)) >= 0) && (fstat(fd, &sb) >= 0) && (sb.st_size > 0)) {
+        contents = calloc(sb.st_size + 1u, 1u);
+        while (sb.st_size && (nread = read(fd, contents+length, sb.st_size)) > 0)
+            length += nread, sb.st_size -= nread;
+    }
+    if (fd > 0) close(fd);
+    return contents;
+}
+
+static inline void convert_value(Tok* tok) {
+    switch (tok->type) {
+        case T_STRING: {
+            size_t len = strlen(tok->text+1);
+            char* strtext = malloc(len);
+            strncpy(strtext, tok->text+1, len);
+            strtext[len-1] = '\0';
+            free(tok->text), tok->text = strtext;
+            break;
+        }
+
+        case T_INT: {
+            tok->value.integer = strtol(tok->text, NULL, 0);
+            break;
+        }
+
+        case T_ID: {
+            KeywordDef key = { .keyword = tok->text };
+            KeywordDef* match = bsearch(
+                &key, Keywords, NUM_KEYWORDS, sizeof(KeywordDef), keywcmp);
+            if (match) tok->type = match->type;
+            break;
+        }
+
+        case T_BOOL: {
+            tok->value.integer = (tok->text[0] == 't');
+            break;
+        }
+
+        default:
+            break;
+    }
+}
+
+static inline void readtok(Parser* ctx) {
+    Tok* tok = &(ctx->tok);
+    char *beg = ctx->file->fpos, *curr = ctx->file->fpos;
+    tok->offset = (beg - ctx->file->fbeg);
+    switch (FirstChar[*curr++]) {
+        case 1: /* skip whitespace */
+            for (; SPACE[*curr]; curr++);
+            break;
+
+        case 2: /* skip comments */
+            for (; *curr != '\n'; curr++);
+            break;
+
+        case 3: /* +/- as ops or number signs */
+            tok->type = *(curr-1);
+            if (!DIGIT[*curr]) break;
+            /* fallthrough to number parsing */
+
+        case 4:
+            tok->type = T_INT;
+            for (; DIGIT[*curr]; curr++);
+            break;
+
+        case 5:
+            tok->type = T_ID;
+            for (; ALNUM_[*curr]; curr++);
+            break;
+
+        case 6: /* single char tokens */
+            tok->type = *(curr-1);
+            break;
+
+        case 7: /* string parsing */
+            tok->type = T_STRING;
+            for (; *curr != '"'; curr++);
+            curr++;
+            break;
+
+        case 0: /* error handling */
+        default:
+            fprintf(stderr, "Failed to parse token '%c'\n", *(curr-1));
+            exit(1);
+    }
+
+    if (tok->type) {
+        size_t sz = (curr - beg);
+        tok->text = malloc(sz+1);
+        tok->text[sz] = '\0';
+        strncpy(tok->text, beg, sz);
+        convert_value(tok);
+    }
+
+    ctx->file->fpos = curr;
+}
+
+void lexfile(Parser* ctx, char* path) {
+    LexFile* file = calloc(sizeof(file), 1u);
+    file->path = strdup(path);
+    file->fbeg = file->fpos = file_load(path);
+    file->next = ctx->file;
+    ctx->file = file;
+}
+
+void lex(Parser* ctx) {
+    ctx->tok.type = T_NONE;
+    while (ctx->tok.type == T_NONE) {
+        if (!ctx->file) {
+            /* no more files left to process */
+            ctx->tok.type = T_END_FILE;
+            return;
+        } else if (!*(ctx->file->fpos)) {
+            /* grab the next file to process */
+            LexFile* f = ctx->file;
+            ctx->file = f->next;
+            f->next = ctx->done;
+            ctx->done = f;
+        } else {
+            /* parse out a token */
+            readtok(ctx);
+        }
+    }
+}
diff --git a/source/lexer.l b/source/lexer.l
index 994adcd..4958cb4 100644
--- a/source/lexer.l
+++ b/source/lexer.l
@@ -1,7 +1,5 @@
 %{
 #include <sclpl.h>
-#include <fcntl.h>
-#include <sys/stat.h>
 
 static union {
     char* text;
@@ -126,27 +124,6 @@ false {
 
 %%
 
-static char* file_load(char* path) {
-    int fd = -1, nread = 0, length = 0;
-    struct stat sb = {0};
-    char* contents = NULL;
-    if (((fd = open(path, O_RDONLY, 0)) >= 0) && (fstat(fd, &sb) >= 0) && (sb.st_size > 0)) {
-        contents = calloc(sb.st_size + 1u, 1u);
-        while (sb.st_size && (nread = read(fd, contents+length, sb.st_size)) > 0)
-            length += nread, sb.st_size -= nread;
-    }
-    if (fd > 0) close(fd);
-    return contents;
-}
-
-void lexfile(Parser* ctx, char* path) {
-    LexFile* file = calloc(sizeof(file), 1u);
-    file->path = strdup(path);
-    file->fbeg = file->fpos = file_load(path);
-    file->next = ctx->file;
-    ctx->file = file;
-}
-
 void gettoken(Parser* ctx) {
     ctx->tok.line = yylineno;
     ctx->tok.type = yylex();
diff --git a/source/main.c b/source/main.c
index 5bb8c93..beb1f94 100644
--- a/source/main.c
+++ b/source/main.c
@@ -1,14 +1,16 @@
 #include <sclpl.h>
 
 char* ARGV0;
-char* Artifact = "bin";
+char* Artifact = "tok";
 
 /* Driver Modes
  *****************************************************************************/
 static int emit_tokens(int argc, char **argv) {
     Parser ctx = {0};
+    for (; argc; argc--,argv++)
+        lexfile(&ctx, *argv);
     while (1) {
-        gettoken(&ctx);
+        lex(&ctx);
         if (ctx.tok.type == T_END_FILE)
             break;
         else
diff --git a/source/pprint.c b/source/pprint.c
index 7eb8bba..f39eed0 100644
--- a/source/pprint.c
+++ b/source/pprint.c
@@ -8,7 +8,7 @@ static void print_indent(FILE* file, int depth) {
 static const char* token_type_to_string(int type) {
     #define TOK(name) case (name): return #name
     switch(type) {
-        TOK(T_NONE); TOK(T_ERROR); TOK(T_END_FILE); TOK(T_PACKAGE);
+        TOK(T_NONE); TOK(T_ERROR); TOK(T_END_FILE);
         TOK(T_REQUIRES); TOK(T_PROVIDES); TOK(T_LET); TOK(T_VAR);
         TOK(T_FUN); TOK(T_TYPE); TOK(T_STRUCT); TOK(T_UNION);
         TOK(T_RETURN); TOK(T_IF); TOK(T_ELSE); TOK(T_ID);
@@ -59,15 +59,15 @@ void pprint_token_value(FILE* file, Tok* token) {
     #define TOK(name) case (name): fprintf(file, "%s", #name); break
     switch(token->type) {
         /* value tokens */
-        case T_STRING: fprintf(file, "\"%s\"", token->value.text);                 break;
-        case T_ID:     fprintf(file, "%s", token->value.text);                     break;
-        case T_CHAR:   print_char(file, token->value.integer);                   break;
-        case T_INT:    fprintf(file, "%lld", token->value.integer);                 break;
+        case T_STRING: fprintf(file, "\"%s\"", token->text);                       break;
+        case T_ID:     fprintf(file, "%s", token->text);                           break;
+        case T_CHAR:   print_char(file, token->value.integer);                     break;
+        case T_INT:    fprintf(file, "%lld", token->value.integer);                break;
         case T_FLOAT:  fprintf(file, "%f", token->value.floating);                 break;
         case T_BOOL:   fprintf(file, "%s", (token->value.integer)?"true":"false"); break;
 
         /* keyword tokens */
-        TOK(T_NONE); TOK(T_ERROR); TOK(T_END_FILE); TOK(T_PACKAGE);
+        TOK(T_NONE); TOK(T_ERROR); TOK(T_END_FILE);
         TOK(T_REQUIRES); TOK(T_PROVIDES); TOK(T_LET); TOK(T_VAR);
         TOK(T_FUN); TOK(T_TYPE); TOK(T_STRUCT); TOK(T_UNION); TOK(T_RETURN);
         TOK(T_IF); TOK(T_ELSE);
diff --git a/source/sclpl.h b/source/sclpl.h
index d70e568..156324c 100644
--- a/source/sclpl.h
+++ b/source/sclpl.h
@@ -23,10 +23,12 @@ static void* emalloc(size_t size) {
 /* Token Types
  *****************************************************************************/
 typedef enum {
-    T_NONE = 0, T_ERROR = 256, T_END_FILE,
-    T_PACKAGE, T_REQUIRES, T_PROVIDES, T_LET, T_VAR, T_FUN, T_TYPE, T_STRUCT,
+    T_NONE = 0,
+    T_STRING = 256, T_ID, T_INT, T_BOOL, T_CHAR, T_FLOAT,
+    T_REQUIRES, T_PROVIDES, T_LET, T_VAR, T_FUN, T_TYPE, T_STRUCT,
     T_UNION, T_RETURN, T_IF, T_ELSE,
-    T_ID, T_CHAR, T_INT, T_FLOAT, T_BOOL, T_STRING,
+    T_ERROR = -2,
+    T_END_FILE = -1
 } TokType;
 
 typedef struct {
@@ -34,7 +36,9 @@ typedef struct {
     size_t line;
     size_t col;
     TokType type;
-    union {
+    char* text;
+    long offset;
+  union {
         char* text;
         long long integer;
         double floating;
@@ -195,6 +199,7 @@ typedef struct LexFile {
 } LexFile;
 
 typedef struct {
+    LexFile* done;
     LexFile* file;
     Tok tok;
     SymTable syms;
-- 
2.52.0