added custom handcoded lexer

author Michael D. Lowis <mike@mdlowis.com>

Sun, 17 Mar 2019 01:14:55 +0000 (21:14 -0400)

committer Michael D. Lowis <mike@mdlowis.com>

Sun, 17 Mar 2019 01:14:55 +0000 (21:14 -0400)
author Michael D. Lowis <mike@mdlowis.com>
Sun, 17 Mar 2019 01:14:55 +0000 (21:14 -0400)
committer Michael D. Lowis <mike@mdlowis.com>
Sun, 17 Mar 2019 01:14:55 +0000 (21:14 -0400)
diff --git a/Makefile b/Makefile

index efe80dd16d1e230ea71a65883e308d04c06cbd83..63881f1ac6898481499d92bd28f9788a1d345382 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -23,6 +23,7 @@ OBJS = source/main.o    \
         source/pprint.o  \
         source/parser.o  \
         source/lexer.o   \
+       source/lex.o   \
         source/ast.o     \
         source/types.o   \
         source/syms.o    \
diff --git a/source/lex.c b/source/lex.c

new file mode 100644 (file)

index 0000000..b3ccd11
--- /dev/null
+++ b/source/lex.c
@@ -0,0 +1,215 @@
+#include <sclpl.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+
+typedef struct {
+    char* keyword;
+    int type;
+} KeywordDef;
+
+static const char FirstChar[256] = {
+    /* Whitespace */
+    [' '] = 1, ['\t'] = 1, ['\r'] = 1, ['\n'] = 1,
+    /* comment start */
+    ['#'] = 2,
+    /* number or op */
+    ['+'] = 3, ['-'] = 3,
+    /* number digits */
+    ['0'] = 4, ['1'] = 4, ['2'] = 4, ['3'] = 4, ['4'] = 4,
+    ['5'] = 4, ['6'] = 4, ['7'] = 4, ['8'] = 4, ['9'] = 4,
+    /* alpha characters */
+    ['A'] = 5, ['B'] = 5, ['C'] = 5, ['D'] = 5, ['E'] = 5,
+    ['F'] = 5, ['G'] = 5, ['H'] = 5, ['I'] = 5, ['J'] = 5,
+    ['K'] = 5, ['L'] = 5, ['M'] = 5, ['N'] = 5, ['O'] = 5,
+    ['P'] = 5, ['Q'] = 5, ['R'] = 5, ['S'] = 5, ['T'] = 5,
+    ['U'] = 5, ['V'] = 5, ['W'] = 5, ['X'] = 5, ['Y'] = 5,
+    ['Z'] = 5, ['a'] = 5, ['b'] = 5, ['c'] = 5, ['d'] = 5,
+    ['e'] = 5, ['f'] = 5, ['g'] = 5, ['h'] = 5, ['i'] = 5,
+    ['j'] = 5, ['k'] = 5, ['l'] = 5, ['m'] = 5, ['n'] = 5,
+    ['o'] = 5, ['p'] = 5, ['q'] = 5, ['r'] = 5, ['s'] = 5,
+    ['t'] = 5, ['u'] = 5, ['v'] = 5, ['w'] = 5, ['x'] = 5,
+    ['y'] = 5, ['z'] = 5,
+    /* punctuation */
+    ['('] = 6, [')'] = 6, ['['] = 6, [']'] = 6, ['{'] = 6, ['}'] = 6,
+    ['.'] = 6, [','] = 6, [':'] = 6, ['&'] = 6, ['='] = 6, [';'] = 6,
+    ['*'] = 6, ['\''] = 6,
+    /* strings */
+    ['"'] = 7
+};
+
+char SPACE[256] = {
+    [' '] = 1, ['\t'] = 1, ['\r'] = 1, ['\n'] = 1,
+};
+
+char DIGIT[256] = {
+    ['0'] = 1, ['1'] = 1, ['2'] = 1, ['3'] = 1, ['4'] = 1,
+    ['5'] = 1, ['6'] = 1, ['7'] = 1, ['8'] = 1, ['9'] = 1,
+};
+
+char ALNUM_[256] = {
+    ['0'] = 1, ['1'] = 1, ['2'] = 1, ['3'] = 1, ['4'] = 1,
+    ['5'] = 1, ['6'] = 1, ['7'] = 1, ['8'] = 1, ['9'] = 1,
+    ['A'] = 1, ['B'] = 1, ['C'] = 1, ['D'] = 1, ['E'] = 1,
+    ['F'] = 1, ['G'] = 1, ['H'] = 1, ['I'] = 1, ['J'] = 1,
+    ['K'] = 1, ['L'] = 1, ['M'] = 1, ['N'] = 1, ['O'] = 1,
+    ['P'] = 1, ['Q'] = 1, ['R'] = 1, ['S'] = 1, ['T'] = 1,
+    ['U'] = 1, ['V'] = 1, ['W'] = 1, ['X'] = 1, ['Y'] = 1,
+    ['Z'] = 1, ['a'] = 1, ['b'] = 1, ['c'] = 1, ['d'] = 1,
+    ['e'] = 1, ['f'] = 1, ['g'] = 1, ['h'] = 1, ['i'] = 1,
+    ['j'] = 1, ['k'] = 1, ['l'] = 1, ['m'] = 1, ['n'] = 1,
+    ['o'] = 1, ['p'] = 1, ['q'] = 1, ['r'] = 1, ['s'] = 1,
+    ['t'] = 1, ['u'] = 1, ['v'] = 1, ['w'] = 1, ['x'] = 1,
+    ['y'] = 1, ['z'] = 1, ['_'] = 1,
+};
+
+#define NUM_KEYWORDS (sizeof(Keywords) / sizeof(Keywords[0]))
+KeywordDef Keywords[] = {
+    { "else",    T_ELSE     },
+    { "false",   T_BOOL     },
+    { "fun",     T_FUN      },
+    { "if",      T_IF       },
+    { "let",     T_LET      },
+    { "provide", T_PROVIDES },
+    { "require", T_REQUIRES },
+    { "return",  T_RETURN   },
+    { "struct",  T_STRUCT   },
+    { "true",    T_BOOL     },
+    { "type",    T_TYPE     },
+    { "union",   T_UNION    },
+    { "var",     T_VAR      },
+};
+
+static int keywcmp(const void* a, const void* b) {
+    return strcmp(((KeywordDef*)a)->keyword, ((KeywordDef*)b)->keyword);
+}
+
+static inline char* file_load(char* path) {
+    int fd = -1, nread = 0, length = 0;
+    struct stat sb = {0};
+    char* contents = NULL;
+    if (((fd = open(path, O_RDONLY, 0)) >= 0) && (fstat(fd, &sb) >= 0) && (sb.st_size > 0)) {
+        contents = calloc(sb.st_size + 1u, 1u);
+        while (sb.st_size && (nread = read(fd, contents+length, sb.st_size)) > 0)
+            length += nread, sb.st_size -= nread;
+    }
+    if (fd > 0) close(fd);
+    return contents;
+}
+
+static inline void convert_value(Tok* tok) {
+    switch (tok->type) {
+        case T_STRING: {
+            size_t len = strlen(tok->text+1);
+            char* strtext = malloc(len);
+            strncpy(strtext, tok->text+1, len);
+            strtext[len-1] = '\0';
+            free(tok->text), tok->text = strtext;
+            break;
+        }
+
+        case T_INT: {
+            tok->value.integer = strtol(tok->text, NULL, 0);
+            break;
+        }
+
+        case T_ID: {
+            KeywordDef key = { .keyword = tok->text };
+            KeywordDef* match = bsearch(
+                &key, Keywords, NUM_KEYWORDS, sizeof(KeywordDef), keywcmp);
+            if (match) tok->type = match->type;
+            break;
+        }
+
+        case T_BOOL: {
+            tok->value.integer = (tok->text[0] == 't');
+            break;
+        }
+
+        default:
+            break;
+    }
+}
+
+static inline void readtok(Parser* ctx) {
+    Tok* tok = &(ctx->tok);
+    char *beg = ctx->file->fpos, *curr = ctx->file->fpos;
+    tok->offset = (beg - ctx->file->fbeg);
+    switch (FirstChar[*curr++]) {
+        case 1: /* skip whitespace */
+            for (; SPACE[*curr]; curr++);
+            break;
+
+        case 2: /* skip comments */
+            for (; *curr != '\n'; curr++);
+            break;
+
+        case 3: /* +/- as ops or number signs */
+            tok->type = *(curr-1);
+            if (!DIGIT[*curr]) break;
+            /* fallthrough to number parsing */
+
+        case 4:
+            tok->type = T_INT;
+            for (; DIGIT[*curr]; curr++);
+            break;
+
+        case 5:
+            tok->type = T_ID;
+            for (; ALNUM_[*curr]; curr++);
+            break;
+
+        case 6: /* single char tokens */
+            tok->type = *(curr-1);
+            break;
+
+        case 7: /* string parsing */
+            tok->type = T_STRING;
+            for (; *curr != '"'; curr++);
+            curr++;
+            break;
+
+        case 0: /* error handling */
+        default:
+            fprintf(stderr, "Failed to parse token '%c'\n", *(curr-1));
+            exit(1);
+    }
+
+    if (tok->type) {
+        size_t sz = (curr - beg);
+        tok->text = malloc(sz+1);
+        tok->text[sz] = '\0';
+        strncpy(tok->text, beg, sz);
+        convert_value(tok);
+    }
+
+    ctx->file->fpos = curr;
+}
+
+void lexfile(Parser* ctx, char* path) {
+    LexFile* file = calloc(sizeof(file), 1u);
+    file->path = strdup(path);
+    file->fbeg = file->fpos = file_load(path);
+    file->next = ctx->file;
+    ctx->file = file;
+}
+
+void lex(Parser* ctx) {
+    ctx->tok.type = T_NONE;
+    while (ctx->tok.type == T_NONE) {
+        if (!ctx->file) {
+            /* no more files left to process */
+            ctx->tok.type = T_END_FILE;
+            return;
+        } else if (!*(ctx->file->fpos)) {
+            /* grab the next file to process */
+            LexFile* f = ctx->file;
+            ctx->file = f->next;
+            f->next = ctx->done;
+            ctx->done = f;
+        } else {
+            /* parse out a token */
+            readtok(ctx);
+        }
+    }
+}
diff --git a/source/lexer.l b/source/lexer.l

index 994adcdf67e093459fb5089a35d6c860cff9cbe0..4958cb40fca892e0ee8715caf09196a8a863807b 100644 (file)
--- a/source/lexer.l
+++ b/source/lexer.l
@@ -1,7 +1,5 @@
  %{
  #include <sclpl.h>
-#include <fcntl.h>
-#include <sys/stat.h>
  
  static union {
      char* text;
@@ -126,27 +124,6 @@ false {
  
  %%
  
-static char* file_load(char* path) {
-    int fd = -1, nread = 0, length = 0;
-    struct stat sb = {0};
-    char* contents = NULL;
-    if (((fd = open(path, O_RDONLY, 0)) >= 0) && (fstat(fd, &sb) >= 0) && (sb.st_size > 0)) {
-        contents = calloc(sb.st_size + 1u, 1u);
-        while (sb.st_size && (nread = read(fd, contents+length, sb.st_size)) > 0)
-            length += nread, sb.st_size -= nread;
-    }
-    if (fd > 0) close(fd);
-    return contents;
-}
-
-void lexfile(Parser* ctx, char* path) {
-    LexFile* file = calloc(sizeof(file), 1u);
-    file->path = strdup(path);
-    file->fbeg = file->fpos = file_load(path);
-    file->next = ctx->file;
-    ctx->file = file;
-}
-
  void gettoken(Parser* ctx) {
      ctx->tok.line = yylineno;
      ctx->tok.type = yylex();
diff --git a/source/main.c b/source/main.c

index 5bb8c9388010d290855c18202675d912ba769284..beb1f9474a4602d644dfcc1635a3970b364fd78a 100644 (file)
--- a/source/main.c
+++ b/source/main.c
@@ -1,14 +1,16 @@
  #include <sclpl.h>
  
  char* ARGV0;
-char* Artifact = "bin";
+char* Artifact = "tok";
  
  /* Driver Modes
   *****************************************************************************/
  static int emit_tokens(int argc, char **argv) {
      Parser ctx = {0};
+    for (; argc; argc--,argv++)
+        lexfile(&ctx, *argv);
      while (1) {
-        gettoken(&ctx);
+        lex(&ctx);
          if (ctx.tok.type == T_END_FILE)
              break;
          else
diff --git a/source/pprint.c b/source/pprint.c

index 7eb8bbae3277213c2121f72cb1d1e53eff532826..f39eed0738b46a3eace691f549f1757a55e5fd52 100644 (file)
--- a/source/pprint.c
+++ b/source/pprint.c
@@ -8,7 +8,7 @@ static void print_indent(FILE* file, int depth) {
  static const char* token_type_to_string(int type) {
      #define TOK(name) case (name): return #name
      switch(type) {
-        TOK(T_NONE); TOK(T_ERROR); TOK(T_END_FILE); TOK(T_PACKAGE);
+        TOK(T_NONE); TOK(T_ERROR); TOK(T_END_FILE);
          TOK(T_REQUIRES); TOK(T_PROVIDES); TOK(T_LET); TOK(T_VAR);
          TOK(T_FUN); TOK(T_TYPE); TOK(T_STRUCT); TOK(T_UNION);
          TOK(T_RETURN); TOK(T_IF); TOK(T_ELSE); TOK(T_ID);
@@ -59,15 +59,15 @@ void pprint_token_value(FILE* file, Tok* token) {
      #define TOK(name) case (name): fprintf(file, "%s", #name); break
      switch(token->type) {
          /* value tokens */
-        case T_STRING: fprintf(file, "\"%s\"", token->value.text);                 break;
-        case T_ID:     fprintf(file, "%s", token->value.text);                     break;
-        case T_CHAR:   print_char(file, token->value.integer);                   break;
-        case T_INT:    fprintf(file, "%lld", token->value.integer);                 break;
+        case T_STRING: fprintf(file, "\"%s\"", token->text);                       break;
+        case T_ID:     fprintf(file, "%s", token->text);                           break;
+        case T_CHAR:   print_char(file, token->value.integer);                     break;
+        case T_INT:    fprintf(file, "%lld", token->value.integer);                break;
          case T_FLOAT:  fprintf(file, "%f", token->value.floating);                 break;
          case T_BOOL:   fprintf(file, "%s", (token->value.integer)?"true":"false"); break;
  
          /* keyword tokens */
-        TOK(T_NONE); TOK(T_ERROR); TOK(T_END_FILE); TOK(T_PACKAGE);
+        TOK(T_NONE); TOK(T_ERROR); TOK(T_END_FILE);
          TOK(T_REQUIRES); TOK(T_PROVIDES); TOK(T_LET); TOK(T_VAR);
          TOK(T_FUN); TOK(T_TYPE); TOK(T_STRUCT); TOK(T_UNION); TOK(T_RETURN);
          TOK(T_IF); TOK(T_ELSE);
diff --git a/source/sclpl.h b/source/sclpl.h

index d70e568af3f8d4e67eb2b14db1e84240c1c44ff4..156324c409459b8786d8d032ba83730646c00f74 100644 (file)
--- a/source/sclpl.h
+++ b/source/sclpl.h
@@ -23,10 +23,12 @@ static void* emalloc(size_t size) {
  /* Token Types
   *****************************************************************************/
  typedef enum {
-    T_NONE = 0, T_ERROR = 256, T_END_FILE,
-    T_PACKAGE, T_REQUIRES, T_PROVIDES, T_LET, T_VAR, T_FUN, T_TYPE, T_STRUCT,
+    T_NONE = 0,
+    T_STRING = 256, T_ID, T_INT, T_BOOL, T_CHAR, T_FLOAT,
+    T_REQUIRES, T_PROVIDES, T_LET, T_VAR, T_FUN, T_TYPE, T_STRUCT,
      T_UNION, T_RETURN, T_IF, T_ELSE,
-    T_ID, T_CHAR, T_INT, T_FLOAT, T_BOOL, T_STRING,
+    T_ERROR = -2,
+    T_END_FILE = -1
  } TokType;
  
  typedef struct {
@@ -34,7 +36,9 @@ typedef struct {
      size_t line;
      size_t col;
      TokType type;
-    union {
+    char* text;
+    long offset;
+  union {
          char* text;
          long long integer;
          double floating;
@@ -195,6 +199,7 @@ typedef struct LexFile {
  } LexFile;
  
  typedef struct {
+    LexFile* done;
      LexFile* file;
      Tok tok;
      SymTable syms;
author	Michael D. Lowis <mike@mdlowis.com>
	Sun, 17 Mar 2019 01:14:55 +0000 (21:14 -0400)
committer	Michael D. Lowis <mike@mdlowis.com>
	Sun, 17 Mar 2019 01:14:55 +0000 (21:14 -0400)
Makefile		patch \| blob \| history
source/lex.c	[new file with mode: 0644]	patch \| blob
source/lexer.l		patch \| blob \| history
source/main.c		patch \| blob \| history
source/pprint.c		patch \| blob \| history
source/sclpl.h		patch \| blob \| history