From: Michael D. Lowis <mike.lowis@gentex.com>
Date: Fri, 16 Apr 2021 15:49:09 +0000 (-0400)
Subject: optimized  lexer table
X-Git-Url: https://git.mdlowis.com/?a=commitdiff_plain;h=2c88efc6e0ac8b72ac40d60fead48928ea6ae4d0;p=proto%2Fobnc.git

optimized  lexer table
---

diff --git a/cerise/cerise.h b/cerise/cerise.h
index f4196bb..62fc8fd 100644
--- a/cerise/cerise.h
+++ b/cerise/cerise.h
@@ -75,33 +75,6 @@ typedef struct {
     } value;
 } Tok;
 
-/* Datatype Types
- *****************************************************************************/
-//typedef enum {
-//    VOID, INT, UINT, FLOAT, ARRAY, REF, PTR, FUNC
-//} Kind;
-//
-//typedef struct Type {
-//    Kind kind;
-//    union {
-//        struct Type* type;
-//        size_t bits;
-//        struct {
-//            struct Type* type;
-//            size_t count;
-//        } array;
-//    } value;
-//} Type;
-//
-//Type* VoidType(void);
-//Type* IntType(size_t nbits);
-//Type* UIntType(size_t nbits);
-//Type* FloatType(size_t nbits);
-//Type* ArrayOf(Type* type, size_t count);
-//Type* RefTo(Type* type);
-//Type* PtrTo(Type* type);
-//bool types_equal(Type* type1, Type* type2);
-
 /* Lexer and Parser Types
  *****************************************************************************/
 typedef struct LexFile {
diff --git a/cerise/lex.c b/cerise/lex.c
index 291d415..019ac72 100644
--- a/cerise/lex.c
+++ b/cerise/lex.c
@@ -8,65 +8,55 @@ typedef struct {
     int type;
 } KeywordDef;
 
-static const char FirstChar[256] = {
+enum {
+    WSPACE   = 1,
+    COMMENT  = 2,
+    STRINGS  = 3,
+    DBL_OP   = 4,
+    MULTI_OP = 5,
+    PUNCT    = 6,
+    DIGITS   = 7,
+    ALPHA_   = 8,
+};
+
+static const char Chars[256] = {
     /* Whitespace */
     [' '] = 1, ['\t'] = 1, ['\r'] = 1, ['\n'] = 1,
+
     /* comment start */
     ['#'] = 2,
-    /* number or op */
-//    ['+'] = 3, ['-'] = 3,
-    /* number digits */
-    ['0'] = 4, ['1'] = 4, ['2'] = 4, ['3'] = 4, ['4'] = 4,
-    ['5'] = 4, ['6'] = 4, ['7'] = 4, ['8'] = 4, ['9'] = 4,
-    /* alpha characters */
-    ['A'] = 5, ['B'] = 5, ['C'] = 5, ['D'] = 5, ['E'] = 5,
-    ['F'] = 5, ['G'] = 5, ['H'] = 5, ['I'] = 5, ['J'] = 5,
-    ['K'] = 5, ['L'] = 5, ['M'] = 5, ['N'] = 5, ['O'] = 5,
-    ['P'] = 5, ['Q'] = 5, ['R'] = 5, ['S'] = 5, ['T'] = 5,
-    ['U'] = 5, ['V'] = 5, ['W'] = 5, ['X'] = 5, ['Y'] = 5,
-    ['Z'] = 5, ['a'] = 5, ['b'] = 5, ['c'] = 5, ['d'] = 5,
-    ['e'] = 5, ['f'] = 5, ['g'] = 5, ['h'] = 5, ['i'] = 5,
-    ['j'] = 5, ['k'] = 5, ['l'] = 5, ['m'] = 5, ['n'] = 5,
-    ['o'] = 5, ['p'] = 5, ['q'] = 5, ['r'] = 5, ['s'] = 5,
-    ['t'] = 5, ['u'] = 5, ['v'] = 5, ['w'] = 5, ['x'] = 5,
-    ['y'] = 5, ['z'] = 5,
-    /* punctuation */
-    ['('] = 6, [')'] = 6, ['['] = 6, [']'] = 6, ['{'] = 6, ['}'] = 6,
-    ['.'] = 6, [','] = 6, [':'] = 6, ['='] = 6, [';'] = 6, ['^'] = 6,
-    ['+'] = 6, ['-'] = 6, ['*'] = 6, ['/'] = 6, ['<'] = 6, ['>'] = 6, 
-    ['|'] = 6, 
-    ['!'] = 6, 
+
     /* strings */
-    ['"'] = 7
-};
+    ['"'] = 3,
 
-static const char HasSecondChar[256] = {
-    ['<'] = 1, ['>'] = 1, ['!'] = 1
-};
+    /* double character ops */
+    ['='] = 4, ['.'] = 4,
 
-char SPACE[256] = {
-    [' '] = 1, ['\t'] = 1, ['\r'] = 1, ['\n'] = 1,
-};
+    /* potential multi-character ops */
+    ['<'] = 5, ['>'] = 5, ['!'] = 5,
 
-char DIGIT[256] = {
-    ['0'] = 1, ['1'] = 1, ['2'] = 1, ['3'] = 1, ['4'] = 1,
-    ['5'] = 1, ['6'] = 1, ['7'] = 1, ['8'] = 1, ['9'] = 1,
-};
+    /* punctuation and single-char ops */
+    ['('] = 6, [')'] = 6, ['['] = 6, [']'] = 6, ['{'] = 6,
+    ['}'] = 6, [','] = 6, [':'] = 6, [';'] = 6,
+    ['^'] = 6, ['+'] = 6, ['-'] = 6, ['*'] = 6, ['/'] = 6,
+    ['|'] = 6,
 
-char ALNUM_[256] = {
-    ['0'] = 1, ['1'] = 1, ['2'] = 1, ['3'] = 1, ['4'] = 1,
-    ['5'] = 1, ['6'] = 1, ['7'] = 1, ['8'] = 1, ['9'] = 1,
-    ['A'] = 1, ['B'] = 1, ['C'] = 1, ['D'] = 1, ['E'] = 1,
-    ['F'] = 1, ['G'] = 1, ['H'] = 1, ['I'] = 1, ['J'] = 1,
-    ['K'] = 1, ['L'] = 1, ['M'] = 1, ['N'] = 1, ['O'] = 1,
-    ['P'] = 1, ['Q'] = 1, ['R'] = 1, ['S'] = 1, ['T'] = 1,
-    ['U'] = 1, ['V'] = 1, ['W'] = 1, ['X'] = 1, ['Y'] = 1,
-    ['Z'] = 1, ['a'] = 1, ['b'] = 1, ['c'] = 1, ['d'] = 1,
-    ['e'] = 1, ['f'] = 1, ['g'] = 1, ['h'] = 1, ['i'] = 1,
-    ['j'] = 1, ['k'] = 1, ['l'] = 1, ['m'] = 1, ['n'] = 1,
-    ['o'] = 1, ['p'] = 1, ['q'] = 1, ['r'] = 1, ['s'] = 1,
-    ['t'] = 1, ['u'] = 1, ['v'] = 1, ['w'] = 1, ['x'] = 1,
-    ['y'] = 1, ['z'] = 1, ['_'] = 1,
+    /* number digits */
+    ['0'] = 7, ['1'] = 7, ['2'] = 7, ['3'] = 7, ['4'] = 7,
+    ['5'] = 7, ['6'] = 7, ['7'] = 7, ['8'] = 7, ['9'] = 7,
+
+    /* alpha and underscore characters */
+    ['A'] = 8, ['B'] = 8, ['C'] = 8, ['D'] = 8, ['E'] = 8,
+    ['F'] = 8, ['G'] = 8, ['H'] = 8, ['I'] = 8, ['J'] = 8,
+    ['K'] = 8, ['L'] = 8, ['M'] = 8, ['N'] = 8, ['O'] = 8,
+    ['P'] = 8, ['Q'] = 8, ['R'] = 8, ['S'] = 8, ['T'] = 8,
+    ['U'] = 8, ['V'] = 8, ['W'] = 8, ['X'] = 8, ['Y'] = 8,
+    ['Z'] = 8, ['a'] = 8, ['b'] = 8, ['c'] = 8, ['d'] = 8,
+    ['e'] = 8, ['f'] = 8, ['g'] = 8, ['h'] = 8, ['i'] = 8,
+    ['j'] = 8, ['k'] = 8, ['l'] = 8, ['m'] = 8, ['n'] = 8,
+    ['o'] = 8, ['p'] = 8, ['q'] = 8, ['r'] = 8, ['s'] = 8,
+    ['t'] = 8, ['u'] = 8, ['v'] = 8, ['w'] = 8, ['x'] = 8,
+    ['y'] = 8, ['z'] = 8, ['_'] = 8,
 };
 
 #define NUM_KEYWORDS (sizeof(Keywords) / sizeof(Keywords[0]))
@@ -104,7 +94,7 @@ KeywordDef Keywords[] = {
     { "type",      TYPE      },
     { "until",     UNTIL     },
     { "var",       VAR       },
-    { "while",     WHILE     }, 
+    { "while",     WHILE     },
 };
 
 static int keywcmp(const void* a, const void* b) {
@@ -167,35 +157,40 @@ static inline void readtok(Parser* ctx) {
     char *beg = ctx->file->fpos;
     char *curr = ctx->file->fpos;
     tok->offset = (beg - ctx->file->fbeg);
-    switch (FirstChar[(int)*curr++]) {
-        case 1: /* skip whitespace */
-            for (; SPACE[(int)*curr]; curr++);
+    switch (Chars[(int)*curr++]) {
+        case WSPACE: /* skip whitespace */
+            for (; Chars[(int)*curr] == WSPACE; curr++);
             break;
 
-        case 2: /* skip comments */
+        case COMMENT: /* skip comments */
             for (; *curr != '\n'; curr++);
             break;
 
-        case 3: /* +/- as ops or number signs */
-            tok->type = *(curr-1);
-            if (!DIGIT[(int)*curr]) break;
-            /* parse it as an int */
-            tok->type = INT;
-            for (; DIGIT[(int)*curr]; curr++);
-            break;
-
-        case 4:
-            tok->type = INT;
-            for (; DIGIT[(int)*curr]; curr++);
+        case STRINGS:
+            tok->type = STRING;
+            for (; *curr != '"'; curr++);
+            curr++;
             break;
 
-        case 5:
-            tok->type = IDENT;
-            for (; ALNUM_[(int)*curr]; curr++);
+        case DBL_OP:
+            if (*(curr-1) == *(curr))
+            {
+                curr++;
+                switch (*(curr-1))
+                {
+                    case '=': tok->type = EQ;     break;
+                    case '.': tok->type = DOTDOT; break;
+                    default:  goto error;         break;
+                }
+            }
+            else
+            {
+                tok->type = *(curr-1);
+            }
             break;
 
-        case 6: /* single/double char tokens */
-            if (HasSecondChar[(int)*(curr-1)] && *(curr) == '=')
+        case MULTI_OP:
+            if ('=' == *(curr))
             {
                 curr++;
                 switch (*(curr-2))
@@ -206,25 +201,24 @@ static inline void readtok(Parser* ctx) {
                     default:  goto error;       break;
                 }
             }
-            else if (*(curr-1) == '.' || *(curr-1) == '=')
-            {
-                tok->type = *(curr-1);
-                if (*(curr) == tok->type)
-                {
-                    curr++;
-                    tok->type = (tok->type == '.' ? DOTDOT : EQ);
-                }
-            }
             else
             {
                 tok->type = *(curr-1);
             }
             break;
 
-        case 7: /* string parsing */
-            tok->type = STRING;
-            for (; *curr != '"'; curr++);
-            curr++;
+        case PUNCT:
+            tok->type = *(curr-1);
+            break;
+
+        case DIGITS:
+            tok->type = INT;
+            for (; Chars[(int)*curr] == DIGITS; curr++);
+            break;
+
+        case ALPHA_:
+            tok->type = IDENT;
+            for (; Chars[(int)*curr] >= DIGITS; curr++);
             break;
 
         case 0: /* error handling */
@@ -368,8 +362,11 @@ TEST_SUITE(Lexer)
         { "type",      TYPE      },
         { "until",     UNTIL     },
         { "var",       VAR       },
-        { "while",     WHILE     }, 
-        { "",          END_FILE  }, 
+        { "while",     WHILE     },
+        { "foo",       IDENT     },
+        { "123",       INT       },
+        { "",          STRING    },
+        { "",          END_FILE  },
     };
 
 
@@ -380,7 +377,7 @@ TEST_SUITE(Lexer)
         for (size_t i = 0; i < sizeof(Tokens)/sizeof(Tokens[0]); i++)
         {
             lex(&ctx);
-            //printf("(%d, '%s') != (%d, '%s')\n", 
+            //printf("(%d, '%s') != (%d, '%s')\n",
             //    ctx.tok.type, ctx.tok.text, Tokens[i].type, Tokens[i].text);
             CHECK(ctx.tok.type == Tokens[i].type);
             CHECK(ctx.tok.text != NULL);
diff --git a/cerise/tests/tokens.txt b/cerise/tests/tokens.txt
index f4301f3..4a967ba 100644
--- a/cerise/tests/tokens.txt
+++ b/cerise/tests/tokens.txt
@@ -56,3 +56,6 @@ type
 until
 var
 while
+foo
+123
+""