From: Michael D. Lowis Date: Fri, 16 Apr 2021 15:49:09 +0000 (-0400) Subject: optimized lexer table X-Git-Url: https://git.mdlowis.com/?a=commitdiff_plain;h=2c88efc6e0ac8b72ac40d60fead48928ea6ae4d0;p=proto%2Fobnc.git optimized lexer table --- diff --git a/cerise/cerise.h b/cerise/cerise.h index f4196bb..62fc8fd 100644 --- a/cerise/cerise.h +++ b/cerise/cerise.h @@ -75,33 +75,6 @@ typedef struct { } value; } Tok; -/* Datatype Types - *****************************************************************************/ -//typedef enum { -// VOID, INT, UINT, FLOAT, ARRAY, REF, PTR, FUNC -//} Kind; -// -//typedef struct Type { -// Kind kind; -// union { -// struct Type* type; -// size_t bits; -// struct { -// struct Type* type; -// size_t count; -// } array; -// } value; -//} Type; -// -//Type* VoidType(void); -//Type* IntType(size_t nbits); -//Type* UIntType(size_t nbits); -//Type* FloatType(size_t nbits); -//Type* ArrayOf(Type* type, size_t count); -//Type* RefTo(Type* type); -//Type* PtrTo(Type* type); -//bool types_equal(Type* type1, Type* type2); - /* Lexer and Parser Types *****************************************************************************/ typedef struct LexFile { diff --git a/cerise/lex.c b/cerise/lex.c index 291d415..019ac72 100644 --- a/cerise/lex.c +++ b/cerise/lex.c @@ -8,65 +8,55 @@ typedef struct { int type; } KeywordDef; -static const char FirstChar[256] = { +enum { + WSPACE = 1, + COMMENT = 2, + STRINGS = 3, + DBL_OP = 4, + MULTI_OP = 5, + PUNCT = 6, + DIGITS = 7, + ALPHA_ = 8, +}; + +static const char Chars[256] = { /* Whitespace */ [' '] = 1, ['\t'] = 1, ['\r'] = 1, ['\n'] = 1, + /* comment start */ ['#'] = 2, - /* number or op */ -// ['+'] = 3, ['-'] = 3, - /* number digits */ - ['0'] = 4, ['1'] = 4, ['2'] = 4, ['3'] = 4, ['4'] = 4, - ['5'] = 4, ['6'] = 4, ['7'] = 4, ['8'] = 4, ['9'] = 4, - /* alpha characters */ - ['A'] = 5, ['B'] = 5, ['C'] = 5, ['D'] = 5, ['E'] = 5, - ['F'] = 5, ['G'] = 5, ['H'] = 5, ['I'] = 5, ['J'] = 5, - ['K'] = 5, ['L'] = 5, ['M'] = 5, ['N'] = 5, ['O'] = 5, - ['P'] = 5, ['Q'] = 5, ['R'] = 5, ['S'] = 5, ['T'] = 5, - ['U'] = 5, ['V'] = 5, ['W'] = 5, ['X'] = 5, ['Y'] = 5, - ['Z'] = 5, ['a'] = 5, ['b'] = 5, ['c'] = 5, ['d'] = 5, - ['e'] = 5, ['f'] = 5, ['g'] = 5, ['h'] = 5, ['i'] = 5, - ['j'] = 5, ['k'] = 5, ['l'] = 5, ['m'] = 5, ['n'] = 5, - ['o'] = 5, ['p'] = 5, ['q'] = 5, ['r'] = 5, ['s'] = 5, - ['t'] = 5, ['u'] = 5, ['v'] = 5, ['w'] = 5, ['x'] = 5, - ['y'] = 5, ['z'] = 5, - /* punctuation */ - ['('] = 6, [')'] = 6, ['['] = 6, [']'] = 6, ['{'] = 6, ['}'] = 6, - ['.'] = 6, [','] = 6, [':'] = 6, ['='] = 6, [';'] = 6, ['^'] = 6, - ['+'] = 6, ['-'] = 6, ['*'] = 6, ['/'] = 6, ['<'] = 6, ['>'] = 6, - ['|'] = 6, - ['!'] = 6, + /* strings */ - ['"'] = 7 -}; + ['"'] = 3, -static const char HasSecondChar[256] = { - ['<'] = 1, ['>'] = 1, ['!'] = 1 -}; + /* double character ops */ + ['='] = 4, ['.'] = 4, -char SPACE[256] = { - [' '] = 1, ['\t'] = 1, ['\r'] = 1, ['\n'] = 1, -}; + /* potential multi-character ops */ + ['<'] = 5, ['>'] = 5, ['!'] = 5, -char DIGIT[256] = { - ['0'] = 1, ['1'] = 1, ['2'] = 1, ['3'] = 1, ['4'] = 1, - ['5'] = 1, ['6'] = 1, ['7'] = 1, ['8'] = 1, ['9'] = 1, -}; + /* punctuation and single-char ops */ + ['('] = 6, [')'] = 6, ['['] = 6, [']'] = 6, ['{'] = 6, + ['}'] = 6, [','] = 6, [':'] = 6, [';'] = 6, + ['^'] = 6, ['+'] = 6, ['-'] = 6, ['*'] = 6, ['/'] = 6, + ['|'] = 6, -char ALNUM_[256] = { - ['0'] = 1, ['1'] = 1, ['2'] = 1, ['3'] = 1, ['4'] = 1, - ['5'] = 1, ['6'] = 1, ['7'] = 1, ['8'] = 1, ['9'] = 1, - ['A'] = 1, ['B'] = 1, ['C'] = 1, ['D'] = 1, ['E'] = 1, - ['F'] = 1, ['G'] = 1, ['H'] = 1, ['I'] = 1, ['J'] = 1, - ['K'] = 1, ['L'] = 1, ['M'] = 1, ['N'] = 1, ['O'] = 1, - ['P'] = 1, ['Q'] = 1, ['R'] = 1, ['S'] = 1, ['T'] = 1, - ['U'] = 1, ['V'] = 1, ['W'] = 1, ['X'] = 1, ['Y'] = 1, - ['Z'] = 1, ['a'] = 1, ['b'] = 1, ['c'] = 1, ['d'] = 1, - ['e'] = 1, ['f'] = 1, ['g'] = 1, ['h'] = 1, ['i'] = 1, - ['j'] = 1, ['k'] = 1, ['l'] = 1, ['m'] = 1, ['n'] = 1, - ['o'] = 1, ['p'] = 1, ['q'] = 1, ['r'] = 1, ['s'] = 1, - ['t'] = 1, ['u'] = 1, ['v'] = 1, ['w'] = 1, ['x'] = 1, - ['y'] = 1, ['z'] = 1, ['_'] = 1, + /* number digits */ + ['0'] = 7, ['1'] = 7, ['2'] = 7, ['3'] = 7, ['4'] = 7, + ['5'] = 7, ['6'] = 7, ['7'] = 7, ['8'] = 7, ['9'] = 7, + + /* alpha and underscore characters */ + ['A'] = 8, ['B'] = 8, ['C'] = 8, ['D'] = 8, ['E'] = 8, + ['F'] = 8, ['G'] = 8, ['H'] = 8, ['I'] = 8, ['J'] = 8, + ['K'] = 8, ['L'] = 8, ['M'] = 8, ['N'] = 8, ['O'] = 8, + ['P'] = 8, ['Q'] = 8, ['R'] = 8, ['S'] = 8, ['T'] = 8, + ['U'] = 8, ['V'] = 8, ['W'] = 8, ['X'] = 8, ['Y'] = 8, + ['Z'] = 8, ['a'] = 8, ['b'] = 8, ['c'] = 8, ['d'] = 8, + ['e'] = 8, ['f'] = 8, ['g'] = 8, ['h'] = 8, ['i'] = 8, + ['j'] = 8, ['k'] = 8, ['l'] = 8, ['m'] = 8, ['n'] = 8, + ['o'] = 8, ['p'] = 8, ['q'] = 8, ['r'] = 8, ['s'] = 8, + ['t'] = 8, ['u'] = 8, ['v'] = 8, ['w'] = 8, ['x'] = 8, + ['y'] = 8, ['z'] = 8, ['_'] = 8, }; #define NUM_KEYWORDS (sizeof(Keywords) / sizeof(Keywords[0])) @@ -104,7 +94,7 @@ KeywordDef Keywords[] = { { "type", TYPE }, { "until", UNTIL }, { "var", VAR }, - { "while", WHILE }, + { "while", WHILE }, }; static int keywcmp(const void* a, const void* b) { @@ -167,35 +157,40 @@ static inline void readtok(Parser* ctx) { char *beg = ctx->file->fpos; char *curr = ctx->file->fpos; tok->offset = (beg - ctx->file->fbeg); - switch (FirstChar[(int)*curr++]) { - case 1: /* skip whitespace */ - for (; SPACE[(int)*curr]; curr++); + switch (Chars[(int)*curr++]) { + case WSPACE: /* skip whitespace */ + for (; Chars[(int)*curr] == WSPACE; curr++); break; - case 2: /* skip comments */ + case COMMENT: /* skip comments */ for (; *curr != '\n'; curr++); break; - case 3: /* +/- as ops or number signs */ - tok->type = *(curr-1); - if (!DIGIT[(int)*curr]) break; - /* parse it as an int */ - tok->type = INT; - for (; DIGIT[(int)*curr]; curr++); - break; - - case 4: - tok->type = INT; - for (; DIGIT[(int)*curr]; curr++); + case STRINGS: + tok->type = STRING; + for (; *curr != '"'; curr++); + curr++; break; - case 5: - tok->type = IDENT; - for (; ALNUM_[(int)*curr]; curr++); + case DBL_OP: + if (*(curr-1) == *(curr)) + { + curr++; + switch (*(curr-1)) + { + case '=': tok->type = EQ; break; + case '.': tok->type = DOTDOT; break; + default: goto error; break; + } + } + else + { + tok->type = *(curr-1); + } break; - case 6: /* single/double char tokens */ - if (HasSecondChar[(int)*(curr-1)] && *(curr) == '=') + case MULTI_OP: + if ('=' == *(curr)) { curr++; switch (*(curr-2)) @@ -206,25 +201,24 @@ static inline void readtok(Parser* ctx) { default: goto error; break; } } - else if (*(curr-1) == '.' || *(curr-1) == '=') - { - tok->type = *(curr-1); - if (*(curr) == tok->type) - { - curr++; - tok->type = (tok->type == '.' ? DOTDOT : EQ); - } - } else { tok->type = *(curr-1); } break; - case 7: /* string parsing */ - tok->type = STRING; - for (; *curr != '"'; curr++); - curr++; + case PUNCT: + tok->type = *(curr-1); + break; + + case DIGITS: + tok->type = INT; + for (; Chars[(int)*curr] == DIGITS; curr++); + break; + + case ALPHA_: + tok->type = IDENT; + for (; Chars[(int)*curr] >= DIGITS; curr++); break; case 0: /* error handling */ @@ -368,8 +362,11 @@ TEST_SUITE(Lexer) { "type", TYPE }, { "until", UNTIL }, { "var", VAR }, - { "while", WHILE }, - { "", END_FILE }, + { "while", WHILE }, + { "foo", IDENT }, + { "123", INT }, + { "", STRING }, + { "", END_FILE }, }; @@ -380,7 +377,7 @@ TEST_SUITE(Lexer) for (size_t i = 0; i < sizeof(Tokens)/sizeof(Tokens[0]); i++) { lex(&ctx); - //printf("(%d, '%s') != (%d, '%s')\n", + //printf("(%d, '%s') != (%d, '%s')\n", // ctx.tok.type, ctx.tok.text, Tokens[i].type, Tokens[i].text); CHECK(ctx.tok.type == Tokens[i].type); CHECK(ctx.tok.text != NULL); diff --git a/cerise/tests/tokens.txt b/cerise/tests/tokens.txt index f4301f3..4a967ba 100644 --- a/cerise/tests/tokens.txt +++ b/cerise/tests/tokens.txt @@ -56,3 +56,6 @@ type until var while +foo +123 +""