int type;
} KeywordDef;
-static const char FirstChar[256] = {
+enum {
+ WSPACE = 1,
+ COMMENT = 2,
+ STRINGS = 3,
+ DBL_OP = 4,
+ MULTI_OP = 5,
+ PUNCT = 6,
+ DIGITS = 7,
+ ALPHA_ = 8,
+};
+
+static const char Chars[256] = {
/* Whitespace */
[' '] = 1, ['\t'] = 1, ['\r'] = 1, ['\n'] = 1,
+
/* comment start */
['#'] = 2,
- /* number or op */
-// ['+'] = 3, ['-'] = 3,
- /* number digits */
- ['0'] = 4, ['1'] = 4, ['2'] = 4, ['3'] = 4, ['4'] = 4,
- ['5'] = 4, ['6'] = 4, ['7'] = 4, ['8'] = 4, ['9'] = 4,
- /* alpha characters */
- ['A'] = 5, ['B'] = 5, ['C'] = 5, ['D'] = 5, ['E'] = 5,
- ['F'] = 5, ['G'] = 5, ['H'] = 5, ['I'] = 5, ['J'] = 5,
- ['K'] = 5, ['L'] = 5, ['M'] = 5, ['N'] = 5, ['O'] = 5,
- ['P'] = 5, ['Q'] = 5, ['R'] = 5, ['S'] = 5, ['T'] = 5,
- ['U'] = 5, ['V'] = 5, ['W'] = 5, ['X'] = 5, ['Y'] = 5,
- ['Z'] = 5, ['a'] = 5, ['b'] = 5, ['c'] = 5, ['d'] = 5,
- ['e'] = 5, ['f'] = 5, ['g'] = 5, ['h'] = 5, ['i'] = 5,
- ['j'] = 5, ['k'] = 5, ['l'] = 5, ['m'] = 5, ['n'] = 5,
- ['o'] = 5, ['p'] = 5, ['q'] = 5, ['r'] = 5, ['s'] = 5,
- ['t'] = 5, ['u'] = 5, ['v'] = 5, ['w'] = 5, ['x'] = 5,
- ['y'] = 5, ['z'] = 5,
- /* punctuation */
- ['('] = 6, [')'] = 6, ['['] = 6, [']'] = 6, ['{'] = 6, ['}'] = 6,
- ['.'] = 6, [','] = 6, [':'] = 6, ['='] = 6, [';'] = 6, ['^'] = 6,
- ['+'] = 6, ['-'] = 6, ['*'] = 6, ['/'] = 6, ['<'] = 6, ['>'] = 6,
- ['|'] = 6,
- ['!'] = 6,
+
/* strings */
- ['"'] = 7
-};
+ ['"'] = 3,
-static const char HasSecondChar[256] = {
- ['<'] = 1, ['>'] = 1, ['!'] = 1
-};
+ /* double character ops */
+ ['='] = 4, ['.'] = 4,
-char SPACE[256] = {
- [' '] = 1, ['\t'] = 1, ['\r'] = 1, ['\n'] = 1,
-};
+ /* potential multi-character ops */
+ ['<'] = 5, ['>'] = 5, ['!'] = 5,
-char DIGIT[256] = {
- ['0'] = 1, ['1'] = 1, ['2'] = 1, ['3'] = 1, ['4'] = 1,
- ['5'] = 1, ['6'] = 1, ['7'] = 1, ['8'] = 1, ['9'] = 1,
-};
+ /* punctuation and single-char ops */
+ ['('] = 6, [')'] = 6, ['['] = 6, [']'] = 6, ['{'] = 6,
+ ['}'] = 6, [','] = 6, [':'] = 6, [';'] = 6,
+ ['^'] = 6, ['+'] = 6, ['-'] = 6, ['*'] = 6, ['/'] = 6,
+ ['|'] = 6,
-char ALNUM_[256] = {
- ['0'] = 1, ['1'] = 1, ['2'] = 1, ['3'] = 1, ['4'] = 1,
- ['5'] = 1, ['6'] = 1, ['7'] = 1, ['8'] = 1, ['9'] = 1,
- ['A'] = 1, ['B'] = 1, ['C'] = 1, ['D'] = 1, ['E'] = 1,
- ['F'] = 1, ['G'] = 1, ['H'] = 1, ['I'] = 1, ['J'] = 1,
- ['K'] = 1, ['L'] = 1, ['M'] = 1, ['N'] = 1, ['O'] = 1,
- ['P'] = 1, ['Q'] = 1, ['R'] = 1, ['S'] = 1, ['T'] = 1,
- ['U'] = 1, ['V'] = 1, ['W'] = 1, ['X'] = 1, ['Y'] = 1,
- ['Z'] = 1, ['a'] = 1, ['b'] = 1, ['c'] = 1, ['d'] = 1,
- ['e'] = 1, ['f'] = 1, ['g'] = 1, ['h'] = 1, ['i'] = 1,
- ['j'] = 1, ['k'] = 1, ['l'] = 1, ['m'] = 1, ['n'] = 1,
- ['o'] = 1, ['p'] = 1, ['q'] = 1, ['r'] = 1, ['s'] = 1,
- ['t'] = 1, ['u'] = 1, ['v'] = 1, ['w'] = 1, ['x'] = 1,
- ['y'] = 1, ['z'] = 1, ['_'] = 1,
+ /* number digits */
+ ['0'] = 7, ['1'] = 7, ['2'] = 7, ['3'] = 7, ['4'] = 7,
+ ['5'] = 7, ['6'] = 7, ['7'] = 7, ['8'] = 7, ['9'] = 7,
+
+ /* alpha and underscore characters */
+ ['A'] = 8, ['B'] = 8, ['C'] = 8, ['D'] = 8, ['E'] = 8,
+ ['F'] = 8, ['G'] = 8, ['H'] = 8, ['I'] = 8, ['J'] = 8,
+ ['K'] = 8, ['L'] = 8, ['M'] = 8, ['N'] = 8, ['O'] = 8,
+ ['P'] = 8, ['Q'] = 8, ['R'] = 8, ['S'] = 8, ['T'] = 8,
+ ['U'] = 8, ['V'] = 8, ['W'] = 8, ['X'] = 8, ['Y'] = 8,
+ ['Z'] = 8, ['a'] = 8, ['b'] = 8, ['c'] = 8, ['d'] = 8,
+ ['e'] = 8, ['f'] = 8, ['g'] = 8, ['h'] = 8, ['i'] = 8,
+ ['j'] = 8, ['k'] = 8, ['l'] = 8, ['m'] = 8, ['n'] = 8,
+ ['o'] = 8, ['p'] = 8, ['q'] = 8, ['r'] = 8, ['s'] = 8,
+ ['t'] = 8, ['u'] = 8, ['v'] = 8, ['w'] = 8, ['x'] = 8,
+ ['y'] = 8, ['z'] = 8, ['_'] = 8,
};
#define NUM_KEYWORDS (sizeof(Keywords) / sizeof(Keywords[0]))
{ "type", TYPE },
{ "until", UNTIL },
{ "var", VAR },
- { "while", WHILE },
+ { "while", WHILE },
};
static int keywcmp(const void* a, const void* b) {
char *beg = ctx->file->fpos;
char *curr = ctx->file->fpos;
tok->offset = (beg - ctx->file->fbeg);
- switch (FirstChar[(int)*curr++]) {
- case 1: /* skip whitespace */
- for (; SPACE[(int)*curr]; curr++);
+ switch (Chars[(int)*curr++]) {
+ case WSPACE: /* skip whitespace */
+ for (; Chars[(int)*curr] == WSPACE; curr++);
break;
- case 2: /* skip comments */
+ case COMMENT: /* skip comments */
for (; *curr != '\n'; curr++);
break;
- case 3: /* +/- as ops or number signs */
- tok->type = *(curr-1);
- if (!DIGIT[(int)*curr]) break;
- /* parse it as an int */
- tok->type = INT;
- for (; DIGIT[(int)*curr]; curr++);
- break;
-
- case 4:
- tok->type = INT;
- for (; DIGIT[(int)*curr]; curr++);
+ case STRINGS:
+ tok->type = STRING;
+ for (; *curr != '"'; curr++);
+ curr++;
break;
- case 5:
- tok->type = IDENT;
- for (; ALNUM_[(int)*curr]; curr++);
+ case DBL_OP:
+ if (*(curr-1) == *(curr))
+ {
+ curr++;
+ switch (*(curr-1))
+ {
+ case '=': tok->type = EQ; break;
+ case '.': tok->type = DOTDOT; break;
+ default: goto error; break;
+ }
+ }
+ else
+ {
+ tok->type = *(curr-1);
+ }
break;
- case 6: /* single/double char tokens */
- if (HasSecondChar[(int)*(curr-1)] && *(curr) == '=')
+ case MULTI_OP:
+ if ('=' == *(curr))
{
curr++;
switch (*(curr-2))
default: goto error; break;
}
}
- else if (*(curr-1) == '.' || *(curr-1) == '=')
- {
- tok->type = *(curr-1);
- if (*(curr) == tok->type)
- {
- curr++;
- tok->type = (tok->type == '.' ? DOTDOT : EQ);
- }
- }
else
{
tok->type = *(curr-1);
}
break;
- case 7: /* string parsing */
- tok->type = STRING;
- for (; *curr != '"'; curr++);
- curr++;
+ case PUNCT:
+ tok->type = *(curr-1);
+ break;
+
+ case DIGITS:
+ tok->type = INT;
+ for (; Chars[(int)*curr] == DIGITS; curr++);
+ break;
+
+ case ALPHA_:
+ tok->type = IDENT;
+ for (; Chars[(int)*curr] >= DIGITS; curr++);
break;
case 0: /* error handling */
{ "type", TYPE },
{ "until", UNTIL },
{ "var", VAR },
- { "while", WHILE },
- { "", END_FILE },
+ { "while", WHILE },
+ { "foo", IDENT },
+ { "123", INT },
+ { "", STRING },
+ { "", END_FILE },
};
for (size_t i = 0; i < sizeof(Tokens)/sizeof(Tokens[0]); i++)
{
lex(&ctx);
- //printf("(%d, '%s') != (%d, '%s')\n",
+ //printf("(%d, '%s') != (%d, '%s')\n",
// ctx.tok.type, ctx.tok.text, Tokens[i].type, Tokens[i].text);
CHECK(ctx.tok.type == Tokens[i].type);
CHECK(ctx.tok.text != NULL);