Converted handwritten lexer to flex based lexer

author Michael D. Lowis <mike@mdlowis.com>

Sun, 24 Jan 2016 22:06:53 +0000 (17:06 -0500)

committer Michael D. Lowis <mike@mdlowis.com>

Sun, 24 Jan 2016 22:06:53 +0000 (17:06 -0500)
author Michael D. Lowis <mike@mdlowis.com>
Sun, 24 Jan 2016 22:06:53 +0000 (17:06 -0500)
committer Michael D. Lowis <mike@mdlowis.com>
Sun, 24 Jan 2016 22:06:53 +0000 (17:06 -0500)
diff --git a/.gitignore b/.gitignore

index 0b742e65e9cb562f4c6ba376c33d5f4093263407..4bf735a9d0005d46f392d0e61e5e61d375b849be 100644 (file)
--- a/.gitignore
+++ b/.gitignore
@@ -11,3 +11,4 @@ cscope.out
  .DS_Store
  .rsconscache
  sclpl
+source/lexer.c
diff --git a/Makefile b/Makefile

index 323108500356c45a0333b364cea20ffb569c8bfb..defd536fcfea0eaa96a70752efd96a6ad7fb814f 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -30,8 +30,8 @@ OBJS = source/main.o    \
         source/gc.o      \
         source/vec.o     \
         source/pprint.o  \
-       source/lexer.o   \
         source/parser.o  \
+       source/lexer.o   \
         source/ast.o     \
         source/anf.o     \
         source/codegen.o
@@ -59,17 +59,21 @@ ${BIN}: lib${BIN}.a
         @echo LD $@
         @${LD} ${LDFLAGS} -o $@ $^
  
-${TESTBIN}: ${TESTOBJS}
-       @echo LD $@
-       @${LD} ${LDFLAGS} -o $@ $^
-
-tests: $(TESTBIN)
-       @./$<
+#${TESTBIN}: ${TESTOBJS}
+#      @echo LD $@
+#      @${LD} ${LDFLAGS} -o $@ $^
+#
+#tests: $(TESTBIN)
+#      @./$<
  
  specs: $(BIN)
         @echo TEST $<
         @rspec --pattern 'spec/**{,/*/**}/*_spec.rb'
  
+.l.c:
+       @echo LEX $<
+       @${LEX} -o $@ $<
+
  .c.o:
         @echo CC $<
         @${CC} ${CFLAGS} -c -o $@ $<
@@ -77,7 +81,7 @@ specs: $(BIN)
  clean:
         @rm -f ${BIN} lib${BIN}.a
         @rm -f ${TESTBIN} ${TESTOBJS} ${TESTOBJS:.o=.gcda} ${TESTOBJS:.o=.gcno}
-       @rm -f ${OBJS} ${OBJS:.o=.gcda} ${OBJS:.o=.gcno}
+       @rm -f ${OBJS} ${OBJS:.o=.gcda} ${OBJS:.o=.gcno} source/lexer.c
  
  .PHONY: all options tests specs
  
diff --git a/source/lexer.c b/source/lexer.c

deleted file mode 100644 (file)

index 61ddef2..0000000
--- a/source/lexer.c
+++ /dev/null
@@ -1,435 +0,0 @@
-/**
-  @file lexer.c
-  @brief See header for details
-  $Revision$
-  $HeadURL$
-  */
-#include <sclpl.h>
-
-/* Private Declarations
- *****************************************************************************/
-// Token Scanning
-static char* scan(Parser* ctx, size_t* line, size_t* column);
-static void skip_ws(Parser* ctx);
-static char* read_string(Parser* ctx);
-static bool eof(Parser* ctx);
-static bool oneof(Parser* ctx, const char* set);
-static char current(Parser* ctx);
-static bool eol(Parser* ctx);
-static char* dup(Parser* ctx, size_t start_idx, size_t len);
-
-// Lexical Analysis
-static Tok* classify(const char* file, size_t line, size_t col, char* text);
-static bool char_oneof(const char* class, char c);
-static Tok* punctuation(char* text);
-static Tok* character(char* text);
-static Tok* integer(char* text, int base);
-static int getradix(char ch);
-static Tok* radixint(char* text);
-static bool is_float(char* text);
-static Tok* floating(char* text);
-static Tok* number(char* text);
-static Tok* boolean(char* text);
-
-// Token Constructors
-static Tok* Token(TokType type);
-static Tok* TextTok(TokType type, char* text);
-static Tok* CharTok(uint32_t val);
-static Tok* IntTok(intptr_t val);
-static Tok* FloatTok(double val);
-static Tok* BoolTok(bool val);
-static void token_free(void* obj);
-
-// Utility Functions
-static char* dupstring(const char* old);
-
-/* Public API
- *****************************************************************************/
-Tok* gettoken(Parser* ctx)
-{
-    Tok* tok = NULL;
-    size_t line, col;
-    char* text = scan(ctx, &line, &col);
-    if (text != NULL) {
-        tok = classify(NULL, line, col, text);
-        free(text);
-    }
-    return tok;
-}
-
-void fetchline(Parser* ctx)
-{
-    int c;
-    size_t capacity = 8;
-    size_t index    = 0;
-    /* Reallocate and clear the line buffer */
-    ctx->line = realloc(ctx->line, capacity);
-    ctx->line[0] = '\0';
-    ctx->index = 0;
-
-    /* If we have not yet reached the end of the file, read the next line */
-    if (!eof(ctx)) {
-        if (NULL != ctx->prompt)
-            printf("%s", ctx->prompt);
-        while(('\n' != (c = fgetc(ctx->input))) && (EOF != c)) {
-            if (index+2 == capacity) {
-                capacity <<= 1u;
-                ctx->line = realloc(ctx->line, capacity);
-            }
-            ctx->line[index++] = c;
-        }
-        ctx->line[index++] = (c == EOF) ? '\0' : c;
-        ctx->line[index++] = '\0';
-        ctx->index = 0;
-        /* Increment line count */
-        ctx->lineno++;
-    }
-}
-
-/* Token Scanning
- *****************************************************************************/
-static char* scan(Parser* ctx, size_t* line, size_t* column)
-{
-    char* tok = NULL;
-    skip_ws(ctx);
-    *line   = ctx->lineno;
-    *column = ctx->index+1;
-    if (!eof(ctx)) {
-        if (oneof(ctx, "()[]{};,'")) {
-            tok = dup(ctx, ctx->index, 1);
-            ctx->index++;
-        } else if (current(ctx) == '"') {
-            tok = read_string(ctx);
-        } else {
-            size_t start = ctx->index;
-            while(!oneof(ctx," \t\r\n()[]{};,'\"") && (current(ctx) != '\0')) {
-                ctx->index++;
-            }
-            tok = dup(ctx, start, ctx->index - start);
-        }
-    }
-    return tok;
-}
-
-static void skip_ws(Parser* ctx)
-{
-    /* If we haven't read a line yet, read one now */
-    if (NULL == ctx->line)
-        fetchline(ctx);
-    /* Fast forward past whitespace and read a newline if necessary  */
-    while(!eof(ctx)) {
-        if ('\0' == current(ctx)) {
-            fetchline(ctx);
-        } else if (oneof(ctx, " \t\r\n")) {
-            ctx->index++;
-        } else {
-            break;
-        }
-    }
-}
-
-static char* read_string(Parser* ctx)
-{
-    size_t capacity = 8;
-    size_t index = 0;
-    char*  tok = (char*)malloc(capacity);
-
-    /* Skip the first " */
-    tok[index++] = current(ctx);
-    tok[index] = '\0';
-    ctx->index++;
-
-    /* Read the contents of the string */
-    while ('"' != current(ctx)) {
-        /* Resize the buffer if necessary */
-        if ((index+2) >= capacity) {
-            capacity = capacity << 1;
-            tok = (char*)realloc(tok, capacity);
-        }
-
-        /* EOF results in an assertion (don't do) */
-        if (eof(ctx))
-            assert(false);
-
-        /* Read the char */
-        tok[index++] = current(ctx);
-        tok[index] = '\0';
-        ctx->index++;
-
-        /* Get the next line if necessary */
-        if ('\n' == tok[index-1])
-            fetchline(ctx);
-    }
-
-    /* Skip the last " */
-    tok[index++] = current(ctx);
-    tok[index] = '\0';
-    ctx->index++;
-
-    return tok;
-}
-
-static bool eof(Parser* ctx)
-{
-    return (eol(ctx) && feof(ctx->input));
-}
-
-static bool oneof(Parser* ctx, const char* set)
-{
-    bool ret = false;
-    size_t sz = strlen(set);
-    for (size_t idx = 0; idx < sz; idx++) {
-        if (current(ctx) == set[idx]) {
-            ret = true;
-            break;
-        }
-    }
-    return ret;
-}
-
-static char current(Parser* ctx)
-{
-    return ctx->line[ctx->index];
-}
-
-static bool eol(Parser* ctx)
-{
-    bool ret = true;
-    size_t index = ctx->index;
-    char ch;
-    while((NULL != ctx->line) && ('\0' != (ch = ctx->line[index]))) {
-        if((' '!=ch) && ('\t'!=ch) && ('\r'!=ch) && ('\n'!=ch)) {
-            ret = false;
-            break;
-        }
-        index++;
-    }
-    return ret;
-}
-
-static char* dup(Parser* ctx, size_t start_idx, size_t len)
-{
-    char* str = (char*)malloc(len+1);
-    memcpy(str, &(ctx->line[start_idx]), len);
-    str[len] = '\0';
-    return str;
-}
-
-
-/* Lexical Analysis
- *****************************************************************************/
-static Tok* classify(const char* file, size_t line, size_t col, char* text)
-{
-    Tok* tok = NULL;
-    (void)file;
-    if (0 == strcmp(text,"end")) {
-        tok = Token(T_END);
-    } else if (char_oneof("()[]{};,'", text[0])) {
-        tok = punctuation(text);
-    } else if ('"' == text[0]) {
-        text[strlen(text)-1] = '\0';
-        tok = TextTok(T_STRING, &text[1]);
-    } else if (text[0] == '\\') {
-        tok = character(text);
-    } else if ((text[0] == '0') && char_oneof("bodh",text[1])) {
-        tok = radixint(text);
-    } else if (char_oneof("+-0123456789",text[0])) {
-        tok = number(text);
-    } else if ((0 == strcmp(text,"true")) || (0 == strcmp(text,"false"))) {
-        tok = boolean(text);
-    } else {
-        tok = TextTok(T_ID, text);
-    }
-    /* If we found a valid token then fill in the location details */
-    if (NULL != tok) {
-        tok->line = line;
-        tok->col  = col;
-    }
-    return tok;
-}
-
-static bool char_oneof(const char* class, char c)
-{
-    bool ret = false;
-    size_t sz = strlen(class);
-    for (size_t idx = 0; idx < sz; idx++) {
-        if (c == class[idx]) {
-            ret = true;
-            break;
-        }
-    }
-    return ret;
-}
-
-static Tok* punctuation(char* text)
-{
-    Tok* tok = NULL;
-    switch (text[0]) {
-        case '(':  tok = Token(T_LPAR);   break;
-        case ')':  tok = Token(T_RPAR);   break;
-        case '{':  tok = Token(T_LBRACE); break;
-        case '}':  tok = Token(T_RBRACE); break;
-        case '[':  tok = Token(T_LBRACK); break;
-        case ']':  tok = Token(T_RBRACK); break;
-        case ';':  tok = Token(T_END);    break;
-        case ',':  tok = Token(T_COMMA);  break;
-        case '\'': tok = Token(T_SQUOTE); break;
-    }
-    return tok;
-}
-
-static Tok* character(char* text)
-{
-    Tok* tok = NULL;
-    static const char* lookuptable[5] = {
-        " \0space",
-        "\n\0newline",
-        "\r\0return",
-        "\t\0tab",
-        "\v\0vtab"
-    };
-    if (strlen(text) == 2) {
-        tok = CharTok((uint32_t)text[1]);
-    } else {
-        for(int i = 0; i < 5; i++) {
-            if (0 == strcmp(&text[1], &(lookuptable[i][2]))) {
-                tok = CharTok((uint32_t)lookuptable[i][0]);
-                break;
-            }
-        }
-        if (NULL == tok)
-            tok = TextTok(T_ID, text);
-    }
-    return tok;
-}
-
-static Tok* integer(char* text, int base)
-{
-    char* end;
-    long integer;
-    errno = 0;
-    integer = strtol(text, &end, base);
-    assert(errno == 0);
-    return (end[0] == '\0') ? IntTok(integer) : NULL;
-}
-
-static int getradix(char ch)
-{
-    int ret = -1;
-    switch(ch) {
-        case 'b': ret = 2;  break;
-        case 'o': ret = 8;  break;
-        case 'd': ret = 10; break;
-        case 'h': ret = 16; break;
-    }
-    return ret;
-}
-
-static Tok* radixint(char* text)
-{
-    Tok* ret = integer(&text[2], getradix(text[1]));
-    if (NULL == ret)
-        ret = TextTok(T_ID, text);
-    return ret;
-}
-
-static bool is_float(char* text)
-{
-    while (text[0] != '\0')
-        if (text[0] == '.')
-            return true;
-        else
-            text++;
-    return false;
-}
-
-static Tok* floating(char* text)
-{
-    char* end;
-    double dbl;
-    errno = 0;
-    dbl = strtod(text, &end);
-    assert(errno == 0);
-    return (end[0] == '\0') ? FloatTok(dbl) : NULL;
-}
-
-static Tok* number(char* text)
-{
-    Tok* tok = NULL;
-    if (is_float(text))
-        tok = floating(text);
-    else
-        tok = integer(text, 10);
-    return (NULL == tok) ? TextTok(T_ID, text) : tok;
-}
-
-static Tok* boolean(char* text)
-{
-    return BoolTok(0 == strcmp(text,"true"));
-}
-
-/* Token Constructors
- *****************************************************************************/
-static Tok* Token(TokType type)
-{
-    Tok* tok = (Tok*)gc_alloc(sizeof(Tok), &token_free);
-    tok->type = type;
-    return tok;
-}
-
-static Tok* TextTok(TokType type, char* text)
-{
-    Tok* tok = Token(type);
-    tok->value.text = (char*)gc_addref(dupstring(text));
-    return tok;
-}
-
-static Tok* CharTok(uint32_t val)
-{
-    Tok* tok = Token(T_CHAR);
-    tok->value.character = val;
-    return tok;
-}
-
-static Tok* IntTok(intptr_t val)
-{
-    Tok* tok = Token(T_INT);
-    tok->value.integer = val;
-    return tok;
-}
-
-static Tok* FloatTok(double val)
-{
-    Tok* tok = Token(T_FLOAT);
-    tok->value.floating = val;
-    return tok;
-}
-
-static Tok* BoolTok(bool val)
-{
-    Tok* tok = Token(T_BOOL);
-    tok->value.boolean = val;
-    return tok;
-}
-
-static void token_free(void* obj)
-{
-    Tok* tok = (Tok*)obj;
-    if ((tok->type != T_BOOL) &&
-        (tok->type != T_CHAR) &&
-        (tok->type != T_INT) &&
-        (tok->type != T_FLOAT) &&
-        (NULL != tok->value.text))
-        gc_delref(tok->value.text);
-}
-
-/* Utility Functions
- *****************************************************************************/
-static char* dupstring(const char* old) {
-    size_t length = strlen(old);
-    char* str = (char*)gc_alloc(length+1, NULL);
-    memcpy(str, old, length);
-    str[length] = '\0';
-    return str;
-}
-
diff --git a/source/lexer.l b/source/lexer.l

new file mode 100644 (file)

index 0000000..ae0659d
--- /dev/null
+++ b/source/lexer.l
@@ -0,0 +1,144 @@
+%{
+#include <sclpl.h>
+
+static union {
+    char* text;
+    uint32_t character;
+    intptr_t integer;
+    double floating;
+    bool boolean;
+} Value;
+
+static char* dupstring(const char* old) {
+    size_t length = strlen(old);
+    char* str = (char*)gc_alloc(length+1, NULL);
+    memcpy(str, old, length);
+    str[length] = '\0';
+    return str;
+}
+
+%}
+
+DIGIT   [0-9]
+ALPHA   [a-zA_Z]
+ALPHA_  [a-zA_Z_]
+ALNUM_  [a-zA-Z0-9_]
+SPACE   [ \t\r\n]
+NOSPACE [^ \t\r\n]
+
+%option noyywrap
+
+%%
+
+<<EOF>> { return T_END_FILE; }
+
+"end" { return T_END;    }
+"("   { return T_LPAR;   }
+")"   { return T_RPAR;   }
+"["   { return T_LBRACK; }
+"]"   { return T_RBRACK; }
+"{"   { return T_LBRACE; }
+"}"   { return T_RBRACE; }
+";"   { return T_END;    }
+","   { return T_COMMA;  }
+"'"   { return T_SQUOTE; }
+
+\\.       { Value.character = yytext[1];    return T_CHAR; }
+\\space   { Value.character = ' ';          return T_CHAR; }
+\\newline { Value.character = '\n';         return T_CHAR; }
+\\return  { Value.character = '\r';         return T_CHAR; }
+\\tab     { Value.character = '\t';         return T_CHAR; }
+\\vtab    { Value.character = '\v';         return T_CHAR; }
+\\[a-z]+  { Value.text = dupstring(yytext); return T_ID;   }
+
+0b[01]+ {
+    Value.integer = strtol(&yytext[2], NULL, 2);
+    return T_INT;
+}
+
+0o[0-7]+ {
+    Value.integer = strtol(&yytext[2], NULL, 8);
+    return T_INT;
+}
+
+0d[0-9]+ {
+    Value.integer = strtol(&yytext[2], NULL, 10);
+    return T_INT;
+}
+
+0h[0-9a-fA-F]+ {
+    Value.integer = strtol(&yytext[2], NULL, 16);
+    return T_INT;
+}
+
+0[b0dh][0-9a-fA-F]+ {
+    Value.text = dupstring(yytext);
+    return T_ID;
+}
+
+[+-]?[0-9]+ {
+    Value.integer = strtol(&yytext[0], NULL, 10);
+    return T_INT;
+}
+
+[+-]?[0-9]+\.[0-9]+(e[+-]?[0-9]+)? {
+    Value.floating = strtod(yytext, NULL);
+    return T_FLOAT;
+}
+
+\"([^"]|\\\")*\" {
+    size_t len = strlen(&yytext[1]);
+    Value.text = dupstring(&yytext[1]);
+    Value.text[len-1] = '\0';
+    return T_STRING;
+}
+
+true  {
+    Value.boolean = true;
+    return T_BOOL;
+}
+
+false {
+    Value.boolean = false;
+    return T_BOOL;
+}
+
+[a-zA-Z_][a-zA-Z0-9_]* {
+    Value.text = dupstring(yytext);
+    return T_ID;
+}
+
+[^ \r\t\n\[\]\{\}\(\)'\",;]+ {
+    Value.text = dupstring(yytext);
+    return T_ID;
+}
+
+%%
+
+static void token_free(void* obj)
+{
+    Tok* tok = (Tok*)obj;
+    if ((tok->type != T_BOOL) &&
+        (tok->type != T_CHAR) &&
+        (tok->type != T_INT) &&
+        (tok->type != T_FLOAT) &&
+        (NULL != tok->value.text))
+        gc_delref(tok->value.text);
+}
+
+Tok* gettoken(Parser* ctx)
+{
+    Tok* tok = NULL;
+    int type = yylex();
+    if (type != T_END_FILE) {
+        tok = (Tok*)gc_alloc(sizeof(Tok), &token_free);
+        tok->type = type;
+        memcpy(&(tok->value), &Value, sizeof(Value));
+    }
+    return tok;
+}
+
+void fetchline(Parser* ctx)
+{
+}
+
diff --git a/source/opt.h b/source/opt.h

index 126fcbeb45d87310d8b6661fe134e1d846986727..a859258b1d798c6e93ee9fee42db122a5dfc8c9f 100644 (file)
--- a/source/opt.h
+++ b/source/opt.h
@@ -35,7 +35,7 @@ extern char* ARGV0;
  /* This is a helper function used by the macros in this file to parse the next
   * option from the command line.
   */
-static inline char* getopt(int* p_argc, char*** p_argv) {
+static inline char* __getopt(int* p_argc, char*** p_argv) {
      if (!(*p_argv)[0][1] && !(*p_argv)[1]) {
          return (char*)0;
      } else if ((*p_argv)[0][1]) {
@@ -73,13 +73,13 @@ static inline char* getopt(int* p_argc, char*** p_argv) {
  /* Get an argument from the command line and return it as a string. If no
   * argument is available, this macro returns NULL */
  #define OPTARG() \
-    (optarg_ = getopt(&argc,&argv), brk_ = (optarg_!=0), optarg_)
+    (optarg_ = __getopt(&argc,&argv), brk_ = (optarg_!=0), optarg_)
  
  /* Get an argument from the command line and return it as a string. If no
   * argument is available, this macro executes the provided code. If that code
   * returns, then abort is called. */
  #define EOPTARG(code) \
-    (optarg_ = getopt(&argc,&argv), \
+    (optarg_ = __getopt(&argc,&argv), \
       (!optarg_ ? ((code), abort(), (char*)0) : (brk_ = 1, optarg_)))
  
  /* Helper macro to recognize number options */
diff --git a/spec/anf_spec.rb b/spec/anf_spec.rb

index 1a7cda2f234399b53e6b8934d728534928bea5e9..57e9cc3f582078dfa9f4ba1d091b6c6220aaa770 100644 (file)
--- a/spec/anf_spec.rb
+++ b/spec/anf_spec.rb
@@ -187,8 +187,8 @@ describe "sclpl a-normal form" do
        expect(anf('fn() if 1 2 else 3;;')).to eq([
          ["fn", [],
            ["if", "T_INT:1",
-            ["let", ["$:0", "T_INT:2"], "$:0"],
-            ["let", ["$:1", "T_INT:3"], "$:1"]]]
+            ["let", ["$:1", "T_INT:2"], "$:1"],
+            ["let", ["$:2", "T_INT:3"], "$:2"]]]
        ])
      end
  
diff --git a/spec/parser_spec.rb b/spec/parser_spec.rb

index d0d0065406d90da6ab6c41a72b1b2c68e13c693f..ffdcac409f5d8551ba53ab4d1ac74b13a7b34151 100644 (file)
--- a/spec/parser_spec.rb
+++ b/spec/parser_spec.rb
@@ -191,23 +191,23 @@ describe "sclpl grammar" do
        it "should normalize a literal with an if expression" do
          expect(ast('fn() if 1 2 else 3;;')).to eq([
            ["fn", [],
-            ["let", ["$:2", ["if", "T_INT:1",
-                              ["let", ["$:0", "T_INT:2"], "$:0"],
-                              ["let", ["$:1", "T_INT:3"], "$:1"]]],
-              "$:2"]]
+            ["let", ["$:0", ["if", "T_INT:1",
+                              ["let", ["$:1", "T_INT:2"], "$:1"],
+                              ["let", ["$:2", "T_INT:3"], "$:2"]]],
+              "$:0"]]
          ])
        end
  
        it "should normalize a literal with two sequential if expressions" do
          expect(ast('fn() if 1 2 else 3; if 1 2 else 3; ;')).to eq([
            ["fn", [],
-            ["let", ["$:2", ["if", "T_INT:1",
-                              ["let", ["$:0", "T_INT:2"], "$:0"],
-                              ["let", ["$:1", "T_INT:3"], "$:1"]]],
-              ["let", ["$:5", ["if", "T_INT:1",
-                                ["let", ["$:3", "T_INT:2"], "$:3"],
-                                ["let", ["$:4", "T_INT:3"], "$:4"]]],
-                "$:5"]]]
+            ["let", ["$:0", ["if", "T_INT:1",
+                              ["let", ["$:1", "T_INT:2"], "$:1"],
+                              ["let", ["$:2", "T_INT:3"], "$:2"]]],
+              ["let", ["$:3", ["if", "T_INT:1",
+                                ["let", ["$:4", "T_INT:2"], "$:4"],
+                                ["let", ["$:5", "T_INT:3"], "$:5"]]],
+                "$:3"]]]
          ])
        end
      end
author	Michael D. Lowis <mike@mdlowis.com>
	Sun, 24 Jan 2016 22:06:53 +0000 (17:06 -0500)
committer	Michael D. Lowis <mike@mdlowis.com>
	Sun, 24 Jan 2016 22:06:53 +0000 (17:06 -0500)
.gitignore		patch \| blob \| history
Makefile		patch \| blob \| history
source/lexer.c	[deleted file]	patch \| blob \| history
source/lexer.l	[new file with mode: 0644]	patch \| blob
source/opt.h		patch \| blob \| history
spec/anf_spec.rb		patch \| blob \| history
spec/parser_spec.rb		patch \| blob \| history