implemented lexer based on oberon spec

author mike lowis <mike@mdlowis.com>

Fri, 16 Apr 2021 03:02:06 +0000 (23:02 -0400)

committer mike lowis <mike@mdlowis.com>

Fri, 16 Apr 2021 03:02:06 +0000 (23:02 -0400)
author mike lowis <mike@mdlowis.com>
Fri, 16 Apr 2021 03:02:06 +0000 (23:02 -0400)
committer mike lowis <mike@mdlowis.com>
Fri, 16 Apr 2021 03:02:06 +0000 (23:02 -0400)
diff --git a/cerise/build.sh b/cerise/build.sh

index dec3fbff4cb8f7df15a9372c0faaf087412a5491..10a89f4e976ca680fb6cc6916e8b21b8562d0e78 100755 (executable)
--- a/cerise/build.sh
+++ b/cerise/build.sh
@@ -1,5 +1,5 @@
  #!/bin/sh
  ctags -R &
-cc -g -D CERISE_TESTS -Wall -Wextra -Werror --std=c99 -o cerisec-test *.c tests/*.c \
+cc -g -D CERISE_TESTS -Wall -Wextra -Werror --std=c99 -o cerisec-test *.c \
    && ./cerisec-test \
    && cc -g -Wall -Wextra -Werror --std=c99 -o cerisec *.c
diff --git a/cerise/cerise.h b/cerise/cerise.h

index 2a0060bb94fa107c93501bb26a237aae356ecf4c..f4196bb67a1ecaa46ec9cd5e9bddd821f8c5cfba 100644 (file)
--- a/cerise/cerise.h
+++ b/cerise/cerise.h
@@ -15,13 +15,53 @@ void* emalloc(size_t size);
  /* Token Types
   *****************************************************************************/
  typedef enum {
-    T_NONE = 0,
-    T_STRING = 256, T_ID, T_INT, T_BOOL, T_CHAR, T_FLOAT,
-    T_REQUIRES, T_PROVIDES, T_LET, T_VAR, T_FUN, T_TYPE, T_STRUCT,
-    T_UNION, T_RETURN, T_IF, T_ELSE,
-    T_COUNT,
-    T_ERROR = -2,
-    T_END_FILE = -1
+    NONE = 0,
+    IDENT = 256,
+    INT,
+    STRING,
+    BOOL,
+    EQ,
+    NEQ,
+    LTEQ,
+    GTEQ,
+    DOTDOT,
+    AND,
+    ARRAY,
+    BEGIN,
+    BY,
+    CASE,
+    CONST,
+    DIV,
+    DO,
+    ELSE,
+    ELSIF,
+    END,
+    FALSE,
+    FOR,
+    IF,
+    IMPORT,
+    IS,
+    MOD,
+    MODULE,
+    NIL,
+    NOT,
+    OF,
+    OR,
+    POINTER,
+    PROCEDURE,
+    RECORD,
+    REPEAT,
+    RETURN,
+    THEN,
+    TO,
+    TRUE,
+    TYPE,
+    UNTIL,
+    VAR,
+    WHILE,
+    COUNT,
+    ERROR = -2,
+    END_FILE = -1
  } TokType;
  
  typedef struct {
@@ -37,177 +77,30 @@ typedef struct {
  
  /* Datatype Types
   *****************************************************************************/
-typedef enum {
-    VOID, INT, UINT, FLOAT, ARRAY, REF, PTR, FUNC
-} Kind;
-
-typedef struct Type {
-    Kind kind;
-    union {
-        struct Type* type;
-        size_t bits;
-        struct {
-            struct Type* type;
-            size_t count;
-        } array;
-    } value;
-} Type;
-
-Type* VoidType(void);
-Type* IntType(size_t nbits);
-Type* UIntType(size_t nbits);
-Type* FloatType(size_t nbits);
-Type* ArrayOf(Type* type, size_t count);
-Type* RefTo(Type* type);
-Type* PtrTo(Type* type);
-bool types_equal(Type* type1, Type* type2);
-
-/* Symbol Table
- *****************************************************************************/
-typedef enum {
-    SF_TYPEDEF  = (1 << 0),
-    SF_CONSTANT = (1 << 1),
-    SF_ARGUMENT = (1 << 2),
-} SymFlags;
-
-typedef struct Sym {
-    struct Sym* next;
-    bool is_typedef;
-    int flags;
-    char* name;
-    Type* type;
-} Sym;
-
-typedef struct {
-    Sym* syms;
-} SymTable;
-
-void sym_add(SymTable* syms, int flags, char* name, Type* type);
-Sym* sym_get(SymTable* syms, char* name);
-
-/* AST Types
- *****************************************************************************/
-typedef enum {
-    AST_VAR, AST_FUNC, AST_EXPLIST, AST_IF, AST_APPLY,
-    AST_STRING, AST_SYMBOL, AST_CHAR, AST_INT,
-    AST_FLOAT, AST_BOOL, AST_IDENT, AST_OPER
-} ASTType;
-
-typedef struct AST {
-    ASTType nodetype;
-    Type* datatype;
-    union {
-        struct AST* nodes[3];
-        struct {
-            int oper;
-            struct AST* left;
-            struct AST* right;
-        } op;
-        /* Definition Node */
-        struct {
-            char* name;
-            int flags;
-            struct AST* value;
-        } var;
-        /* Expression Block Node */
-        struct {
-            size_t nexprs;
-            struct AST** exprs;
-        } explist;
-        /* String, Symbol, Identifier */
-        char* text;
-        /* Integer */
-        intptr_t integer;
-        /* Float */
-        double floating;
-    } value;
-} AST;
-
-/* String */
-AST* String(char* val);
-char* string_value(AST* val);
-
-/* Character */
-AST* Char(int val);
-uint32_t char_value(AST* val);
-
-/* Integer */
-AST* Integer(int val);
-intptr_t integer_value(AST* val);
-
-/* Float */
-AST* Float(double val);
-double float_value(AST* val);
-
-/* Bool */
-AST* Bool(bool val);
-bool bool_value(AST* val);
-
-/* Ident */
-AST* Ident(char* val);
-char* ident_value(AST* val);
-
-/* Definition */
-AST* Var(char* name, AST* value, AST* type, int flags);
-char* var_name(AST* var);
-AST* var_value(AST* var);
-bool var_flagset(AST* var, int mask);
-
-AST* Func(AST* args, AST* body, AST* type);
-AST* func_args(AST* func);
-AST* func_body(AST* func);
-
-AST* ExpList(void);
-AST** explist_get(AST* explist, size_t* nexprs);
-void explist_append(AST* explist, AST* expr);
-void explist_prepend(AST* explist, AST* expr);
-
-AST* If(AST* cond, AST* b1, AST* b2);
-AST* if_cond(AST* ifexp);
-AST* if_then(AST* ifexp);
-AST* if_else(AST* ifexp);
-
-AST* Apply(AST* func, AST* args);
-AST* apply_func(AST* apply);
-AST* apply_args(AST* apply);
-
-AST* OpCall(int oper, AST* left, AST* right);
-
-
-/* Package Definition
- *****************************************************************************/
-typedef struct Require {
-    struct Require* next;
-    char* path;
-    char* alias;
-} Require;
-
-typedef struct Provide {
-    struct Provide* next;
-    char* name;
-} Provide;
-
-typedef struct Definition {
-    struct Provide* next;
-    AST* ast;
-} Definition;
-
-typedef struct {
-    char* name;
-    SymTable* syms;
-    Require* requires;
-    Provide* provides;
-    Definition* definitions;
-} Package;
-
-void pkg_add_require(Package* p, char* req);
-void pkg_add_provide(Package* p, char* exp);
-void pkg_add_definition(Package* p, AST* ast);
-
-/* Pretty Printing
- *****************************************************************************/
-void pprint_token(FILE* file, Tok* token, bool print_loc);
-void pprint_tree(FILE* file, AST* tree, int depth);
+//typedef enum {
+//    VOID, INT, UINT, FLOAT, ARRAY, REF, PTR, FUNC
+//} Kind;
+//
+//typedef struct Type {
+//    Kind kind;
+//    union {
+//        struct Type* type;
+//        size_t bits;
+//        struct {
+//            struct Type* type;
+//            size_t count;
+//        } array;
+//    } value;
+//} Type;
+//
+//Type* VoidType(void);
+//Type* IntType(size_t nbits);
+//Type* UIntType(size_t nbits);
+//Type* FloatType(size_t nbits);
+//Type* ArrayOf(Type* type, size_t count);
+//Type* RefTo(Type* type);
+//Type* PtrTo(Type* type);
+//bool types_equal(Type* type1, Type* type2);
  
  /* Lexer and Parser Types
   *****************************************************************************/
@@ -222,8 +115,6 @@ typedef struct {
      LexFile* done;
      LexFile* file;
      Tok tok;
-    SymTable syms;
-    Package pkg;
  } Parser;
  
  void lexfile(Parser* ctx, char* path);
@@ -231,7 +122,6 @@ void lex(Parser* ctx);
  void lexprintpos(Parser* p, FILE* file, Tok* tok);
  void gettoken(Parser* ctx);
  void toplevel(Parser* p);
-void codegen_init(Parser* p);
  
  /* Option Parsing
   *****************************************************************************/
diff --git a/cerise/lex.c b/cerise/lex.c

index 0adffdf765ee2ec90d466414189b0cd39a0e4b60..291d4154cbdc1c59a2826f4ff17adb9978b8afc5 100644 (file)
--- a/cerise/lex.c
+++ b/cerise/lex.c
@@ -14,7 +14,7 @@ static const char FirstChar[256] = {
      /* comment start */
      ['#'] = 2,
      /* number or op */
-    ['+'] = 3, ['-'] = 3,
+//    ['+'] = 3, ['-'] = 3,
      /* number digits */
      ['0'] = 4, ['1'] = 4, ['2'] = 4, ['3'] = 4, ['4'] = 4,
      ['5'] = 4, ['6'] = 4, ['7'] = 4, ['8'] = 4, ['9'] = 4,
@@ -32,12 +32,18 @@ static const char FirstChar[256] = {
      ['y'] = 5, ['z'] = 5,
      /* punctuation */
      ['('] = 6, [')'] = 6, ['['] = 6, [']'] = 6, ['{'] = 6, ['}'] = 6,
-    ['.'] = 6, [','] = 6, [':'] = 6, ['&'] = 6, ['='] = 6, [';'] = 6,
-    ['*'] = 6, ['\''] = 6,
+    ['.'] = 6, [','] = 6, [':'] = 6, ['='] = 6, [';'] = 6, ['^'] = 6,
+    ['+'] = 6, ['-'] = 6, ['*'] = 6, ['/'] = 6, ['<'] = 6, ['>'] = 6, 
+    ['|'] = 6, 
+    ['!'] = 6, 
      /* strings */
      ['"'] = 7
  };
  
+static const char HasSecondChar[256] = {
+    ['<'] = 1, ['>'] = 1, ['!'] = 1
+};
+
  char SPACE[256] = {
      [' '] = 1, ['\t'] = 1, ['\r'] = 1, ['\n'] = 1,
  };
@@ -65,19 +71,40 @@ char ALNUM_[256] = {
  
  #define NUM_KEYWORDS (sizeof(Keywords) / sizeof(Keywords[0]))
  KeywordDef Keywords[] = {
-    { "else",    T_ELSE     },
-    { "false",   T_BOOL     },
-    { "fun",     T_FUN      },
-    { "if",      T_IF       },
-    { "let",     T_LET      },
-    { "provide", T_PROVIDES },
-    { "require", T_REQUIRES },
-    { "return",  T_RETURN   },
-    { "struct",  T_STRUCT   },
-    { "true",    T_BOOL     },
-    { "type",    T_TYPE     },
-    { "union",   T_UNION    },
-    { "var",     T_VAR      },
+    { "and",       AND       },
+    { "array",     ARRAY     },
+    { "begin",     BEGIN     },
+    { "by",        BY        },
+    { "case",      CASE      },
+    { "const",     CONST     },
+    { "div",       DIV       },
+    { "do",        DO        },
+    { "else",      ELSE      },
+    { "elsif",     ELSIF     },
+    { "end",       END       },
+    { "false",     FALSE     },
+    { "for",       FOR       },
+    { "if",        IF        },
+    { "import",    IMPORT    },
+    { "is",        IS        },
+    { "mod",       MOD       },
+    { "module",    MODULE    },
+    { "nil",       NIL       },
+    { "not",       NOT       },
+    { "of",        OF        },
+    { "or",        OR        },
+    { "pointer",   POINTER   },
+    { "procedure", PROCEDURE },
+    { "record",    RECORD    },
+    { "repeat",    REPEAT    },
+    { "return",    RETURN    },
+    { "then",      THEN      },
+    { "to",        TO        },
+    { "true",      TRUE      },
+    { "type",      TYPE      },
+    { "until",     UNTIL     },
+    { "var",       VAR       },
+    { "while",     WHILE     }, 
  };
  
  static int keywcmp(const void* a, const void* b) {
@@ -99,7 +126,7 @@ static inline char* file_load(char* path) {
  
  static inline void convert_value(Tok* tok) {
      switch (tok->type) {
-        case T_STRING: {
+        case STRING: {
              size_t len = strlen(tok->text+1);
              char* strtext = malloc(len);
              strncpy(strtext, tok->text+1, len);
@@ -108,24 +135,24 @@ static inline void convert_value(Tok* tok) {
              break;
          }
  
-        case T_INT: {
+        case INT: {
              tok->value.integer = strtol(tok->text, NULL, 0);
              break;
          }
  
-        case T_ID: {
+        case IDENT: {
              KeywordDef key = { .keyword = tok->text };
              KeywordDef* match = bsearch(
                  &key, Keywords, NUM_KEYWORDS, sizeof(KeywordDef), keywcmp);
              if (match) {
                  tok->type = match->type;
-                if (tok->type != T_ID)
+                if (tok->type != IDENT)
                      convert_value(tok); /* recurse to ensure correct conversion */
              }
              break;
          }
  
-        case T_BOOL: {
+        case BOOL: {
              tok->value.integer = (tok->text[0] == 't');
              break;
          }
@@ -137,7 +164,8 @@ static inline void convert_value(Tok* tok) {
  
  static inline void readtok(Parser* ctx) {
      Tok* tok = &(ctx->tok);
-    char *beg = ctx->file->fpos, *curr = ctx->file->fpos;
+    char *beg = ctx->file->fpos;
+    char *curr = ctx->file->fpos;
      tok->offset = (beg - ctx->file->fbeg);
      switch (FirstChar[(int)*curr++]) {
          case 1: /* skip whitespace */
@@ -152,32 +180,56 @@ static inline void readtok(Parser* ctx) {
              tok->type = *(curr-1);
              if (!DIGIT[(int)*curr]) break;
              /* parse it as an int */
-            tok->type = T_INT;
+            tok->type = INT;
              for (; DIGIT[(int)*curr]; curr++);
              break;
  
          case 4:
-            tok->type = T_INT;
+            tok->type = INT;
              for (; DIGIT[(int)*curr]; curr++);
              break;
  
          case 5:
-            tok->type = T_ID;
+            tok->type = IDENT;
              for (; ALNUM_[(int)*curr]; curr++);
              break;
  
-        case 6: /* single char tokens */
-            tok->type = *(curr-1);
+        case 6: /* single/double char tokens */
+            if (HasSecondChar[(int)*(curr-1)] && *(curr) == '=')
+            {
+                curr++;
+                switch (*(curr-2))
+                {
+                    case '!': tok->type = NEQ;  break;
+                    case '<': tok->type = LTEQ; break;
+                    case '>': tok->type = GTEQ; break;
+                    default:  goto error;       break;
+                }
+            }
+            else if (*(curr-1) == '.' || *(curr-1) == '=')
+            {
+                tok->type = *(curr-1);
+                if (*(curr) == tok->type)
+                {
+                    curr++;
+                    tok->type = (tok->type == '.' ? DOTDOT : EQ);
+                }
+            }
+            else
+            {
+                tok->type = *(curr-1);
+            }
              break;
  
          case 7: /* string parsing */
-            tok->type = T_STRING;
+            tok->type = STRING;
              for (; *curr != '"'; curr++);
              curr++;
              break;
  
          case 0: /* error handling */
          default:
+        error:
              fprintf(stderr, "Failed to parse token '%c'\n", *(curr-1));
              exit(1);
      }
@@ -203,11 +255,12 @@ void lexfile(Parser* ctx, char* path) {
  
  void lex(Parser* ctx) {
      ctx->tok.file = ctx->file->path;
-    ctx->tok.type = T_NONE;
-    while (ctx->tok.type == T_NONE) {
+    ctx->tok.type = NONE;
+    while (ctx->tok.type == NONE) {
          if (!ctx->file) {
              /* no more files left to process */
-            ctx->tok.type = T_END_FILE;
+            ctx->tok.type = END_FILE;
+            ctx->tok.text = "";
              return;
          } else if (!(ctx->file->fpos) || !*(ctx->file->fpos)) {
              /* grab the next file to process */
@@ -248,3 +301,93 @@ void lexprintpos(Parser* p, FILE* file, Tok* tok) {
      }
      fprintf(file, "%s:%zu:%zu:", tok->file, line, col);
  }
+
+#ifdef CERISE_TESTS
+#include "atf.h"
+
+TEST_SUITE(Lexer)
+{
+    struct {
+        char* text;
+        int type;
+    } Tokens[] = {
+        { "+",         '+'       },
+        { "-",         '-'       },
+        { "/",         '/'       },
+        { "*",         '*'       },
+        { "^",         '^'       },
+        { "=",         '='       },
+        { "==",        EQ        },
+        { "!=",        NEQ       },
+        { "<",         '<'       },
+        { ">",         '>'       },
+        { "<=",        LTEQ      },
+        { ">=",        GTEQ      },
+        { ".",         '.'       },
+        { ",",         ','       },
+        { ";",         ';'       },
+        { "..",        DOTDOT    },
+        { "|",         '|'       },
+        { ":",         ':'       },
+        { "(",         '('       },
+        { ")",         ')'       },
+        { "[",         '['       },
+        { "]",         ']'       },
+        { "{",         '{'       },
+        { "}",         '}'       },
+        { "and",       AND       },
+        { "array",     ARRAY     },
+        { "begin",     BEGIN     },
+        { "by",        BY        },
+        { "case",      CASE      },
+        { "const",     CONST     },
+        { "div",       DIV       },
+        { "do",        DO        },
+        { "else",      ELSE      },
+        { "elsif",     ELSIF     },
+        { "end",       END       },
+        { "false",     FALSE     },
+        { "for",       FOR       },
+        { "if",        IF        },
+        { "import",    IMPORT    },
+        { "is",        IS        },
+        { "mod",       MOD       },
+        { "module",    MODULE    },
+        { "nil",       NIL       },
+        { "not",       NOT       },
+        { "of",        OF        },
+        { "or",        OR        },
+        { "pointer",   POINTER   },
+        { "procedure", PROCEDURE },
+        { "record",    RECORD    },
+        { "repeat",    REPEAT    },
+        { "return",    RETURN    },
+        { "then",      THEN      },
+        { "to",        TO        },
+        { "true",      TRUE      },
+        { "type",      TYPE      },
+        { "until",     UNTIL     },
+        { "var",       VAR       },
+        { "while",     WHILE     }, 
+        { "",          END_FILE  }, 
+    };
+
+
+    TEST(Lexer recognizes all required tokens)
+    {
+        Parser ctx = {0};
+        lexfile(&ctx, "tests/tokens.txt");
+        for (size_t i = 0; i < sizeof(Tokens)/sizeof(Tokens[0]); i++)
+        {
+            lex(&ctx);
+            //printf("(%d, '%s') != (%d, '%s')\n", 
+            //    ctx.tok.type, ctx.tok.text, Tokens[i].type, Tokens[i].text);
+            CHECK(ctx.tok.type == Tokens[i].type);
+            CHECK(ctx.tok.text != NULL);
+            CHECK(!strcmp(ctx.tok.text, Tokens[i].text));
+        }
+    }
+}
+
+#endif
+
diff --git a/cerise/main.c b/cerise/main.c

index 8e85037cba81c260ff41d75c4b40abe9f9747980..34766a5ef787eb37f7b6e0a0948a0da0ef6114d0 100644 (file)
--- a/cerise/main.c
+++ b/cerise/main.c
@@ -1,23 +1,28 @@
  #include "cerise.h"
  
+#ifndef CERISE_TESTS
+
  char* ARGV0;
  char* Artifact = "bin";
  
  /* Driver Modes
   *****************************************************************************/
-static int emit_binary(Parser* ctx, int argc, char **argv) {
+static int emit_binary(Parser* ctx, int argc, char **argv) 
+{
      (void)ctx, (void)argc, (void)argv;
      return 0;
  }
  
-static int emit_library(Parser* ctx, int argc, char **argv) {
+static int emit_library(Parser* ctx, int argc, char **argv) 
+{
      (void)ctx, (void)argc, (void)argv;
      return 0;
  }
  
  /* Main Routine and Usage
   *****************************************************************************/
-void usage(void) {
+void usage(void) 
+{
      fprintf(stderr, "%s\n",
          "Usage: sclpl [options...] [-A artifact] [file...]\n"
          "\n-A<artifact> Emit the given type of artifact"
@@ -26,7 +31,8 @@ void usage(void) {
      exit(1);
  }
  
-int main(int argc, char **argv) {
+int main(int argc, char **argv) 
+{
      /* Option parsing */
      OPTBEGIN {
          case 'A': Artifact = EOPTARG(usage()); break;
@@ -37,13 +43,31 @@ int main(int argc, char **argv) {
      for (; argc; argc--,argv++)
          lexfile(&ctx, *argv);
      /* Execute the main compiler process */
-    if (0 == strcmp("bin", Artifact)) {
+    if (0 == strcmp("bin", Artifact)) 
+    {
          return emit_binary(&ctx, argc, argv);
-    } else if (0 == strcmp("lib", Artifact)) {
+    } 
+    else if (0 == strcmp("lib", Artifact)) 
+    {
          return emit_library(&ctx, argc, argv);
-    } else {
+    } 
+    else 
+    {
          fprintf(stderr, "Unknown artifact type: '%s'\n\n", Artifact);
          usage();
      }
      return 1;
  }
+
+#else
+
+#define INCLUDE_DEFS
+#include "atf.h"
+int main(int argc, char **argv) 
+{
+    atf_init(argc, argv);
+    RUN_EXTERN_TEST_SUITE(Lexer);
+    return atf_print_results();
+}
+
+#endif
diff --git a/cerise/tests/tokens.txt b/cerise/tests/tokens.txt

index b9068f3680bdf22450c5493cb9fb6a72703af64a..f4301f363c863f0b6438b59aa47fd2df3e368c87 100644 (file)
--- a/cerise/tests/tokens.txt
+++ b/cerise/tests/tokens.txt
@@ -4,7 +4,7 @@
  *
  ^
  =
-:=
+==
  !=
  <
  >
author	mike lowis <mike@mdlowis.com>
	Fri, 16 Apr 2021 03:02:06 +0000 (23:02 -0400)
committer	mike lowis <mike@mdlowis.com>
	Fri, 16 Apr 2021 03:02:06 +0000 (23:02 -0400)
cerise/build.sh		patch \| blob \| history
cerise/cerise.h		patch \| blob \| history
cerise/lex.c		patch \| blob \| history
cerise/main.c		patch \| blob \| history
cerise/tests/tokens.txt		patch \| blob \| history