From: mike lowis Date: Fri, 16 Apr 2021 03:02:06 +0000 (-0400) Subject: implemented lexer based on oberon spec X-Git-Url: https://git.mdlowis.com/?a=commitdiff_plain;h=271612c474e95c8993f8cfd7bbce7e8dfe9601c9;p=proto%2Fobnc.git implemented lexer based on oberon spec --- diff --git a/cerise/build.sh b/cerise/build.sh index dec3fbf..10a89f4 100755 --- a/cerise/build.sh +++ b/cerise/build.sh @@ -1,5 +1,5 @@ #!/bin/sh ctags -R & -cc -g -D CERISE_TESTS -Wall -Wextra -Werror --std=c99 -o cerisec-test *.c tests/*.c \ +cc -g -D CERISE_TESTS -Wall -Wextra -Werror --std=c99 -o cerisec-test *.c \ && ./cerisec-test \ && cc -g -Wall -Wextra -Werror --std=c99 -o cerisec *.c diff --git a/cerise/cerise.h b/cerise/cerise.h index 2a0060b..f4196bb 100644 --- a/cerise/cerise.h +++ b/cerise/cerise.h @@ -15,13 +15,53 @@ void* emalloc(size_t size); /* Token Types *****************************************************************************/ typedef enum { - T_NONE = 0, - T_STRING = 256, T_ID, T_INT, T_BOOL, T_CHAR, T_FLOAT, - T_REQUIRES, T_PROVIDES, T_LET, T_VAR, T_FUN, T_TYPE, T_STRUCT, - T_UNION, T_RETURN, T_IF, T_ELSE, - T_COUNT, - T_ERROR = -2, - T_END_FILE = -1 + NONE = 0, + IDENT = 256, + INT, + STRING, + BOOL, + EQ, + NEQ, + LTEQ, + GTEQ, + DOTDOT, + AND, + ARRAY, + BEGIN, + BY, + CASE, + CONST, + DIV, + DO, + ELSE, + ELSIF, + END, + FALSE, + FOR, + IF, + IMPORT, + IS, + MOD, + MODULE, + NIL, + NOT, + OF, + OR, + POINTER, + PROCEDURE, + RECORD, + REPEAT, + RETURN, + THEN, + TO, + TRUE, + TYPE, + UNTIL, + VAR, + WHILE, + COUNT, + ERROR = -2, + END_FILE = -1 } TokType; typedef struct { @@ -37,177 +77,30 @@ typedef struct { /* Datatype Types *****************************************************************************/ -typedef enum { - VOID, INT, UINT, FLOAT, ARRAY, REF, PTR, FUNC -} Kind; - -typedef struct Type { - Kind kind; - union { - struct Type* type; - size_t bits; - struct { - struct Type* type; - size_t count; - } array; - } value; -} Type; - -Type* VoidType(void); -Type* IntType(size_t nbits); -Type* UIntType(size_t nbits); -Type* FloatType(size_t nbits); -Type* ArrayOf(Type* type, size_t count); -Type* RefTo(Type* type); -Type* PtrTo(Type* type); -bool types_equal(Type* type1, Type* type2); - -/* Symbol Table - *****************************************************************************/ -typedef enum { - SF_TYPEDEF = (1 << 0), - SF_CONSTANT = (1 << 1), - SF_ARGUMENT = (1 << 2), -} SymFlags; - -typedef struct Sym { - struct Sym* next; - bool is_typedef; - int flags; - char* name; - Type* type; -} Sym; - -typedef struct { - Sym* syms; -} SymTable; - -void sym_add(SymTable* syms, int flags, char* name, Type* type); -Sym* sym_get(SymTable* syms, char* name); - -/* AST Types - *****************************************************************************/ -typedef enum { - AST_VAR, AST_FUNC, AST_EXPLIST, AST_IF, AST_APPLY, - AST_STRING, AST_SYMBOL, AST_CHAR, AST_INT, - AST_FLOAT, AST_BOOL, AST_IDENT, AST_OPER -} ASTType; - -typedef struct AST { - ASTType nodetype; - Type* datatype; - union { - struct AST* nodes[3]; - struct { - int oper; - struct AST* left; - struct AST* right; - } op; - /* Definition Node */ - struct { - char* name; - int flags; - struct AST* value; - } var; - /* Expression Block Node */ - struct { - size_t nexprs; - struct AST** exprs; - } explist; - /* String, Symbol, Identifier */ - char* text; - /* Integer */ - intptr_t integer; - /* Float */ - double floating; - } value; -} AST; - -/* String */ -AST* String(char* val); -char* string_value(AST* val); - -/* Character */ -AST* Char(int val); -uint32_t char_value(AST* val); - -/* Integer */ -AST* Integer(int val); -intptr_t integer_value(AST* val); - -/* Float */ -AST* Float(double val); -double float_value(AST* val); - -/* Bool */ -AST* Bool(bool val); -bool bool_value(AST* val); - -/* Ident */ -AST* Ident(char* val); -char* ident_value(AST* val); - -/* Definition */ -AST* Var(char* name, AST* value, AST* type, int flags); -char* var_name(AST* var); -AST* var_value(AST* var); -bool var_flagset(AST* var, int mask); - -AST* Func(AST* args, AST* body, AST* type); -AST* func_args(AST* func); -AST* func_body(AST* func); - -AST* ExpList(void); -AST** explist_get(AST* explist, size_t* nexprs); -void explist_append(AST* explist, AST* expr); -void explist_prepend(AST* explist, AST* expr); - -AST* If(AST* cond, AST* b1, AST* b2); -AST* if_cond(AST* ifexp); -AST* if_then(AST* ifexp); -AST* if_else(AST* ifexp); - -AST* Apply(AST* func, AST* args); -AST* apply_func(AST* apply); -AST* apply_args(AST* apply); - -AST* OpCall(int oper, AST* left, AST* right); - - -/* Package Definition - *****************************************************************************/ -typedef struct Require { - struct Require* next; - char* path; - char* alias; -} Require; - -typedef struct Provide { - struct Provide* next; - char* name; -} Provide; - -typedef struct Definition { - struct Provide* next; - AST* ast; -} Definition; - -typedef struct { - char* name; - SymTable* syms; - Require* requires; - Provide* provides; - Definition* definitions; -} Package; - -void pkg_add_require(Package* p, char* req); -void pkg_add_provide(Package* p, char* exp); -void pkg_add_definition(Package* p, AST* ast); - -/* Pretty Printing - *****************************************************************************/ -void pprint_token(FILE* file, Tok* token, bool print_loc); -void pprint_tree(FILE* file, AST* tree, int depth); +//typedef enum { +// VOID, INT, UINT, FLOAT, ARRAY, REF, PTR, FUNC +//} Kind; +// +//typedef struct Type { +// Kind kind; +// union { +// struct Type* type; +// size_t bits; +// struct { +// struct Type* type; +// size_t count; +// } array; +// } value; +//} Type; +// +//Type* VoidType(void); +//Type* IntType(size_t nbits); +//Type* UIntType(size_t nbits); +//Type* FloatType(size_t nbits); +//Type* ArrayOf(Type* type, size_t count); +//Type* RefTo(Type* type); +//Type* PtrTo(Type* type); +//bool types_equal(Type* type1, Type* type2); /* Lexer and Parser Types *****************************************************************************/ @@ -222,8 +115,6 @@ typedef struct { LexFile* done; LexFile* file; Tok tok; - SymTable syms; - Package pkg; } Parser; void lexfile(Parser* ctx, char* path); @@ -231,7 +122,6 @@ void lex(Parser* ctx); void lexprintpos(Parser* p, FILE* file, Tok* tok); void gettoken(Parser* ctx); void toplevel(Parser* p); -void codegen_init(Parser* p); /* Option Parsing *****************************************************************************/ diff --git a/cerise/lex.c b/cerise/lex.c index 0adffdf..291d415 100644 --- a/cerise/lex.c +++ b/cerise/lex.c @@ -14,7 +14,7 @@ static const char FirstChar[256] = { /* comment start */ ['#'] = 2, /* number or op */ - ['+'] = 3, ['-'] = 3, +// ['+'] = 3, ['-'] = 3, /* number digits */ ['0'] = 4, ['1'] = 4, ['2'] = 4, ['3'] = 4, ['4'] = 4, ['5'] = 4, ['6'] = 4, ['7'] = 4, ['8'] = 4, ['9'] = 4, @@ -32,12 +32,18 @@ static const char FirstChar[256] = { ['y'] = 5, ['z'] = 5, /* punctuation */ ['('] = 6, [')'] = 6, ['['] = 6, [']'] = 6, ['{'] = 6, ['}'] = 6, - ['.'] = 6, [','] = 6, [':'] = 6, ['&'] = 6, ['='] = 6, [';'] = 6, - ['*'] = 6, ['\''] = 6, + ['.'] = 6, [','] = 6, [':'] = 6, ['='] = 6, [';'] = 6, ['^'] = 6, + ['+'] = 6, ['-'] = 6, ['*'] = 6, ['/'] = 6, ['<'] = 6, ['>'] = 6, + ['|'] = 6, + ['!'] = 6, /* strings */ ['"'] = 7 }; +static const char HasSecondChar[256] = { + ['<'] = 1, ['>'] = 1, ['!'] = 1 +}; + char SPACE[256] = { [' '] = 1, ['\t'] = 1, ['\r'] = 1, ['\n'] = 1, }; @@ -65,19 +71,40 @@ char ALNUM_[256] = { #define NUM_KEYWORDS (sizeof(Keywords) / sizeof(Keywords[0])) KeywordDef Keywords[] = { - { "else", T_ELSE }, - { "false", T_BOOL }, - { "fun", T_FUN }, - { "if", T_IF }, - { "let", T_LET }, - { "provide", T_PROVIDES }, - { "require", T_REQUIRES }, - { "return", T_RETURN }, - { "struct", T_STRUCT }, - { "true", T_BOOL }, - { "type", T_TYPE }, - { "union", T_UNION }, - { "var", T_VAR }, + { "and", AND }, + { "array", ARRAY }, + { "begin", BEGIN }, + { "by", BY }, + { "case", CASE }, + { "const", CONST }, + { "div", DIV }, + { "do", DO }, + { "else", ELSE }, + { "elsif", ELSIF }, + { "end", END }, + { "false", FALSE }, + { "for", FOR }, + { "if", IF }, + { "import", IMPORT }, + { "is", IS }, + { "mod", MOD }, + { "module", MODULE }, + { "nil", NIL }, + { "not", NOT }, + { "of", OF }, + { "or", OR }, + { "pointer", POINTER }, + { "procedure", PROCEDURE }, + { "record", RECORD }, + { "repeat", REPEAT }, + { "return", RETURN }, + { "then", THEN }, + { "to", TO }, + { "true", TRUE }, + { "type", TYPE }, + { "until", UNTIL }, + { "var", VAR }, + { "while", WHILE }, }; static int keywcmp(const void* a, const void* b) { @@ -99,7 +126,7 @@ static inline char* file_load(char* path) { static inline void convert_value(Tok* tok) { switch (tok->type) { - case T_STRING: { + case STRING: { size_t len = strlen(tok->text+1); char* strtext = malloc(len); strncpy(strtext, tok->text+1, len); @@ -108,24 +135,24 @@ static inline void convert_value(Tok* tok) { break; } - case T_INT: { + case INT: { tok->value.integer = strtol(tok->text, NULL, 0); break; } - case T_ID: { + case IDENT: { KeywordDef key = { .keyword = tok->text }; KeywordDef* match = bsearch( &key, Keywords, NUM_KEYWORDS, sizeof(KeywordDef), keywcmp); if (match) { tok->type = match->type; - if (tok->type != T_ID) + if (tok->type != IDENT) convert_value(tok); /* recurse to ensure correct conversion */ } break; } - case T_BOOL: { + case BOOL: { tok->value.integer = (tok->text[0] == 't'); break; } @@ -137,7 +164,8 @@ static inline void convert_value(Tok* tok) { static inline void readtok(Parser* ctx) { Tok* tok = &(ctx->tok); - char *beg = ctx->file->fpos, *curr = ctx->file->fpos; + char *beg = ctx->file->fpos; + char *curr = ctx->file->fpos; tok->offset = (beg - ctx->file->fbeg); switch (FirstChar[(int)*curr++]) { case 1: /* skip whitespace */ @@ -152,32 +180,56 @@ static inline void readtok(Parser* ctx) { tok->type = *(curr-1); if (!DIGIT[(int)*curr]) break; /* parse it as an int */ - tok->type = T_INT; + tok->type = INT; for (; DIGIT[(int)*curr]; curr++); break; case 4: - tok->type = T_INT; + tok->type = INT; for (; DIGIT[(int)*curr]; curr++); break; case 5: - tok->type = T_ID; + tok->type = IDENT; for (; ALNUM_[(int)*curr]; curr++); break; - case 6: /* single char tokens */ - tok->type = *(curr-1); + case 6: /* single/double char tokens */ + if (HasSecondChar[(int)*(curr-1)] && *(curr) == '=') + { + curr++; + switch (*(curr-2)) + { + case '!': tok->type = NEQ; break; + case '<': tok->type = LTEQ; break; + case '>': tok->type = GTEQ; break; + default: goto error; break; + } + } + else if (*(curr-1) == '.' || *(curr-1) == '=') + { + tok->type = *(curr-1); + if (*(curr) == tok->type) + { + curr++; + tok->type = (tok->type == '.' ? DOTDOT : EQ); + } + } + else + { + tok->type = *(curr-1); + } break; case 7: /* string parsing */ - tok->type = T_STRING; + tok->type = STRING; for (; *curr != '"'; curr++); curr++; break; case 0: /* error handling */ default: + error: fprintf(stderr, "Failed to parse token '%c'\n", *(curr-1)); exit(1); } @@ -203,11 +255,12 @@ void lexfile(Parser* ctx, char* path) { void lex(Parser* ctx) { ctx->tok.file = ctx->file->path; - ctx->tok.type = T_NONE; - while (ctx->tok.type == T_NONE) { + ctx->tok.type = NONE; + while (ctx->tok.type == NONE) { if (!ctx->file) { /* no more files left to process */ - ctx->tok.type = T_END_FILE; + ctx->tok.type = END_FILE; + ctx->tok.text = ""; return; } else if (!(ctx->file->fpos) || !*(ctx->file->fpos)) { /* grab the next file to process */ @@ -248,3 +301,93 @@ void lexprintpos(Parser* p, FILE* file, Tok* tok) { } fprintf(file, "%s:%zu:%zu:", tok->file, line, col); } + +#ifdef CERISE_TESTS +#include "atf.h" + +TEST_SUITE(Lexer) +{ + struct { + char* text; + int type; + } Tokens[] = { + { "+", '+' }, + { "-", '-' }, + { "/", '/' }, + { "*", '*' }, + { "^", '^' }, + { "=", '=' }, + { "==", EQ }, + { "!=", NEQ }, + { "<", '<' }, + { ">", '>' }, + { "<=", LTEQ }, + { ">=", GTEQ }, + { ".", '.' }, + { ",", ',' }, + { ";", ';' }, + { "..", DOTDOT }, + { "|", '|' }, + { ":", ':' }, + { "(", '(' }, + { ")", ')' }, + { "[", '[' }, + { "]", ']' }, + { "{", '{' }, + { "}", '}' }, + { "and", AND }, + { "array", ARRAY }, + { "begin", BEGIN }, + { "by", BY }, + { "case", CASE }, + { "const", CONST }, + { "div", DIV }, + { "do", DO }, + { "else", ELSE }, + { "elsif", ELSIF }, + { "end", END }, + { "false", FALSE }, + { "for", FOR }, + { "if", IF }, + { "import", IMPORT }, + { "is", IS }, + { "mod", MOD }, + { "module", MODULE }, + { "nil", NIL }, + { "not", NOT }, + { "of", OF }, + { "or", OR }, + { "pointer", POINTER }, + { "procedure", PROCEDURE }, + { "record", RECORD }, + { "repeat", REPEAT }, + { "return", RETURN }, + { "then", THEN }, + { "to", TO }, + { "true", TRUE }, + { "type", TYPE }, + { "until", UNTIL }, + { "var", VAR }, + { "while", WHILE }, + { "", END_FILE }, + }; + + + TEST(Lexer recognizes all required tokens) + { + Parser ctx = {0}; + lexfile(&ctx, "tests/tokens.txt"); + for (size_t i = 0; i < sizeof(Tokens)/sizeof(Tokens[0]); i++) + { + lex(&ctx); + //printf("(%d, '%s') != (%d, '%s')\n", + // ctx.tok.type, ctx.tok.text, Tokens[i].type, Tokens[i].text); + CHECK(ctx.tok.type == Tokens[i].type); + CHECK(ctx.tok.text != NULL); + CHECK(!strcmp(ctx.tok.text, Tokens[i].text)); + } + } +} + +#endif + diff --git a/cerise/main.c b/cerise/main.c index 8e85037..34766a5 100644 --- a/cerise/main.c +++ b/cerise/main.c @@ -1,23 +1,28 @@ #include "cerise.h" +#ifndef CERISE_TESTS + char* ARGV0; char* Artifact = "bin"; /* Driver Modes *****************************************************************************/ -static int emit_binary(Parser* ctx, int argc, char **argv) { +static int emit_binary(Parser* ctx, int argc, char **argv) +{ (void)ctx, (void)argc, (void)argv; return 0; } -static int emit_library(Parser* ctx, int argc, char **argv) { +static int emit_library(Parser* ctx, int argc, char **argv) +{ (void)ctx, (void)argc, (void)argv; return 0; } /* Main Routine and Usage *****************************************************************************/ -void usage(void) { +void usage(void) +{ fprintf(stderr, "%s\n", "Usage: sclpl [options...] [-A artifact] [file...]\n" "\n-A Emit the given type of artifact" @@ -26,7 +31,8 @@ void usage(void) { exit(1); } -int main(int argc, char **argv) { +int main(int argc, char **argv) +{ /* Option parsing */ OPTBEGIN { case 'A': Artifact = EOPTARG(usage()); break; @@ -37,13 +43,31 @@ int main(int argc, char **argv) { for (; argc; argc--,argv++) lexfile(&ctx, *argv); /* Execute the main compiler process */ - if (0 == strcmp("bin", Artifact)) { + if (0 == strcmp("bin", Artifact)) + { return emit_binary(&ctx, argc, argv); - } else if (0 == strcmp("lib", Artifact)) { + } + else if (0 == strcmp("lib", Artifact)) + { return emit_library(&ctx, argc, argv); - } else { + } + else + { fprintf(stderr, "Unknown artifact type: '%s'\n\n", Artifact); usage(); } return 1; } + +#else + +#define INCLUDE_DEFS +#include "atf.h" +int main(int argc, char **argv) +{ + atf_init(argc, argv); + RUN_EXTERN_TEST_SUITE(Lexer); + return atf_print_results(); +} + +#endif diff --git a/cerise/tests/tokens.txt b/cerise/tests/tokens.txt index b9068f3..f4301f3 100644 --- a/cerise/tests/tokens.txt +++ b/cerise/tests/tokens.txt @@ -4,7 +4,7 @@ * ^ = -:= +== != < >