From: Michael D. Lowis Date: Sun, 24 Jan 2016 22:06:53 +0000 (-0500) Subject: Converted handwritten lexer to flex based lexer X-Git-Url: https://git.mdlowis.com/?a=commitdiff_plain;h=fc9085d917c0a0dacdc92b13559463c30a0ba5ad;p=proto%2Fsclpl.git Converted handwritten lexer to flex based lexer --- diff --git a/.gitignore b/.gitignore index 0b742e6..4bf735a 100644 --- a/.gitignore +++ b/.gitignore @@ -11,3 +11,4 @@ cscope.out .DS_Store .rsconscache sclpl +source/lexer.c diff --git a/Makefile b/Makefile index 3231085..defd536 100644 --- a/Makefile +++ b/Makefile @@ -30,8 +30,8 @@ OBJS = source/main.o \ source/gc.o \ source/vec.o \ source/pprint.o \ - source/lexer.o \ source/parser.o \ + source/lexer.o \ source/ast.o \ source/anf.o \ source/codegen.o @@ -59,17 +59,21 @@ ${BIN}: lib${BIN}.a @echo LD $@ @${LD} ${LDFLAGS} -o $@ $^ -${TESTBIN}: ${TESTOBJS} - @echo LD $@ - @${LD} ${LDFLAGS} -o $@ $^ - -tests: $(TESTBIN) - @./$< +#${TESTBIN}: ${TESTOBJS} +# @echo LD $@ +# @${LD} ${LDFLAGS} -o $@ $^ +# +#tests: $(TESTBIN) +# @./$< specs: $(BIN) @echo TEST $< @rspec --pattern 'spec/**{,/*/**}/*_spec.rb' +.l.c: + @echo LEX $< + @${LEX} -o $@ $< + .c.o: @echo CC $< @${CC} ${CFLAGS} -c -o $@ $< @@ -77,7 +81,7 @@ specs: $(BIN) clean: @rm -f ${BIN} lib${BIN}.a @rm -f ${TESTBIN} ${TESTOBJS} ${TESTOBJS:.o=.gcda} ${TESTOBJS:.o=.gcno} - @rm -f ${OBJS} ${OBJS:.o=.gcda} ${OBJS:.o=.gcno} + @rm -f ${OBJS} ${OBJS:.o=.gcda} ${OBJS:.o=.gcno} source/lexer.c .PHONY: all options tests specs diff --git a/source/lexer.c b/source/lexer.c deleted file mode 100644 index 61ddef2..0000000 --- a/source/lexer.c +++ /dev/null @@ -1,435 +0,0 @@ -/** - @file lexer.c - @brief See header for details - $Revision$ - $HeadURL$ - */ -#include - -/* Private Declarations - *****************************************************************************/ -// Token Scanning -static char* scan(Parser* ctx, size_t* line, size_t* column); -static void skip_ws(Parser* ctx); -static char* read_string(Parser* ctx); -static bool eof(Parser* ctx); -static bool oneof(Parser* ctx, const char* set); -static char current(Parser* ctx); -static bool eol(Parser* ctx); -static char* dup(Parser* ctx, size_t start_idx, size_t len); - -// Lexical Analysis -static Tok* classify(const char* file, size_t line, size_t col, char* text); -static bool char_oneof(const char* class, char c); -static Tok* punctuation(char* text); -static Tok* character(char* text); -static Tok* integer(char* text, int base); -static int getradix(char ch); -static Tok* radixint(char* text); -static bool is_float(char* text); -static Tok* floating(char* text); -static Tok* number(char* text); -static Tok* boolean(char* text); - -// Token Constructors -static Tok* Token(TokType type); -static Tok* TextTok(TokType type, char* text); -static Tok* CharTok(uint32_t val); -static Tok* IntTok(intptr_t val); -static Tok* FloatTok(double val); -static Tok* BoolTok(bool val); -static void token_free(void* obj); - -// Utility Functions -static char* dupstring(const char* old); - -/* Public API - *****************************************************************************/ -Tok* gettoken(Parser* ctx) -{ - Tok* tok = NULL; - size_t line, col; - char* text = scan(ctx, &line, &col); - if (text != NULL) { - tok = classify(NULL, line, col, text); - free(text); - } - return tok; -} - -void fetchline(Parser* ctx) -{ - int c; - size_t capacity = 8; - size_t index = 0; - /* Reallocate and clear the line buffer */ - ctx->line = realloc(ctx->line, capacity); - ctx->line[0] = '\0'; - ctx->index = 0; - - /* If we have not yet reached the end of the file, read the next line */ - if (!eof(ctx)) { - if (NULL != ctx->prompt) - printf("%s", ctx->prompt); - while(('\n' != (c = fgetc(ctx->input))) && (EOF != c)) { - if (index+2 == capacity) { - capacity <<= 1u; - ctx->line = realloc(ctx->line, capacity); - } - ctx->line[index++] = c; - } - ctx->line[index++] = (c == EOF) ? '\0' : c; - ctx->line[index++] = '\0'; - ctx->index = 0; - /* Increment line count */ - ctx->lineno++; - } -} - -/* Token Scanning - *****************************************************************************/ -static char* scan(Parser* ctx, size_t* line, size_t* column) -{ - char* tok = NULL; - skip_ws(ctx); - *line = ctx->lineno; - *column = ctx->index+1; - if (!eof(ctx)) { - if (oneof(ctx, "()[]{};,'")) { - tok = dup(ctx, ctx->index, 1); - ctx->index++; - } else if (current(ctx) == '"') { - tok = read_string(ctx); - } else { - size_t start = ctx->index; - while(!oneof(ctx," \t\r\n()[]{};,'\"") && (current(ctx) != '\0')) { - ctx->index++; - } - tok = dup(ctx, start, ctx->index - start); - } - } - return tok; -} - -static void skip_ws(Parser* ctx) -{ - /* If we haven't read a line yet, read one now */ - if (NULL == ctx->line) - fetchline(ctx); - /* Fast forward past whitespace and read a newline if necessary */ - while(!eof(ctx)) { - if ('\0' == current(ctx)) { - fetchline(ctx); - } else if (oneof(ctx, " \t\r\n")) { - ctx->index++; - } else { - break; - } - } -} - -static char* read_string(Parser* ctx) -{ - size_t capacity = 8; - size_t index = 0; - char* tok = (char*)malloc(capacity); - - /* Skip the first " */ - tok[index++] = current(ctx); - tok[index] = '\0'; - ctx->index++; - - /* Read the contents of the string */ - while ('"' != current(ctx)) { - /* Resize the buffer if necessary */ - if ((index+2) >= capacity) { - capacity = capacity << 1; - tok = (char*)realloc(tok, capacity); - } - - /* EOF results in an assertion (don't do) */ - if (eof(ctx)) - assert(false); - - /* Read the char */ - tok[index++] = current(ctx); - tok[index] = '\0'; - ctx->index++; - - /* Get the next line if necessary */ - if ('\n' == tok[index-1]) - fetchline(ctx); - } - - /* Skip the last " */ - tok[index++] = current(ctx); - tok[index] = '\0'; - ctx->index++; - - return tok; -} - -static bool eof(Parser* ctx) -{ - return (eol(ctx) && feof(ctx->input)); -} - -static bool oneof(Parser* ctx, const char* set) -{ - bool ret = false; - size_t sz = strlen(set); - for (size_t idx = 0; idx < sz; idx++) { - if (current(ctx) == set[idx]) { - ret = true; - break; - } - } - return ret; -} - -static char current(Parser* ctx) -{ - return ctx->line[ctx->index]; -} - -static bool eol(Parser* ctx) -{ - bool ret = true; - size_t index = ctx->index; - char ch; - while((NULL != ctx->line) && ('\0' != (ch = ctx->line[index]))) { - if((' '!=ch) && ('\t'!=ch) && ('\r'!=ch) && ('\n'!=ch)) { - ret = false; - break; - } - index++; - } - return ret; -} - -static char* dup(Parser* ctx, size_t start_idx, size_t len) -{ - char* str = (char*)malloc(len+1); - memcpy(str, &(ctx->line[start_idx]), len); - str[len] = '\0'; - return str; -} - - -/* Lexical Analysis - *****************************************************************************/ -static Tok* classify(const char* file, size_t line, size_t col, char* text) -{ - Tok* tok = NULL; - (void)file; - if (0 == strcmp(text,"end")) { - tok = Token(T_END); - } else if (char_oneof("()[]{};,'", text[0])) { - tok = punctuation(text); - } else if ('"' == text[0]) { - text[strlen(text)-1] = '\0'; - tok = TextTok(T_STRING, &text[1]); - } else if (text[0] == '\\') { - tok = character(text); - } else if ((text[0] == '0') && char_oneof("bodh",text[1])) { - tok = radixint(text); - } else if (char_oneof("+-0123456789",text[0])) { - tok = number(text); - } else if ((0 == strcmp(text,"true")) || (0 == strcmp(text,"false"))) { - tok = boolean(text); - } else { - tok = TextTok(T_ID, text); - } - /* If we found a valid token then fill in the location details */ - if (NULL != tok) { - tok->line = line; - tok->col = col; - } - return tok; -} - -static bool char_oneof(const char* class, char c) -{ - bool ret = false; - size_t sz = strlen(class); - for (size_t idx = 0; idx < sz; idx++) { - if (c == class[idx]) { - ret = true; - break; - } - } - return ret; -} - -static Tok* punctuation(char* text) -{ - Tok* tok = NULL; - switch (text[0]) { - case '(': tok = Token(T_LPAR); break; - case ')': tok = Token(T_RPAR); break; - case '{': tok = Token(T_LBRACE); break; - case '}': tok = Token(T_RBRACE); break; - case '[': tok = Token(T_LBRACK); break; - case ']': tok = Token(T_RBRACK); break; - case ';': tok = Token(T_END); break; - case ',': tok = Token(T_COMMA); break; - case '\'': tok = Token(T_SQUOTE); break; - } - return tok; -} - -static Tok* character(char* text) -{ - Tok* tok = NULL; - static const char* lookuptable[5] = { - " \0space", - "\n\0newline", - "\r\0return", - "\t\0tab", - "\v\0vtab" - }; - if (strlen(text) == 2) { - tok = CharTok((uint32_t)text[1]); - } else { - for(int i = 0; i < 5; i++) { - if (0 == strcmp(&text[1], &(lookuptable[i][2]))) { - tok = CharTok((uint32_t)lookuptable[i][0]); - break; - } - } - if (NULL == tok) - tok = TextTok(T_ID, text); - } - return tok; -} - -static Tok* integer(char* text, int base) -{ - char* end; - long integer; - errno = 0; - integer = strtol(text, &end, base); - assert(errno == 0); - return (end[0] == '\0') ? IntTok(integer) : NULL; -} - -static int getradix(char ch) -{ - int ret = -1; - switch(ch) { - case 'b': ret = 2; break; - case 'o': ret = 8; break; - case 'd': ret = 10; break; - case 'h': ret = 16; break; - } - return ret; -} - -static Tok* radixint(char* text) -{ - Tok* ret = integer(&text[2], getradix(text[1])); - if (NULL == ret) - ret = TextTok(T_ID, text); - return ret; -} - -static bool is_float(char* text) -{ - while (text[0] != '\0') - if (text[0] == '.') - return true; - else - text++; - return false; -} - -static Tok* floating(char* text) -{ - char* end; - double dbl; - errno = 0; - dbl = strtod(text, &end); - assert(errno == 0); - return (end[0] == '\0') ? FloatTok(dbl) : NULL; -} - -static Tok* number(char* text) -{ - Tok* tok = NULL; - if (is_float(text)) - tok = floating(text); - else - tok = integer(text, 10); - return (NULL == tok) ? TextTok(T_ID, text) : tok; -} - -static Tok* boolean(char* text) -{ - return BoolTok(0 == strcmp(text,"true")); -} - -/* Token Constructors - *****************************************************************************/ -static Tok* Token(TokType type) -{ - Tok* tok = (Tok*)gc_alloc(sizeof(Tok), &token_free); - tok->type = type; - return tok; -} - -static Tok* TextTok(TokType type, char* text) -{ - Tok* tok = Token(type); - tok->value.text = (char*)gc_addref(dupstring(text)); - return tok; -} - -static Tok* CharTok(uint32_t val) -{ - Tok* tok = Token(T_CHAR); - tok->value.character = val; - return tok; -} - -static Tok* IntTok(intptr_t val) -{ - Tok* tok = Token(T_INT); - tok->value.integer = val; - return tok; -} - -static Tok* FloatTok(double val) -{ - Tok* tok = Token(T_FLOAT); - tok->value.floating = val; - return tok; -} - -static Tok* BoolTok(bool val) -{ - Tok* tok = Token(T_BOOL); - tok->value.boolean = val; - return tok; -} - -static void token_free(void* obj) -{ - Tok* tok = (Tok*)obj; - if ((tok->type != T_BOOL) && - (tok->type != T_CHAR) && - (tok->type != T_INT) && - (tok->type != T_FLOAT) && - (NULL != tok->value.text)) - gc_delref(tok->value.text); -} - -/* Utility Functions - *****************************************************************************/ -static char* dupstring(const char* old) { - size_t length = strlen(old); - char* str = (char*)gc_alloc(length+1, NULL); - memcpy(str, old, length); - str[length] = '\0'; - return str; -} - diff --git a/source/lexer.l b/source/lexer.l new file mode 100644 index 0000000..ae0659d --- /dev/null +++ b/source/lexer.l @@ -0,0 +1,144 @@ +%{ +#include + +static union { + char* text; + uint32_t character; + intptr_t integer; + double floating; + bool boolean; +} Value; + +static char* dupstring(const char* old) { + size_t length = strlen(old); + char* str = (char*)gc_alloc(length+1, NULL); + memcpy(str, old, length); + str[length] = '\0'; + return str; +} + +%} + +DIGIT [0-9] +ALPHA [a-zA_Z] +ALPHA_ [a-zA_Z_] +ALNUM_ [a-zA-Z0-9_] +SPACE [ \t\r\n] +NOSPACE [^ \t\r\n] + +%option noyywrap + +%% + +<> { return T_END_FILE; } + +"end" { return T_END; } +"(" { return T_LPAR; } +")" { return T_RPAR; } +"[" { return T_LBRACK; } +"]" { return T_RBRACK; } +"{" { return T_LBRACE; } +"}" { return T_RBRACE; } +";" { return T_END; } +"," { return T_COMMA; } +"'" { return T_SQUOTE; } + +\\. { Value.character = yytext[1]; return T_CHAR; } +\\space { Value.character = ' '; return T_CHAR; } +\\newline { Value.character = '\n'; return T_CHAR; } +\\return { Value.character = '\r'; return T_CHAR; } +\\tab { Value.character = '\t'; return T_CHAR; } +\\vtab { Value.character = '\v'; return T_CHAR; } +\\[a-z]+ { Value.text = dupstring(yytext); return T_ID; } + +0b[01]+ { + Value.integer = strtol(&yytext[2], NULL, 2); + return T_INT; +} + +0o[0-7]+ { + Value.integer = strtol(&yytext[2], NULL, 8); + return T_INT; +} + +0d[0-9]+ { + Value.integer = strtol(&yytext[2], NULL, 10); + return T_INT; +} + +0h[0-9a-fA-F]+ { + Value.integer = strtol(&yytext[2], NULL, 16); + return T_INT; +} + +0[b0dh][0-9a-fA-F]+ { + Value.text = dupstring(yytext); + return T_ID; +} + +[+-]?[0-9]+ { + Value.integer = strtol(&yytext[0], NULL, 10); + return T_INT; +} + +[+-]?[0-9]+\.[0-9]+(e[+-]?[0-9]+)? { + Value.floating = strtod(yytext, NULL); + return T_FLOAT; +} + +\"([^"]|\\\")*\" { + size_t len = strlen(&yytext[1]); + Value.text = dupstring(&yytext[1]); + Value.text[len-1] = '\0'; + return T_STRING; +} + +true { + Value.boolean = true; + return T_BOOL; +} + +false { + Value.boolean = false; + return T_BOOL; +} + +[a-zA-Z_][a-zA-Z0-9_]* { + Value.text = dupstring(yytext); + return T_ID; +} + +[^ \r\t\n\[\]\{\}\(\)'\",;]+ { + Value.text = dupstring(yytext); + return T_ID; +} + +%% + +static void token_free(void* obj) +{ + Tok* tok = (Tok*)obj; + if ((tok->type != T_BOOL) && + (tok->type != T_CHAR) && + (tok->type != T_INT) && + (tok->type != T_FLOAT) && + (NULL != tok->value.text)) + gc_delref(tok->value.text); +} + +Tok* gettoken(Parser* ctx) +{ + Tok* tok = NULL; + int type = yylex(); + if (type != T_END_FILE) { + tok = (Tok*)gc_alloc(sizeof(Tok), &token_free); + tok->type = type; + memcpy(&(tok->value), &Value, sizeof(Value)); + } + return tok; +} + +void fetchline(Parser* ctx) +{ +} + diff --git a/source/opt.h b/source/opt.h index 126fcbe..a859258 100644 --- a/source/opt.h +++ b/source/opt.h @@ -35,7 +35,7 @@ extern char* ARGV0; /* This is a helper function used by the macros in this file to parse the next * option from the command line. */ -static inline char* getopt(int* p_argc, char*** p_argv) { +static inline char* __getopt(int* p_argc, char*** p_argv) { if (!(*p_argv)[0][1] && !(*p_argv)[1]) { return (char*)0; } else if ((*p_argv)[0][1]) { @@ -73,13 +73,13 @@ static inline char* getopt(int* p_argc, char*** p_argv) { /* Get an argument from the command line and return it as a string. If no * argument is available, this macro returns NULL */ #define OPTARG() \ - (optarg_ = getopt(&argc,&argv), brk_ = (optarg_!=0), optarg_) + (optarg_ = __getopt(&argc,&argv), brk_ = (optarg_!=0), optarg_) /* Get an argument from the command line and return it as a string. If no * argument is available, this macro executes the provided code. If that code * returns, then abort is called. */ #define EOPTARG(code) \ - (optarg_ = getopt(&argc,&argv), \ + (optarg_ = __getopt(&argc,&argv), \ (!optarg_ ? ((code), abort(), (char*)0) : (brk_ = 1, optarg_))) /* Helper macro to recognize number options */ diff --git a/spec/anf_spec.rb b/spec/anf_spec.rb index 1a7cda2..57e9cc3 100644 --- a/spec/anf_spec.rb +++ b/spec/anf_spec.rb @@ -187,8 +187,8 @@ describe "sclpl a-normal form" do expect(anf('fn() if 1 2 else 3;;')).to eq([ ["fn", [], ["if", "T_INT:1", - ["let", ["$:0", "T_INT:2"], "$:0"], - ["let", ["$:1", "T_INT:3"], "$:1"]]] + ["let", ["$:1", "T_INT:2"], "$:1"], + ["let", ["$:2", "T_INT:3"], "$:2"]]] ]) end diff --git a/spec/parser_spec.rb b/spec/parser_spec.rb index d0d0065..ffdcac4 100644 --- a/spec/parser_spec.rb +++ b/spec/parser_spec.rb @@ -191,23 +191,23 @@ describe "sclpl grammar" do it "should normalize a literal with an if expression" do expect(ast('fn() if 1 2 else 3;;')).to eq([ ["fn", [], - ["let", ["$:2", ["if", "T_INT:1", - ["let", ["$:0", "T_INT:2"], "$:0"], - ["let", ["$:1", "T_INT:3"], "$:1"]]], - "$:2"]] + ["let", ["$:0", ["if", "T_INT:1", + ["let", ["$:1", "T_INT:2"], "$:1"], + ["let", ["$:2", "T_INT:3"], "$:2"]]], + "$:0"]] ]) end it "should normalize a literal with two sequential if expressions" do expect(ast('fn() if 1 2 else 3; if 1 2 else 3; ;')).to eq([ ["fn", [], - ["let", ["$:2", ["if", "T_INT:1", - ["let", ["$:0", "T_INT:2"], "$:0"], - ["let", ["$:1", "T_INT:3"], "$:1"]]], - ["let", ["$:5", ["if", "T_INT:1", - ["let", ["$:3", "T_INT:2"], "$:3"], - ["let", ["$:4", "T_INT:3"], "$:4"]]], - "$:5"]]] + ["let", ["$:0", ["if", "T_INT:1", + ["let", ["$:1", "T_INT:2"], "$:1"], + ["let", ["$:2", "T_INT:3"], "$:2"]]], + ["let", ["$:3", ["if", "T_INT:1", + ["let", ["$:4", "T_INT:2"], "$:4"], + ["let", ["$:5", "T_INT:3"], "$:5"]]], + "$:3"]]] ]) end end