From: Michael D. Lowis Date: Tue, 18 Dec 2018 18:59:12 +0000 (-0500) Subject: minor refactoring and renaming of regex compilation process X-Git-Url: https://git.mdlowis.com/?a=commitdiff_plain;h=6dcad9ddc640658fa84a71a35f1b7daa4c38f146;p=proto%2Flibregexp.git minor refactoring and renaming of regex compilation process --- diff --git a/.gitignore b/.gitignore index 6e92f57..25b12e2 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,3 @@ tags +*.o +*.a diff --git a/regcomp.c b/regcomp.c index 4adc7ef..cce3a12 100644 --- a/regcomp.c +++ b/regcomp.c @@ -21,14 +21,14 @@ static Reprog *reprog; static char* SrcExpr; /* pointer to next character in source expression */ -#define NSTACK 20 -static Node andstack[NSTACK]; /* Stack of operands */ -static Node* andp; /* Pointer to the top of the operand stack */ -static int atorstack[NSTACK]; /* Stack of operators */ -static int* atorp; /* Pointer to the top of the operator stack */ +#define NSTACK 20 +static Node OperandStack[NSTACK]; /* Stack of operands */ +static Node* OperandPtr; /* Pointer to the top of the operand stack */ +static int OperatorStack[NSTACK]; /* Stack of operators */ +static int* OperatorPtr; /* Pointer to the top of the operator stack */ static int cursubid; /* id of current subexpression */ -static int subidstack[NSTACK]; /* parallel to atorstack */ +static int subidstack[NSTACK]; /* parallel to OperatorStack */ static int* subidp; static bool lastwasand; /* Last token was operand */ static int nparens; @@ -41,17 +41,29 @@ static int yyrune; /* last lex'd rune */ static Reclass* yyclassp; /* last lex'd class */ /* predeclared crap */ -static void operator(int); -static void pushand(Reinst*, Reinst*); -static void pushator(int); -static void evaluntil(int); -static int bldcclass(void); +static void PushOperand(Reinst*, Reinst*); +static void PushOperator(int); +static Node* PopOperand(int op); +static int PopOperator(void); +static int BuildClass(void); + +static void operator(int); +static void evaluntil(int); static jmp_buf regkaboom; /******************************************************************************/ -static int nextc(int* p_rune) { +static Reinst* newinst(int t) { + freep->type = t; + freep->l.left = 0; + freep->r.right = 0; + return freep++; +} + +/******************************************************************************/ + +static int NextRune(int* p_rune) { if (lexdone) { *p_rune = 0; return 1; @@ -65,8 +77,8 @@ static int nextc(int* p_rune) { return 0; } -static int lex(int literal, int dot_type) { - int quoted = nextc(&yyrune); +static int GetToken(int literal, int dot_type) { + int quoted = NextRune(&yyrune); if (literal || quoted) { if (yyrune == 0) return END; @@ -84,7 +96,7 @@ static int lex(int literal, int dot_type) { case ')': return RPAREN; case '^': return BOL; case '$': return EOL; - case '[': return bldcclass(); + case '[': return BuildClass(); } return RUNE; } @@ -116,13 +128,135 @@ static void cant(char *s) { /******************************************************************************/ -static Reinst* newinst(int t) { - freep->type = t; - freep->l.left = 0; - freep->r.right = 0; - return freep++; +static void PushOperand(Reinst *f, Reinst *l) { + if(OperandPtr >= &OperandStack[NSTACK]) + cant("operand stack overflow"); + OperandPtr->first = f; + OperandPtr->last = l; + OperandPtr++; +} + +static void PushOperator(int t) { + if(OperatorPtr >= &OperatorStack[NSTACK]) + cant("operator stack overflow"); + *OperatorPtr++ = t; + *subidp++ = cursubid; +} + +static Node* PopOperand(int op) { + Reinst *inst; + + if(OperandPtr <= &OperandStack[0]){ + regerr2("missing operand for ", op); + inst = newinst(NOP); + PushOperand(inst,inst); + } + return --OperandPtr; +} + +static int PopOperator(void) { + if(OperatorPtr <= &OperatorStack[0]) + cant("operator stack underflow"); + --subidp; + return *--OperatorPtr; +} + +/******************************************************************************/ + +static int BuildClass(void) { + int type; + int r[NCCRUNE]; + int *p, *ep, *np; + int rune; + int quoted; + + /* we have already seen the '[' */ + if (nclass >= nelem(reprog->class)) + rcerror("too many character classes; increase Reprog.class size"); + type = CCLASS; + yyclassp = &(classp[nclass++]); + + /* look ahead for negation */ + /* SPECIAL CASE!!! negated classes don't match \n */ + ep = r; + quoted = NextRune(&rune); + if(!quoted && rune == '^'){ + type = NCCLASS; + quoted = NextRune(&rune); + *ep++ = '\n'; + *ep++ = '\n'; + } + + /* parse class into a set of spans */ + while(ep < &r[NCCRUNE-1]){ + if(rune == 0){ + rcerror("malformed '[]'"); + return 0; + } + if(!quoted && rune == ']') + break; + if(!quoted && rune == '-'){ + if(ep == r){ + rcerror("malformed '[]'"); + return 0; + } + quoted = NextRune(&rune); + if((!quoted && rune == ']') || rune == 0){ + rcerror("malformed '[]'"); + return 0; + } + *(ep-1) = rune; + } else { + *ep++ = rune; + *ep++ = rune; + } + quoted = NextRune(&rune); + } + if(ep >= &r[NCCRUNE-1]) { + rcerror("char class too large; increase Reclass.spans size"); + return 0; + } + + /* sort on span start */ + for(p = r; p < ep; p += 2){ + for(np = p; np < ep; np += 2) + if(*np < *p){ + rune = np[0]; + np[0] = p[0]; + p[0] = rune; + rune = np[1]; + np[1] = p[1]; + p[1] = rune; + } + } + + /* merge spans */ + np = yyclassp->spans; + p = r; + if(r == ep) + yyclassp->end = np; + else { + np[0] = *p++; + np[1] = *p++; + for(; p < ep; p += 2) + /* overlapping or adjacent ranges? */ + if(p[0] <= np[1] + 1){ + if(p[1] >= np[1]) + np[1] = p[1]; /* coalesce */ + } else { + np += 2; + np[0] = p[0]; + np[1] = p[1]; + } + yyclassp->end = np+2; + } + + return type; } + +/******************************************************************************/ + static void operand(int t) { Reinst *i; @@ -135,7 +269,7 @@ static void operand(int t) { if(t == RUNE) i->r.r = yyrune; - pushand(i, i); + PushOperand(i, i); lastwasand = true; } @@ -151,103 +285,70 @@ static void operator(int t) { }else evaluntil(t); if(t != RPAREN) - pushator(t); + PushOperator(t); lastwasand = false; if(t==STAR || t==QUEST || t==PLUS || t==RPAREN) lastwasand = true; /* these look like operands */ } -static void pushand(Reinst *f, Reinst *l) { - if(andp >= &andstack[NSTACK]) - cant("operand stack overflow"); - andp->first = f; - andp->last = l; - andp++; -} - -static void pushator(int t) { - if(atorp >= &atorstack[NSTACK]) - cant("operator stack overflow"); - *atorp++ = t; - *subidp++ = cursubid; -} - -static Node* popand(int op) { - Reinst *inst; - - if(andp <= &andstack[0]){ - regerr2("missing operand for ", op); - inst = newinst(NOP); - pushand(inst,inst); - } - return --andp; -} - -static int popator(void) { - if(atorp <= &atorstack[0]) - cant("operator stack underflow"); - --subidp; - return *--atorp; -} - static void evaluntil(int pri) { Node *op1, *op2; Reinst *inst1, *inst2; - while(pri==RPAREN || atorp[-1]>=pri){ - switch(popator()){ + while(pri==RPAREN || OperatorPtr[-1]>=pri){ + switch(PopOperator()){ default: rcerror("unknown operator in evaluntil"); break; case LPAREN: /* must have been RPAREN */ - op1 = popand('('); + op1 = PopOperand('('); inst2 = newinst(RPAREN); inst2->r.subid = *subidp; op1->last->l.next = inst2; inst1 = newinst(LPAREN); inst1->r.subid = *subidp; inst1->l.next = op1->first; - pushand(inst1, inst2); + PushOperand(inst1, inst2); return; case OR: - op2 = popand('|'); - op1 = popand('|'); + op2 = PopOperand('|'); + op1 = PopOperand('|'); inst2 = newinst(NOP); op2->last->l.next = inst2; op1->last->l.next = inst2; inst1 = newinst(OR); inst1->r.right = op1->first; inst1->l.left = op2->first; - pushand(inst1, inst2); + PushOperand(inst1, inst2); break; case CAT: - op2 = popand(0); - op1 = popand(0); + op2 = PopOperand(0); + op1 = PopOperand(0); op1->last->l.next = op2->first; - pushand(op1->first, op2->last); + PushOperand(op1->first, op2->last); break; case STAR: - op2 = popand('*'); + op2 = PopOperand('*'); inst1 = newinst(OR); op2->last->l.next = inst1; inst1->r.right = op2->first; - pushand(inst1, inst1); + PushOperand(inst1, inst1); break; case PLUS: - op2 = popand('+'); + op2 = PopOperand('+'); inst1 = newinst(OR); op2->last->l.next = inst1; inst1->r.right = op2->first; - pushand(op2->first, inst1); + PushOperand(op2->first, inst1); break; case QUEST: - op2 = popand('?'); + op2 = PopOperand('?'); inst1 = newinst(OR); inst2 = newinst(NOP); inst1->l.left = inst2; inst1->r.right = op2->first; op2->last->l.next = inst2; - pushand(inst1, inst2); + PushOperand(inst1, inst2); break; } } @@ -302,101 +403,6 @@ static Reprog* optimize(Reprog *pp) { return npp; } -static Reclass* newclass(void) { - if(nclass >= nelem(reprog->class)) - rcerror("too many character classes; increase Reprog.class size"); - return &(classp[nclass++]); -} - -static int bldcclass(void) { - int type; - int r[NCCRUNE]; - int *p, *ep, *np; - int rune; - int quoted; - - /* we have already seen the '[' */ - type = CCLASS; - yyclassp = newclass(); - - /* look ahead for negation */ - /* SPECIAL CASE!!! negated classes don't match \n */ - ep = r; - quoted = nextc(&rune); - if(!quoted && rune == '^'){ - type = NCCLASS; - quoted = nextc(&rune); - *ep++ = '\n'; - *ep++ = '\n'; - } - - /* parse class into a set of spans */ - while(ep < &r[NCCRUNE-1]){ - if(rune == 0){ - rcerror("malformed '[]'"); - return 0; - } - if(!quoted && rune == ']') - break; - if(!quoted && rune == '-'){ - if(ep == r){ - rcerror("malformed '[]'"); - return 0; - } - quoted = nextc(&rune); - if((!quoted && rune == ']') || rune == 0){ - rcerror("malformed '[]'"); - return 0; - } - *(ep-1) = rune; - } else { - *ep++ = rune; - *ep++ = rune; - } - quoted = nextc(&rune); - } - if(ep >= &r[NCCRUNE-1]) { - rcerror("char class too large; increase Reclass.spans size"); - return 0; - } - - /* sort on span start */ - for(p = r; p < ep; p += 2){ - for(np = p; np < ep; np += 2) - if(*np < *p){ - rune = np[0]; - np[0] = p[0]; - p[0] = rune; - rune = np[1]; - np[1] = p[1]; - p[1] = rune; - } - } - - /* merge spans */ - np = yyclassp->spans; - p = r; - if(r == ep) - yyclassp->end = np; - else { - np[0] = *p++; - np[1] = *p++; - for(; p < ep; p += 2) - /* overlapping or adjacent ranges? */ - if(p[0] <= np[1] + 1){ - if(p[1] >= np[1]) - np[1] = p[1]; /* coalesce */ - } else { - np += 2; - np[0] = p[0]; - np[1] = p[1]; - } - yyclassp->end = np+2; - } - - return type; -} - static Reprog* regcomp1(char *s, int literal, int dot_type) { int token; @@ -419,16 +425,16 @@ static Reprog* regcomp1(char *s, int literal, int dot_type) { SrcExpr = s; nclass = 0; nparens = 0; - atorp = atorstack; - andp = andstack; + OperatorPtr = OperatorStack; + OperandPtr = OperandStack; subidp = subidstack; lastwasand = false; cursubid = 0; /* Start with a low priority operator to prime parser */ - pushator(START-1); - while((token = lex(literal, dot_type)) != END){ - if((token&0300) == OPERATOR) + PushOperator(START-1); + while((token = GetToken(literal, dot_type)) != END){ + if((token & 0300) == OPERATOR) operator(token); else operand(token); @@ -442,8 +448,8 @@ static Reprog* regcomp1(char *s, int literal, int dot_type) { evaluntil(START); if(nparens) rcerror("unmatched left paren"); - --andp; /* points to first and only operand */ - pp->startinst = andp->first; + --OperandPtr; /* points to first and only operand */ + pp->startinst = OperandPtr->first; pp = optimize(pp); return pp; }