]> git.mdlowis.com Git - proto/libregexp.git/commitdiff
import from plan9port
authorDavid du Colombier <0intro@gmail.com>
Sat, 8 Jun 2013 19:00:00 +0000 (21:00 +0200)
committerDavid du Colombier <0intro@gmail.com>
Sat, 8 Jun 2013 19:00:00 +0000 (21:00 +0200)
13 files changed:
NOTICE [new file with mode: 0644]
README [new file with mode: 0644]
regaux.c [new file with mode: 0644]
regcomp.c [new file with mode: 0644]
regcomp.h [new file with mode: 0644]
regerror.c [new file with mode: 0644]
regexec.c [new file with mode: 0644]
regexp9.h [new file with mode: 0644]
regsub.c [new file with mode: 0644]
rregexec.c [new file with mode: 0644]
rregsub.c [new file with mode: 0644]
test.c [new file with mode: 0644]
test2.c [new file with mode: 0644]

diff --git a/NOTICE b/NOTICE
new file mode 100644 (file)
index 0000000..02856cf
--- /dev/null
+++ b/NOTICE
@@ -0,0 +1,25 @@
+/*
+ * The authors of this software is Rob Pike.
+ *             Copyright (c) 2002 by Lucent Technologies.
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose without fee is hereby granted, provided that this entire notice
+ * is included in all copies of any software which is or includes a copy
+ * or modification of this software and in all copies of the supporting
+ * documentation for such software.
+ * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
+ * WARRANTY.  IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
+ * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
+ * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
+*/
+
+This is a Unix port of the Plan 9 regular expression library.
+
+Please send comments about the packaging
+to Russ Cox <rsc@swtch.com>.
+
+
+----
+
+This software is also made available under the Lucent Public License
+version 1.02; see http://plan9.bell-labs.com/plan9dist/license.html
+
diff --git a/README b/README
new file mode 100644 (file)
index 0000000..1f62511
--- /dev/null
+++ b/README
@@ -0,0 +1,7 @@
+This is a Unix port of the Plan 9 regular expression library,
+originally done for the Inferno operating system.
+
+Russ Cox repackaged this to build as a standalone
+Unix library.  Send comments about packaging to
+Russ Cox <rsc@post.harvard.edu>
+
diff --git a/regaux.c b/regaux.c
new file mode 100644 (file)
index 0000000..b854b5a
--- /dev/null
+++ b/regaux.c
@@ -0,0 +1,112 @@
+#include "lib9.h"
+#include "regexp9.h"
+#include "regcomp.h"
+
+
+/*
+ *  save a new match in mp
+ */
+extern void
+_renewmatch(Resub *mp, int ms, Resublist *sp)
+{
+       int i;
+
+       if(mp==0 || ms<=0)
+               return;
+       if(mp[0].s.sp==0 || sp->m[0].s.sp<mp[0].s.sp ||
+          (sp->m[0].s.sp==mp[0].s.sp && sp->m[0].e.ep>mp[0].e.ep)){
+               for(i=0; i<ms && i<NSUBEXP; i++)
+                       mp[i] = sp->m[i];
+               for(; i<ms; i++)
+                       mp[i].s.sp = mp[i].e.ep = 0;
+       }
+}
+
+/*
+ * Note optimization in _renewthread:
+ *     *lp must be pending when _renewthread called; if *l has been looked
+ *             at already, the optimization is a bug.
+ */
+extern Relist*
+_renewthread(Relist *lp,       /* _relist to add to */
+       Reinst *ip,             /* instruction to add */
+       int ms,
+       Resublist *sep)         /* pointers to subexpressions */
+{
+       Relist *p;
+
+       for(p=lp; p->inst; p++){
+               if(p->inst == ip){
+                       if(sep->m[0].s.sp < p->se.m[0].s.sp){
+                               if(ms > 1)
+                                       p->se = *sep;
+                               else
+                                       p->se.m[0] = sep->m[0];
+                       }
+                       return 0;
+               }
+       }
+       p->inst = ip;
+       if(ms > 1)
+               p->se = *sep;
+       else
+               p->se.m[0] = sep->m[0];
+       (++p)->inst = 0;
+       return p;
+}
+
+/*
+ * same as renewthread, but called with
+ * initial empty start pointer.
+ */
+extern Relist*
+_renewemptythread(Relist *lp,  /* _relist to add to */
+       Reinst *ip,             /* instruction to add */
+       int ms,
+       char *sp)               /* pointers to subexpressions */
+{
+       Relist *p;
+
+       for(p=lp; p->inst; p++){
+               if(p->inst == ip){
+                       if(sp < p->se.m[0].s.sp) {
+                               if(ms > 1)
+                                       memset(&p->se, 0, sizeof(p->se));
+                               p->se.m[0].s.sp = sp;
+                       }
+                       return 0;
+               }
+       }
+       p->inst = ip;
+       if(ms > 1)
+               memset(&p->se, 0, sizeof(p->se));
+       p->se.m[0].s.sp = sp;
+       (++p)->inst = 0;
+       return p;
+}
+
+extern Relist*
+_rrenewemptythread(Relist *lp, /* _relist to add to */
+       Reinst *ip,             /* instruction to add */
+       int ms,
+       Rune *rsp)              /* pointers to subexpressions */
+{
+       Relist *p;
+
+       for(p=lp; p->inst; p++){
+               if(p->inst == ip){
+                       if(rsp < p->se.m[0].s.rsp) {
+                               if(ms > 1)
+                                       memset(&p->se, 0, sizeof(p->se));
+                               p->se.m[0].s.rsp = rsp;
+                       }
+                       return 0;
+               }
+       }
+       p->inst = ip;
+       if(ms > 1)
+               memset(&p->se, 0, sizeof(p->se));
+       p->se.m[0].s.rsp = rsp;
+       (++p)->inst = 0;
+       return p;
+}
diff --git a/regcomp.c b/regcomp.c
new file mode 100644 (file)
index 0000000..0967875
--- /dev/null
+++ b/regcomp.c
@@ -0,0 +1,566 @@
+#include "lib9.h"
+#include "regexp9.h"
+#include "regcomp.h"
+
+#define        TRUE    1
+#define        FALSE   0
+
+/*
+ * Parser Information
+ */
+typedef
+struct Node
+{
+       Reinst* first;
+       Reinst* last;
+}Node;
+
+/* max character classes per program is nelem(reprog->class) */
+static Reprog  *reprog;
+
+/* max rune ranges per character class is nelem(classp->spans)/2 */
+#define NCCRUNE        nelem(classp->spans)
+
+#define        NSTACK  20
+static Node    andstack[NSTACK];
+static Node    *andp;
+static int     atorstack[NSTACK];
+static int*    atorp;
+static int     cursubid;               /* id of current subexpression */
+static int     subidstack[NSTACK];     /* parallel to atorstack */
+static int*    subidp;
+static int     lastwasand;     /* Last token was operand */
+static int     nbra;
+static char*   exprp;          /* pointer to next character in source expression */
+static int     lexdone;
+static int     nclass;
+static Reclass*classp;
+static Reinst* freep;
+static int     errors;
+static Rune    yyrune;         /* last lex'd rune */
+static Reclass*yyclassp;       /* last lex'd class */
+
+/* predeclared crap */
+static void    operator(int);
+static void    pushand(Reinst*, Reinst*);
+static void    pushator(int);
+static void    evaluntil(int);
+static int     bldcclass(void);
+
+static jmp_buf regkaboom;
+
+static void
+rcerror(char *s)
+{
+       errors++;
+       regerror(s);
+       longjmp(regkaboom, 1);
+}
+
+static Reinst*
+newinst(int t)
+{
+       freep->type = t;
+       freep->u2.left = 0;
+       freep->u1.right = 0;
+       return freep++;
+}
+
+static void
+operand(int t)
+{
+       Reinst *i;
+
+       if(lastwasand)
+               operator(CAT);  /* catenate is implicit */
+       i = newinst(t);
+
+       if(t == CCLASS || t == NCCLASS)
+               i->u1.cp = yyclassp;
+       if(t == RUNE)
+               i->u1.r = yyrune;
+
+       pushand(i, i);
+       lastwasand = TRUE;
+}
+
+static void
+operator(int t)
+{
+       if(t==RBRA && --nbra<0)
+               rcerror("unmatched right paren");
+       if(t==LBRA){
+               if(++cursubid >= NSUBEXP)
+                       rcerror ("too many subexpressions");
+               nbra++;
+               if(lastwasand)
+                       operator(CAT);
+       } else
+               evaluntil(t);
+       if(t != RBRA)
+               pushator(t);
+       lastwasand = FALSE;
+       if(t==STAR || t==QUEST || t==PLUS || t==RBRA)
+               lastwasand = TRUE;      /* these look like operands */
+}
+
+static void
+regerr2(char *s, int c)
+{
+       char buf[100];
+       char *cp = buf;
+       while(*s)
+               *cp++ = *s++;
+       *cp++ = c;
+       *cp = '\0'; 
+       rcerror(buf);
+}
+
+static void
+cant(char *s)
+{
+       char buf[100];
+       strcpy(buf, "can't happen: ");
+       strcat(buf, s);
+       rcerror(buf);
+}
+
+static void
+pushand(Reinst *f, Reinst *l)
+{
+       if(andp >= &andstack[NSTACK])
+               cant("operand stack overflow");
+       andp->first = f;
+       andp->last = l;
+       andp++;
+}
+
+static void
+pushator(int t)
+{
+       if(atorp >= &atorstack[NSTACK])
+               cant("operator stack overflow");
+       *atorp++ = t;
+       *subidp++ = cursubid;
+}
+
+static Node*
+popand(int op)
+{
+       Reinst *inst;
+
+       if(andp <= &andstack[0]){
+               regerr2("missing operand for ", op);
+               inst = newinst(NOP);
+               pushand(inst,inst);
+       }
+       return --andp;
+}
+
+static int
+popator(void)
+{
+       if(atorp <= &atorstack[0])
+               cant("operator stack underflow");
+       --subidp;
+       return *--atorp;
+}
+
+static void
+evaluntil(int pri)
+{
+       Node *op1, *op2;
+       Reinst *inst1, *inst2;
+
+       while(pri==RBRA || atorp[-1]>=pri){
+               switch(popator()){
+               default:
+                       rcerror("unknown operator in evaluntil");
+                       break;
+               case LBRA:              /* must have been RBRA */
+                       op1 = popand('(');
+                       inst2 = newinst(RBRA);
+                       inst2->u1.subid = *subidp;
+                       op1->last->u2.next = inst2;
+                       inst1 = newinst(LBRA);
+                       inst1->u1.subid = *subidp;
+                       inst1->u2.next = op1->first;
+                       pushand(inst1, inst2);
+                       return;
+               case OR:
+                       op2 = popand('|');
+                       op1 = popand('|');
+                       inst2 = newinst(NOP);
+                       op2->last->u2.next = inst2;
+                       op1->last->u2.next = inst2;
+                       inst1 = newinst(OR);
+                       inst1->u1.right = op1->first;
+                       inst1->u2.left = op2->first;
+                       pushand(inst1, inst2);
+                       break;
+               case CAT:
+                       op2 = popand(0);
+                       op1 = popand(0);
+                       op1->last->u2.next = op2->first;
+                       pushand(op1->first, op2->last);
+                       break;
+               case STAR:
+                       op2 = popand('*');
+                       inst1 = newinst(OR);
+                       op2->last->u2.next = inst1;
+                       inst1->u1.right = op2->first;
+                       pushand(inst1, inst1);
+                       break;
+               case PLUS:
+                       op2 = popand('+');
+                       inst1 = newinst(OR);
+                       op2->last->u2.next = inst1;
+                       inst1->u1.right = op2->first;
+                       pushand(op2->first, inst1);
+                       break;
+               case QUEST:
+                       op2 = popand('?');
+                       inst1 = newinst(OR);
+                       inst2 = newinst(NOP);
+                       inst1->u2.left = inst2;
+                       inst1->u1.right = op2->first;
+                       op2->last->u2.next = inst2;
+                       pushand(inst1, inst2);
+                       break;
+               }
+       }
+}
+
+static Reprog*
+optimize(Reprog *pp)
+{
+       Reinst *inst, *target;
+       int size;
+       Reprog *npp;
+       Reclass *cl;
+       int diff;
+
+       /*
+        *  get rid of NOOP chains
+        */
+       for(inst=pp->firstinst; inst->type!=END; inst++){
+               target = inst->u2.next;
+               while(target->type == NOP)
+                       target = target->u2.next;
+               inst->u2.next = target;
+       }
+
+       /*
+        *  The original allocation is for an area larger than
+        *  necessary.  Reallocate to the actual space used
+        *  and then relocate the code.
+        */
+       size = sizeof(Reprog) + (freep - pp->firstinst)*sizeof(Reinst);
+       npp = realloc(pp, size);
+       if(npp==0 || npp==pp)
+               return pp;
+       diff = (char *)npp - (char *)pp;
+       freep = (Reinst *)((char *)freep + diff);
+       for(inst=npp->firstinst; inst<freep; inst++){
+               switch(inst->type){
+               case OR:
+               case STAR:
+               case PLUS:
+               case QUEST:
+                       inst->u1.right = (void*)((char*)inst->u1.right + diff);
+                       break;
+               case CCLASS:
+               case NCCLASS:
+                       inst->u1.right = (void*)((char*)inst->u1.right + diff);
+                       cl = inst->u1.cp;
+                       cl->end = (void*)((char*)cl->end + diff);
+                       break;
+               }
+               inst->u2.left = (void*)((char*)inst->u2.left + diff);
+       }
+       npp->startinst = (void*)((char*)npp->startinst + diff);
+       return npp;
+}
+
+#ifdef DEBUG
+static void
+dumpstack(void){
+       Node *stk;
+       int *ip;
+
+       print("operators\n");
+       for(ip=atorstack; ip<atorp; ip++)
+               print("0%o\n", *ip);
+       print("operands\n");
+       for(stk=andstack; stk<andp; stk++)
+               print("0%o\t0%o\n", stk->first->type, stk->last->type);
+}
+
+static void
+dump(Reprog *pp)
+{
+       Reinst *l;
+       Rune *p;
+
+       l = pp->firstinst;
+       do{
+               print("%d:\t0%o\t%d\t%d", l-pp->firstinst, l->type,
+                       l->u2.left-pp->firstinst, l->u1.right-pp->firstinst);
+               if(l->type == RUNE)
+                       print("\t%C\n", l->u1.r);
+               else if(l->type == CCLASS || l->type == NCCLASS){
+                       print("\t[");
+                       if(l->type == NCCLASS)
+                               print("^");
+                       for(p = l->u1.cp->spans; p < l->u1.cp->end; p += 2)
+                               if(p[0] == p[1])
+                                       print("%C", p[0]);
+                               else
+                                       print("%C-%C", p[0], p[1]);
+                       print("]\n");
+               } else
+                       print("\n");
+       }while(l++->type);
+}
+#endif
+
+static Reclass*
+newclass(void)
+{
+       if(nclass >= nelem(reprog->class))
+               rcerror("too many character classes; increase Reprog.class size");
+       return &(classp[nclass++]);
+}
+
+static int
+nextc(Rune *rp)
+{
+       if(lexdone){
+               *rp = 0;
+               return 1;
+       }
+       exprp += chartorune(rp, exprp);
+       if(*rp == '\\'){
+               exprp += chartorune(rp, exprp);
+               return 1;
+       }
+       if(*rp == 0)
+               lexdone = 1;
+       return 0;
+}
+
+static int
+lex(int literal, int dot_type)
+{
+       int quoted;
+
+       quoted = nextc(&yyrune);
+       if(literal || quoted){
+               if(yyrune == 0)
+                       return END;
+               return RUNE;
+       }
+
+       switch(yyrune){
+       case 0:
+               return END;
+       case '*':
+               return STAR;
+       case '?':
+               return QUEST;
+       case '+':
+               return PLUS;
+       case '|':
+               return OR;
+       case '.':
+               return dot_type;
+       case '(':
+               return LBRA;
+       case ')':
+               return RBRA;
+       case '^':
+               return BOL;
+       case '$':
+               return EOL;
+       case '[':
+               return bldcclass();
+       }
+       return RUNE;
+}
+
+static int
+bldcclass(void)
+{
+       int type;
+       Rune r[NCCRUNE];
+       Rune *p, *ep, *np;
+       Rune rune;
+       int quoted;
+
+       /* we have already seen the '[' */
+       type = CCLASS;
+       yyclassp = newclass();
+
+       /* look ahead for negation */
+       /* SPECIAL CASE!!! negated classes don't match \n */
+       ep = r;
+       quoted = nextc(&rune);
+       if(!quoted && rune == '^'){
+               type = NCCLASS;
+               quoted = nextc(&rune);
+               *ep++ = '\n';
+               *ep++ = '\n';
+       }
+
+       /* parse class into a set of spans */
+       while(ep < &r[NCCRUNE-1]){
+               if(rune == 0){
+                       rcerror("malformed '[]'");
+                       return 0;
+               }
+               if(!quoted && rune == ']')
+                       break;
+               if(!quoted && rune == '-'){
+                       if(ep == r){
+                               rcerror("malformed '[]'");
+                               return 0;
+                       }
+                       quoted = nextc(&rune);
+                       if((!quoted && rune == ']') || rune == 0){
+                               rcerror("malformed '[]'");
+                               return 0;
+                       }
+                       *(ep-1) = rune;
+               } else {
+                       *ep++ = rune;
+                       *ep++ = rune;
+               }
+               quoted = nextc(&rune);
+       }
+       if(ep >= &r[NCCRUNE-1]) {
+               rcerror("char class too large; increase Reclass.spans size");
+               return 0;
+       }
+
+       /* sort on span start */
+       for(p = r; p < ep; p += 2){
+               for(np = p; np < ep; np += 2)
+                       if(*np < *p){
+                               rune = np[0];
+                               np[0] = p[0];
+                               p[0] = rune;
+                               rune = np[1];
+                               np[1] = p[1];
+                               p[1] = rune;
+                       }
+       }
+
+       /* merge spans */
+       np = yyclassp->spans;
+       p = r;
+       if(r == ep)
+               yyclassp->end = np;
+       else {
+               np[0] = *p++;
+               np[1] = *p++;
+               for(; p < ep; p += 2)
+                       /* overlapping or adjacent ranges? */
+                       if(p[0] <= np[1] + 1){
+                               if(p[1] >= np[1])
+                                       np[1] = p[1];   /* coalesce */
+                       } else {
+                               np += 2;
+                               np[0] = p[0];
+                               np[1] = p[1];
+                       }
+               yyclassp->end = np+2;
+       }
+
+       return type;
+}
+
+static Reprog*
+regcomp1(char *s, int literal, int dot_type)
+{
+       int token;
+       Reprog *volatile pp;
+
+       /* get memory for the program */
+       pp = malloc(sizeof(Reprog) + 6*sizeof(Reinst)*strlen(s));
+       if(pp == 0){
+               regerror("out of memory");
+               return 0;
+       }
+       freep = pp->firstinst;
+       classp = pp->class;
+       errors = 0;
+
+       if(setjmp(regkaboom))
+               goto out;
+
+       /* go compile the sucker */
+       lexdone = 0;
+       exprp = s;
+       nclass = 0;
+       nbra = 0;
+       atorp = atorstack;
+       andp = andstack;
+       subidp = subidstack;
+       lastwasand = FALSE;
+       cursubid = 0;
+
+       /* Start with a low priority operator to prime parser */
+       pushator(START-1);
+       while((token = lex(literal, dot_type)) != END){
+               if((token&0300) == OPERATOR)
+                       operator(token);
+               else
+                       operand(token);
+       }
+
+       /* Close with a low priority operator */
+       evaluntil(START);
+
+       /* Force END */
+       operand(END);
+       evaluntil(START);
+#ifdef DEBUG
+       dumpstack();
+#endif
+       if(nbra)
+               rcerror("unmatched left paren");
+       --andp; /* points to first and only operand */
+       pp->startinst = andp->first;
+#ifdef DEBUG
+       dump(pp);
+#endif
+       pp = optimize(pp);
+#ifdef DEBUG
+       print("start: %d\n", andp->first-pp->firstinst);
+       dump(pp);
+#endif
+out:
+       if(errors){
+               free(pp);
+               pp = 0;
+       }
+       return pp;
+}
+
+extern Reprog*
+regcomp(char *s)
+{
+       return regcomp1(s, 0, ANY);
+}
+
+extern Reprog*
+regcomplit(char *s)
+{
+       return regcomp1(s, 1, ANY);
+}
+
+extern Reprog*
+regcompnl(char *s)
+{
+       return regcomp1(s, 0, ANYNL);
+}
diff --git a/regcomp.h b/regcomp.h
new file mode 100644 (file)
index 0000000..4b9a483
--- /dev/null
+++ b/regcomp.h
@@ -0,0 +1,67 @@
+/*
+ *  substitution list
+ */
+#define uchar __reuchar
+typedef unsigned char uchar;
+#define nelem(x) (sizeof(x)/sizeof((x)[0]))
+
+#define NSUBEXP 32
+typedef struct Resublist       Resublist;
+struct Resublist
+{
+       Resub   m[NSUBEXP];
+};
+
+/*
+ * Actions and Tokens (Reinst types)
+ *
+ *     02xx are operators, value == precedence
+ *     03xx are tokens, i.e. operands for operators
+ */
+#define RUNE           0177
+#define        OPERATOR        0200    /* Bitmask of all operators */
+#define        START           0200    /* Start, used for marker on stack */
+#define        RBRA            0201    /* Right bracket, ) */
+#define        LBRA            0202    /* Left bracket, ( */
+#define        OR              0203    /* Alternation, | */
+#define        CAT             0204    /* Concatentation, implicit operator */
+#define        STAR            0205    /* Closure, * */
+#define        PLUS            0206    /* a+ == aa* */
+#define        QUEST           0207    /* a? == a|nothing, i.e. 0 or 1 a's */
+#define        ANY             0300    /* Any character except newline, . */
+#define        ANYNL           0301    /* Any character including newline, . */
+#define        NOP             0302    /* No operation, internal use only */
+#define        BOL             0303    /* Beginning of line, ^ */
+#define        EOL             0304    /* End of line, $ */
+#define        CCLASS          0305    /* Character class, [] */
+#define        NCCLASS         0306    /* Negated character class, [] */
+#define        END             0377    /* Terminate: match found */
+
+/*
+ *  regexec execution lists
+ */
+#define LISTSIZE       10
+#define BIGLISTSIZE    (25*LISTSIZE)
+typedef struct Relist  Relist;
+struct Relist
+{
+       Reinst*         inst;           /* Reinstruction of the thread */
+       Resublist       se;             /* matched subexpressions in this thread */
+};
+typedef struct Reljunk Reljunk;
+struct Reljunk
+{
+       Relist* relist[2];
+       Relist* reliste[2];
+       int     starttype;
+       Rune    startchar;
+       char*   starts;
+       char*   eol;
+       Rune*   rstarts;
+       Rune*   reol;
+};
+
+extern Relist* _renewthread(Relist*, Reinst*, int, Resublist*);
+extern void    _renewmatch(Resub*, int, Resublist*);
+extern Relist* _renewemptythread(Relist*, Reinst*, int, char*);
+extern Relist* _rrenewemptythread(Relist*, Reinst*, int, Rune*);
diff --git a/regerror.c b/regerror.c
new file mode 100644 (file)
index 0000000..99ff0c7
--- /dev/null
@@ -0,0 +1,14 @@
+#include "lib9.h"
+#include "regexp9.h"
+
+void
+regerror(char *s)
+{
+       char buf[132];
+
+       strcpy(buf, "regerror: ");
+       strcat(buf, s);
+       strcat(buf, "\n");
+       write(2, buf, strlen(buf));
+       exits("regerr");
+}
diff --git a/regexec.c b/regexec.c
new file mode 100644 (file)
index 0000000..a00fbcb
--- /dev/null
+++ b/regexec.c
@@ -0,0 +1,231 @@
+#include "lib9.h"
+#include "regexp9.h"
+#include "regcomp.h"
+
+
+/*
+ *  return     0 if no match
+ *             >0 if a match
+ *             <0 if we ran out of _relist space
+ */
+static int
+regexec1(Reprog *progp,        /* program to run */
+       char *bol,      /* string to run machine on */
+       Resub *mp,      /* subexpression elements */
+       int ms,         /* number of elements at mp */
+       Reljunk *j
+)
+{
+       int flag=0;
+       Reinst *inst;
+       Relist *tlp;
+       char *s;
+       int i, checkstart;
+       Rune r, *rp, *ep;
+       int n;
+       Relist* tl;             /* This list, next list */
+       Relist* nl;
+       Relist* tle;            /* ends of this and next list */
+       Relist* nle;
+       int match;
+       char *p;
+
+       match = 0;
+       checkstart = j->starttype;
+       if(mp)
+               for(i=0; i<ms; i++) {
+                       mp[i].s.sp = 0;
+                       mp[i].e.ep = 0;
+               }
+       j->relist[0][0].inst = 0;
+       j->relist[1][0].inst = 0;
+
+       /* Execute machine once for each character, including terminal NUL */
+       s = j->starts;
+       do{
+               /* fast check for first char */
+               if(checkstart) {
+                       switch(j->starttype) {
+                       case RUNE:
+                               p = utfrune(s, j->startchar);
+                               if(p == 0 || s == j->eol)
+                                       return match;
+                               s = p;
+                               break;
+                       case BOL:
+                               if(s == bol)
+                                       break;
+                               p = utfrune(s, '\n');
+                               if(p == 0 || s == j->eol)
+                                       return match;
+                               s = p+1;
+                               break;
+                       }
+               }
+               r = *(uchar*)s;
+               if(r < Runeself)
+                       n = 1;
+               else
+                       n = chartorune(&r, s);
+
+               /* switch run lists */
+               tl = j->relist[flag];
+               tle = j->reliste[flag];
+               nl = j->relist[flag^=1];
+               nle = j->reliste[flag];
+               nl->inst = 0;
+
+               /* Add first instruction to current list */
+               if(match == 0)
+                       _renewemptythread(tl, progp->startinst, ms, s);
+
+               /* Execute machine until current list is empty */
+               for(tlp=tl; tlp->inst; tlp++){  /* assignment = */
+                       for(inst = tlp->inst; ; inst = inst->u2.next){
+                               switch(inst->type){
+                               case RUNE:      /* regular character */
+                                       if(inst->u1.r == r){
+                                               if(_renewthread(nl, inst->u2.next, ms, &tlp->se)==nle)
+                                                       return -1;
+                                       }
+                                       break;
+                               case LBRA:
+                                       tlp->se.m[inst->u1.subid].s.sp = s;
+                                       continue;
+                               case RBRA:
+                                       tlp->se.m[inst->u1.subid].e.ep = s;
+                                       continue;
+                               case ANY:
+                                       if(r != '\n')
+                                               if(_renewthread(nl, inst->u2.next, ms, &tlp->se)==nle)
+                                                       return -1;
+                                       break;
+                               case ANYNL:
+                                       if(_renewthread(nl, inst->u2.next, ms, &tlp->se)==nle)
+                                                       return -1;
+                                       break;
+                               case BOL:
+                                       if(s == bol || *(s-1) == '\n')
+                                               continue;
+                                       break;
+                               case EOL:
+                                       if(s == j->eol || r == 0 || r == '\n')
+                                               continue;
+                                       break;
+                               case CCLASS:
+                                       ep = inst->u1.cp->end;
+                                       for(rp = inst->u1.cp->spans; rp < ep; rp += 2)
+                                               if(r >= rp[0] && r <= rp[1]){
+                                                       if(_renewthread(nl, inst->u2.next, ms, &tlp->se)==nle)
+                                                               return -1;
+                                                       break;
+                                               }
+                                       break;
+                               case NCCLASS:
+                                       ep = inst->u1.cp->end;
+                                       for(rp = inst->u1.cp->spans; rp < ep; rp += 2)
+                                               if(r >= rp[0] && r <= rp[1])
+                                                       break;
+                                       if(rp == ep)
+                                               if(_renewthread(nl, inst->u2.next, ms, &tlp->se)==nle)
+                                                       return -1;
+                                       break;
+                               case OR:
+                                       /* evaluate right choice later */
+                                       if(_renewthread(tlp, inst->u1.right, ms, &tlp->se) == tle)
+                                               return -1;
+                                       /* efficiency: advance and re-evaluate */
+                                       continue;
+                               case END:       /* Match! */
+                                       match = 1;
+                                       tlp->se.m[0].e.ep = s;
+                                       if(mp != 0)
+                                               _renewmatch(mp, ms, &tlp->se);
+                                       break;
+                               }
+                               break;
+                       }
+               }
+               if(s == j->eol)
+                       break;
+               checkstart = j->starttype && nl->inst==0;
+               s += n;
+       }while(r);
+       return match;
+}
+
+static int
+regexec2(Reprog *progp,        /* program to run */
+       char *bol,      /* string to run machine on */
+       Resub *mp,      /* subexpression elements */
+       int ms,         /* number of elements at mp */
+       Reljunk *j
+)
+{
+       int rv;
+       Relist *relist0, *relist1;
+
+       /* mark space */
+       relist0 = malloc(BIGLISTSIZE*sizeof(Relist));
+       if(relist0 == nil)
+               return -1;
+       relist1 = malloc(BIGLISTSIZE*sizeof(Relist));
+       if(relist1 == nil){
+               free(relist1);
+               return -1;
+       }
+       j->relist[0] = relist0;
+       j->relist[1] = relist1;
+       j->reliste[0] = relist0 + BIGLISTSIZE - 2;
+       j->reliste[1] = relist1 + BIGLISTSIZE - 2;
+
+       rv = regexec1(progp, bol, mp, ms, j);
+       free(relist0);
+       free(relist1);
+       return rv;
+}
+
+extern int
+regexec(Reprog *progp, /* program to run */
+       char *bol,      /* string to run machine on */
+       Resub *mp,      /* subexpression elements */
+       int ms)         /* number of elements at mp */
+{
+       Reljunk j;
+       Relist relist0[LISTSIZE], relist1[LISTSIZE];
+       int rv;
+
+       /*
+        *  use user-specified starting/ending location if specified
+        */
+       j.starts = bol;
+       j.eol = 0;
+       if(mp && ms>0){
+               if(mp->s.sp)
+                       j.starts = mp->s.sp;
+               if(mp->e.ep)
+                       j.eol = mp->e.ep;
+       }
+       j.starttype = 0;
+       j.startchar = 0;
+       if(progp->startinst->type == RUNE && progp->startinst->u1.r < Runeself) {
+               j.starttype = RUNE;
+               j.startchar = progp->startinst->u1.r;
+       }
+       if(progp->startinst->type == BOL)
+               j.starttype = BOL;
+
+       /* mark space */
+       j.relist[0] = relist0;
+       j.relist[1] = relist1;
+       j.reliste[0] = relist0 + nelem(relist0) - 2;
+       j.reliste[1] = relist1 + nelem(relist1) - 2;
+
+       rv = regexec1(progp, bol, mp, ms, &j);
+       if(rv >= 0)
+               return rv;
+       rv = regexec2(progp, bol, mp, ms, &j);
+       if(rv >= 0)
+               return rv;
+       return -1;
+}
diff --git a/regexp9.h b/regexp9.h
new file mode 100644 (file)
index 0000000..20c0c3d
--- /dev/null
+++ b/regexp9.h
@@ -0,0 +1,96 @@
+#ifndef _REGEXP9_H_
+#define _REGEXP9_H_ 1
+#if defined(__cplusplus)
+extern "C" { 
+#endif
+
+#ifdef AUTOLIB
+AUTOLIB(regexp9)
+#endif
+
+#include <utf.h>
+
+typedef struct Resub           Resub;
+typedef struct Reclass         Reclass;
+typedef struct Reinst          Reinst;
+typedef struct Reprog          Reprog;
+
+/*
+ *     Sub expression matches
+ */
+struct Resub{
+       union
+       {
+               char *sp;
+               Rune *rsp;
+       }s;
+       union
+       {
+               char *ep;
+               Rune *rep;
+       }e;
+};
+
+/*
+ *     character class, each pair of rune's defines a range
+ */
+struct Reclass{
+       Rune    *end;
+       Rune    spans[64];
+};
+
+/*
+ *     Machine instructions
+ */
+struct Reinst{
+       int     type;
+       union   {
+               Reclass *cp;            /* class pointer */
+               Rune    r;              /* character */
+               int     subid;          /* sub-expression id for RBRA and LBRA */
+               Reinst  *right;         /* right child of OR */
+       }u1;
+       union { /* regexp relies on these two being in the same union */
+               Reinst *left;           /* left child of OR */
+               Reinst *next;           /* next instruction for CAT & LBRA */
+       }u2;
+};
+
+/*
+ *     Reprogram definition
+ */
+struct Reprog{
+       Reinst  *startinst;     /* start pc */
+       Reclass class[16];      /* .data */
+       Reinst  firstinst[5];   /* .text */
+};
+
+extern Reprog  *regcomp9(char*);
+extern Reprog  *regcomplit9(char*);
+extern Reprog  *regcompnl9(char*);
+extern void    regerror9(char*);
+extern int     regexec9(Reprog*, char*, Resub*, int);
+extern void    regsub9(char*, char*, int, Resub*, int);
+
+extern int     rregexec9(Reprog*, Rune*, Resub*, int);
+extern void    rregsub9(Rune*, Rune*, int, Resub*, int);
+
+/*
+ * Darwin simply cannot handle having routines that
+ * override other library routines.
+ */
+#ifndef NOPLAN9DEFINES
+#define regcomp regcomp9
+#define regcomplit regcomplit9
+#define regcompnl regcompnl9
+#define regerror regerror9
+#define regexec regexec9
+#define regsub regsub9
+#define rregexec rregexec9
+#define rregsub rregsub9
+#endif
+
+#if defined(__cplusplus)
+}
+#endif
+#endif
diff --git a/regsub.c b/regsub.c
new file mode 100644 (file)
index 0000000..579d124
--- /dev/null
+++ b/regsub.c
@@ -0,0 +1,62 @@
+#include "lib9.h"
+#include "regexp9.h"
+
+/* substitute into one string using the matches from the last regexec() */
+extern void
+regsub(char *sp,       /* source string */
+       char *dp,       /* destination string */
+       int dlen,
+       Resub *mp,      /* subexpression elements */
+       int ms)         /* number of elements pointed to by mp */
+{
+       char *ssp, *ep;
+       int i;
+
+       ep = dp+dlen-1;
+       while(*sp != '\0'){
+               if(*sp == '\\'){
+                       switch(*++sp){
+                       case '0':
+                       case '1':
+                       case '2':
+                       case '3':
+                       case '4':
+                       case '5':
+                       case '6':
+                       case '7':
+                       case '8':
+                       case '9':
+                               i = *sp-'0';
+                               if(mp!=0 && mp[i].s.sp != 0 && ms>i)
+                                       for(ssp = mp[i].s.sp;
+                                            ssp < mp[i].e.ep;
+                                            ssp++)
+                                               if(dp < ep)
+                                                       *dp++ = *ssp;
+                               break;
+                       case '\\':
+                               if(dp < ep)
+                                       *dp++ = '\\';
+                               break;
+                       case '\0':
+                               sp--;
+                               break;
+                       default:
+                               if(dp < ep)
+                                       *dp++ = *sp;
+                               break;
+                       }
+               }else if(*sp == '&'){
+                       if(mp!=0 && mp[0].s.sp != 0 && ms>0)
+                               for(ssp = mp[0].s.sp;
+                                    ssp < mp[0].e.ep; ssp++)
+                                       if(dp < ep)
+                                               *dp++ = *ssp;
+               }else{
+                       if(dp < ep)
+                               *dp++ = *sp;
+               }
+               sp++;
+       }
+       *dp = '\0';
+}
diff --git a/rregexec.c b/rregexec.c
new file mode 100644 (file)
index 0000000..16d95e6
--- /dev/null
@@ -0,0 +1,212 @@
+#include "lib9.h"
+#include "regexp9.h"
+#include "regcomp.h"
+
+/*
+ *  return     0 if no match
+ *             >0 if a match
+ *             <0 if we ran out of _relist space
+ */
+static int
+rregexec1(Reprog *progp,       /* program to run */
+       Rune *bol,              /* string to run machine on */
+       Resub *mp,              /* subexpression elements */
+       int ms,                 /* number of elements at mp */
+       Reljunk *j)
+{
+       int flag=0;
+       Reinst *inst;
+       Relist *tlp;
+       Rune *s;
+       int i, checkstart;
+       Rune r, *rp, *ep;
+       Relist* tl;             /* This list, next list */
+       Relist* nl;
+       Relist* tle;            /* ends of this and next list */
+       Relist* nle;
+       int match;
+       Rune *p;
+
+       match = 0;
+       checkstart = j->startchar;
+       if(mp)
+               for(i=0; i<ms; i++) {
+                       mp[i].s.rsp = 0;
+                       mp[i].e.rep = 0;
+               }
+       j->relist[0][0].inst = 0;
+       j->relist[1][0].inst = 0;
+
+       /* Execute machine once for each character, including terminal NUL */
+       s = j->rstarts;
+       do{
+
+               /* fast check for first char */
+               if(checkstart) {
+                       switch(j->starttype) {
+                       case RUNE:
+                               p = runestrchr(s, j->startchar);
+                               if(p == 0 || p == j->reol)
+                                       return match;
+                               s = p;
+                               break;
+                       case BOL:
+                               if(s == bol)
+                                       break;
+                               p = runestrchr(s, '\n');
+                               if(p == 0 || s == j->reol)
+                                       return match;
+                               s = p+1;
+                               break;
+                       }
+               }
+
+               r = *s;
+
+               /* switch run lists */
+               tl = j->relist[flag];
+               tle = j->reliste[flag];
+               nl = j->relist[flag^=1];
+               nle = j->reliste[flag];
+               nl->inst = 0;
+
+               /* Add first instruction to current list */
+               _rrenewemptythread(tl, progp->startinst, ms, s);
+
+               /* Execute machine until current list is empty */
+               for(tlp=tl; tlp->inst; tlp++){
+                       for(inst=tlp->inst; ; inst = inst->u2.next){
+                               switch(inst->type){
+                               case RUNE:      /* regular character */
+                                       if(inst->u1.r == r)
+                                               if(_renewthread(nl, inst->u2.next, ms, &tlp->se)==nle)
+                                                       return -1;
+                                       break;
+                               case LBRA:
+                                       tlp->se.m[inst->u1.subid].s.rsp = s;
+                                       continue;
+                               case RBRA:
+                                       tlp->se.m[inst->u1.subid].e.rep = s;
+                                       continue;
+                               case ANY:
+                                       if(r != '\n')
+                                               if(_renewthread(nl, inst->u2.next, ms, &tlp->se)==nle)
+                                                       return -1;
+                                       break;
+                               case ANYNL:
+                                       if(_renewthread(nl, inst->u2.next, ms, &tlp->se)==nle)
+                                                       return -1;
+                                       break;
+                               case BOL:
+                                       if(s == bol || *(s-1) == '\n')
+                                               continue;
+                                       break;
+                               case EOL:
+                                       if(s == j->reol || r == 0 || r == '\n')
+                                               continue;
+                                       break;
+                               case CCLASS:
+                                       ep = inst->u1.cp->end;
+                                       for(rp = inst->u1.cp->spans; rp < ep; rp += 2)
+                                               if(r >= rp[0] && r <= rp[1]){
+                                                       if(_renewthread(nl, inst->u2.next, ms, &tlp->se)==nle)
+                                                               return -1;
+                                                       break;
+                                               }
+                                       break;
+                               case NCCLASS:
+                                       ep = inst->u1.cp->end;
+                                       for(rp = inst->u1.cp->spans; rp < ep; rp += 2)
+                                               if(r >= rp[0] && r <= rp[1])
+                                                       break;
+                                       if(rp == ep)
+                                               if(_renewthread(nl, inst->u2.next, ms, &tlp->se)==nle)
+                                                       return -1;
+                                       break;
+                               case OR:
+                                       /* evaluate right choice later */
+                                       if(_renewthread(tlp, inst->u1.right, ms, &tlp->se) == tle)
+                                               return -1;
+                                       /* efficiency: advance and re-evaluate */
+                                       continue;
+                               case END:       /* Match! */
+                                       match = 1;
+                                       tlp->se.m[0].e.rep = s;
+                                       if(mp != 0)
+                                               _renewmatch(mp, ms, &tlp->se);
+                                       break;
+                               }
+                               break;
+                       }
+               }
+               if(s == j->reol)
+                       break;
+               checkstart = j->startchar && nl->inst==0;
+               s++;
+       }while(r);
+       return match;
+}
+
+static int
+rregexec2(Reprog *progp,       /* program to run */
+       Rune *bol,      /* string to run machine on */
+       Resub *mp,      /* subexpression elements */
+       int ms,         /* number of elements at mp */
+       Reljunk *j
+)
+{
+       Relist relist0[5*LISTSIZE], relist1[5*LISTSIZE];
+
+       /* mark space */
+       j->relist[0] = relist0;
+       j->relist[1] = relist1;
+       j->reliste[0] = relist0 + nelem(relist0) - 2;
+       j->reliste[1] = relist1 + nelem(relist1) - 2;
+
+       return rregexec1(progp, bol, mp, ms, j);
+}
+
+extern int
+rregexec(Reprog *progp,        /* program to run */
+       Rune *bol,      /* string to run machine on */
+       Resub *mp,      /* subexpression elements */
+       int ms)         /* number of elements at mp */
+{
+       Reljunk j;
+       Relist relist0[LISTSIZE], relist1[LISTSIZE];
+       int rv;
+
+       /*
+        *  use user-specified starting/ending location if specified
+        */
+       j.rstarts = bol;
+       j.reol = 0;
+       if(mp && ms>0){
+               if(mp->s.sp)
+                       j.rstarts = mp->s.rsp;
+               if(mp->e.ep)
+                       j.reol = mp->e.rep;
+       }
+       j.starttype = 0;
+       j.startchar = 0;
+       if(progp->startinst->type == RUNE && progp->startinst->u1.r < Runeself) {
+               j.starttype = RUNE;
+               j.startchar = progp->startinst->u1.r;
+       }
+       if(progp->startinst->type == BOL)
+               j.starttype = BOL;
+
+       /* mark space */
+       j.relist[0] = relist0;
+       j.relist[1] = relist1;
+       j.reliste[0] = relist0 + nelem(relist0) - 2;
+       j.reliste[1] = relist1 + nelem(relist1) - 2;
+
+       rv = rregexec1(progp, bol, mp, ms, &j);
+       if(rv >= 0)
+               return rv;
+       rv = rregexec2(progp, bol, mp, ms, &j);
+       if(rv >= 0)
+               return rv;
+       return -1;
+}
diff --git a/rregsub.c b/rregsub.c
new file mode 100644 (file)
index 0000000..5a4a564
--- /dev/null
+++ b/rregsub.c
@@ -0,0 +1,63 @@
+#include "lib9.h"
+#include "regexp9.h"
+
+/* substitute into one string using the matches from the last regexec() */
+extern void
+rregsub(Rune *sp,      /* source string */
+       Rune *dp,       /* destination string */
+       int dlen,
+       Resub *mp,      /* subexpression elements */
+       int ms)         /* number of elements pointed to by mp */
+{
+       Rune *ssp, *ep;
+       int i;
+
+       ep = dp+(dlen/sizeof(Rune))-1;
+       while(*sp != '\0'){
+               if(*sp == '\\'){
+                       switch(*++sp){
+                       case '0':
+                       case '1':
+                       case '2':
+                       case '3':
+                       case '4':
+                       case '5':
+                       case '6':
+                       case '7':
+                       case '8':
+                       case '9':
+                               i = *sp-'0';
+                               if(mp[i].s.rsp != 0 && mp!=0 && ms>i)
+                                       for(ssp = mp[i].s.rsp;
+                                            ssp < mp[i].e.rep;
+                                            ssp++)
+                                               if(dp < ep)
+                                                       *dp++ = *ssp;
+                               break;
+                       case '\\':
+                               if(dp < ep)
+                                       *dp++ = '\\';
+                               break;
+                       case '\0':
+                               sp--;
+                               break;
+                       default:
+                               if(dp < ep)
+                                       *dp++ = *sp;
+                               break;
+                       }
+               }else if(*sp == '&'){                           
+                       if(mp[0].s.rsp != 0 && mp!=0 && ms>0)
+                       if(mp[0].s.rsp != 0)
+                               for(ssp = mp[0].s.rsp;
+                                    ssp < mp[0].e.rep; ssp++)
+                                       if(dp < ep)
+                                               *dp++ = *ssp;
+               }else{
+                       if(dp < ep)
+                               *dp++ = *sp;
+               }
+               sp++;
+       }
+       *dp = '\0';
+}
diff --git a/test.c b/test.c
new file mode 100644 (file)
index 0000000..f6bea53
--- /dev/null
+++ b/test.c
@@ -0,0 +1,45 @@
+#include "lib9.h"
+#include <regexp9.h>
+
+struct x
+{
+       char *re;
+       char *s;
+       Reprog *p;
+};
+
+struct x t[] = {
+       { "^[^!@]+$", "/bin/upas/aliasmail '&'", 0 },
+       { "^local!(.*)$", "/mail/box/\\1/mbox", 0 },
+       { "^plan9!(.*)$", "\\1", 0 },
+       { "^helix!(.*)$", "\\1", 0 },
+       { "^([^!]+)@([^!@]+)$", "\\2!\\1", 0 },
+       { "^(uk\\.[^!]*)(!.*)$", "/bin/upas/uk2uk '\\1' '\\2'", 0 },
+       { "^[^!]*\\.[^!]*!.*$", "inet!&", 0 },
+       { "^\xE2\x98\xBA$", "smiley", 0 },
+       { "^(coma|research|pipe|pyxis|inet|hunny|gauss)!(.*)$", "/mail/lib/qmail '\\s' 'net!\\1' '\\2'", 0 },
+       { "^.*$", "/mail/lib/qmail '\\s' 'net!research' '&'", 0 },
+       { 0, 0, 0 },
+};
+
+int
+main(int ac, char **av)
+{
+       Resub rs[10];
+       char dst[128];
+       struct x *tp;
+
+       for(tp = t; tp->re; tp++)
+               tp->p = regcomp(tp->re);
+
+       for(tp = t; tp->re; tp++){
+               print("%s VIA %s", av[1], tp->re);
+               memset(rs, 0, sizeof rs);
+               if(regexec(tp->p, av[1], rs, 10)){
+                       regsub(tp->s, dst, sizeof dst, rs, 10);
+                       print(" sub %s -> %s", tp->s, dst);
+               }
+               print("\n");
+       }
+       exit(0);
+}
diff --git a/test2.c b/test2.c
new file mode 100644 (file)
index 0000000..62d5213
--- /dev/null
+++ b/test2.c
@@ -0,0 +1,19 @@
+#include "lib9.h"
+#include <regexp9.h>
+
+int
+main(int ac, char **av)
+{
+       Resub rs[10];
+       Reprog *p;
+       char *s;
+
+       p = regcomp("[^a-z]");
+       s = "\n";
+       if(regexec(p, s, rs, 10))
+               print("%s %lux %lux %lux\n", s, s, rs[0].s.sp, rs[0].e.ep);
+       s = "0";
+       if(regexec(p, s, rs, 10))
+               print("%s %lux %lux %lux\n", s, s, rs[0].s.sp, rs[0].e.ep);
+       exit(0);
+}