regexp.c

Go to the documentation of this file.
00001 /*
00002  * regcomp and regexec -- regsub and regerror are elsewhere
00003  *
00004  *      Copyright (c) 1986 by University of Toronto.
00005  *      Written by Henry Spencer.  Not derived from licensed software.
00006  *
00007  *      Permission is granted to anyone to use this software for any
00008  *      purpose on any computer system, and to redistribute it freely,
00009  *      subject to the following restrictions:
00010  *
00011  *      1. The author is not responsible for the consequences of use of
00012  *              this software, no matter how awful, even if they arise
00013  *              from defects in it.
00014  *
00015  *      2. The origin of this software must not be misrepresented, either
00016  *              by explicit claim or by omission.
00017  *
00018  *      3. Altered versions must be plainly marked as such, and must not
00019  *              be misrepresented as being the original software.
00020  *
00021  * Beware that some of this code is subtly aware of the way operator
00022  * precedence is structured in regular expressions.  Serious changes in
00023  * regular-expression syntax might require a total rethink.
00024  *
00025  *      Modified by K.Hirabayashi to accept KANJI code, memory allocation
00026  *      and to add following functions.
00027  *              isthere(), mkpat(), match(), regsub(), sjtok(), ktosj(),
00028  *              Strchr(), Strncmp(), Strlen()
00029  */
00030 
00031 #include <stdio.h>
00032 #include <ctype.h>
00033 #include "regexp.h"
00034 
00035 #define regerror(a)     error("regular expression error: %s", a)
00036 
00037 int r_start, r_length;
00038 
00039 /*
00040  * The first byte of the regexp internal "program" is actually this magic
00041  * number; the start node begins in the second byte.
00042  */
00043 #define MAGIC   0234
00044 
00045 /*
00046  * The "internal use only" fields in regexp.h are present to pass info from
00047  * compile to execute that permits the execute phase to run lots faster on
00048  * simple cases.  They are:
00049  *
00050  * regstart     char that must begin a match; '\0' if none obvious
00051  * reganch      is the match anchored (at beginning-of-line only)?
00052  * regmust      string (pointer into program) that match must include, or NULL
00053  * regmlen      length of regmust string
00054  *
00055  * Regstart and reganch permit very fast decisions on suitable starting points
00056  * for a match, cutting down the work a lot.  Regmust permits fast rejection
00057  * of lines that cannot possibly match.  The regmust tests are costly enough
00058  * that regcomp() supplies a regmust only if the r.e. contains something
00059  * potentially expensive (at present, the only such thing detected is * or +
00060  * at the start of the r.e., which can involve a lot of backup).  Regmlen is
00061  * supplied because the test in regexec() needs it and regcomp() is computing
00062  * it anyway.
00063  */
00064 
00065 /*
00066  * Structure for regexp "program".  This is essentially a linear encoding
00067  * of a nondeterministic finite-state machine (aka syntax charts or
00068  * "railroad normal form" in parsing technology).  Each node is an opcode
00069  * plus a "next" pointer, possibly plus an operand.  "Next" pointers of
00070  * all nodes except BRANCH implement concatenation; a "next" pointer with
00071  * a BRANCH on both ends of it is connecting two alternatives.  (Here we
00072  * have one of the subtle syntax dependencies:  an individual BRANCH (as
00073  * opposed to a collection of them) is never concatenated with anything
00074  * because of operator precedence.)  The operand of some types of node is
00075  * a literal string; for others, it is a node leading into a sub-FSM.  In
00076  * particular, the operand of a BRANCH node is the first node of the branch.
00077  * (NB this is *not* a tree structure:  the tail of the branch connects
00078  * to the thing following the set of BRANCHes.)  The opcodes are:
00079  */
00080 
00081 /* definition   number  opnd?   meaning */
00082 #define END     0       /* no   End of program. */
00083 #define BOL     1       /* no   Match "" at beginning of line. */
00084 #define EOL     2       /* no   Match "" at end of line. */
00085 #define ANY     3       /* no   Match any one character. */
00086 #define ANYOF   4       /* str  Match any character in this string. */
00087 #define ANYBUT  5       /* str  Match any character not in this string. */
00088 #define BRANCH  6       /* node Match this alternative, or the next... */
00089 #define BACK    7       /* no   Match "", "next" ptr points backward. */
00090 #define EXACTLY 8       /* str  Match this string. */
00091 #define NOTHING 9       /* no   Match empty string. */
00092 #define STAR    10      /* node Match this (simple) thing 0 or more times. */
00093 #define PLUS    11      /* node Match this (simple) thing 1 or more times. */
00094 #define OPEN    20      /* no   Mark this point in input as start of #n. */
00095                 /*      OPEN+1 is number 1, etc. */
00096 #define CLOSE   30      /* no   Analogous to OPEN. */
00097 
00098 /*
00099  * Opcode notes:
00100  *
00101  * BRANCH       The set of branches constituting a single choice are hooked
00102  *              together with their "next" pointers, since precedence prevents
00103  *              anything being concatenated to any individual branch.  The
00104  *              "next" pointer of the last BRANCH in a choice points to the
00105  *              thing following the whole choice.  This is also where the
00106  *              final "next" pointer of each individual branch points; each
00107  *              branch starts with the operand node of a BRANCH node.
00108  *
00109  * BACK         Normal "next" pointers all implicitly point forward; BACK
00110  *              exists to make loop structures possible.
00111  *
00112  * STAR,PLUS    '?', and complex '*' and '+', are implemented as circular
00113  *              BRANCH structures using BACK.  Simple cases (one character
00114  *              per match) are implemented with STAR and PLUS for speed
00115  *              and to minimize recursive plunges.
00116  *
00117  * OPEN,CLOSE   ...are numbered at compile time.
00118  */
00119 
00120 /*
00121  * A node is one char of opcode followed by two chars of "next" pointer.
00122  * "Next" pointers are stored as two 8-bit pieces, high order first.  The
00123  * value is a positive offset from the opcode of the node containing it.
00124  * An operand, if any, simply follows the node.  (Note that much of the
00125  * code generation knows about this implicit relationship.)
00126  *
00127  * Using two bytes for the "next" pointer is vast overkill for most things,
00128  * but allows patterns to get big without disasters.
00129  */
00130 #define OP(p)   (*(p))
00131 #define NEXT(p) (((*((p)+1)&0377)<<8) + *((p)+2)&0377)
00132 #define OPERAND(p)      ((p) + 3)
00133 
00134 
00135 
00136 /*
00137  * Utility definitions.
00138  */
00139 #ifndef CHARBITS
00140 #define UCHARAT(p)      ((int)*(ushort *)(p))
00141 #else
00142 #define UCHARAT(p)      ((int)*(p)&CHARBITS)
00143 #endif
00144 
00145 #define FAIL(m) { regerror(m); return(NULL); }
00146 #define ISMULT(c)       ((c) == '*' || (c) == '+' || (c) == '?')
00147 #define META    "^$.[()|?+*\\"
00148 
00149 /*
00150  * Flags to be passed up and down.
00151  */
00152 #define HASWIDTH        01      /* Known never to match null string. */
00153 #define SIMPLE          02      /* Simple enough to be STAR/PLUS operand. */
00154 #define SPSTART         04      /* Starts with * or +. */
00155 #define WORST           0       /* Worst case. */
00156 
00157 /*
00158  * Global work variables for regcomp().
00159  */
00160 static ushort *regparse;                /* Input-scan pointer. */
00161 static int regnpar;             /* () count. */
00162 static ushort regdummy;
00163 static ushort *regcode;         /* Code-emit pointer; &regdummy = don't. */
00164 static long regsize;            /* Code size. */
00165 
00166 /*
00167  * Forward declarations for regcomp()'s friends.
00168  */
00169 #ifndef STATIC
00170 #define STATIC  static
00171 #endif
00172 STATIC ushort *reg();
00173 STATIC ushort *regbranch();
00174 STATIC ushort *regpiece();
00175 STATIC ushort *regatom();
00176 STATIC ushort *regnode();
00177 STATIC ushort *regnext();
00178 STATIC void regc();
00179 STATIC void reginsert();
00180 STATIC void regtail();
00181 STATIC void regoptail();
00182 STATIC int Strcspn();
00183 
00184 /*
00185  - regcomp - compile a regular expression into internal code
00186  *
00187  * We can't allocate space until we know how big the compiled form will be,
00188  * but we can't compile it (and thus know how big it is) until we've got a
00189  * place to put the code.  So we cheat:  we compile it twice, once with code
00190  * generation turned off and size counting turned on, and once "for real".
00191  * This also means that we don't allocate space until we are sure that the
00192  * thing really will compile successfully, and we never have to move the
00193  * code and thus invalidate pointers into it.  (Note that it has to be in
00194  * one piece because free() must be able to free it all.)
00195  *
00196  * Beware that the optimization-preparation code in here knows about some
00197  * of the structure of the compiled regexp.
00198  */
00199 regexp *
00200 regcomp(exp)
00201 ushort *exp;
00202 {
00203   register regexp *r;
00204   register ushort *scan;
00205   register ushort *longest;
00206   register int len;
00207   int flags;
00208   extern char *emalloc();
00209 
00210   if (exp == NULL)
00211         FAIL("NULL argument");
00212 
00213   /* First pass: determine size, legality. */
00214   regparse = exp;
00215   regnpar = 1;
00216   regsize = 0L;
00217   regcode = &regdummy;
00218   regc((ushort) MAGIC);
00219   if (reg(0, &flags) == NULL)
00220         return(NULL);
00221 
00222   /* Small enough for pointer-storage convention? */
00223   if (regsize >= 32767L)                /* Probably could be 65535L. */
00224         FAIL("regexp too big");
00225 
00226   /* Allocate space. */
00227   r = (regexp *)emalloc(sizeof(regexp) + (unsigned)regsize * sizeof(ushort));
00228 
00229   /* Second pass: emit code. */
00230   regparse = exp;
00231   regnpar = 1;
00232   regcode = r->program;
00233   regc((ushort) MAGIC);
00234   if (reg(0, &flags) == NULL)
00235         return(NULL);
00236 
00237   /* Dig out information for optimizations. */
00238   r->regstart = '\0';   /* Worst-case defaults. */
00239   r->reganch = 0;
00240   r->regmust = NULL;
00241   r->regmlen = 0;
00242   scan = r->program+1;                  /* First BRANCH. */
00243   if (OP(regnext(scan)) == END) {               /* Only one top-level choice. */
00244         scan = OPERAND(scan);
00245 
00246         /* Starting-point info. */
00247         if (OP(scan) == EXACTLY)
00248                 r->regstart = *OPERAND(scan);
00249         else if (OP(scan) == BOL)
00250                 r->reganch++;
00251 
00252         /*
00253          * If there's something expensive in the r.e., find the
00254          * longest literal string that must appear and make it the
00255          * regmust.  Resolve ties in favor of later strings, since
00256          * the regstart check works with the beginning of the r.e.
00257          * and avoiding duplication strengthens checking.  Not a
00258          * strong reason, but sufficient in the absence of others.
00259          */
00260         if (flags&SPSTART) {
00261                 longest = NULL;
00262                 len = 0;
00263                 for (; scan != NULL; scan = regnext(scan))
00264                         if (OP(scan) == EXACTLY && Strlen(OPERAND(scan)) >= len) {
00265                                 longest = OPERAND(scan);
00266                                 len = Strlen(OPERAND(scan));
00267                         }
00268                 r->regmust = longest;
00269                 r->regmlen = len;
00270         }
00271   }
00272 
00273   return(r);
00274 }
00275 
00276 /*
00277  - reg - regular expression, i.e. main body or parenthesized thing
00278  *
00279  * Caller must absorb opening parenthesis.
00280  *
00281  * Combining parenthesis handling with the base level of regular expression
00282  * is a trifle forced, but the need to tie the tails of the branches to what
00283  * follows makes it hard to avoid.
00284  */
00285 static ushort *
00286 reg(paren, flagp)
00287 int paren;                      /* Parenthesized? */
00288 int *flagp;
00289 {
00290   register ushort *ret;
00291   register ushort *br;
00292   register ushort *ender;
00293   register int parno;
00294   int flags;
00295 
00296   *flagp = HASWIDTH;    /* Tentatively. */
00297 
00298   /* Make an OPEN node, if parenthesized. */
00299   if (paren) {
00300         if (regnpar >= NSUBEXP)
00301                 FAIL("too many ()");
00302         parno = regnpar;
00303         regnpar++;
00304         ret = regnode(OPEN+parno);
00305   } else
00306         ret = NULL;
00307 
00308   /* Pick up the branches, linking them together. */
00309   br = regbranch(&flags);
00310   if (br == NULL)
00311         return(NULL);
00312   if (ret != NULL)
00313         regtail(ret, br);       /* OPEN -> first. */
00314   else
00315         ret = br;
00316   if (!(flags&HASWIDTH))
00317         *flagp &= ~HASWIDTH;
00318   *flagp |= flags&SPSTART;
00319   while (*regparse == '|') {
00320         regparse++;
00321         br = regbranch(&flags);
00322         if (br == NULL)
00323                 return(NULL);
00324         regtail(ret, br);       /* BRANCH -> BRANCH. */
00325         if (!(flags&HASWIDTH))
00326                 *flagp &= ~HASWIDTH;
00327         *flagp |= flags&SPSTART;
00328   }
00329 
00330   /* Make a closing node, and hook it on the end. */
00331   ender = regnode((paren) ? CLOSE+parno : END); 
00332   regtail(ret, ender);
00333 
00334   /* Hook the tails of the branches to the closing node. */
00335   for (br = ret; br != NULL; br = regnext(br))
00336         regoptail(br, ender);
00337 
00338   /* Check for proper termination. */
00339   if (paren && *regparse++ != ')') {
00340         FAIL("unmatched ()");
00341   } else if (!paren && *regparse != '\0') {
00342         if (*regparse == ')') {
00343                 FAIL("unmatched ()");
00344         } else
00345                 FAIL("junk on end");    /* "Can't happen". */
00346         /* NOTREACHED */
00347   }
00348 
00349   return(ret);
00350 }
00351 
00352 /*
00353  - regbranch - one alternative of an | operator
00354  *
00355  * Implements the concatenation operator.
00356  */
00357 static ushort *
00358 regbranch(flagp)
00359 int *flagp;
00360 {
00361   register ushort *ret;
00362   register ushort *chain;
00363   register ushort *latest;
00364   int flags;
00365 
00366   *flagp = WORST;               /* Tentatively. */
00367 
00368   ret = regnode(BRANCH);
00369   chain = NULL;
00370   while (*regparse != '\0' && *regparse != '|' && *regparse != ')') {
00371         latest = regpiece(&flags);
00372         if (latest == NULL)
00373                 return(NULL);
00374         *flagp |= flags&HASWIDTH;
00375         if (chain == NULL)      /* First piece. */
00376                 *flagp |= flags&SPSTART;
00377         else
00378                 regtail(chain, latest);
00379         chain = latest;
00380   }
00381   if (chain == NULL)    /* Loop ran zero times. */
00382         (void) regnode(NOTHING);
00383 
00384   return(ret);
00385 }
00386 
00387 /*
00388  - regpiece - something followed by possible [*+?]
00389  *
00390  * Note that the branching code sequences used for ? and the general cases
00391  * of * and + are somewhat optimized:  they use the same NOTHING node as
00392  * both the endmarker for their branch list and the body of the last branch.
00393  * It might seem that this node could be dispensed with entirely, but the
00394  * endmarker role is not redundant.
00395  */
00396 static ushort *
00397 regpiece(flagp)
00398 int *flagp;
00399 {
00400   register ushort *ret;
00401   register ushort op;
00402   register ushort *next;
00403   int flags;
00404 
00405   ret = regatom(&flags);
00406   if (ret == NULL)
00407         return(NULL);
00408 
00409   op = *regparse;
00410   if (!ISMULT(op)) {
00411         *flagp = flags;
00412         return(ret);
00413   }
00414 
00415   if (!(flags&HASWIDTH) && op != '?')
00416         FAIL("*+ operand could be empty");
00417   *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH);
00418 
00419   if (op == '*' && (flags&SIMPLE))
00420         reginsert(STAR, ret);
00421   else if (op == '*') {
00422         /* Emit x* as (x&|), where & means "self". */
00423         reginsert(BRANCH, ret);                 /* Either x */
00424         regoptail(ret, regnode(BACK));          /* and loop */
00425         regoptail(ret, ret);                    /* back */
00426         regtail(ret, regnode(BRANCH));          /* or */
00427         regtail(ret, regnode(NOTHING));         /* null. */
00428   } else if (op == '+' && (flags&SIMPLE))
00429         reginsert(PLUS, ret);
00430   else if (op == '+') {
00431         /* Emit x+ as x(&|), where & means "self". */
00432         next = regnode(BRANCH);                 /* Either */
00433         regtail(ret, next);
00434         regtail(regnode(BACK), ret);            /* loop back */
00435         regtail(next, regnode(BRANCH));         /* or */
00436         regtail(ret, regnode(NOTHING));         /* null. */
00437   } else if (op == '?') {
00438         /* Emit x? as (x|) */
00439         reginsert(BRANCH, ret);                 /* Either x */
00440         regtail(ret, regnode(BRANCH));          /* or */
00441         next = regnode(NOTHING);                /* null. */
00442         regtail(ret, next);
00443         regoptail(ret, next);
00444   }
00445   regparse++;
00446   if (ISMULT(*regparse))
00447         FAIL("nested *?+");
00448 
00449   return(ret);
00450 }
00451 
00452 /*
00453  - regatom - the lowest level
00454  *
00455  * Optimization:  gobbles an entire sequence of ordinary characters so that
00456  * it can turn them into a single node, which is smaller to store and
00457  * faster to run.  Backslashed characters are exceptions, each becoming a
00458  * separate node; the code is simpler that way and it's not worth fixing.
00459  */
00460 static ushort *
00461 regatom(flagp)
00462 int *flagp;
00463 {
00464   register ushort *ret;
00465   int flags;
00466   ushort c, d;
00467 
00468   *flagp = WORST;               /* Tentatively. */
00469 
00470   switch ((int) *regparse++) {
00471   case '^':
00472         ret = regnode(BOL);
00473         break;
00474   case '$':
00475         ret = regnode(EOL);
00476         break;
00477   case '.':
00478         ret = regnode(ANY);
00479         *flagp |= HASWIDTH|SIMPLE;
00480         break;
00481   case '[': {
00482                 register int class;
00483                 register int classend;
00484                 register int c;
00485 
00486                 if (*regparse == '^') { /* Complement of range. */
00487                         ret = regnode(ANYBUT);
00488                         regparse++;
00489                 } else
00490                         ret = regnode(ANYOF);
00491                 if (*regparse == ']' || *regparse == '-')
00492                         regc(*regparse++);
00493                 while (*regparse != '\0' && *regparse != ']') {
00494                         if (*regparse == '-') {
00495                                 regparse++;
00496                                 if (*regparse == ']' || *regparse == '\0')
00497                                         regc((ushort) '-');
00498                                 else {
00499                                         class = UCHARAT(regparse-2)+1;
00500                                         classend = UCHARAT(regparse);
00501                                         if (class > classend+1)
00502                                                 FAIL("invalid [] range");
00503                                         regc((ushort) 0xffff);
00504                                         regc((ushort) class);
00505                                         regc((ushort) classend);
00506                                         regparse++;
00507                                 }
00508                         } else {
00509                                 if ((c = *regparse++) == '\\') {
00510                                         if ((c = *regparse++) == 'n')
00511                                                 c = '\n';
00512                                         else if (c == 't')
00513                                                 c = '\t';
00514                                 }
00515                                 regc(c);
00516                         }
00517                 }
00518                 regc((ushort) 0);
00519                 if (*regparse != ']')
00520                         FAIL("unmatched []");
00521                 regparse++;
00522                 *flagp |= HASWIDTH|SIMPLE;
00523         }
00524         break;
00525   case '(':
00526         ret = reg(1, &flags);
00527         if (ret == NULL)
00528                 return(NULL);
00529         *flagp |= flags&(HASWIDTH|SPSTART);
00530         break;
00531   case '\0':
00532   case '|':
00533   case ')':
00534         FAIL("internal urp");   /* Supposed to be caught earlier. */
00535         break;
00536   case '?':
00537   case '+':
00538   case '*':
00539         FAIL("?+* follows nothing");
00540         break;
00541   case '\\':
00542         if (*regparse == '\0')
00543                 FAIL("trailing \\");
00544         ret = regnode(EXACTLY);
00545 /*
00546         regc(*regparse++);
00547 */
00548         c = *regparse++;
00549         if (c == 'n')
00550                 c = '\n';
00551         else if (c == 't')
00552                 c = '\t';
00553         else if (c == 'f')
00554                 c = '\f';
00555         else if (c == '\r')
00556                 c = '\r';
00557         else if (c == '\b')
00558                 c = '\b';
00559         else if (IsDigid(c)) {
00560                 d = c - '0';
00561                 if (IsDigid(*regparse)) {
00562                         d = d * 8 + *regparse++ - '0';
00563                         if (IsDigid(*regparse))
00564                                 d = d * 8 + *regparse++ - '0';
00565                 }
00566                 c = d;
00567         }
00568         regc(c);
00569         regc((ushort) 0);
00570         *flagp |= HASWIDTH|SIMPLE;
00571         break;
00572   default: {
00573                 register int len;
00574                 register char ender;
00575 
00576                 regparse--;
00577                 len = Strcspn(regparse, META);
00578                 if (len <= 0)
00579                         FAIL("internal disaster");
00580                 ender = *(regparse+len);
00581                 if (len > 1 && ISMULT(ender))
00582                         len--;          /* Back off clear of ?+* operand. */
00583                 *flagp |= HASWIDTH;
00584                 if (len == 1)
00585                         *flagp |= SIMPLE;
00586                 ret = regnode(EXACTLY);
00587                 while (len > 0) {
00588                         regc(*regparse++);
00589                         len--;
00590                 }
00591                 regc((ushort) 0);
00592         }
00593         break;
00594   }
00595 
00596   return(ret);
00597 }
00598 
00599 IsDigid(c) ushort c;
00600 {
00601   return '0' <= c && c <= '9';
00602 }
00603 
00604 /*
00605  - regnode - emit a node
00606  */
00607 static ushort *                 /* Location. */
00608 regnode(op)
00609 ushort op;
00610 {
00611   register ushort *ret;
00612   register ushort *ptr;
00613 
00614   ret = regcode;
00615   if (ret == &regdummy) {
00616         regsize += 3;
00617         return(ret);
00618   }
00619 
00620   ptr = ret;
00621   *ptr++ = op;
00622   *ptr++ = '\0';                /* Null "next" pointer. */
00623   *ptr++ = '\0';
00624   regcode = ptr;
00625 
00626   return(ret);
00627 }
00628 
00629 /*
00630  - regc - emit (if appropriate) a byte of code
00631  */
00632 static void
00633 regc(b)
00634 ushort b;
00635 {
00636   if (regcode != &regdummy)
00637         *regcode++ = b;
00638   else
00639         regsize++;
00640 }
00641 
00642 /*
00643  - reginsert - insert an operator in front of already-emitted operand
00644  *
00645  * Means relocating the operand.
00646  */
00647 static void
00648 reginsert(op, opnd)
00649 ushort op;
00650 ushort *opnd;
00651 {
00652   register ushort *src;
00653   register ushort *dst;
00654   register ushort *place;
00655 
00656   if (regcode == &regdummy) {
00657         regsize += 3;
00658         return;
00659   }
00660 
00661   src = regcode;
00662   regcode += 3;
00663   dst = regcode;
00664   while (src > opnd)
00665         *--dst = *--src;
00666 
00667   place = opnd;         /* Op node, where operand used to be. */
00668   *place++ = op;
00669   *place++ = '\0';
00670   *place++ = '\0';
00671 }
00672 
00673 /*
00674  - regtail - set the next-pointer at the end of a node chain
00675  */
00676 static void
00677 regtail(p, val)
00678 ushort *p;
00679 ushort *val;
00680 {
00681   register ushort *scan;
00682   register ushort *temp;
00683   register int offset;
00684 
00685   if (p == &regdummy)
00686         return;
00687 
00688   /* Find last node. */
00689   scan = p;
00690   for (;;) {
00691         temp = regnext(scan);
00692         if (temp == NULL)
00693                 break;
00694         scan = temp;
00695   }
00696 
00697   if (OP(scan) == BACK)
00698         offset = scan - val;
00699   else
00700         offset = val - scan;
00701   *(scan+1) = (offset>>8)&0377;
00702   *(scan+2) = offset&0377;
00703 }
00704 
00705 /*
00706  - regoptail - regtail on operand of first argument; nop if operandless
00707  */
00708 static void
00709 regoptail(p, val)
00710 ushort *p;
00711 ushort *val;
00712 {
00713   /* "Operandless" and "op != BRANCH" are synonymous in practice. */
00714   if (p == NULL || p == &regdummy || OP(p) != BRANCH)
00715         return;
00716   regtail(OPERAND(p), val);
00717 }
00718 
00719 /*
00720  * regexec and friends
00721  */
00722 
00723 /*
00724  * Global work variables for regexec().
00725  */
00726 static ushort *reginput;                /* String-input pointer. */
00727 static ushort *regbol;          /* Beginning of input, for ^ check. */
00728 static ushort **regstartp;      /* Pointer to startp array. */
00729 static ushort **regendp;                /* Ditto for endp. */
00730 
00731 /*
00732  * Forwards.
00733  */
00734 STATIC int regtry();
00735 STATIC int regmatch();
00736 STATIC int regrepeat();
00737 
00738 #ifdef DEBUG
00739 int regnarrate = 0;
00740 void regdump();
00741 STATIC char *regprop();
00742 #endif
00743 
00744 /*
00745  - regexec - match a regexp against a string
00746  */
00747 int
00748 regexec(prog, string, bolflag)
00749 register regexp *prog;
00750 register ushort *string;
00751 int bolflag;
00752 {
00753   register ushort *s;
00754   extern ushort *Strchr();
00755 
00756   /* Be paranoid... */
00757   if (prog == NULL || string == NULL) {
00758         regerror("NULL parameter");
00759         return(0);
00760   }
00761 
00762   /* Check validity of program. */
00763   if (prog->program[0] != MAGIC) {
00764         regerror("corrupted program");
00765         return(0);
00766   }
00767 
00768   /* If there is a "must appear" string, look for it. */
00769   if (prog->regmust != NULL) {
00770         s = string;
00771         while ((s = Strchr(s, prog->regmust[0])) != NULL) {
00772                 if (Strncmp(s, prog->regmust, prog->regmlen) == 0)
00773                         break;  /* Found it. */
00774                 s++;
00775         }
00776         if (s == NULL)  /* Not present. */
00777                 return(0);
00778   }
00779 
00780   /* Mark beginning of line for ^ . */
00781   if(bolflag)
00782         regbol = string;
00783   else
00784         regbol = NULL;
00785 
00786   /* Simplest case:  anchored match need be tried only once. */
00787   if (prog->reganch)
00788         return(regtry(prog, string));
00789 
00790   /* Messy cases:  unanchored match. */
00791   s = string;
00792   if (prog->regstart != '\0') {
00793         /* We know what char it must start with. */
00794         while ((s = Strchr(s, prog->regstart)) != NULL) {
00795                 if (regtry(prog, s))
00796                         return(1);
00797                 s++;
00798         }
00799 }
00800   else
00801         /* We don't -- general case. */
00802         do {
00803                 if (regtry(prog, s))
00804                         return(1);
00805         } while (*s++ != '\0');
00806 
00807   /* Failure. */
00808   return(0);
00809 }
00810 
00811 /*
00812  - regtry - try match at specific point
00813  */
00814 static int                      /* 0 failure, 1 success */
00815 regtry(prog, string)
00816 regexp *prog;
00817 ushort *string;
00818 {
00819   register int i;
00820   register ushort **sp;
00821   register ushort **ep;
00822 
00823   reginput = string;
00824   regstartp = prog->startp;
00825   regendp = prog->endp;
00826 
00827   sp = prog->startp;
00828   ep = prog->endp;
00829   for (i = NSUBEXP; i > 0; i--) {
00830         *sp++ = NULL;
00831         *ep++ = NULL;
00832   }
00833   if (regmatch(prog->program + 1)) {
00834         prog->startp[0] = string;
00835         prog->endp[0] = reginput;
00836         return(1);
00837   } else
00838         return(0);
00839 }
00840 
00841 /*
00842  - regmatch - main matching routine
00843  *
00844  * Conceptually the strategy is simple:  check to see whether the current
00845  * node matches, call self recursively to see whether the rest matches,
00846  * and then act accordingly.  In practice we make some effort to avoid
00847  * recursion, in particular by going through "ordinary" nodes (that don't
00848  * need to know whether the rest of the match failed) by a loop instead of
00849  * by recursion.
00850  */
00851 static int                      /* 0 failure, 1 success */
00852 regmatch(prog)
00853 ushort *prog;
00854 {
00855   register ushort *scan;        /* Current node. */
00856   ushort *next;         /* Next node. */
00857   extern ushort *Strchr();
00858 
00859   scan = prog;
00860 #ifdef DEBUG
00861   if (scan != NULL && regnarrate)
00862         fprintf(stderr, "%s(\n", regprop(scan));
00863 #endif
00864   while (scan != NULL) {
00865 #ifdef DEBUG
00866         if (regnarrate)
00867                 fprintf(stderr, "%s...\n", regprop(scan));
00868 #endif
00869         next = regnext(scan);
00870 
00871         switch ((int) OP(scan)) {
00872         case BOL:
00873                 if (reginput != regbol)
00874                         return(0);
00875                 break;
00876         case EOL:
00877                 if (*reginput != '\0')
00878                         return(0);
00879                 break;
00880         case ANY:
00881                 if (*reginput == '\0')
00882                         return(0);
00883                 reginput++;
00884                 break;
00885         case EXACTLY: {
00886                         register int len;
00887                         register ushort *opnd;
00888 
00889                         opnd = OPERAND(scan);
00890                         /* Inline the first character, for speed. */
00891                         if (*opnd != *reginput)
00892                                 return(0);
00893                         len = Strlen(opnd);
00894                         if (len > 1 && Strncmp(opnd, reginput, len) != 0)
00895                                 return(0);
00896                         reginput += len;
00897                 }
00898                 break;
00899         case ANYOF:
00900                 if (*reginput == '\0' || isthere(OPERAND(scan), *reginput) == 0)
00901                         return(0);
00902                 reginput++;
00903                 break;
00904         case ANYBUT:
00905                 if (*reginput == '\0' || isthere(OPERAND(scan), *reginput) != 0)
00906                         return(0);
00907                 reginput++;
00908                 break;
00909         case NOTHING:
00910                 break;
00911         case BACK:
00912                 break;
00913         case OPEN+1:
00914         case OPEN+2:
00915         case OPEN+3:
00916         case OPEN+4:
00917         case OPEN+5:
00918         case OPEN+6:
00919         case OPEN+7:
00920         case OPEN+8:
00921         case OPEN+9: {
00922                         register int no;
00923                         register ushort *save;
00924 
00925                         no = OP(scan) - OPEN;
00926                         save = reginput;
00927 
00928                         if (regmatch(next)) {
00929                                 /*
00930                                  * Don't set startp if some later
00931                                  * invocation of the same parentheses
00932                                  * already has.
00933                                  */
00934                                 if (regstartp[no] == NULL)
00935                                         regstartp[no] = save;
00936                                 return(1);
00937                         } else
00938                                 return(0);
00939                 }
00940                 break;
00941         case CLOSE+1:
00942         case CLOSE+2:
00943         case CLOSE+3:
00944         case CLOSE+4:
00945         case CLOSE+5:
00946         case CLOSE+6:
00947         case CLOSE+7:
00948         case CLOSE+8:
00949         case CLOSE+9: {
00950                         register int no;
00951                         register ushort *save;
00952 
00953                         no = OP(scan) - CLOSE;
00954                         save = reginput;
00955 
00956                         if (regmatch(next)) {
00957                                 /*
00958                                  * Don't set endp if some later
00959                                  * invocation of the same parentheses
00960                                  * already has.
00961                                  */
00962                                 if (regendp[no] == NULL)
00963                                         regendp[no] = save;
00964                                 return(1);
00965                         } else
00966                                 return(0);
00967                 }
00968                 break;
00969         case BRANCH: {
00970                         register ushort *save;
00971 
00972                         if (OP(next) != BRANCH)         /* No choice. */
00973                                 next = OPERAND(scan);   /* Avoid recursion. */
00974                         else {
00975                                 do {
00976                                         save = reginput;
00977                                         if (regmatch(OPERAND(scan)))
00978                                                 return(1);
00979                                         reginput = save;
00980                                         scan = regnext(scan);
00981                                 } while (scan != NULL && OP(scan) == BRANCH);
00982                                 return(0);
00983                                 /* NOTREACHED */
00984                         }
00985                 }
00986                 break;
00987         case STAR:
00988         case PLUS: {
00989                         register ushort nextch;
00990                         register int no;
00991                         register ushort *save;
00992                         register int min;
00993 
00994                         /*
00995                          * Lookahead to avoid useless match attempts
00996                          * when we know what character comes next.
00997                          */
00998                         nextch = '\0';
00999                         if (OP(next) == EXACTLY)
01000                                 nextch = *OPERAND(next);
01001                         min = (OP(scan) == STAR) ? 0 : 1;
01002                         save = reginput;
01003                         no = regrepeat(OPERAND(scan));
01004                         while (no >= min) {
01005                                 /* If it could work, try it. */
01006                                 if (nextch == '\0' || *reginput == nextch)
01007                                         if (regmatch(next))
01008                                                 return(1);
01009                                 /* Couldn't or didn't -- back up. */
01010                                 no--;
01011                                 reginput = save + no;
01012                         }
01013                         return(0);
01014                 }
01015                 break;
01016         case END:
01017                 return(1);      /* Success! */
01018                 break;
01019         default:
01020                 regerror("memory corruption");
01021                 return(0);
01022                 break;
01023         }
01024 
01025         scan = next;
01026   }
01027 
01028   /*
01029    * We get here only if there's trouble -- normally "case END" is
01030    * the terminating point.
01031    */
01032   regerror("corrupted pointers");
01033   return(0);
01034 }
01035 
01036 /*
01037  - regrepeat - repeatedly match something simple, report how many
01038  */
01039 static int
01040 regrepeat(p)
01041 ushort *p;
01042 {
01043   register int count = 0;
01044   register ushort *scan;
01045   register ushort *opnd;
01046 
01047   scan = reginput;
01048   opnd = OPERAND(p);
01049   switch (OP(p)) {
01050   case ANY:
01051         count = Strlen(scan);
01052         scan += count;
01053         break;
01054   case EXACTLY:
01055         while (*opnd == *scan) {
01056                 count++;
01057                 scan++;
01058         }
01059         break;
01060   case ANYOF:
01061         while (*scan != '\0' && isthere(opnd, *scan) != 0) {
01062                 count++;
01063                 scan++;
01064         }
01065         break;
01066   case ANYBUT:
01067         while (*scan != '\0' && isthere(opnd, *scan) == 0) {
01068                 count++;
01069                 scan++;
01070         }
01071         break;
01072   default:              /* Oh dear.  Called inappropriately. */
01073         regerror("internal foulup");
01074         count = 0;      /* Best compromise. */
01075         break;
01076   }
01077   reginput = scan;
01078 
01079   return(count);
01080 }
01081 
01082 /*
01083  - regnext - dig the "next" pointer out of a node
01084  */
01085 static ushort *
01086 regnext(p)
01087 register ushort *p;
01088 {
01089   register int offset;
01090 
01091   if (p == &regdummy)
01092         return(NULL);
01093 
01094   offset = NEXT(p);
01095   if (offset == 0)
01096         return(NULL);
01097 
01098   if (OP(p) == BACK)
01099         return(p-offset);
01100   else
01101         return(p+offset);
01102 }
01103 
01104 #ifdef DEBUG
01105 
01106 STATIC char *regprop();
01107 
01108 /*
01109  - regdump - dump a regexp onto stdout in vaguely comprehensible form
01110  */
01111 void
01112 regdump(r)
01113 regexp *r;
01114 {
01115   register ushort *s;
01116   register ushort op = EXACTLY; /* Arbitrary non-END op. */
01117   register ushort *next;
01118 
01119 
01120   s = r->program + 1;
01121   while (op != END) {   /* While that wasn't END last time... */
01122         op = OP(s);
01123         printf("%2d%s", s-r->program, regprop(s));      /* Where, what. */
01124         next = regnext(s);
01125         if (next == NULL)               /* Next ptr. */
01126                 printf("(0)");
01127         else 
01128                 printf("(%d)", (s-r->program)+(next-s));
01129         s += 3;
01130         if (op == ANYOF || op == ANYBUT || op == EXACTLY) {
01131                 /* Literal string, where present. */
01132                 while (*s != '\0') {
01133                         if (*s == 0xffff) {     /* range */
01134                                 kputchar(*++s); putchar('-'); ++s;
01135                         }
01136                         kputchar(*s++);
01137                 }
01138                 s++;
01139         }
01140         putchar('\n');
01141   }
01142 
01143   /* Header fields of interest. */
01144   if (r->regstart != '\0') {
01145         fputs("start `", stdout); kputchar(r->regstart); fputs("' ", stdout);
01146   }
01147   if (r->reganch)
01148         printf("anchored ");
01149   if (r->regmust != NULL) {
01150         fputs("must have \"", stdout); kputs(r->regmust); putchar('"');
01151   }
01152   printf("\n");
01153 }
01154 
01155 kputchar(c) ushort c;
01156 {
01157   if (c & 0xff00)
01158         putchar(c >> 8);
01159   putchar(c & 0xff);
01160 }
01161 
01162 kputs(s) ushort *s;
01163 {
01164   while (*s)
01165         kputchar(*s++);
01166 }
01167 
01168 /*
01169  - regprop - printable representation of opcode
01170  */
01171 static char *
01172 regprop(op)
01173 ushort *op;
01174 {
01175   register char *p;
01176   static char buf[50];
01177 
01178   (void) strcpy(buf, ":");
01179 
01180   switch ((int) OP(op)) {
01181   case BOL:
01182         p = "BOL";
01183         break;
01184   case EOL:
01185         p = "EOL";
01186         break;
01187   case ANY:
01188         p = "ANY";
01189         break;
01190   case ANYOF:
01191         p = "ANYOF";
01192         break;
01193   case ANYBUT:
01194         p = "ANYBUT";
01195         break;
01196   case BRANCH:
01197         p = "BRANCH";
01198         break;
01199   case EXACTLY:
01200         p = "EXACTLY";
01201         break;
01202   case NOTHING:
01203         p = "NOTHING";
01204         break;
01205   case BACK:
01206         p = "BACK";
01207         break;
01208   case END:
01209         p = "END";
01210         break;
01211   case OPEN+1:
01212   case OPEN+2:
01213   case OPEN+3:
01214   case OPEN+4:
01215   case OPEN+5:
01216   case OPEN+6:
01217   case OPEN+7:
01218   case OPEN+8:
01219   case OPEN+9:
01220         sprintf(buf+strlen(buf), "OPEN%d", OP(op)-OPEN);
01221         p = NULL;
01222         break;
01223   case CLOSE+1:
01224   case CLOSE+2:
01225   case CLOSE+3:
01226   case CLOSE+4:
01227   case CLOSE+5:
01228   case CLOSE+6:
01229   case CLOSE+7:
01230   case CLOSE+8:
01231   case CLOSE+9:
01232         sprintf(buf+strlen(buf), "CLOSE%d", OP(op)-CLOSE);
01233         p = NULL;
01234         break;
01235   case STAR:
01236         p = "STAR";
01237         break;
01238   case PLUS:
01239         p = "PLUS";
01240         break;
01241   default:
01242         regerror("corrupted opcode");
01243         break;
01244   }
01245   if (p != NULL)
01246         (void) strcat(buf, p);
01247   return(buf);
01248 }
01249 #endif
01250 
01251 /*
01252  * The following is provided for those people who do not have strcspn() in
01253  * their C libraries.  They should get off their butts and do something
01254  * about it; at least one public-domain implementation of those (highly
01255  * useful) string routines has been published on Usenet.
01256  */
01257 /*
01258  * strcspn - find length of initial segment of s1 consisting entirely
01259  * of characters not from s2
01260  */
01261 
01262 static int
01263 Strcspn(s1, s2)
01264 ushort *s1;
01265 unsigned char *s2;
01266 {
01267   register ushort *scan1;
01268   register unsigned char *scan2;
01269   register int count;
01270 
01271   count = 0;
01272   for (scan1 = s1; *scan1 != '\0'; scan1++) {
01273         for (scan2 = s2; *scan2 != '\0';)       /* ++ moved down. */
01274                 if (*scan1 == *scan2++)
01275                         return(count);
01276         count++;
01277   }
01278   return(count);
01279 }
01280 
01281 isthere(s, c) ushort *s, c;
01282 {
01283   register unsigned int c1, c2;
01284 
01285   for ( ; *s; s++) {
01286         if (*s == 0xffff) {     /* range */
01287                 c1 = *++s; c2 = *++s;
01288                 if (c1 <= c && c <= c2)
01289                         return 1;
01290         }
01291         else if (*s == c)
01292                 return 1;
01293   }
01294   return 0;
01295 }
01296 
01297 ushort *
01298 Strchr(s, c) ushort *s, c;
01299 {
01300   for ( ; *s; s++)
01301         if (*s == c)
01302                 return s;
01303   return NULL;
01304 }
01305 
01306 Strncmp(s, t, n) ushort *s, *t;
01307 {
01308   for ( ; --n > 0 && *s == *t; s++, t++)
01309         ;
01310   return *s - *t;
01311 }
01312 
01313 Strlen(s) ushort *s;
01314 {
01315   int i;
01316 
01317   for (i = 0; *s; i++, s++)
01318         ;
01319   return i;
01320 }
01321 
01322 ushort kbuf[BUFSIZ];
01323 
01324 char *
01325 mkpat(s) char *s;
01326 {
01327   sjtok(kbuf, s);
01328   return (char *) regcomp(kbuf);
01329 }
01330 
01331 match(p, s) regexp *p; char *s;
01332 {
01333   register int i;
01334 
01335   sjtok(kbuf, s);
01336   if (i = regexec(p, kbuf, 1)) {
01337         r_start = p->startp[0] - kbuf + 1;
01338         r_length = p->endp[0] - p->startp[0];
01339   }
01340   else
01341         r_start = r_length = 0;
01342   return i;
01343 }
01344 
01345 sjtok(s, t) ushort *s; unsigned char *t;
01346 {
01347   register c;
01348 
01349   for ( ; *t; t++) {
01350         if (isKanji(c = *t))
01351                 c = (c << 8) | (*++t & 0xff);
01352         *s++ = c;
01353   }
01354   *s = 0;
01355 }
01356 
01357 ktosj(s, t) unsigned char *s; ushort *t;
01358 {
01359   register c;
01360 
01361   while (*t) {
01362         if ((c = *t++) & 0xff00)
01363                 *s++ = c >> 8;
01364         *s++ = c & 0xff;
01365   }
01366   *s = '\0';
01367 }
01368 
01369 regsub(dst, exp, src, pat, pos) ushort *dst, *src, *pat; regexp *exp;
01370 {       /* dst <-- s/src/pat/pos        global substitution for pos == 0 */
01371   register int c, i;
01372   register ushort *loc1, *loc2, *s, *t, *u;
01373   register int n = 0;
01374 
01375   if (exp->program[0] != MAGIC) {
01376         regerror("damaged regexp fed to regsub");
01377         return 0;
01378   }
01379   while (*src) {
01380 next:
01381         if (regexec(exp, src, 1) == 0)
01382                 break;
01383         loc1 = exp->startp[0]; loc2 = exp->endp[0];
01384         if (pos-- > 1) {
01385                 while (src < loc2)
01386                         *dst++ = *src++;
01387                 goto next;
01388         }
01389         while (src < loc1)
01390                 *dst++ = *src++;
01391         for (s = pat; c = *s++; ) {
01392                 if (c == '&')
01393                         i = 0;
01394                 else if (c == '\\' && '0' <= *s && *s <= '9')
01395                         i = *s++ - '0';
01396                 else {
01397                         if (c == '\\' && (*s == '\\' || *s == '&'))
01398                                 c = *s++;
01399                         *dst++ = c;
01400                         continue;
01401                 }
01402                 if ((t = exp->startp[i]) != NULL
01403                         && (u = exp->endp[i]) != NULL) {
01404                         while (t < u)
01405                                 *dst++ = *t++;
01406                 }
01407         }
01408         src = loc2;
01409         n++;
01410         if (pos == 0)
01411                 break;
01412   }
01413   while (*src)
01414         *dst++ = *src++;
01415   *dst++ = 0;
01416   return n;
01417 }
01418 
01419 static ushort kbuf1[BUFSIZ], kbuf2[BUFSIZ];
01420 
01421 Sub(u, exp, str, s, t, pos) char *exp; char *s, *t, *u;
01422 {
01423   register int i;
01424   regexp *r;
01425 
01426   if (str) {
01427         sjtok(kbuf, exp);
01428         r = regcomp(kbuf);
01429   }
01430   else
01431         r = (regexp *) exp;
01432   sjtok(kbuf, s);
01433   sjtok(kbuf1, t);
01434   i = regsub(kbuf2, r, kbuf1, kbuf, pos);
01435   ktosj(u, kbuf2);
01436   if (str)
01437         sfree(r);
01438 
01439   return i;
01440 }

Generated on Fri Apr 14 22:56:39 2006 for minix by  doxygen 1.4.6