00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040 #if defined(LIBC_SCCS) && !defined(lint)
00041 static char sccsid[] = "@(#)regcomp.c 8.5 (Berkeley) 3/20/94";
00042 #endif
00043
00044 #include <sys/types.h>
00045 #include <stdio.h>
00046 #include <string.h>
00047 #ifdef __minix_vmd
00048 #include <bsd/asciictype.h>
00049 #else
00050 #include <ctype.h>
00051 #endif
00052 #include <limits.h>
00053 #include <stdlib.h>
00054 #include <regex.h>
00055
00056 #include "utils.h"
00057 #include "regex2.h"
00058
00059 #include "cclass.h"
00060 #include "cname.h"
00061
00062
00063
00064
00065
00066 struct parse {
00067 char *next;
00068 char *end;
00069 int error;
00070 sop *strip;
00071 sopno ssize;
00072 sopno slen;
00073 int ncsalloc;
00074 struct re_guts *g;
00075 # define NPAREN 10
00076 sopno pbegin[NPAREN];
00077 sopno pend[NPAREN];
00078 };
00079
00080
00081 #ifdef __cplusplus
00082 extern "C" {
00083 #endif
00084
00085
00086 static void p_ere(struct parse *p, int stop);
00087 static void p_ere_exp(struct parse *p);
00088 static void p_str(struct parse *p);
00089 static void p_bre(struct parse *p, int end1, int end2);
00090 static int p_simp_re(struct parse *p, int starordinary);
00091 static int p_count(struct parse *p);
00092 static void p_bracket(struct parse *p);
00093 static void p_b_term(struct parse *p, cset *cs);
00094 static void p_b_cclass(struct parse *p, cset *cs);
00095 static void p_b_eclass(struct parse *p, cset *cs);
00096 static char p_b_symbol(struct parse *p);
00097 static char p_b_coll_elem(struct parse *p, int endc);
00098 static char othercase(int ch);
00099 static void bothcases(struct parse *p, int ch);
00100 static void ordinary(struct parse *p, int ch);
00101 static void nonnewline(struct parse *p);
00102 static void repeat(struct parse *p, sopno start, int from, int to);
00103 static int seterr(struct parse *p, int e);
00104 static cset *allocset(struct parse *p);
00105 static void freeset(struct parse *p, cset *cs);
00106 static int freezeset(struct parse *p, cset *cs);
00107 static int firstch(struct parse *p, cset *cs);
00108 static int nch(struct parse *p, cset *cs);
00109 static void mcadd(struct parse *p, cset *cs, char *cp);
00110 static void mcsub(cset *cs, char *cp);
00111 static int mcin(cset *cs, char *cp);
00112 static char *mcfind(cset *cs, char *cp);
00113 static void mcinvert(struct parse *p, cset *cs);
00114 static void mccase(struct parse *p, cset *cs);
00115 static int isinsets(struct re_guts *g, int c);
00116 static int samesets(struct re_guts *g, int c1, int c2);
00117 static void categorize(struct parse *p, struct re_guts *g);
00118 static sopno dupl(struct parse *p, sopno start, sopno finish);
00119 static void doemit(struct parse *p, sop op, size_t opnd);
00120 static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos);
00121 static void dofwd(struct parse *p, sopno pos, sop value);
00122 static void enlarge(struct parse *p, sopno size);
00123 static void stripsnug(struct parse *p, struct re_guts *g);
00124 static void findmust(struct parse *p, struct re_guts *g);
00125 static sopno pluscount(struct parse *p, struct re_guts *g);
00126
00127 #ifdef __cplusplus
00128 }
00129 #endif
00130
00131
00132 static char nuls[10];
00133
00134
00135
00136
00137
00138 #define PEEK() (*p->next)
00139 #define PEEK2() (*(p->next+1))
00140 #define MORE() (p->next < p->end)
00141 #define MORE2() (p->next+1 < p->end)
00142 #define SEE(c) (MORE() && PEEK() == (c))
00143 #define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
00144 #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0)
00145 #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
00146 #define NEXT() (p->next++)
00147 #define NEXT2() (p->next += 2)
00148 #define NEXTn(n) (p->next += (n))
00149 #define GETNEXT() (*p->next++)
00150 #define SETERROR(e) seterr(p, (e))
00151 #define REQUIRE(co, e) ((co) || SETERROR(e))
00152 #define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e))
00153 #define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e))
00154 #define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e))
00155 #define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd))
00156 #define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos)
00157 #define AHEAD(pos) dofwd(p, pos, HERE()-(pos))
00158 #define ASTERN(sop, pos) EMIT(sop, HERE()-pos)
00159 #define HERE() (p->slen)
00160 #define THERE() (p->slen - 1)
00161 #define THERETHERE() (p->slen - 2)
00162 #define DROP(n) (p->slen -= (n))
00163
00164 #ifndef NDEBUG
00165 static int never = 0;
00166 #else
00167 #define never 0
00168 #endif
00169
00170
00171
00172
00173
00174
00175
00176
00177
00178
00179
00180
00181
00182 int
00183 regcomp(preg, pattern, cflags)
00184 regex_t *preg;
00185 const char *pattern;
00186 int cflags;
00187 {
00188 struct parse pa;
00189 register struct re_guts *g;
00190 register struct parse *p = &pa;
00191 register int i;
00192 register size_t len;
00193 #ifdef REDEBUG
00194 # define GOODFLAGS(f) (f)
00195 #else
00196 # define GOODFLAGS(f) ((f)&~REG_DUMP)
00197 #endif
00198
00199 cflags = GOODFLAGS(cflags);
00200 if ((cflags®_EXTENDED) && (cflags®_NOSPEC))
00201 return(REG_INVARG);
00202
00203 if (cflags®_PEND) {
00204 if (preg->re_endp < pattern)
00205 return(REG_INVARG);
00206 len = preg->re_endp - pattern;
00207 } else
00208 len = strlen((char *)pattern);
00209
00210
00211 g = (struct re_guts *)malloc(sizeof(struct re_guts) +
00212 (NC-1)*sizeof(cat_t));
00213 if (g == NULL)
00214 return(REG_ESPACE);
00215 p->ssize = len/(size_t)2*(size_t)3 + (size_t)1;
00216 p->strip = (sop *)malloc(p->ssize * sizeof(sop));
00217 p->slen = 0;
00218 if (p->strip == NULL) {
00219 free((char *)g);
00220 return(REG_ESPACE);
00221 }
00222
00223
00224 p->g = g;
00225 p->next = (char *)pattern;
00226 p->end = p->next + len;
00227 p->error = 0;
00228 p->ncsalloc = 0;
00229 for (i = 0; i < NPAREN; i++) {
00230 p->pbegin[i] = 0;
00231 p->pend[i] = 0;
00232 }
00233 g->csetsize = NC;
00234 g->sets = NULL;
00235 g->setbits = NULL;
00236 g->ncsets = 0;
00237 g->cflags = cflags;
00238 g->iflags = 0;
00239 g->nbol = 0;
00240 g->neol = 0;
00241 g->must = NULL;
00242 g->mlen = 0;
00243 g->nsub = 0;
00244 g->ncategories = 1;
00245 g->categories = &g->catspace[-(CHAR_MIN)];
00246 (void) memset((char *)g->catspace, 0, NC*sizeof(cat_t));
00247 g->backrefs = 0;
00248
00249
00250 EMIT(OEND, 0);
00251 g->firststate = THERE();
00252 if (cflags®_EXTENDED)
00253 p_ere(p, OUT);
00254 else if (cflags®_NOSPEC)
00255 p_str(p);
00256 else
00257 p_bre(p, OUT, OUT);
00258 EMIT(OEND, 0);
00259 g->laststate = THERE();
00260
00261
00262 categorize(p, g);
00263 stripsnug(p, g);
00264 findmust(p, g);
00265 g->nplus = pluscount(p, g);
00266 g->magic = MAGIC2;
00267 preg->re_nsub = g->nsub;
00268 preg->re_g = g;
00269 preg->re_magic = MAGIC1;
00270 #ifndef REDEBUG
00271
00272 if (g->iflags&BAD)
00273 SETERROR(REG_ASSERT);
00274 #endif
00275
00276
00277 if (p->error != 0)
00278 regfree(preg);
00279 return(p->error);
00280 }
00281
00282
00283
00284
00285
00286 static void
00287 p_ere(p, stop)
00288 register struct parse *p;
00289 int stop;
00290 {
00291 register char c;
00292 register sopno prevback;
00293 register sopno prevfwd;
00294 register sopno conc;
00295 register int first = 1;
00296
00297 for (;;) {
00298
00299 conc = HERE();
00300 while (MORE() && (c = PEEK()) != '|' && c != stop)
00301 p_ere_exp(p);
00302 REQUIRE(HERE() != conc, REG_EMPTY);
00303
00304 if (!EAT('|'))
00305 break;
00306
00307 if (first) {
00308 INSERT(OCH_, conc);
00309 prevfwd = conc;
00310 prevback = conc;
00311 first = 0;
00312 }
00313 ASTERN(OOR1, prevback);
00314 prevback = THERE();
00315 AHEAD(prevfwd);
00316 prevfwd = HERE();
00317 EMIT(OOR2, 0);
00318 }
00319
00320 if (!first) {
00321 AHEAD(prevfwd);
00322 ASTERN(O_CH, prevback);
00323 }
00324
00325 assert(!MORE() || SEE(stop));
00326 }
00327
00328
00329
00330
00331
00332 static void
00333 p_ere_exp(p)
00334 register struct parse *p;
00335 {
00336 register char c;
00337 register sopno pos;
00338 register int count;
00339 register int count2;
00340 register sopno subno;
00341 int wascaret = 0;
00342
00343 assert(MORE());
00344 c = GETNEXT();
00345
00346 pos = HERE();
00347 switch (c) {
00348 case '(':
00349 REQUIRE(MORE(), REG_EPAREN);
00350 p->g->nsub++;
00351 subno = p->g->nsub;
00352 if (subno < NPAREN)
00353 p->pbegin[subno] = HERE();
00354 EMIT(OLPAREN, subno);
00355 if (!SEE(')'))
00356 p_ere(p, ')');
00357 if (subno < NPAREN) {
00358 p->pend[subno] = HERE();
00359 assert(p->pend[subno] != 0);
00360 }
00361 EMIT(ORPAREN, subno);
00362 MUSTEAT(')', REG_EPAREN);
00363 break;
00364 #ifndef POSIX_MISTAKE
00365 case ')':
00366
00367
00368
00369
00370
00371
00372
00373 SETERROR(REG_EPAREN);
00374 break;
00375 #endif
00376 case '^':
00377 EMIT(OBOL, 0);
00378 p->g->iflags |= USEBOL;
00379 p->g->nbol++;
00380 wascaret = 1;
00381 break;
00382 case '$':
00383 EMIT(OEOL, 0);
00384 p->g->iflags |= USEEOL;
00385 p->g->neol++;
00386 break;
00387 case '|':
00388 SETERROR(REG_EMPTY);
00389 break;
00390 case '*':
00391 case '+':
00392 case '?':
00393 SETERROR(REG_BADRPT);
00394 break;
00395 case '.':
00396 if (p->g->cflags®_NEWLINE)
00397 nonnewline(p);
00398 else
00399 EMIT(OANY, 0);
00400 break;
00401 case '[':
00402 p_bracket(p);
00403 break;
00404 case '\\':
00405 REQUIRE(MORE(), REG_EESCAPE);
00406 c = GETNEXT();
00407 ordinary(p, c);
00408 break;
00409 case '{':
00410 REQUIRE(!MORE() || !isdigit(PEEK()), REG_BADRPT);
00411
00412 default:
00413 ordinary(p, c);
00414 break;
00415 }
00416
00417 if (!MORE())
00418 return;
00419 c = PEEK();
00420
00421 if (!( c == '*' || c == '+' || c == '?' ||
00422 (c == '{' && MORE2() && isdigit(PEEK2())) ))
00423 return;
00424 NEXT();
00425
00426 REQUIRE(!wascaret, REG_BADRPT);
00427 switch (c) {
00428 case '*':
00429
00430 INSERT(OPLUS_, pos);
00431 ASTERN(O_PLUS, pos);
00432 INSERT(OQUEST_, pos);
00433 ASTERN(O_QUEST, pos);
00434 break;
00435 case '+':
00436 INSERT(OPLUS_, pos);
00437 ASTERN(O_PLUS, pos);
00438 break;
00439 case '?':
00440
00441 INSERT(OCH_, pos);
00442 ASTERN(OOR1, pos);
00443 AHEAD(pos);
00444 EMIT(OOR2, 0);
00445 AHEAD(THERE());
00446 ASTERN(O_CH, THERETHERE());
00447 break;
00448 case '{':
00449 count = p_count(p);
00450 if (EAT(',')) {
00451 if (isdigit(PEEK())) {
00452 count2 = p_count(p);
00453 REQUIRE(count <= count2, REG_BADBR);
00454 } else
00455 count2 = INFINITY;
00456 } else
00457 count2 = count;
00458 repeat(p, pos, count, count2);
00459 if (!EAT('}')) {
00460 while (MORE() && PEEK() != '}')
00461 NEXT();
00462 REQUIRE(MORE(), REG_EBRACE);
00463 SETERROR(REG_BADBR);
00464 }
00465 break;
00466 }
00467
00468 if (!MORE())
00469 return;
00470 c = PEEK();
00471 if (!( c == '*' || c == '+' || c == '?' ||
00472 (c == '{' && MORE2() && isdigit(PEEK2())) ) )
00473 return;
00474 SETERROR(REG_BADRPT);
00475 }
00476
00477
00478
00479
00480
00481 static void
00482 p_str(p)
00483 register struct parse *p;
00484 {
00485 REQUIRE(MORE(), REG_EMPTY);
00486 while (MORE())
00487 ordinary(p, GETNEXT());
00488 }
00489
00490
00491
00492
00493
00494
00495
00496
00497
00498
00499
00500
00501
00502 static void
00503 p_bre(p, end1, end2)
00504 register struct parse *p;
00505 register int end1;
00506 register int end2;
00507 {
00508 register sopno start = HERE();
00509 register int first = 1;
00510 register int wasdollar = 0;
00511
00512 if (EAT('^')) {
00513 EMIT(OBOL, 0);
00514 p->g->iflags |= USEBOL;
00515 p->g->nbol++;
00516 }
00517 while (MORE() && !SEETWO(end1, end2)) {
00518 wasdollar = p_simp_re(p, first);
00519 first = 0;
00520 }
00521 if (wasdollar) {
00522 DROP(1);
00523 EMIT(OEOL, 0);
00524 p->g->iflags |= USEEOL;
00525 p->g->neol++;
00526 }
00527
00528 REQUIRE(HERE() != start, REG_EMPTY);
00529 }
00530
00531
00532
00533
00534
00535 static int
00536 p_simp_re(p, starordinary)
00537 register struct parse *p;
00538 int starordinary;
00539 {
00540 register int c;
00541 register int count;
00542 register int count2;
00543 register sopno pos;
00544 register int i;
00545 register sopno subno;
00546 # define BACKSL (1<<CHAR_BIT)
00547
00548 pos = HERE();
00549
00550 assert(MORE());
00551 c = GETNEXT();
00552 if (c == '\\') {
00553 REQUIRE(MORE(), REG_EESCAPE);
00554 c = BACKSL | (unsigned char)GETNEXT();
00555 }
00556 switch (c) {
00557 case '.':
00558 if (p->g->cflags®_NEWLINE)
00559 nonnewline(p);
00560 else
00561 EMIT(OANY, 0);
00562 break;
00563 case '[':
00564 p_bracket(p);
00565 break;
00566 case BACKSL|'{':
00567 SETERROR(REG_BADRPT);
00568 break;
00569 case BACKSL|'(':
00570 p->g->nsub++;
00571 subno = p->g->nsub;
00572 if (subno < NPAREN)
00573 p->pbegin[subno] = HERE();
00574 EMIT(OLPAREN, subno);
00575
00576 if (MORE() && !SEETWO('\\', ')'))
00577 p_bre(p, '\\', ')');
00578 if (subno < NPAREN) {
00579 p->pend[subno] = HERE();
00580 assert(p->pend[subno] != 0);
00581 }
00582 EMIT(ORPAREN, subno);
00583 REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
00584 break;
00585 case BACKSL|')':
00586 case BACKSL|'}':
00587 SETERROR(REG_EPAREN);
00588 break;
00589 case BACKSL|'1':
00590 case BACKSL|'2':
00591 case BACKSL|'3':
00592 case BACKSL|'4':
00593 case BACKSL|'5':
00594 case BACKSL|'6':
00595 case BACKSL|'7':
00596 case BACKSL|'8':
00597 case BACKSL|'9':
00598 i = (c&~BACKSL) - '0';
00599 assert(i < NPAREN);
00600 if (p->pend[i] != 0) {
00601 assert(i <= p->g->nsub);
00602 EMIT(OBACK_, i);
00603 assert(p->pbegin[i] != 0);
00604 assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
00605 assert(OP(p->strip[p->pend[i]]) == ORPAREN);
00606 (void) dupl(p, p->pbegin[i]+1, p->pend[i]);
00607 EMIT(O_BACK, i);
00608 } else
00609 SETERROR(REG_ESUBREG);
00610 p->g->backrefs = 1;
00611 break;
00612 case '*':
00613 REQUIRE(starordinary, REG_BADRPT);
00614
00615 default:
00616 ordinary(p, c &~ BACKSL);
00617 break;
00618 }
00619
00620 if (EAT('*')) {
00621
00622 INSERT(OPLUS_, pos);
00623 ASTERN(O_PLUS, pos);
00624 INSERT(OQUEST_, pos);
00625 ASTERN(O_QUEST, pos);
00626 } else if (EATTWO('\\', '{')) {
00627 count = p_count(p);
00628 if (EAT(',')) {
00629 if (MORE() && isdigit(PEEK())) {
00630 count2 = p_count(p);
00631 REQUIRE(count <= count2, REG_BADBR);
00632 } else
00633 count2 = INFINITY;
00634 } else
00635 count2 = count;
00636 repeat(p, pos, count, count2);
00637 if (!EATTWO('\\', '}')) {
00638 while (MORE() && !SEETWO('\\', '}'))
00639 NEXT();
00640 REQUIRE(MORE(), REG_EBRACE);
00641 SETERROR(REG_BADBR);
00642 }
00643 } else if (c == (unsigned char)'$')
00644 return(1);
00645
00646 return(0);
00647 }
00648
00649
00650
00651
00652
00653 static int
00654 p_count(p)
00655 register struct parse *p;
00656 {
00657 register int count = 0;
00658 register int ndigits = 0;
00659
00660 while (MORE() && isdigit(PEEK()) && count <= DUPMAX) {
00661 count = count*10 + (GETNEXT() - '0');
00662 ndigits++;
00663 }
00664
00665 REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
00666 return(count);
00667 }
00668
00669
00670
00671
00672
00673
00674
00675
00676 static void
00677 p_bracket(p)
00678 register struct parse *p;
00679 {
00680 register char c;
00681 register cset *cs = allocset(p);
00682 register int invert = 0;
00683
00684
00685 if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) {
00686 EMIT(OBOW, 0);
00687 NEXTn(6);
00688 return;
00689 }
00690 if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) {
00691 EMIT(OEOW, 0);
00692 NEXTn(6);
00693 return;
00694 }
00695
00696 if (EAT('^'))
00697 invert++;
00698 if (EAT(']'))
00699 CHadd(cs, ']');
00700 else if (EAT('-'))
00701 CHadd(cs, '-');
00702 while (MORE() && PEEK() != ']' && !SEETWO('-', ']'))
00703 p_b_term(p, cs);
00704 if (EAT('-'))
00705 CHadd(cs, '-');
00706 MUSTEAT(']', REG_EBRACK);
00707
00708 if (p->error != 0)
00709 return;
00710
00711 if (p->g->cflags®_ICASE) {
00712 register int i;
00713 register int ci;
00714
00715 for (i = p->g->csetsize - 1; i >= 0; i--)
00716 if (CHIN(cs, i) && isalpha(i)) {
00717 ci = othercase(i);
00718 if (ci != i)
00719 CHadd(cs, ci);
00720 }
00721 if (cs->multis != NULL)
00722 mccase(p, cs);
00723 }
00724 if (invert) {
00725 register int i;
00726
00727 for (i = p->g->csetsize - 1; i >= 0; i--)
00728 if (CHIN(cs, i))
00729 CHsub(cs, i);
00730 else
00731 CHadd(cs, i);
00732 if (p->g->cflags®_NEWLINE)
00733 CHsub(cs, '\n');
00734 if (cs->multis != NULL)
00735 mcinvert(p, cs);
00736 }
00737
00738 assert(cs->multis == NULL);
00739
00740 if (nch(p, cs) == 1) {
00741 ordinary(p, firstch(p, cs));
00742 freeset(p, cs);
00743 } else
00744 EMIT(OANYOF, freezeset(p, cs));
00745 }
00746
00747
00748
00749
00750
00751 static void
00752 p_b_term(p, cs)
00753 register struct parse *p;
00754 register cset *cs;
00755 {
00756 register char c;
00757 register char start, finish;
00758 register int i;
00759
00760
00761 switch ((MORE()) ? PEEK() : '\0') {
00762 case '[':
00763 c = (MORE2()) ? PEEK2() : '\0';
00764 break;
00765 case '-':
00766 SETERROR(REG_ERANGE);
00767 return;
00768 break;
00769 default:
00770 c = '\0';
00771 break;
00772 }
00773
00774 switch (c) {
00775 case ':':
00776 NEXT2();
00777 REQUIRE(MORE(), REG_EBRACK);
00778 c = PEEK();
00779 REQUIRE(c != '-' && c != ']', REG_ECTYPE);
00780 p_b_cclass(p, cs);
00781 REQUIRE(MORE(), REG_EBRACK);
00782 REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
00783 break;
00784 case '=':
00785 NEXT2();
00786 REQUIRE(MORE(), REG_EBRACK);
00787 c = PEEK();
00788 REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
00789 p_b_eclass(p, cs);
00790 REQUIRE(MORE(), REG_EBRACK);
00791 REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
00792 break;
00793 default:
00794
00795 start = p_b_symbol(p);
00796 if (SEE('-') && MORE2() && PEEK2() != ']') {
00797
00798 NEXT();
00799 if (EAT('-'))
00800 finish = '-';
00801 else
00802 finish = p_b_symbol(p);
00803 } else
00804 finish = start;
00805
00806 REQUIRE(start <= finish, REG_ERANGE);
00807 for (i = start; i <= finish; i++)
00808 CHadd(cs, i);
00809 break;
00810 }
00811 }
00812
00813
00814
00815
00816
00817 static void
00818 p_b_cclass(p, cs)
00819 register struct parse *p;
00820 register cset *cs;
00821 {
00822 register char *sp = p->next;
00823 register struct cclass *cp;
00824 register size_t len;
00825 register char *u;
00826 register char c;
00827
00828 while (MORE() && isalpha(PEEK()))
00829 NEXT();
00830 len = p->next - sp;
00831 for (cp = cclasses; cp->name != NULL; cp++)
00832 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
00833 break;
00834 if (cp->name == NULL) {
00835
00836 SETERROR(REG_ECTYPE);
00837 return;
00838 }
00839
00840 u = cp->chars;
00841 while ((c = *u++) != '\0')
00842 CHadd(cs, c);
00843 for (u = cp->multis; *u != '\0'; u += strlen(u) + 1)
00844 MCadd(p, cs, u);
00845 }
00846
00847
00848
00849
00850
00851
00852
00853 static void
00854 p_b_eclass(p, cs)
00855 register struct parse *p;
00856 register cset *cs;
00857 {
00858 register char c;
00859
00860 c = p_b_coll_elem(p, '=');
00861 CHadd(cs, c);
00862 }
00863
00864
00865
00866
00867
00868 static char
00869 p_b_symbol(p)
00870 register struct parse *p;
00871 {
00872 register char value;
00873
00874 REQUIRE(MORE(), REG_EBRACK);
00875 if (!EATTWO('[', '.'))
00876 return(GETNEXT());
00877
00878
00879 value = p_b_coll_elem(p, '.');
00880 REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
00881 return(value);
00882 }
00883
00884
00885
00886
00887
00888 static char
00889 p_b_coll_elem(p, endc)
00890 register struct parse *p;
00891 int endc;
00892 {
00893 register char *sp = p->next;
00894 register struct cname *cp;
00895 register int len;
00896 register char c;
00897
00898 while (MORE() && !SEETWO(endc, ']'))
00899 NEXT();
00900 if (!MORE()) {
00901 SETERROR(REG_EBRACK);
00902 return(0);
00903 }
00904 len = p->next - sp;
00905 for (cp = cnames; cp->name != NULL; cp++)
00906 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
00907 return(cp->code);
00908 if (len == 1)
00909 return(*sp);
00910 SETERROR(REG_ECOLLATE);
00911 return(0);
00912 }
00913
00914
00915
00916
00917
00918 static char
00919 othercase(ch)
00920 int ch;
00921 {
00922 assert(isalpha(ch));
00923 if (isupper(ch))
00924 return(tolower(ch));
00925 else if (islower(ch))
00926 return(toupper(ch));
00927 else
00928 return(ch);
00929 }
00930
00931
00932
00933
00934
00935
00936
00937 static void
00938 bothcases(p, ch)
00939 register struct parse *p;
00940 int ch;
00941 {
00942 register char *oldnext = p->next;
00943 register char *oldend = p->end;
00944 char bracket[3];
00945
00946 assert(othercase(ch) != ch);
00947 p->next = bracket;
00948 p->end = bracket+2;
00949 bracket[0] = ch;
00950 bracket[1] = ']';
00951 bracket[2] = '\0';
00952 p_bracket(p);
00953 assert(p->next == bracket+2);
00954 p->next = oldnext;
00955 p->end = oldend;
00956 }
00957
00958
00959
00960
00961
00962 static void
00963 ordinary(p, ch)
00964 register struct parse *p;
00965 register int ch;
00966 {
00967 register cat_t *cap = p->g->categories;
00968
00969 if ((p->g->cflags®_ICASE) && isalpha(ch) && othercase(ch) != ch)
00970 bothcases(p, ch);
00971 else {
00972 EMIT(OCHAR, (unsigned char)ch);
00973 if (cap[ch] == 0)
00974 cap[ch] = p->g->ncategories++;
00975 }
00976 }
00977
00978
00979
00980
00981
00982
00983
00984 static void
00985 nonnewline(p)
00986 register struct parse *p;
00987 {
00988 register char *oldnext = p->next;
00989 register char *oldend = p->end;
00990 char bracket[4];
00991
00992 p->next = bracket;
00993 p->end = bracket+3;
00994 bracket[0] = '^';
00995 bracket[1] = '\n';
00996 bracket[2] = ']';
00997 bracket[3] = '\0';
00998 p_bracket(p);
00999 assert(p->next == bracket+3);
01000 p->next = oldnext;
01001 p->end = oldend;
01002 }
01003
01004
01005
01006
01007
01008 static void
01009 repeat(p, start, from, to)
01010 register struct parse *p;
01011 sopno start;
01012 int from;
01013 int to;
01014 {
01015 register sopno finish = HERE();
01016 # define N 2
01017 # define INF 3
01018 # define REP(f, t) ((f)*8 + (t))
01019 # define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
01020 register sopno copy;
01021
01022 if (p->error != 0)
01023 return;
01024
01025 assert(from <= to);
01026
01027 switch (REP(MAP(from), MAP(to))) {
01028 case REP(0, 0):
01029 DROP(finish-start);
01030 break;
01031 case REP(0, 1):
01032 case REP(0, N):
01033 case REP(0, INF):
01034
01035 INSERT(OCH_, start);
01036 repeat(p, start+1, 1, to);
01037 ASTERN(OOR1, start);
01038 AHEAD(start);
01039 EMIT(OOR2, 0);
01040 AHEAD(THERE());
01041 ASTERN(O_CH, THERETHERE());
01042 break;
01043 case REP(1, 1):
01044
01045 break;
01046 case REP(1, N):
01047
01048 INSERT(OCH_, start);
01049 ASTERN(OOR1, start);
01050 AHEAD(start);
01051 EMIT(OOR2, 0);
01052 AHEAD(THERE());
01053 ASTERN(O_CH, THERETHERE());
01054 copy = dupl(p, start+1, finish+1);
01055 assert(copy == finish+4);
01056 repeat(p, copy, 1, to-1);
01057 break;
01058 case REP(1, INF):
01059 INSERT(OPLUS_, start);
01060 ASTERN(O_PLUS, start);
01061 break;
01062 case REP(N, N):
01063 copy = dupl(p, start, finish);
01064 repeat(p, copy, from-1, to-1);
01065 break;
01066 case REP(N, INF):
01067 copy = dupl(p, start, finish);
01068 repeat(p, copy, from-1, to);
01069 break;
01070 default:
01071 SETERROR(REG_ASSERT);
01072 break;
01073 }
01074 }
01075
01076
01077
01078
01079
01080 static int
01081 seterr(p, e)
01082 register struct parse *p;
01083 int e;
01084 {
01085 if (p->error == 0)
01086 p->error = e;
01087 p->next = nuls;
01088 p->end = nuls;
01089 return(0);
01090 }
01091
01092
01093
01094
01095
01096 static cset *
01097 allocset(p)
01098 register struct parse *p;
01099 {
01100 register int no = p->g->ncsets++;
01101 register size_t nc;
01102 register size_t nbytes;
01103 register cset *cs;
01104 register size_t css = (size_t)p->g->csetsize;
01105 register int i;
01106
01107 if (no >= p->ncsalloc) {
01108 p->ncsalloc += CHAR_BIT;
01109 nc = p->ncsalloc;
01110 assert(nc % CHAR_BIT == 0);
01111 nbytes = nc / CHAR_BIT * css;
01112 if (p->g->sets == NULL)
01113 p->g->sets = (cset *)malloc(nc * sizeof(cset));
01114 else
01115 p->g->sets = (cset *)realloc((char *)p->g->sets,
01116 nc * sizeof(cset));
01117 if (p->g->setbits == NULL)
01118 p->g->setbits = (uch *)malloc(nbytes);
01119 else {
01120 p->g->setbits = (uch *)realloc((char *)p->g->setbits,
01121 nbytes);
01122
01123 for (i = 0; i < no; i++)
01124 p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT);
01125 }
01126 if (p->g->sets != NULL && p->g->setbits != NULL)
01127 (void) memset((char *)p->g->setbits + (nbytes - css),
01128 0, css);
01129 else {
01130 no = 0;
01131 SETERROR(REG_ESPACE);
01132
01133 }
01134 }
01135
01136 assert(p->g->sets != NULL);
01137 cs = &p->g->sets[no];
01138 cs->ptr = p->g->setbits + css*((no)/CHAR_BIT);
01139 cs->mask = 1 << ((no) % CHAR_BIT);
01140 cs->hash = 0;
01141 cs->smultis = 0;
01142 cs->multis = NULL;
01143
01144 return(cs);
01145 }
01146
01147
01148
01149
01150
01151 static void
01152 freeset(p, cs)
01153 register struct parse *p;
01154 register cset *cs;
01155 {
01156 register int i;
01157 register cset *top = &p->g->sets[p->g->ncsets];
01158 register size_t css = (size_t)p->g->csetsize;
01159
01160 for (i = 0; i < css; i++)
01161 CHsub(cs, i);
01162 if (cs == top-1)
01163 p->g->ncsets--;
01164 }
01165
01166
01167
01168
01169
01170
01171
01172
01173
01174
01175
01176 static int
01177 freezeset(p, cs)
01178 register struct parse *p;
01179 register cset *cs;
01180 {
01181 register uch h = cs->hash;
01182 register int i;
01183 register cset *top = &p->g->sets[p->g->ncsets];
01184 register cset *cs2;
01185 register size_t css = (size_t)p->g->csetsize;
01186
01187
01188 for (cs2 = &p->g->sets[0]; cs2 < top; cs2++)
01189 if (cs2->hash == h && cs2 != cs) {
01190
01191 for (i = 0; i < css; i++)
01192 if (!!CHIN(cs2, i) != !!CHIN(cs, i))
01193 break;
01194 if (i == css)
01195 break;
01196 }
01197
01198 if (cs2 < top) {
01199 freeset(p, cs);
01200 cs = cs2;
01201 }
01202
01203 return((int)(cs - p->g->sets));
01204 }
01205
01206
01207
01208
01209
01210 static int
01211 firstch(p, cs)
01212 register struct parse *p;
01213 register cset *cs;
01214 {
01215 register int i;
01216 register size_t css = (size_t)p->g->csetsize;
01217
01218 for (i = 0; i < css; i++)
01219 if (CHIN(cs, i))
01220 return((char)i);
01221 assert(never);
01222 return(0);
01223 }
01224
01225
01226
01227
01228
01229 static int
01230 nch(p, cs)
01231 register struct parse *p;
01232 register cset *cs;
01233 {
01234 register int i;
01235 register size_t css = (size_t)p->g->csetsize;
01236 register int n = 0;
01237
01238 for (i = 0; i < css; i++)
01239 if (CHIN(cs, i))
01240 n++;
01241 return(n);
01242 }
01243
01244
01245
01246
01247
01248
01249 static void
01250 mcadd(p, cs, cp)
01251 register struct parse *p;
01252 register cset *cs;
01253 register char *cp;
01254 {
01255 register size_t oldend = cs->smultis;
01256
01257 cs->smultis += strlen(cp) + 1;
01258 if (cs->multis == NULL)
01259 cs->multis = malloc(cs->smultis);
01260 else
01261 cs->multis = realloc(cs->multis, cs->smultis);
01262 if (cs->multis == NULL) {
01263 SETERROR(REG_ESPACE);
01264 return;
01265 }
01266
01267 (void) strcpy(cs->multis + oldend - 1, cp);
01268 cs->multis[cs->smultis - 1] = '\0';
01269 }
01270
01271
01272
01273
01274
01275 static void
01276 mcsub(cs, cp)
01277 register cset *cs;
01278 register char *cp;
01279 {
01280 register char *fp = mcfind(cs, cp);
01281 register size_t len = strlen(fp);
01282
01283 assert(fp != NULL);
01284 (void) memmove(fp, fp + len + 1,
01285 cs->smultis - (fp + len + 1 - cs->multis));
01286 cs->smultis -= len;
01287
01288 if (cs->smultis == 0) {
01289 free(cs->multis);
01290 cs->multis = NULL;
01291 return;
01292 }
01293
01294 cs->multis = realloc(cs->multis, cs->smultis);
01295 assert(cs->multis != NULL);
01296 }
01297
01298
01299
01300
01301
01302 static int
01303 mcin(cs, cp)
01304 register cset *cs;
01305 register char *cp;
01306 {
01307 return(mcfind(cs, cp) != NULL);
01308 }
01309
01310
01311
01312
01313
01314 static char *
01315 mcfind(cs, cp)
01316 register cset *cs;
01317 register char *cp;
01318 {
01319 register char *p;
01320
01321 if (cs->multis == NULL)
01322 return(NULL);
01323 for (p = cs->multis; *p != '\0'; p += strlen(p) + 1)
01324 if (strcmp(cp, p) == 0)
01325 return(p);
01326 return(NULL);
01327 }
01328
01329
01330
01331
01332
01333
01334
01335
01336 static void
01337 mcinvert(p, cs)
01338 register struct parse *p;
01339 register cset *cs;
01340 {
01341 assert(cs->multis == NULL);
01342 }
01343
01344
01345
01346
01347
01348
01349
01350
01351 static void
01352 mccase(p, cs)
01353 register struct parse *p;
01354 register cset *cs;
01355 {
01356 assert(cs->multis == NULL);
01357 }
01358
01359
01360
01361
01362
01363 static int
01364 isinsets(g, c)
01365 register struct re_guts *g;
01366 int c;
01367 {
01368 register uch *col;
01369 register int i;
01370 register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
01371 register unsigned uc = (unsigned char)c;
01372
01373 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
01374 if (col[uc] != 0)
01375 return(1);
01376 return(0);
01377 }
01378
01379
01380
01381
01382
01383 static int
01384 samesets(g, c1, c2)
01385 register struct re_guts *g;
01386 int c1;
01387 int c2;
01388 {
01389 register uch *col;
01390 register int i;
01391 register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
01392 register unsigned uc1 = (unsigned char)c1;
01393 register unsigned uc2 = (unsigned char)c2;
01394
01395 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
01396 if (col[uc1] != col[uc2])
01397 return(0);
01398 return(1);
01399 }
01400
01401
01402
01403
01404
01405 static void
01406 categorize(p, g)
01407 struct parse *p;
01408 register struct re_guts *g;
01409 {
01410 register cat_t *cats = g->categories;
01411 register int c;
01412 register int c2;
01413 register cat_t cat;
01414
01415
01416 if (p->error != 0)
01417 return;
01418
01419 for (c = CHAR_MIN; c <= CHAR_MAX; c++)
01420 if (cats[c] == 0 && isinsets(g, c)) {
01421 cat = g->ncategories++;
01422 cats[c] = cat;
01423 for (c2 = c+1; c2 <= CHAR_MAX; c2++)
01424 if (cats[c2] == 0 && samesets(g, c, c2))
01425 cats[c2] = cat;
01426 }
01427 }
01428
01429
01430
01431
01432
01433 static sopno
01434 dupl(p, start, finish)
01435 register struct parse *p;
01436 sopno start;
01437 sopno finish;
01438 {
01439 register sopno ret = HERE();
01440 register sopno len = finish - start;
01441
01442 assert(finish >= start);
01443 if (len == 0)
01444 return(ret);
01445 enlarge(p, p->ssize + len);
01446 assert(p->ssize >= p->slen + len);
01447 (void) memcpy((char *)(p->strip + p->slen),
01448 (char *)(p->strip + start), (size_t)len*sizeof(sop));
01449 p->slen += len;
01450 return(ret);
01451 }
01452
01453
01454
01455
01456
01457
01458
01459
01460
01461 static void
01462 doemit(p, op, opnd)
01463 register struct parse *p;
01464 sop op;
01465 size_t opnd;
01466 {
01467
01468 if (p->error != 0)
01469 return;
01470
01471
01472 assert(opnd < 1<<OPSHIFT);
01473
01474
01475 if (p->slen >= p->ssize)
01476 enlarge(p, (p->ssize+1) / 2 * 3);
01477 assert(p->slen < p->ssize);
01478
01479
01480 p->strip[p->slen++] = SOP(op, opnd);
01481 }
01482
01483
01484
01485
01486
01487 static void
01488 doinsert(p, op, opnd, pos)
01489 register struct parse *p;
01490 sop op;
01491 size_t opnd;
01492 sopno pos;
01493 {
01494 register sopno sn;
01495 register sop s;
01496 register int i;
01497
01498
01499 if (p->error != 0)
01500 return;
01501
01502 sn = HERE();
01503 EMIT(op, opnd);
01504 assert(HERE() == sn+1);
01505 s = p->strip[sn];
01506
01507
01508 assert(pos > 0);
01509 for (i = 1; i < NPAREN; i++) {
01510 if (p->pbegin[i] >= pos) {
01511 p->pbegin[i]++;
01512 }
01513 if (p->pend[i] >= pos) {
01514 p->pend[i]++;
01515 }
01516 }
01517
01518 memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos],
01519 (HERE()-pos-1)*sizeof(sop));
01520 p->strip[pos] = s;
01521 }
01522
01523
01524
01525
01526
01527 static void
01528 dofwd(p, pos, value)
01529 register struct parse *p;
01530 register sopno pos;
01531 sop value;
01532 {
01533
01534 if (p->error != 0)
01535 return;
01536
01537 assert(value < 1<<OPSHIFT);
01538 p->strip[pos] = OP(p->strip[pos]) | value;
01539 }
01540
01541
01542
01543
01544
01545 static void
01546 enlarge(p, size)
01547 register struct parse *p;
01548 register sopno size;
01549 {
01550 register sop *sp;
01551
01552 if (p->ssize >= size)
01553 return;
01554
01555 sp = (sop *)realloc(p->strip, size*sizeof(sop));
01556 if (sp == NULL) {
01557 SETERROR(REG_ESPACE);
01558 return;
01559 }
01560 p->strip = sp;
01561 p->ssize = size;
01562 }
01563
01564
01565
01566
01567
01568 static void
01569 stripsnug(p, g)
01570 register struct parse *p;
01571 register struct re_guts *g;
01572 {
01573 g->nstates = p->slen;
01574 g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof(sop));
01575 if (g->strip == NULL) {
01576 SETERROR(REG_ESPACE);
01577 g->strip = p->strip;
01578 }
01579 }
01580
01581
01582
01583
01584
01585
01586
01587
01588
01589
01590
01591 static void
01592 findmust(p, g)
01593 struct parse *p;
01594 register struct re_guts *g;
01595 {
01596 register sop *scan;
01597 sop *start;
01598 register sop *newstart;
01599 register sopno newlen;
01600 register sop s;
01601 register char *cp;
01602 register sopno i;
01603
01604
01605 if (p->error != 0)
01606 return;
01607
01608
01609 newlen = 0;
01610 scan = g->strip + 1;
01611 do {
01612 s = *scan++;
01613 switch (OP(s)) {
01614 case OCHAR:
01615 if (newlen == 0)
01616 newstart = scan - 1;
01617 newlen++;
01618 break;
01619 case OPLUS_:
01620 case OLPAREN:
01621 case ORPAREN:
01622 break;
01623 case OQUEST_:
01624 case OCH_:
01625 scan--;
01626 do {
01627 scan += OPND(s);
01628 s = *scan;
01629
01630 if (OP(s) != O_QUEST && OP(s) != O_CH &&
01631 OP(s) != OOR2) {
01632 g->iflags |= BAD;
01633 return;
01634 }
01635 } while (OP(s) != O_QUEST && OP(s) != O_CH);
01636
01637 default:
01638 if (newlen > g->mlen) {
01639 start = newstart;
01640 g->mlen = newlen;
01641 }
01642 newlen = 0;
01643 break;
01644 }
01645 } while (OP(s) != OEND);
01646
01647 if (g->mlen == 0)
01648 return;
01649
01650
01651 g->must = malloc((size_t)g->mlen + 1);
01652 if (g->must == NULL) {
01653 g->mlen = 0;
01654 return;
01655 }
01656 cp = g->must;
01657 scan = start;
01658 for (i = g->mlen; i > 0; i--) {
01659 while (OP(s = *scan++) != OCHAR)
01660 continue;
01661 assert(cp < g->must + g->mlen);
01662 *cp++ = (char)OPND(s);
01663 }
01664 assert(cp == g->must + g->mlen);
01665 *cp++ = '\0';
01666 }
01667
01668
01669
01670
01671
01672 static sopno
01673 pluscount(p, g)
01674 struct parse *p;
01675 register struct re_guts *g;
01676 {
01677 register sop *scan;
01678 register sop s;
01679 register sopno plusnest = 0;
01680 register sopno maxnest = 0;
01681
01682 if (p->error != 0)
01683 return(0);
01684
01685 scan = g->strip + 1;
01686 do {
01687 s = *scan++;
01688 switch (OP(s)) {
01689 case OPLUS_:
01690 plusnest++;
01691 break;
01692 case O_PLUS:
01693 if (plusnest > maxnest)
01694 maxnest = plusnest;
01695 plusnest--;
01696 break;
01697 }
01698 } while (OP(s) != OEND);
01699 if (plusnest != 0)
01700 g->iflags |= BAD;
01701 return(maxnest);
01702 }
01703
01704
01705
01706