00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032 #include <setjmp.h>
00033 #include "config.h"
00034 #include "ctype.h"
00035 #include "vi.h"
00036 #include "regexp.h"
00037
00038
00039
00040 static char *previous;
00041
00042
00043 #ifndef NO_MAGIC
00044
00045
00046
00047 #define META '\0'
00048 #define BASE_META(m) ((m) - 256)
00049 #define INT_META(c) ((c) + 256)
00050 #define IS_META(m) ((m) >= 256)
00051 #define IS_CLASS(m) ((m) >= M_CLASS(0) && (m) <= M_CLASS(9))
00052 #define IS_START(m) ((m) >= M_START(0) && (m) <= M_START(9))
00053 #define IS_END(m) ((m) >= M_END(0) && (m) <= M_END(9))
00054 #define IS_CLOSURE(m) ((m) >= M_SPLAT && (m) <= M_RANGE)
00055 #define ADD_META(s,m) (*(s)++ = META, *(s)++ = BASE_META(m))
00056 #define GET_META(s) (*(s) == META ? INT_META(*++(s)) : *s)
00057
00058
00059 #define M_BEGLINE 256
00060 #define M_ENDLINE 257
00061 #define M_BEGWORD 258
00062 #define M_ENDWORD 259
00063 #define M_ANY 260
00064 #define M_SPLAT 261
00065 #define M_PLUS 262
00066 #define M_QMARK 263
00067 #define M_RANGE 264
00068 #define M_CLASS(n) (265+(n))
00069 #define M_START(n) (275+(n))
00070 #define M_END(n) (285+(n))
00071
00072
00073 static int class_cnt;
00074 static int start_cnt;
00075 static int end_stk[NSUBEXP];
00076 static int end_sp;
00077 static char *retext;
00078
00079
00080 jmp_buf errorhandler;
00081 #define FAIL(why) regerror(why); longjmp(errorhandler, 1)
00082
00083
00084
00085
00086
00087
00088 static char *makeclass(text, bmap)
00089 REG char *text;
00090 REG char *bmap;
00091 {
00092 REG int i;
00093 int complement = 0;
00094
00095
00096
00097 for (i = 0; bmap && i < 32; i++)
00098 {
00099 bmap[i] = 0;
00100 }
00101
00102
00103 if (*text == '^')
00104 {
00105 text++;
00106 complement = 1;
00107 }
00108
00109
00110 while (*text && *text != ']')
00111 {
00112
00113 if (text[1] == '-' && text[2])
00114 {
00115
00116 if (text[0] > text[2])
00117 {
00118 FAIL("Backwards span in []");
00119 }
00120
00121
00122 for (i = text[0]; bmap && i <= text[2]; i++)
00123 {
00124 bmap[i >> 3] |= (1 << (i & 7));
00125 }
00126
00127
00128 text += 3;
00129 }
00130 else
00131 {
00132
00133 i = *text++;
00134 if (bmap)
00135 {
00136 bmap[i >> 3] |= (1 << (i & 7));
00137 }
00138 }
00139 }
00140
00141
00142 if (*text++ != ']')
00143 {
00144 FAIL("] missing");
00145 }
00146
00147
00148 if (complement && bmap)
00149 {
00150 for (i = 0; i < 32; i++)
00151 {
00152 bmap[i] = ~bmap[i];
00153 }
00154 }
00155
00156 return text;
00157 }
00158
00159
00160
00161
00162
00163
00164
00165
00166
00167 static int gettoken(sptr, re)
00168 char **sptr;
00169 regexp *re;
00170 {
00171 int c;
00172
00173 c = **sptr;
00174 ++*sptr;
00175 if (c == '\\')
00176 {
00177 c = **sptr;
00178 ++*sptr;
00179 switch (c)
00180 {
00181 case '<':
00182 return M_BEGWORD;
00183
00184 case '>':
00185 return M_ENDWORD;
00186
00187 case '(':
00188 if (start_cnt >= NSUBEXP)
00189 {
00190 FAIL("Too many \\(s");
00191 }
00192 end_stk[end_sp++] = start_cnt;
00193 return M_START(start_cnt++);
00194
00195 case ')':
00196 if (end_sp <= 0)
00197 {
00198 FAIL("Mismatched \\)");
00199 }
00200 return M_END(end_stk[--end_sp]);
00201
00202 case '*':
00203 return (*o_magic ? c : M_SPLAT);
00204
00205 case '.':
00206 return (*o_magic ? c : M_ANY);
00207
00208 case '+':
00209 return M_PLUS;
00210
00211 case '?':
00212 return M_QMARK;
00213 #ifndef CRUNCH
00214 case '{':
00215 return M_RANGE;
00216 #endif
00217 default:
00218 return c;
00219 }
00220 }
00221 else if (*o_magic)
00222 {
00223 switch (c)
00224 {
00225 case '^':
00226 if (*sptr == retext + 1)
00227 {
00228 return M_BEGLINE;
00229 }
00230 return c;
00231
00232 case '$':
00233 if (!**sptr)
00234 {
00235 return M_ENDLINE;
00236 }
00237 return c;
00238
00239 case '.':
00240 return M_ANY;
00241
00242 case '*':
00243 return M_SPLAT;
00244
00245 case '[':
00246
00247 if (class_cnt >= 10)
00248 {
00249 FAIL("Too many []s");
00250 }
00251
00252
00253 if (re)
00254 {
00255
00256 *sptr = makeclass(*sptr, re->program + 1 + 32 * class_cnt);
00257 }
00258 else
00259 {
00260
00261 *sptr = makeclass(*sptr, (char *)0);
00262 }
00263 return M_CLASS(class_cnt++);
00264
00265 default:
00266 return c;
00267 }
00268 }
00269 else
00270 {
00271 switch (c)
00272 {
00273 case '^':
00274 if (*sptr == retext + 1)
00275 {
00276 return M_BEGLINE;
00277 }
00278 return c;
00279
00280 case '$':
00281 if (!**sptr)
00282 {
00283 return M_ENDLINE;
00284 }
00285 return c;
00286
00287 default:
00288 return c;
00289 }
00290 }
00291
00292 }
00293
00294
00295
00296
00297
00298
00299
00300
00301 static unsigned calcsize(text)
00302 char *text;
00303 {
00304 unsigned size;
00305 int token;
00306
00307 retext = text;
00308 class_cnt = 0;
00309 start_cnt = 1;
00310 end_sp = 0;
00311 size = 5;
00312 while ((token = gettoken(&text, (regexp *)0)) != 0)
00313 {
00314 if (IS_CLASS(token))
00315 {
00316 size += 34;
00317 }
00318 #ifndef CRUNCH
00319 else if (token == M_RANGE)
00320 {
00321 size += 4;
00322 while ((token = gettoken(&text, (regexp *)0)) != 0
00323 && token != '}')
00324 {
00325 }
00326 if (!token)
00327 {
00328 return size;
00329 }
00330 }
00331 #endif
00332 else if (IS_META(token))
00333 {
00334 size += 2;
00335 }
00336 else
00337 {
00338 size++;
00339 }
00340 }
00341
00342 return size;
00343 }
00344
00345
00346
00347
00348 regexp *regcomp(exp)
00349 char *exp;
00350 {
00351 int needfirst;
00352 unsigned size;
00353 int token;
00354 int peek;
00355 char *build;
00356 regexp *re;
00357 #ifndef CRUNCH
00358 int from;
00359 int to;
00360 int digit;
00361 #endif
00362
00363
00364
00365 re = (regexp *)0;
00366 if (setjmp(errorhandler))
00367 {
00368 if (re)
00369 {
00370 free(re);
00371 }
00372 return (regexp *)0;
00373 }
00374
00375
00376 if (*exp == 0)
00377 {
00378 if (!previous)
00379 {
00380 FAIL("No previous RE");
00381 }
00382 exp = previous;
00383 }
00384 else
00385 {
00386 if (previous)
00387 free(previous);
00388 previous = (char *)malloc((unsigned)(strlen(exp) + 1));
00389 if (previous)
00390 strcpy(previous, exp);
00391 }
00392
00393
00394 class_cnt = 0;
00395 start_cnt = 1;
00396 end_sp = 0;
00397 retext = exp;
00398 size = calcsize(exp) + sizeof(regexp) + 10;
00399 #ifdef lint
00400 re = ((regexp *)0) + size;
00401 #else
00402 re = (regexp *)malloc((unsigned)size);
00403 #endif
00404 if (!re)
00405 {
00406 FAIL("Not enough memory for this RE");
00407 }
00408
00409
00410 build = &re->program[1 + 32 * class_cnt];
00411 re->program[0] = class_cnt;
00412 for (token = 0; token < NSUBEXP; token++)
00413 {
00414 re->startp[token] = re->endp[token] = (char *)0;
00415 }
00416 re->first = 0;
00417 re->bol = 0;
00418 re->minlen = 0;
00419 needfirst = 1;
00420 class_cnt = 0;
00421 start_cnt = 1;
00422 end_sp = 0;
00423 retext = exp;
00424 for (token = M_START(0), peek = gettoken(&exp, re);
00425 token;
00426 token = peek, peek = gettoken(&exp, re))
00427 {
00428
00429 if (IS_CLOSURE(peek))
00430 {
00431
00432 if (IS_START(token))
00433 {
00434 FAIL("Closure operator follows nothing");
00435 }
00436 else if (IS_META(token) && token != M_ANY && !IS_CLASS(token))
00437 {
00438 FAIL("Closure operators can only follow a normal character or . or []");
00439 }
00440
00441 #ifndef CRUNCH
00442
00443 if (peek == M_RANGE)
00444 {
00445 from = 0;
00446 for (digit = gettoken(&exp, re);
00447 !IS_META(digit) && isdigit(digit);
00448 digit = gettoken(&exp, re))
00449 {
00450 from = from * 10 + digit - '0';
00451 }
00452 if (digit == '}')
00453 {
00454 to = from;
00455 }
00456 else if (digit == ',')
00457 {
00458 to = 0;
00459 for (digit = gettoken(&exp, re);
00460 !IS_META(digit) && isdigit(digit);
00461 digit = gettoken(&exp, re))
00462 {
00463 to = to * 10 + digit - '0';
00464 }
00465 if (to == 0)
00466 {
00467 to = 255;
00468 }
00469 }
00470 if (digit != '}')
00471 {
00472 FAIL("Bad characters after \\{");
00473 }
00474 else if (to < from || to == 0 || from >= 255)
00475 {
00476 FAIL("Invalid range for \\{ \\}");
00477 }
00478 re->minlen += from;
00479 }
00480 else
00481 #endif
00482 if (peek != M_SPLAT)
00483 {
00484 re->minlen++;
00485 }
00486
00487
00488 ADD_META(build, peek);
00489 #ifndef CRUNCH
00490 if (peek == M_RANGE)
00491 {
00492 *build++ = from;
00493 *build++ = (to < 255 ? to : 255);
00494 }
00495 #endif
00496
00497
00498
00499 if (needfirst && peek == M_PLUS && !IS_META(token))
00500 {
00501 re->first = token;
00502 }
00503 needfirst = 0;
00504
00505
00506 peek = gettoken(&exp, re);
00507 if (IS_CLOSURE(peek))
00508 {
00509 FAIL("* or \\+ or \\? doubled up");
00510 }
00511 }
00512 else if (!IS_META(token))
00513 {
00514
00515 if (needfirst)
00516 {
00517 re->first = token;
00518 needfirst = 0;
00519 }
00520 re->minlen++;
00521 }
00522 else if (token == M_ANY || IS_CLASS(token))
00523 {
00524
00525 needfirst = 0;
00526 re->minlen++;
00527 }
00528
00529
00530 if (token == M_BEGLINE)
00531 {
00532
00533 re->bol = 1;
00534 }
00535 else if (IS_META(token))
00536 {
00537 ADD_META(build, token);
00538 }
00539 else
00540 {
00541 *build++ = token;
00542 }
00543 }
00544
00545
00546 ADD_META(build, M_END(0));
00547 if (end_sp > 0)
00548 {
00549 FAIL("Not enough \\)s");
00550 }
00551
00552 return re;
00553 }
00554
00555
00556
00557
00558
00559
00560
00561
00562
00563
00564 int match1(re, ch, token)
00565 regexp *re;
00566 REG char ch;
00567 REG int token;
00568 {
00569 if (!ch)
00570 {
00571
00572 return 1;
00573 }
00574 if (token == M_ANY)
00575 {
00576 return 0;
00577 }
00578 else if (IS_CLASS(token))
00579 {
00580 if (re->program[1 + 32 * (token - M_CLASS(0)) + (ch >> 3)] & (1 << (ch & 7)))
00581 return 0;
00582 }
00583 else if (ch == token || *o_ignorecase && tolower(ch) == tolower(token))
00584 {
00585 return 0;
00586 }
00587 return 1;
00588 }
00589
00590
00591
00592
00593
00594
00595
00596 int match(re, str, prog, here)
00597 regexp *re;
00598 char *str;
00599 REG char *prog;
00600 REG char *here;
00601 {
00602 REG int token;
00603 REG int nmatched;
00604 REG int closure;
00605 int from;
00606 int to;
00607
00608 for (token = GET_META(prog); !IS_CLOSURE(token); prog++, token = GET_META(prog))
00609 {
00610 switch (token)
00611 {
00612
00613 case M_ENDLINE:
00614 if (*here)
00615 return 1;
00616 break;
00617
00618 case M_BEGWORD:
00619 if (here != str &&
00620 (here[-1] == '_' || isalnum(here[-1])))
00621 return 1;
00622 break;
00623
00624 case M_ENDWORD:
00625 if (here[0] == '_' || isalnum(here[0]))
00626 return 1;
00627 break;
00628
00629 case M_START(0):
00630 case M_START(1):
00631 case M_START(2):
00632 case M_START(3):
00633 case M_START(4):
00634 case M_START(5):
00635 case M_START(6):
00636 case M_START(7):
00637 case M_START(8):
00638 case M_START(9):
00639 re->startp[token - M_START(0)] = (char *)here;
00640 break;
00641
00642 case M_END(0):
00643 case M_END(1):
00644 case M_END(2):
00645 case M_END(3):
00646 case M_END(4):
00647 case M_END(5):
00648 case M_END(6):
00649 case M_END(7):
00650 case M_END(8):
00651 case M_END(9):
00652 re->endp[token - M_END(0)] = (char *)here;
00653 if (token == M_END(0))
00654 {
00655 return 0;
00656 }
00657 break;
00658
00659 default:
00660 if (match1(re, *here, token) != 0)
00661 return 1;
00662 here++;
00663 }
00664 }
00665
00666
00667
00668
00669
00670
00671 closure = token;
00672 prog++;
00673 switch (closure)
00674 {
00675 case M_SPLAT:
00676 from = 0;
00677 to = strlen(str);
00678 break;
00679
00680 case M_PLUS:
00681 from = 1;
00682 to = strlen(str);
00683 break;
00684
00685 case M_QMARK:
00686 from = 0;
00687 to = 1;
00688 break;
00689
00690 #ifndef CRUNCH
00691 case M_RANGE:
00692 from = UCHAR(*prog++);
00693 to = UCHAR(*prog++);
00694 if (to == 255)
00695 {
00696 to = strlen(str);
00697 }
00698 break;
00699 #endif
00700 }
00701 token = GET_META(prog);
00702 prog++;
00703
00704
00705 for (nmatched = 0;
00706 nmatched < to && *here && match1(re, *here, token) == 0;
00707 nmatched++, here++)
00708 {
00709 }
00710
00711
00712 while (nmatched >= from && match(re, str, prog, here) != 0)
00713 {
00714 nmatched--;
00715 here--;
00716 }
00717
00718
00719 if (nmatched >= from)
00720 return 0;
00721 return 1;
00722 }
00723
00724
00725
00726
00727 int regexec(re, str, bol)
00728 regexp *re;
00729 char *str;
00730 int bol;
00731 {
00732 char *prog;
00733 int len;
00734 REG char *here;
00735
00736
00737 if (re->bol && !bol)
00738 {
00739 return 0;
00740 }
00741
00742 len = strlen(str);
00743 prog = re->program + 1 + 32 * re->program[0];
00744
00745
00746 if (re->bol)
00747 {
00748
00749 if ((re->first
00750 && match1(re, *(char *)str, re->first))
00751 || len < re->minlen
00752 || match(re, (char *)str, prog, str))
00753 return 0;
00754 }
00755 #ifndef CRUNCH
00756 else if (!*o_ignorecase)
00757 {
00758
00759 for (here = (char *)str;
00760 (re->first && re->first != *here)
00761 || match(re, (char *)str, prog, here);
00762 here++, len--)
00763 {
00764 if (len < re->minlen)
00765 return 0;
00766 }
00767 }
00768 #endif
00769 else
00770 {
00771
00772 for (here = (char *)str;
00773 (re->first && match1(re, *here, (int)re->first))
00774 || match(re, (char *)str, prog, here);
00775 here++, len--)
00776 {
00777 if (len < re->minlen)
00778 return 0;
00779 }
00780 }
00781
00782
00783 return 1;
00784 }
00785
00786
00787 #else
00788
00789 regexp *regcomp(exp)
00790 char *exp;
00791 {
00792 char *src;
00793 char *dest;
00794 regexp *re;
00795 int i;
00796
00797
00798 #ifdef lint
00799 re = (regexp *)0;
00800 #else
00801 re = (regexp *)malloc((unsigned)(strlen(exp) + 1 + sizeof(struct regexp)));
00802 #endif
00803 if (!re)
00804 {
00805 regerror("Could not malloc a regexp structure");
00806 return (regexp *)0;
00807 }
00808
00809
00810 for (i = 0; i < NSUBEXP; i++)
00811 {
00812 re->startp[i] = re->endp[i] = (char *)0;
00813 }
00814 re->minlen = 0;
00815 re->first = 0;
00816 re->bol = 0;
00817
00818
00819 for (src = exp, dest = re->program + 1; *src; src++)
00820 {
00821 switch (*src)
00822 {
00823 case '^':
00824 if (src == exp)
00825 {
00826 re->bol += 1;
00827 }
00828 else
00829 {
00830 *dest++ = '^';
00831 re->minlen++;
00832 }
00833 break;
00834
00835 case '$':
00836 if (!src[1])
00837 {
00838 re->bol += 2;
00839 }
00840 else
00841 {
00842 *dest++ = '$';
00843 re->minlen++;
00844 }
00845 break;
00846
00847 case '\\':
00848 if (src[1])
00849 {
00850 *dest++ = *++src;
00851 re->minlen++;
00852 }
00853 else
00854 {
00855 regerror("extra \\ at end of regular expression");
00856 }
00857 break;
00858
00859 default:
00860 *dest++ = *src;
00861 re->minlen++;
00862 }
00863 }
00864 *dest = '\0';
00865
00866 return re;
00867 }
00868
00869
00870
00871
00872
00873
00874 static int reghelp(prog, string, bolflag)
00875 struct regexp *prog;
00876 char *string;
00877 int bolflag;
00878 {
00879 char *scan;
00880 char *str;
00881
00882
00883 if ((prog->bol & 1) && !bolflag)
00884 {
00885 return -1;
00886 }
00887
00888
00889 prog->startp[0] = string;
00890
00891
00892 if (*o_ignorecase)
00893 {
00894 for (scan = &prog->program[1]; *scan; scan++, string++)
00895 if (tolower(*scan) != tolower(*string))
00896 return *string ? 0 : -1;
00897 }
00898 else
00899 {
00900 for (scan = &prog->program[1]; *scan; scan++, string++)
00901 if (*scan != *string)
00902 return *string ? 0 : -1;
00903 }
00904
00905
00906 if ((prog->bol & 2) && *string)
00907 {
00908 return 0;
00909 }
00910
00911
00912 prog->endp[0] = string;
00913 return 1;
00914 }
00915
00916
00917
00918 int regexec(prog, string, bolflag)
00919 struct regexp *prog;
00920 char *string;
00921 int bolflag;
00922 {
00923 int rc;
00924
00925
00926 for (rc = reghelp(prog, string, bolflag); rc == 0; rc = reghelp(prog, string, 0))
00927 {
00928 string++;
00929 }
00930
00931
00932 return rc == 1;
00933 }
00934 #endif