|
|
1.1 root 1: /* Extended regular expression matching and search.
2: Copyright (C) 1985 Free Software Foundation, Inc.
3:
4: NO WARRANTY
5:
6: BECAUSE THIS PROGRAM IS LICENSED FREE OF CHARGE, WE PROVIDE ABSOLUTELY
7: NO WARRANTY, TO THE EXTENT PERMITTED BY APPLICABLE STATE LAW. EXCEPT
8: WHEN OTHERWISE STATED IN WRITING, FREE SOFTWARE FOUNDATION, INC,
9: RICHARD M. STALLMAN AND/OR OTHER PARTIES PROVIDE THIS PROGRAM "AS IS"
10: WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING,
11: BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
12: FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY
13: AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE
14: DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR
15: CORRECTION.
16:
17: IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW WILL RICHARD M.
18: STALLMAN, THE FREE SOFTWARE FOUNDATION, INC., AND/OR ANY OTHER PARTY
19: WHO MAY MODIFY AND REDISTRIBUTE THIS PROGRAM AS PERMITTED BELOW, BE
20: LIABLE TO YOU FOR DAMAGES, INCLUDING ANY LOST PROFITS, LOST MONIES, OR
21: OTHER SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
22: USE OR INABILITY TO USE (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR
23: DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY THIRD PARTIES OR
24: A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS) THIS
25: PROGRAM, EVEN IF YOU HAVE BEEN ADVISED OF THE POSSIBILITY OF SUCH
26: DAMAGES, OR FOR ANY CLAIM BY ANY OTHER PARTY.
27:
28: GENERAL PUBLIC LICENSE TO COPY
29:
30: 1. You may copy and distribute verbatim copies of this source file
31: as you receive it, in any medium, provided that you conspicuously and
32: appropriately publish on each copy a valid copyright notice "Copyright
33: (C) 1985 Free Software Foundation, Inc."; and include following the
34: copyright notice a verbatim copy of the above disclaimer of warranty
35: and of this License. You may charge a distribution fee for the
36: physical act of transferring a copy.
37:
38: 2. You may modify your copy or copies of this source file or
39: any portion of it, and copy and distribute such modifications under
40: the terms of Paragraph 1 above, provided that you also do the following:
41:
42: a) cause the modified files to carry prominent notices stating
43: that you changed the files and the date of any change; and
44:
45: b) cause the whole of any work that you distribute or publish,
46: that in whole or in part contains or is a derivative of this
47: program or any part thereof, to be licensed at no charge to all
48: third parties on terms identical to those contained in this
49: License Agreement (except that you may choose to grant more extensive
50: warranty protection to some or all third parties, at your option).
51:
52: c) You may charge a distribution fee for the physical act of
53: transferring a copy, and you may at your option offer warranty
54: protection in exchange for a fee.
55:
56: Mere aggregation of another unrelated program with this program (or its
57: derivative) on a volume of a storage or distribution medium does not bring
58: the other program under the scope of these terms.
59:
60: 3. You may copy and distribute this program (or a portion or derivative
61: of it, under Paragraph 2) in object code or executable form under the terms
62: of Paragraphs 1 and 2 above provided that you also do one of the following:
63:
64: a) accompany it with the complete corresponding machine-readable
65: source code, which must be distributed under the terms of
66: Paragraphs 1 and 2 above; or,
67:
68: b) accompany it with a written offer, valid for at least three
69: years, to give any third party free (except for a nominal
70: shipping charge) a complete machine-readable copy of the
71: corresponding source code, to be distributed under the terms of
72: Paragraphs 1 and 2 above; or,
73:
74: c) accompany it with the information you received as to where the
75: corresponding source code may be obtained. (This alternative is
76: allowed only for noncommercial distribution and only if you
77: received the program in object code or executable form alone.)
78:
79: For an executable file, complete source code means all the source code for
80: all modules it contains; but, as a special exception, it need not include
81: source code for modules which are standard libraries that accompany the
82: operating system on which the executable file runs.
83:
84: 4. You may not copy, sublicense, distribute or transfer this program
85: except as expressly provided under this License Agreement. Any attempt
86: otherwise to copy, sublicense, distribute or transfer this program is void and
87: your rights to use the program under this License agreement shall be
88: automatically terminated. However, parties who have received computer
89: software programs from you with this License Agreement will not have
90: their licenses terminated so long as such parties remain in full compliance.
91:
92: 5. If you wish to incorporate parts of this program into other free
93: programs whose distribution conditions are different, write to the Free
94: Software Foundation at 675 Mass Ave, Cambridge, MA 02139. We have not yet
95: worked out a simple rule that can be stated here, but we will often permit
96: this. We will be guided by the two goals of preserving the free status of
97: all derivatives of our free software and of promoting the sharing and reuse of
98: software.
99:
100:
101: In other words, you are welcome to use, share and improve this program.
102: You are forbidden to forbid anyone else to use, share and improve
103: what you give them. Help stamp out software-hoarding! */
104:
105:
106: /* To test, compile with -Dtest.
107: This Dtestable feature turns this into a self-contained program
108: which reads a pattern, describes how it compiles,
109: then reads a string and searches for it. */
110:
111:
112: #ifdef emacs
113:
114: /* The `emacs' switch turns on certain special matching commands
115: that make sense only in emacs. */
116:
117: #include "config.h"
118: #include "lisp.h"
119: #include "buffer.h"
120: #include "syntax.h"
121:
122: #else /* not emacs */
123:
124: /*
125: * Define the syntax stuff, so we can do the \<...\> things.
126: */
127:
128: #ifndef Sword /* must be non-zero in some of the tests below... */
129: #define Sword 1
130: #endif
131:
132: #define SYNTAX(c) re_syntax_table[c]
133:
134: #ifdef SYNTAX_TABLE
135:
136: char *re_syntax_table;
137:
138: #else
139:
140: static char re_syntax_table[256];
141:
142: static void
143: init_syntax_once ()
144: {
145: register int c;
146: static int done = 0;
147:
148: if (done)
149: return;
150:
151: bzero (re_syntax_table, sizeof re_syntax_table);
152:
153: for (c = 'a'; c <= 'z'; c++)
154: re_syntax_table[c] = Sword;
155:
156: for (c = 'A'; c <= 'Z'; c++)
157: re_syntax_table[c] = Sword;
158:
159: for (c = '0'; c <= '9'; c++)
160: re_syntax_table[c] = Sword;
161:
162: done = 1;
163: }
164:
165: #endif /* SYNTAX_TABLE */
166: #endif /* not emacs */
167:
168: #include "regex.h"
169:
170: /* Number of failure points to allocate space for initially,
171: when matching. If this number is exceeded, more space is allocated,
172: so it is not a hard limit. */
173:
174: #ifndef NFAILURES
175: #define NFAILURES 80
176: #endif NFAILURES
177:
178: /* width of a byte in bits */
179:
180: #define BYTEWIDTH 8
181:
182: #ifndef SIGN_EXTEND_CHAR
183: #define SIGN_EXTEND_CHAR(x) (x)
184: #endif
185:
186: static int obscure_syntax = 0;
187:
188: /* Specify the precise syntax of regexp for compilation.
189: This provides for compatibility for various utilities
190: which historically have different, incompatible syntaxes.
191:
192: The argument SYNTAX is a bit-mask containing the two bits
193: RE_NO_BK_PARENS and RE_NO_BK_VBAR. */
194:
195: int
196: re_set_syntax (syntax)
197: {
198: int ret;
199:
200: ret = obscure_syntax;
201: obscure_syntax = syntax;
202: return ret;
203: }
204:
205: /* re_compile_pattern takes a regular-expression string
206: and converts it into a buffer full of byte commands for matching.
207:
208: PATTERN is the address of the pattern string
209: SIZE is the length of it.
210: BUFP is a struct re_pattern_buffer * which points to the info
211: on where to store the byte commands.
212: This structure contains a char * which points to the
213: actual space, which should have been obtained with malloc.
214: re_compile_pattern may use realloc to grow the buffer space.
215:
216: The number of bytes of commands can be found out by looking in
217: the struct re_pattern_buffer that bufp pointed to,
218: after re_compile_pattern returns.
219: */
220:
221: #define PATPUSH(ch) (*b++ = (char) (ch))
222:
223: #define PATFETCH(c) \
224: {if (p == pend) goto end_of_pattern; \
225: c = * (unsigned char *) p++; \
226: if (translate) c = translate[c]; }
227:
228: #define PATFETCH_RAW(c) \
229: {if (p == pend) goto end_of_pattern; \
230: c = * (unsigned char *) p++; }
231:
232: #define PATUNFETCH p--
233:
234: #define EXTEND_BUFFER \
235: { char *old_buffer = bufp->buffer; \
236: if (bufp->allocated == (1<<16)) goto too_big; \
237: bufp->allocated *= 2; \
238: if (bufp->allocated > (1<<16)) bufp->allocated = (1<<16); \
239: if (!(bufp->buffer = (char *) realloc (bufp->buffer, bufp->allocated))) \
240: goto memory_exhausted; \
241: c = bufp->buffer - old_buffer; \
242: b += c; \
243: if (fixup_jump) \
244: fixup_jump += c; \
245: if (laststart) \
246: laststart += c; \
247: begalt += c; \
248: if (pending_exact) \
249: pending_exact += c; \
250: }
251:
252: static int store_jump (), insert_jump ();
253:
254: char *
255: re_compile_pattern (pattern, size, bufp)
256: char *pattern;
257: int size;
258: struct re_pattern_buffer *bufp;
259: {
260: register char *b = bufp->buffer;
261: register char *p = pattern;
262: char *pend = pattern + size;
263: register unsigned c, c1;
264: char *p1;
265: unsigned char *translate = (unsigned char *) bufp->translate;
266:
267: /* address of the count-byte of the most recently inserted "exactn" command.
268: This makes it possible to tell whether a new exact-match character
269: can be added to that command or requires a new "exactn" command. */
270:
271: char *pending_exact = 0;
272:
273: /* address of the place where a forward-jump should go
274: to the end of the containing expression.
275: Each alternative of an "or", except the last, ends with a forward-jump
276: of this sort. */
277:
278: char *fixup_jump = 0;
279:
280: /* address of start of the most recently finished expression.
281: This tells postfix * where to find the start of its operand. */
282:
283: char *laststart = 0;
284:
285: /* In processing a repeat, 1 means zero matches is allowed */
286:
287: char zero_times_ok;
288:
289: /* In processing a repeat, 1 means many matches is allowed */
290:
291: char many_times_ok;
292:
293: /* address of beginning of regexp, or inside of last \( */
294:
295: char *begalt = b;
296:
297: /* Stack of information saved by \( and restored by \).
298: Four stack elements are pushed by each \(:
299: First, the value of b.
300: Second, the value of fixup_jump.
301: Third, the value of regnum.
302: Fourth, the value of begalt. */
303:
304: int stackb[40];
305: int *stackp = stackb;
306: int *stacke = stackb + 40;
307: int *stackt;
308:
309: /* Counts \('s as they are encountered. Remembered for the matching \),
310: where it becomes the "register number" to put in the stop_memory command */
311:
312: int regnum = 1;
313:
314: bufp->fastmap_accurate = 0;
315:
316: #ifndef emacs
317: #ifndef SYNTAX_TABLE
318: /*
319: * Initialize the syntax table.
320: */
321: init_syntax_once();
322: #endif
323: #endif
324:
325: if (bufp->allocated == 0)
326: {
327: bufp->allocated = 28;
328: if (bufp->buffer)
329: /* EXTEND_BUFFER loses when bufp->allocated is 0 */
330: bufp->buffer = (char *) realloc (bufp->buffer, 28);
331: else
332: /* Caller did not allocate a buffer. Do it for him */
333: bufp->buffer = (char *) malloc (28);
334: if (!bufp->buffer) goto memory_exhausted;
335: begalt = b = bufp->buffer;
336: }
337:
338: while (p != pend)
339: {
340: if (b - bufp->buffer > bufp->allocated - 10)
341: /* Note that EXTEND_BUFFER clobbers c */
342: EXTEND_BUFFER;
343:
344: PATFETCH (c);
345:
346: switch (c)
347: {
348: case '$':
349: if (obscure_syntax & RE_TIGHT_VBAR)
350: {
351: if (! (obscure_syntax & RE_CONTEXT_INDEP_OPS) && p != pend)
352: goto normal_char;
353: /* Make operand of last vbar end before this `$'. */
354: if (fixup_jump)
355: store_jump (fixup_jump, jump, b);
356: fixup_jump = 0;
357: PATPUSH (endline);
358: break;
359: }
360:
361: /* $ means succeed if at end of line, but only in special contexts.
362: If randomly in the middle of a pattern, it is a normal character. */
363: if (p == pend || *p == '\n'
364: || (obscure_syntax & RE_CONTEXT_INDEP_OPS)
365: || (obscure_syntax & RE_NO_BK_PARENS
366: ? *p == ')'
367: : *p == '\\' && p[1] == ')')
368: || (obscure_syntax & RE_NO_BK_VBAR
369: ? *p == '|'
370: : *p == '\\' && p[1] == '|'))
371: {
372: PATPUSH (endline);
373: break;
374: }
375: goto normal_char;
376:
377: case '^':
378: /* ^ means succeed if at beg of line, but only if no preceding pattern. */
379:
380: if (laststart && p[-2] != '\n'
381: && ! (obscure_syntax & RE_CONTEXT_INDEP_OPS))
382: goto normal_char;
383: if (obscure_syntax & RE_TIGHT_VBAR)
384: {
385: if (p != pattern + 1
386: && ! (obscure_syntax & RE_CONTEXT_INDEP_OPS))
387: goto normal_char;
388: PATPUSH (begline);
389: begalt = b;
390: }
391: else
392: PATPUSH (begline);
393: break;
394:
395: case '+':
396: case '?':
397: if (obscure_syntax & RE_BK_PLUS_QM)
398: goto normal_char;
399: handle_plus:
400: case '*':
401: /* If there is no previous pattern, char not special. */
402: if (!laststart && ! (obscure_syntax & RE_CONTEXT_INDEP_OPS))
403: goto normal_char;
404: /* If there is a sequence of repetition chars,
405: collapse it down to equivalent to just one. */
406: zero_times_ok = 0;
407: many_times_ok = 0;
408: while (1)
409: {
410: zero_times_ok |= c != '+';
411: many_times_ok |= c != '?';
412: if (p == pend)
413: break;
414: PATFETCH (c);
415: if (c == '*')
416: ;
417: else if (!(obscure_syntax & RE_BK_PLUS_QM)
418: && (c == '+' || c == '?'))
419: ;
420: else if ((obscure_syntax & RE_BK_PLUS_QM)
421: && c == '\\')
422: {
423: int c1;
424: PATFETCH (c1);
425: if (!(c1 == '+' || c1 == '?'))
426: {
427: PATUNFETCH;
428: PATUNFETCH;
429: break;
430: }
431: c = c1;
432: }
433: else
434: {
435: PATUNFETCH;
436: break;
437: }
438: }
439:
440: /* Star, etc. applied to an empty pattern is equivalent
441: to an empty pattern. */
442: if (!laststart)
443: break;
444:
445: /* Now we know whether 0 matches is allowed,
446: and whether 2 or more matches is allowed. */
447: if (many_times_ok)
448: {
449: /* If more than one repetition is allowed,
450: put in a backward jump at the end. */
451: store_jump (b, maybe_finalize_jump, laststart - 3);
452: b += 3;
453: }
454: insert_jump (on_failure_jump, laststart, b + 3, b);
455: pending_exact = 0;
456: b += 3;
457: if (!zero_times_ok)
458: {
459: /* At least one repetition required: insert before the loop
460: a skip over the initial on-failure-jump instruction */
461: insert_jump (dummy_failure_jump, laststart, laststart + 6, b);
462: b += 3;
463: }
464: break;
465:
466: case '.':
467: laststart = b;
468: PATPUSH (anychar);
469: break;
470:
471: case '[':
472: while (b - bufp->buffer
473: > bufp->allocated - 3 - (1 << BYTEWIDTH) / BYTEWIDTH)
474: /* Note that EXTEND_BUFFER clobbers c */
475: EXTEND_BUFFER;
476:
477: laststart = b;
478: if (*p == '^')
479: PATPUSH (charset_not), p++;
480: else
481: PATPUSH (charset);
482: p1 = p;
483:
484: PATPUSH ((1 << BYTEWIDTH) / BYTEWIDTH);
485: /* Clear the whole map */
486: bzero (b, (1 << BYTEWIDTH) / BYTEWIDTH);
487: /* Read in characters and ranges, setting map bits */
488: while (1)
489: {
490: PATFETCH (c);
491: if (c == ']' && p != p1 + 1) break;
492: if (*p == '-' && p[1] != ']')
493: {
494: PATFETCH (c1);
495: PATFETCH (c1);
496: while (c <= c1)
497: b[c / BYTEWIDTH] |= 1 << (c % BYTEWIDTH), c++;
498: }
499: else
500: {
501: b[c / BYTEWIDTH] |= 1 << (c % BYTEWIDTH);
502: }
503: }
504: /* Discard any bitmap bytes that are all 0 at the end of the map.
505: Decrement the map-length byte too. */
506: while ((int) b[-1] > 0 && b[b[-1] - 1] == 0)
507: b[-1]--;
508: b += b[-1];
509: break;
510:
511: case '(':
512: if (! (obscure_syntax & RE_NO_BK_PARENS))
513: goto normal_char;
514: else
515: goto handle_open;
516:
517: case ')':
518: if (! (obscure_syntax & RE_NO_BK_PARENS))
519: goto normal_char;
520: else
521: goto handle_close;
522:
523: case '\n':
524: if (! (obscure_syntax & RE_NEWLINE_OR))
525: goto normal_char;
526: else
527: goto handle_bar;
528:
529: case '|':
530: if (! (obscure_syntax & RE_NO_BK_VBAR))
531: goto normal_char;
532: else
533: goto handle_bar;
534:
535: case '\\':
536: if (p == pend) goto invalid_pattern;
537: PATFETCH_RAW (c);
538: switch (c)
539: {
540: case '(':
541: if (obscure_syntax & RE_NO_BK_PARENS)
542: goto normal_backsl;
543: handle_open:
544: if (stackp == stacke) goto nesting_too_deep;
545: if (regnum < RE_NREGS)
546: {
547: PATPUSH (start_memory);
548: PATPUSH (regnum);
549: }
550: *stackp++ = b - bufp->buffer;
551: *stackp++ = fixup_jump ? fixup_jump - bufp->buffer + 1 : 0;
552: *stackp++ = regnum++;
553: *stackp++ = begalt - bufp->buffer;
554: fixup_jump = 0;
555: laststart = 0;
556: begalt = b;
557: break;
558:
559: case ')':
560: if (obscure_syntax & RE_NO_BK_PARENS)
561: goto normal_backsl;
562: handle_close:
563: if (stackp == stackb) goto unmatched_close;
564: begalt = *--stackp + bufp->buffer;
565: if (fixup_jump)
566: store_jump (fixup_jump, jump, b);
567: if (stackp[-1] < RE_NREGS)
568: {
569: PATPUSH (stop_memory);
570: PATPUSH (stackp[-1]);
571: }
572: stackp -= 2;
573: fixup_jump = 0;
574: if (*stackp)
575: fixup_jump = *stackp + bufp->buffer - 1;
576: laststart = *--stackp + bufp->buffer;
577: break;
578:
579: case '|':
580: if (obscure_syntax & RE_NO_BK_VBAR)
581: goto normal_backsl;
582: handle_bar:
583: insert_jump (on_failure_jump, begalt, b + 6, b);
584: pending_exact = 0;
585: b += 3;
586: if (fixup_jump)
587: store_jump (fixup_jump, jump, b);
588: fixup_jump = b;
589: b += 3;
590: laststart = 0;
591: begalt = b;
592: break;
593:
594: #ifdef emacs
595: case '=':
596: PATPUSH (at_dot);
597: break;
598:
599: case 's':
600: laststart = b;
601: PATPUSH (syntaxspec);
602: PATFETCH (c);
603: PATPUSH (syntax_spec_code[c]);
604: break;
605:
606: case 'S':
607: laststart = b;
608: PATPUSH (notsyntaxspec);
609: PATFETCH (c);
610: PATPUSH (syntax_spec_code[c]);
611: break;
612: #endif emacs
613:
614: case 'w':
615: laststart = b;
616: PATPUSH (wordchar);
617: break;
618:
619: case 'W':
620: laststart = b;
621: PATPUSH (notwordchar);
622: break;
623:
624: case '<':
625: PATPUSH (wordbeg);
626: break;
627:
628: case '>':
629: PATPUSH (wordend);
630: break;
631:
632: case 'b':
633: PATPUSH (wordbound);
634: break;
635:
636: case 'B':
637: PATPUSH (notwordbound);
638: break;
639:
640: case '`':
641: PATPUSH (begbuf);
642: break;
643:
644: case '\'':
645: PATPUSH (endbuf);
646: break;
647:
648: case '1':
649: case '2':
650: case '3':
651: case '4':
652: case '5':
653: case '6':
654: case '7':
655: case '8':
656: case '9':
657: c1 = c - '0';
658: if (c1 >= regnum)
659: goto normal_char;
660: for (stackt = stackp - 2; stackt > stackb; stackt -= 4)
661: if (*stackt == c1)
662: goto normal_char;
663: laststart = b;
664: PATPUSH (duplicate);
665: PATPUSH (c1);
666: break;
667:
668: case '+':
669: case '?':
670: if (obscure_syntax & RE_BK_PLUS_QM)
671: goto handle_plus;
672:
673: default:
674: normal_backsl:
675: /* You might think it would be useful for \ to mean
676: not to translate; but if we don't translate it
677: it will never match anything. */
678: if (translate) c = translate[c];
679: goto normal_char;
680: }
681: break;
682:
683: default:
684: normal_char:
685: if (!pending_exact || pending_exact + *pending_exact + 1 != b
686: || *pending_exact == 0177 || *p == '*' || *p == '^'
687: || ((obscure_syntax & RE_BK_PLUS_QM)
688: ? *p == '\\' && (p[1] == '+' || p[1] == '?')
689: : (*p == '+' || *p == '?')))
690: {
691: laststart = b;
692: PATPUSH (exactn);
693: pending_exact = b;
694: PATPUSH (0);
695: }
696: PATPUSH (c);
697: (*pending_exact)++;
698: }
699: }
700:
701: if (fixup_jump)
702: store_jump (fixup_jump, jump, b);
703:
704: if (stackp != stackb) goto unmatched_open;
705:
706: bufp->used = b - bufp->buffer;
707: return 0;
708:
709: invalid_pattern:
710: return "Invalid regular expression";
711:
712: unmatched_open:
713: return "Unmatched \\(";
714:
715: unmatched_close:
716: return "Unmatched \\)";
717:
718: end_of_pattern:
719: return "Premature end of regular expression";
720:
721: nesting_too_deep:
722: return "Nesting too deep";
723:
724: too_big:
725: return "Regular expression too big";
726:
727: memory_exhausted:
728: return "Memory exhausted";
729: }
730:
731: /* Store where `from' points a jump operation to jump to where `to' points.
732: `opcode' is the opcode to store. */
733:
734: static int
735: store_jump (from, opcode, to)
736: char *from, *to;
737: char opcode;
738: {
739: from[0] = opcode;
740: from[1] = (to - (from + 3)) & 0377;
741: from[2] = (to - (from + 3)) >> 8;
742: }
743:
744: /* Open up space at char FROM, and insert there a jump to TO.
745: CURRENT_END gives te end of the storage no in use,
746: so we know how much data to copy up.
747: OP is the opcode of the jump to insert.
748:
749: If you call this function, you must zero out pending_exact. */
750:
751: static int
752: insert_jump (op, from, to, current_end)
753: char op;
754: char *from, *to, *current_end;
755: {
756: register char *pto = current_end + 3;
757: register char *pfrom = current_end;
758: while (pfrom != from)
759: *--pto = *--pfrom;
760: store_jump (from, op, to);
761: }
762:
763: /* Given a pattern, compute a fastmap from it.
764: The fastmap records which of the (1 << BYTEWIDTH) possible characters
765: can start a string that matches the pattern.
766: This fastmap is used by re_search to skip quickly over totally implausible text.
767:
768: The caller must supply the address of a (1 << BYTEWIDTH)-byte data area
769: as bufp->fastmap.
770: The other components of bufp describe the pattern to be used. */
771:
772: void
773: re_compile_fastmap (bufp)
774: struct re_pattern_buffer *bufp;
775: {
776: unsigned char *pattern = (unsigned char *) bufp->buffer;
777: int size = bufp->used;
778: register char *fastmap = bufp->fastmap;
779: register unsigned char *p = pattern;
780: register unsigned char *pend = pattern + size;
781: register int j, k;
782: unsigned char *translate = (unsigned char *) bufp->translate;
783:
784: unsigned char *stackb[NFAILURES];
785: unsigned char **stackp = stackb;
786:
787: bzero (fastmap, (1 << BYTEWIDTH));
788: bufp->fastmap_accurate = 1;
789: bufp->can_be_null = 0;
790:
791: while (p)
792: {
793: if (p == pend)
794: {
795: bufp->can_be_null = 1;
796: break;
797: }
798: #ifdef SWITCH_ENUM_BUG
799: switch ((int) ((enum regexpcode) *p++))
800: #else
801: switch ((enum regexpcode) *p++)
802: #endif
803: {
804: case exactn:
805: if (translate)
806: fastmap[translate[p[1]]] = 1;
807: else
808: fastmap[p[1]] = 1;
809: break;
810:
811: case begline:
812: case before_dot:
813: case at_dot:
814: case after_dot:
815: case begbuf:
816: case endbuf:
817: case wordbound:
818: case notwordbound:
819: case wordbeg:
820: case wordend:
821: continue;
822:
823: case endline:
824: if (translate)
825: fastmap[translate['\n']] = 1;
826: else
827: fastmap['\n'] = 1;
828: if (bufp->can_be_null != 1)
829: bufp->can_be_null = 2;
830: break;
831:
832: case finalize_jump:
833: case maybe_finalize_jump:
834: case jump:
835: case dummy_failure_jump:
836: bufp->can_be_null = 1;
837: j = *p++ & 0377;
838: j += SIGN_EXTEND_CHAR (*(char *)p) << 8;
839: p += j + 1; /* The 1 compensates for missing ++ above */
840: if (j > 0)
841: continue;
842: /* Jump backward reached implies we just went through
843: the body of a loop and matched nothing.
844: Opcode jumped to should be an on_failure_jump.
845: Just treat it like an ordinary jump.
846: For a * loop, it has pushed its failure point already;
847: if so, discard that as redundant. */
848: if ((enum regexpcode) *p != on_failure_jump)
849: continue;
850: p++;
851: j = *p++ & 0377;
852: j += SIGN_EXTEND_CHAR (*(char *)p) << 8;
853: p += j + 1; /* The 1 compensates for missing ++ above */
854: if (stackp != stackb && *stackp == p)
855: stackp--;
856: continue;
857:
858: case on_failure_jump:
859: j = *p++ & 0377;
860: j += SIGN_EXTEND_CHAR (*(char *)p) << 8;
861: p++;
862: *++stackp = p + j;
863: continue;
864:
865: case start_memory:
866: case stop_memory:
867: p++;
868: continue;
869:
870: case duplicate:
871: bufp->can_be_null = 1;
872: fastmap['\n'] = 1;
873: case anychar:
874: for (j = 0; j < (1 << BYTEWIDTH); j++)
875: if (j != '\n')
876: fastmap[j] = 1;
877: if (bufp->can_be_null)
878: return;
879: /* Don't return; check the alternative paths
880: so we can set can_be_null if appropriate. */
881: break;
882:
883: case wordchar:
884: for (j = 0; j < (1 << BYTEWIDTH); j++)
885: if (SYNTAX (j) == Sword)
886: fastmap[j] = 1;
887: break;
888:
889: case notwordchar:
890: for (j = 0; j < (1 << BYTEWIDTH); j++)
891: if (SYNTAX (j) != Sword)
892: fastmap[j] = 1;
893: break;
894:
895: #ifdef emacs
896: case syntaxspec:
897: k = *p++;
898: for (j = 0; j < (1 << BYTEWIDTH); j++)
899: if (SYNTAX (j) == (enum syntaxcode) k)
900: fastmap[j] = 1;
901: break;
902:
903: case notsyntaxspec:
904: k = *p++;
905: for (j = 0; j < (1 << BYTEWIDTH); j++)
906: if (SYNTAX (j) != (enum syntaxcode) k)
907: fastmap[j] = 1;
908: break;
909: #endif emacs
910:
911: case charset:
912: for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
913: if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))
914: {
915: if (translate)
916: fastmap[translate[j]] = 1;
917: else
918: fastmap[j] = 1;
919: }
920: break;
921:
922: case charset_not:
923: /* Chars beyond end of map must be allowed */
924: for (j = *p * BYTEWIDTH; j < (1 << BYTEWIDTH); j++)
925: if (translate)
926: fastmap[translate[j]] = 1;
927: else
928: fastmap[j] = 1;
929:
930: for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
931: if (!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))))
932: {
933: if (translate)
934: fastmap[translate[j]] = 1;
935: else
936: fastmap[j] = 1;
937: }
938: break;
939: }
940:
941: /* Get here means we have successfully found the possible starting characters
942: of one path of the pattern. We need not follow this path any farther.
943: Instead, look at the next alternative remembered in the stack. */
944: if (stackp != stackb)
945: p = *stackp--;
946: else
947: break;
948: }
949: }
950:
951: /* Like re_search_2, below, but only one string is specified. */
952:
953: int
954: re_search (pbufp, string, size, startpos, range, regs)
955: struct re_pattern_buffer *pbufp;
956: char *string;
957: int size, startpos, range;
958: struct re_registers *regs;
959: {
960: return re_search_2 (pbufp, 0, 0, string, size, startpos, range, regs, size);
961: }
962:
963: /* Like re_match_2 but tries first a match starting at index STARTPOS,
964: then at STARTPOS + 1, and so on.
965: RANGE is the number of places to try before giving up.
966: If RANGE is negative, the starting positions tried are
967: STARTPOS, STARTPOS - 1, etc.
968: It is up to the caller to make sure that range is not so large
969: as to take the starting position outside of the input strings.
970:
971: The value returned is the position at which the match was found,
972: or -1 if no match was found,
973: or -2 if error (such as failure stack overflow). */
974:
975: int
976: re_search_2 (pbufp, string1, size1, string2, size2, startpos, range, regs, mstop)
977: struct re_pattern_buffer *pbufp;
978: char *string1, *string2;
979: int size1, size2;
980: int startpos;
981: register int range;
982: struct re_registers *regs;
983: int mstop;
984: {
985: register char *fastmap = pbufp->fastmap;
986: register unsigned char *translate = (unsigned char *) pbufp->translate;
987: int total = size1 + size2;
988: int val;
989:
990: /* Update the fastmap now if not correct already */
991: if (fastmap && !pbufp->fastmap_accurate)
992: re_compile_fastmap (pbufp);
993:
994: /* Don't waste time in a long search for a pattern
995: that says it is anchored. */
996: if (pbufp->used > 0 && (enum regexpcode) pbufp->buffer[0] == begbuf
997: && range > 0)
998: {
999: if (startpos > 0)
1000: return -1;
1001: else
1002: range = 1;
1003: }
1004:
1005: while (1)
1006: {
1007: /* If a fastmap is supplied, skip quickly over characters
1008: that cannot possibly be the start of a match.
1009: Note, however, that if the pattern can possibly match
1010: the null string, we must test it at each starting point
1011: so that we take the first null string we get. */
1012:
1013: if (fastmap && startpos < total && pbufp->can_be_null != 1)
1014: {
1015: if (range > 0)
1016: {
1017: register int lim = 0;
1018: register unsigned char *p;
1019: int irange = range;
1020: if (startpos < size1 && startpos + range >= size1)
1021: lim = range - (size1 - startpos);
1022:
1023: p = ((unsigned char *)
1024: &(startpos >= size1 ? string2 - size1 : string1)[startpos]);
1025:
1026: if (translate)
1027: {
1028: while (range > lim && !fastmap[translate[*p++]])
1029: range--;
1030: }
1031: else
1032: {
1033: while (range > lim && !fastmap[*p++])
1034: range--;
1035: }
1036: startpos += irange - range;
1037: }
1038: else
1039: {
1040: register unsigned char c;
1041: if (startpos >= size1)
1042: c = string2[startpos - size1];
1043: else
1044: c = string1[startpos];
1045: c &= 0xff;
1046: if (translate ? !fastmap[translate[c]] : !fastmap[c])
1047: goto advance;
1048: }
1049: }
1050:
1051: if (range >= 0 && startpos == total
1052: && fastmap && pbufp->can_be_null == 0)
1053: return -1;
1054:
1055: val = re_match_2 (pbufp, string1, size1, string2, size2, startpos, regs, mstop);
1056: if (0 <= val)
1057: {
1058: if (val == -2)
1059: return -2;
1060: return startpos;
1061: }
1062:
1063: #ifdef C_ALLOCA
1064: alloca (0);
1065: #endif /* C_ALLOCA */
1066:
1067: advance:
1068: if (!range) break;
1069: if (range > 0) range--, startpos++; else range++, startpos--;
1070: }
1071: return -1;
1072: }
1073:
1074: #ifndef emacs /* emacs never uses this */
1075: int
1076: re_match (pbufp, string, size, pos, regs)
1077: struct re_pattern_buffer *pbufp;
1078: char *string;
1079: int size, pos;
1080: struct re_registers *regs;
1081: {
1082: return re_match_2 (pbufp, 0, 0, string, size, pos, regs, size);
1083: }
1084: #endif /* emacs */
1085:
1086: /* Maximum size of failure stack. Beyond this, overflow is an error. */
1087:
1088: int re_max_failures = 2000;
1089:
1090: static int bcmp_translate();
1091: /* Match the pattern described by PBUFP
1092: against data which is the virtual concatenation of STRING1 and STRING2.
1093: SIZE1 and SIZE2 are the sizes of the two data strings.
1094: Start the match at position POS.
1095: Do not consider matching past the position MSTOP.
1096:
1097: If pbufp->fastmap is nonzero, then it had better be up to date.
1098:
1099: The reason that the data to match are specified as two components
1100: which are to be regarded as concatenated
1101: is so this function can be used directly on the contents of an Emacs buffer.
1102:
1103: -1 is returned if there is no match. -2 is returned if there is
1104: an error (such as match stack overflow). Otherwise the value is the length
1105: of the substring which was matched. */
1106:
1107: int
1108: re_match_2 (pbufp, string1, size1, string2, size2, pos, regs, mstop)
1109: struct re_pattern_buffer *pbufp;
1110: unsigned char *string1, *string2;
1111: int size1, size2;
1112: int pos;
1113: struct re_registers *regs;
1114: int mstop;
1115: {
1116: register unsigned char *p = (unsigned char *) pbufp->buffer;
1117: register unsigned char *pend = p + pbufp->used;
1118: /* End of first string */
1119: unsigned char *end1;
1120: /* End of second string */
1121: unsigned char *end2;
1122: /* Pointer just past last char to consider matching */
1123: unsigned char *end_match_1, *end_match_2;
1124: register unsigned char *d, *dend;
1125: register int mcnt;
1126: unsigned char *translate = (unsigned char *) pbufp->translate;
1127:
1128: /* Failure point stack. Each place that can handle a failure further down the line
1129: pushes a failure point on this stack. It consists of two char *'s.
1130: The first one pushed is where to resume scanning the pattern;
1131: the second pushed is where to resume scanning the strings.
1132: If the latter is zero, the failure point is a "dummy".
1133: If a failure happens and the innermost failure point is dormant,
1134: it discards that failure point and tries the next one. */
1135:
1136: unsigned char *initial_stack[2 * NFAILURES];
1137: unsigned char **stackb = initial_stack;
1138: unsigned char **stackp = stackb, **stacke = &stackb[2 * NFAILURES];
1139:
1140: /* Information on the "contents" of registers.
1141: These are pointers into the input strings; they record
1142: just what was matched (on this attempt) by some part of the pattern.
1143: The start_memory command stores the start of a register's contents
1144: and the stop_memory command stores the end.
1145:
1146: At that point, regstart[regnum] points to the first character in the register,
1147: regend[regnum] points to the first character beyond the end of the register,
1148: regstart_seg1[regnum] is true iff regstart[regnum] points into string1,
1149: and regend_seg1[regnum] is true iff regend[regnum] points into string1. */
1150:
1151: unsigned char *regstart[RE_NREGS];
1152: unsigned char *regend[RE_NREGS];
1153: unsigned char regstart_seg1[RE_NREGS], regend_seg1[RE_NREGS];
1154:
1155: /* Set up pointers to ends of strings.
1156: Don't allow the second string to be empty unless both are empty. */
1157: if (!size2)
1158: {
1159: string2 = string1;
1160: size2 = size1;
1161: string1 = 0;
1162: size1 = 0;
1163: }
1164: end1 = string1 + size1;
1165: end2 = string2 + size2;
1166:
1167: /* Compute where to stop matching, within the two strings */
1168: if (mstop <= size1)
1169: {
1170: end_match_1 = string1 + mstop;
1171: end_match_2 = string2;
1172: }
1173: else
1174: {
1175: end_match_1 = end1;
1176: end_match_2 = string2 + mstop - size1;
1177: }
1178:
1179: /* Initialize \) text positions to -1
1180: to mark ones that no \( or \) has been seen for. */
1181:
1182: for (mcnt = 0; mcnt < sizeof (regend) / sizeof (*regend); mcnt++)
1183: regend[mcnt] = (unsigned char *) -1;
1184:
1185: /* `p' scans through the pattern as `d' scans through the data.
1186: `dend' is the end of the input string that `d' points within.
1187: `d' is advanced into the following input string whenever necessary,
1188: but this happens before fetching;
1189: therefore, at the beginning of the loop,
1190: `d' can be pointing at the end of a string,
1191: but it cannot equal string2. */
1192:
1193: if (pos <= size1)
1194: d = string1 + pos, dend = end_match_1;
1195: else
1196: d = string2 + pos - size1, dend = end_match_2;
1197:
1198: /* Write PREFETCH; just before fetching a character with *d. */
1199: #define PREFETCH \
1200: while (d == dend) \
1201: { if (dend == end_match_2) goto fail; /* end of string2 => failure */ \
1202: d = string2; /* end of string1 => advance to string2. */ \
1203: dend = end_match_2; }
1204:
1205: /* This loop loops over pattern commands.
1206: It exits by returning from the function if match is complete,
1207: or it drops through if match fails at this starting point in the input data. */
1208:
1209: while (1)
1210: {
1211: if (p == pend)
1212: /* End of pattern means we have succeeded! */
1213: {
1214: /* If caller wants register contents data back, convert it to indices */
1215: if (regs)
1216: {
1217: regs->start[0] = pos;
1218: if (dend == end_match_1)
1219: regs->end[0] = d - string1;
1220: else
1221: regs->end[0] = d - string2 + size1;
1222: for (mcnt = 1; mcnt < RE_NREGS; mcnt++)
1223: {
1224: if (regend[mcnt] == (unsigned char *) -1)
1225: {
1226: regs->start[mcnt] = -1;
1227: regs->end[mcnt] = -1;
1228: continue;
1229: }
1230: if (regstart_seg1[mcnt])
1231: regs->start[mcnt] = regstart[mcnt] - string1;
1232: else
1233: regs->start[mcnt] = regstart[mcnt] - string2 + size1;
1234: if (regend_seg1[mcnt])
1235: regs->end[mcnt] = regend[mcnt] - string1;
1236: else
1237: regs->end[mcnt] = regend[mcnt] - string2 + size1;
1238: }
1239: }
1240: if (dend == end_match_1)
1241: return (d - string1 - pos);
1242: else
1243: return d - string2 + size1 - pos;
1244: }
1245:
1246: /* Otherwise match next pattern command */
1247: #ifdef SWITCH_ENUM_BUG
1248: switch ((int) ((enum regexpcode) *p++))
1249: #else
1250: switch ((enum regexpcode) *p++)
1251: #endif
1252: {
1253:
1254: /* \( is represented by a start_memory, \) by a stop_memory.
1255: Both of those commands contain a "register number" argument.
1256: The text matched within the \( and \) is recorded under that number.
1257: Then, \<digit> turns into a `duplicate' command which
1258: is followed by the numeric value of <digit> as the register number. */
1259:
1260: case start_memory:
1261: regstart[*p] = d;
1262: regstart_seg1[*p++] = (dend == end_match_1);
1263: break;
1264:
1265: case stop_memory:
1266: regend[*p] = d;
1267: regend_seg1[*p++] = (dend == end_match_1);
1268: break;
1269:
1270: case duplicate:
1271: {
1272: int regno = *p++; /* Get which register to match against */
1273: register unsigned char *d2, *dend2;
1274:
1275: d2 = regstart[regno];
1276: dend2 = ((regstart_seg1[regno] == regend_seg1[regno])
1277: ? regend[regno] : end_match_1);
1278: while (1)
1279: {
1280: /* Advance to next segment in register contents, if necessary */
1281: while (d2 == dend2)
1282: {
1283: if (dend2 == end_match_2) break;
1284: if (dend2 == regend[regno]) break;
1285: d2 = string2, dend2 = regend[regno]; /* end of string1 => advance to string2. */
1286: }
1287: /* At end of register contents => success */
1288: if (d2 == dend2) break;
1289:
1290: /* Advance to next segment in data being matched, if necessary */
1291: PREFETCH;
1292:
1293: /* mcnt gets # consecutive chars to compare */
1294: mcnt = dend - d;
1295: if (mcnt > dend2 - d2)
1296: mcnt = dend2 - d2;
1297: /* Compare that many; failure if mismatch, else skip them. */
1298: if (translate ? bcmp_translate (d, d2, mcnt, translate) : bcmp (d, d2, mcnt))
1299: goto fail;
1300: d += mcnt, d2 += mcnt;
1301: }
1302: }
1303: break;
1304:
1305: case anychar:
1306: /* fetch a data character */
1307: PREFETCH;
1308: /* Match anything but a newline. */
1309: if ((translate ? translate[*d++] : *d++) == '\n')
1310: goto fail;
1311: break;
1312:
1313: case charset:
1314: case charset_not:
1315: {
1316: /* Nonzero for charset_not */
1317: int not = 0;
1318: register int c;
1319: if (*(p - 1) == (unsigned char) charset_not)
1320: not = 1;
1321:
1322: /* fetch a data character */
1323: PREFETCH;
1324:
1325: if (translate)
1326: c = translate [*d];
1327: else
1328: c = *d;
1329:
1330: if (c < *p * BYTEWIDTH
1331: && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
1332: not = !not;
1333:
1334: p += 1 + *p;
1335:
1336: if (!not) goto fail;
1337: d++;
1338: break;
1339: }
1340:
1341: case begline:
1342: if (d == string1 || d[-1] == '\n')
1343: break;
1344: goto fail;
1345:
1346: case endline:
1347: if (d == end2
1348: || (d == end1 ? (size2 == 0 || *string2 == '\n') : *d == '\n'))
1349: break;
1350: goto fail;
1351:
1352: /* "or" constructs ("|") are handled by starting each alternative
1353: with an on_failure_jump that points to the start of the next alternative.
1354: Each alternative except the last ends with a jump to the joining point.
1355: (Actually, each jump except for the last one really jumps
1356: to the following jump, because tensioning the jumps is a hassle.) */
1357:
1358: /* The start of a stupid repeat has an on_failure_jump that points
1359: past the end of the repeat text.
1360: This makes a failure point so that, on failure to match a repetition,
1361: matching restarts past as many repetitions have been found
1362: with no way to fail and look for another one. */
1363:
1364: /* A smart repeat is similar but loops back to the on_failure_jump
1365: so that each repetition makes another failure point. */
1366:
1367: case on_failure_jump:
1368: if (stackp == stacke)
1369: {
1370: unsigned char **stackx;
1371: if (stacke - stackb > re_max_failures)
1372: return -2;
1373: stackx = (unsigned char **) alloca (2 * (stacke - stackb)
1374: * sizeof (char *));
1375: bcopy (stackb, stackx, (stacke - stackb) * sizeof (char *));
1376: stackp = stackx + (stackp - stackb);
1377: stacke = stackx + 2 * (stacke - stackb);
1378: stackb = stackx;
1379: }
1380: mcnt = *p++ & 0377;
1381: mcnt += SIGN_EXTEND_CHAR (*(char *)p) << 8;
1382: p++;
1383: *stackp++ = mcnt + p;
1384: *stackp++ = d;
1385: break;
1386:
1387: /* The end of a smart repeat has an maybe_finalize_jump back.
1388: Change it either to a finalize_jump or an ordinary jump. */
1389:
1390: case maybe_finalize_jump:
1391: mcnt = *p++ & 0377;
1392: mcnt += SIGN_EXTEND_CHAR (*(char *)p) << 8;
1393: p++;
1394: /* Compare what follows with the begining of the repeat.
1395: If we can establish that there is nothing that they would
1396: both match, we can change to finalize_jump */
1397: if (p == pend)
1398: p[-3] = (unsigned char) finalize_jump;
1399: else if (*p == (unsigned char) exactn
1400: || *p == (unsigned char) endline)
1401: {
1402: register int c = *p == (unsigned char) endline ? '\n' : p[2];
1403: register unsigned char *p1 = p + mcnt;
1404: /* p1[0] ... p1[2] are an on_failure_jump.
1405: Examine what follows that */
1406: if (p1[3] == (unsigned char) exactn && p1[5] != c)
1407: p[-3] = (unsigned char) finalize_jump;
1408: else if (p1[3] == (unsigned char) charset
1409: || p1[3] == (unsigned char) charset_not)
1410: {
1411: int not = p1[3] == (unsigned char) charset_not;
1412: if (c < p1[4] * BYTEWIDTH
1413: && p1[5 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
1414: not = !not;
1415: /* not is 1 if c would match */
1416: /* That means it is not safe to finalize */
1417: if (!not)
1418: p[-3] = (unsigned char) finalize_jump;
1419: }
1420: }
1421: p -= 2;
1422: if (p[-1] != (unsigned char) finalize_jump)
1423: {
1424: p[-1] = (unsigned char) jump;
1425: goto nofinalize;
1426: }
1427:
1428: /* The end of a stupid repeat has a finalize-jump
1429: back to the start, where another failure point will be made
1430: which will point after all the repetitions found so far. */
1431:
1432: case finalize_jump:
1433: stackp -= 2;
1434:
1435: case jump:
1436: nofinalize:
1437: mcnt = *p++ & 0377;
1438: mcnt += SIGN_EXTEND_CHAR (*(char *)p) << 8;
1439: p += mcnt + 1; /* The 1 compensates for missing ++ above */
1440: break;
1441:
1442: case dummy_failure_jump:
1443: if (stackp == stacke)
1444: {
1445: unsigned char **stackx
1446: = (unsigned char **) alloca (2 * (stacke - stackb)
1447: * sizeof (char *));
1448: bcopy (stackb, stackx, (stacke - stackb) * sizeof (char *));
1449: stackp = stackx + (stackp - stackb);
1450: stacke = stackx + 2 * (stacke - stackb);
1451: stackb = stackx;
1452: }
1453: *stackp++ = 0;
1454: *stackp++ = 0;
1455: goto nofinalize;
1456:
1457: case wordbound:
1458: if (d == string1 /* Points to first char */
1459: || d == end2 /* Points to end */
1460: || (d == end1 && size2 == 0)) /* Points to end */
1461: break;
1462: if ((SYNTAX (d[-1]) == Sword)
1463: != (SYNTAX (d == end1 ? *string2 : *d) == Sword))
1464: break;
1465: goto fail;
1466:
1467: case notwordbound:
1468: if (d == string1 /* Points to first char */
1469: || d == end2 /* Points to end */
1470: || (d == end1 && size2 == 0)) /* Points to end */
1471: goto fail;
1472: if ((SYNTAX (d[-1]) == Sword)
1473: != (SYNTAX (d == end1 ? *string2 : *d) == Sword))
1474: goto fail;
1475: break;
1476:
1477: case wordbeg:
1478: if (d == end2 /* Points to end */
1479: || (d == end1 && size2 == 0) /* Points to end */
1480: || SYNTAX (* (d == end1 ? string2 : d)) != Sword) /* Next char not a letter */
1481: goto fail;
1482: if (d == string1 /* Points to first char */
1483: || SYNTAX (d[-1]) != Sword) /* prev char not letter */
1484: break;
1485: goto fail;
1486:
1487: case wordend:
1488: if (d == string1 /* Points to first char */
1489: || SYNTAX (d[-1]) != Sword) /* prev char not letter */
1490: goto fail;
1491: if (d == end2 /* Points to end */
1492: || (d == end1 && size2 == 0) /* Points to end */
1493: || SYNTAX (d == end1 ? *string2 : *d) != Sword) /* Next char not a letter */
1494: break;
1495: goto fail;
1496:
1497: #ifdef emacs
1498: case before_dot:
1499: if (((d - string2 <= (unsigned) size2)
1500: ? d - bf_p2 : d - bf_p1)
1501: <= point)
1502: goto fail;
1503: break;
1504:
1505: case at_dot:
1506: if (((d - string2 <= (unsigned) size2)
1507: ? d - bf_p2 : d - bf_p1)
1508: == point)
1509: goto fail;
1510: break;
1511:
1512: case after_dot:
1513: if (((d - string2 <= (unsigned) size2)
1514: ? d - bf_p2 : d - bf_p1)
1515: >= point)
1516: goto fail;
1517: break;
1518:
1519: case wordchar:
1520: mcnt = (int) Sword;
1521: goto matchsyntax;
1522:
1523: case syntaxspec:
1524: mcnt = *p++;
1525: matchsyntax:
1526: PREFETCH;
1527: if (SYNTAX (*d++) != (enum syntaxcode) mcnt) goto fail;
1528: break;
1529:
1530: case notwordchar:
1531: mcnt = (int) Sword;
1532: goto matchnotsyntax;
1533:
1534: case notsyntaxspec:
1535: mcnt = *p++;
1536: matchnotsyntax:
1537: PREFETCH;
1538: if (SYNTAX (*d++) == (enum syntaxcode) mcnt) goto fail;
1539: break;
1540: #else
1541: case wordchar:
1542: PREFETCH;
1543: if (SYNTAX (*d++) == 0) goto fail;
1544: break;
1545:
1546: case notwordchar:
1547: PREFETCH;
1548: if (SYNTAX (*d++) != 0) goto fail;
1549: break;
1550: #endif not emacs
1551:
1552: case begbuf:
1553: if (d == string1) /* Note, d cannot equal string2 */
1554: break; /* unless string1 == string2. */
1555: goto fail;
1556:
1557: case endbuf:
1558: if (d == end2 || (d == end1 && size2 == 0))
1559: break;
1560: goto fail;
1561:
1562: case exactn:
1563: /* Match the next few pattern characters exactly.
1564: mcnt is how many characters to match. */
1565: mcnt = *p++;
1566: if (translate)
1567: {
1568: do
1569: {
1570: PREFETCH;
1571: if (translate[*d++] != *p++) goto fail;
1572: }
1573: while (--mcnt);
1574: }
1575: else
1576: {
1577: do
1578: {
1579: PREFETCH;
1580: if (*d++ != *p++) goto fail;
1581: }
1582: while (--mcnt);
1583: }
1584: break;
1585: }
1586: continue; /* Successfully matched one pattern command; keep matching */
1587:
1588: /* Jump here if any matching operation fails. */
1589: fail:
1590: if (stackp != stackb)
1591: /* A restart point is known. Restart there and pop it. */
1592: {
1593: if (!stackp[-2])
1594: { /* If innermost failure point is dormant, flush it and keep looking */
1595: stackp -= 2;
1596: goto fail;
1597: }
1598: d = *--stackp;
1599: p = *--stackp;
1600: if (d >= string1 && d <= end1)
1601: dend = end_match_1;
1602: }
1603: else break; /* Matching at this starting point really fails! */
1604: }
1605: return -1; /* Failure to match */
1606: }
1607:
1608: static int
1609: bcmp_translate (s1, s2, len, translate)
1610: unsigned char *s1, *s2;
1611: register int len;
1612: unsigned char *translate;
1613: {
1614: register unsigned char *p1 = s1, *p2 = s2;
1615: while (len)
1616: {
1617: if (translate [*p1++] != translate [*p2++]) return 1;
1618: len--;
1619: }
1620: return 0;
1621: }
1622:
1623: /* Entry points compatible with bsd4.2 regex library */
1624:
1625: #ifndef emacs
1626:
1627: static struct re_pattern_buffer re_comp_buf;
1628:
1629: char *
1630: re_comp (s)
1631: char *s;
1632: {
1633: if (!s)
1634: {
1635: if (!re_comp_buf.buffer)
1636: return "No previous regular expression";
1637: return 0;
1638: }
1639:
1640: if (!re_comp_buf.buffer)
1641: {
1642: if (!(re_comp_buf.buffer = (char *) malloc (200)))
1643: return "Memory exhausted";
1644: re_comp_buf.allocated = 200;
1645: if (!(re_comp_buf.fastmap = (char *) malloc (1 << BYTEWIDTH)))
1646: return "Memory exhausted";
1647: }
1648: return re_compile_pattern (s, strlen (s), &re_comp_buf);
1649: }
1650:
1651: int
1652: re_exec (s)
1653: char *s;
1654: {
1655: int len = strlen (s);
1656: return 0 <= re_search (&re_comp_buf, s, len, 0, len, 0);
1657: }
1658:
1659: #endif /* emacs */
1660:
1661: #ifdef test
1662:
1663: #include <stdio.h>
1664:
1665: /* Indexed by a character, gives the upper case equivalent of the character */
1666:
1667: static char upcase[0400] =
1668: { 000, 001, 002, 003, 004, 005, 006, 007,
1669: 010, 011, 012, 013, 014, 015, 016, 017,
1670: 020, 021, 022, 023, 024, 025, 026, 027,
1671: 030, 031, 032, 033, 034, 035, 036, 037,
1672: 040, 041, 042, 043, 044, 045, 046, 047,
1673: 050, 051, 052, 053, 054, 055, 056, 057,
1674: 060, 061, 062, 063, 064, 065, 066, 067,
1675: 070, 071, 072, 073, 074, 075, 076, 077,
1676: 0100, 0101, 0102, 0103, 0104, 0105, 0106, 0107,
1677: 0110, 0111, 0112, 0113, 0114, 0115, 0116, 0117,
1678: 0120, 0121, 0122, 0123, 0124, 0125, 0126, 0127,
1679: 0130, 0131, 0132, 0133, 0134, 0135, 0136, 0137,
1680: 0140, 0101, 0102, 0103, 0104, 0105, 0106, 0107,
1681: 0110, 0111, 0112, 0113, 0114, 0115, 0116, 0117,
1682: 0120, 0121, 0122, 0123, 0124, 0125, 0126, 0127,
1683: 0130, 0131, 0132, 0173, 0174, 0175, 0176, 0177,
1684: 0200, 0201, 0202, 0203, 0204, 0205, 0206, 0207,
1685: 0210, 0211, 0212, 0213, 0214, 0215, 0216, 0217,
1686: 0220, 0221, 0222, 0223, 0224, 0225, 0226, 0227,
1687: 0230, 0231, 0232, 0233, 0234, 0235, 0236, 0237,
1688: 0240, 0241, 0242, 0243, 0244, 0245, 0246, 0247,
1689: 0250, 0251, 0252, 0253, 0254, 0255, 0256, 0257,
1690: 0260, 0261, 0262, 0263, 0264, 0265, 0266, 0267,
1691: 0270, 0271, 0272, 0273, 0274, 0275, 0276, 0277,
1692: 0300, 0301, 0302, 0303, 0304, 0305, 0306, 0307,
1693: 0310, 0311, 0312, 0313, 0314, 0315, 0316, 0317,
1694: 0320, 0321, 0322, 0323, 0324, 0325, 0326, 0327,
1695: 0330, 0331, 0332, 0333, 0334, 0335, 0336, 0337,
1696: 0340, 0341, 0342, 0343, 0344, 0345, 0346, 0347,
1697: 0350, 0351, 0352, 0353, 0354, 0355, 0356, 0357,
1698: 0360, 0361, 0362, 0363, 0364, 0365, 0366, 0367,
1699: 0370, 0371, 0372, 0373, 0374, 0375, 0376, 0377
1700: };
1701:
1702: main (argc, argv)
1703: int argc;
1704: char **argv;
1705: {
1706: char pat[80];
1707: struct re_pattern_buffer buf;
1708: int i;
1709: char c;
1710: char fastmap[(1 << BYTEWIDTH)];
1711:
1712: /* Allow a command argument to specify the style of syntax. */
1713: if (argc > 1)
1714: obscure_syntax = atoi (argv[1]);
1715:
1716: buf.allocated = 40;
1717: buf.buffer = (char *) malloc (buf.allocated);
1718: buf.fastmap = fastmap;
1719: buf.translate = upcase;
1720:
1721: while (1)
1722: {
1723: gets (pat);
1724:
1725: if (*pat)
1726: {
1727: re_compile_pattern (pat, strlen(pat), &buf);
1728:
1729: for (i = 0; i < buf.used; i++)
1730: printchar (buf.buffer[i]);
1731:
1732: putchar ('\n');
1733:
1734: printf ("%d allocated, %d used.\n", buf.allocated, buf.used);
1735:
1736: re_compile_fastmap (&buf);
1737: printf ("Allowed by fastmap: ");
1738: for (i = 0; i < (1 << BYTEWIDTH); i++)
1739: if (fastmap[i]) printchar (i);
1740: putchar ('\n');
1741: }
1742:
1743: gets (pat); /* Now read the string to match against */
1744:
1745: i = re_match (&buf, pat, strlen (pat), 0, 0);
1746: printf ("Match value %d.\n", i);
1747: }
1748: }
1749:
1750: #ifdef NOTDEF
1751: print_buf (bufp)
1752: struct re_pattern_buffer *bufp;
1753: {
1754: int i;
1755:
1756: printf ("buf is :\n----------------\n");
1757: for (i = 0; i < bufp->used; i++)
1758: printchar (bufp->buffer[i]);
1759:
1760: printf ("\n%d allocated, %d used.\n", bufp->allocated, bufp->used);
1761:
1762: printf ("Allowed by fastmap: ");
1763: for (i = 0; i < (1 << BYTEWIDTH); i++)
1764: if (bufp->fastmap[i])
1765: printchar (i);
1766: printf ("\nAllowed by translate: ");
1767: if (bufp->translate)
1768: for (i = 0; i < (1 << BYTEWIDTH); i++)
1769: if (bufp->translate[i])
1770: printchar (i);
1771: printf ("\nfastmap is%s accurate\n", bufp->fastmap_accurate ? "" : "n't");
1772: printf ("can %s be null\n----------", bufp->can_be_null ? "" : "not");
1773: }
1774: #endif
1775:
1776: printchar (c)
1777: char c;
1778: {
1779: if (c < 041 || c >= 0177)
1780: {
1781: putchar ('\\');
1782: putchar (((c >> 6) & 3) + '0');
1783: putchar (((c >> 3) & 7) + '0');
1784: putchar ((c & 7) + '0');
1785: }
1786: else
1787: putchar (c);
1788: }
1789:
1790: error (string)
1791: char *string;
1792: {
1793: puts (string);
1794: exit (1);
1795: }
1796:
1797: #endif test
This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.