|
|
1.1 ! root 1: .TH RE 3 ! 2: .CT 2 data_man ! 3: .SH NAME ! 4: re_bm, re_cw, re_re \(mi string and pattern matching ! 5: .SH SYNOPSIS ! 6: .nf ! 7: .2C ! 8: .B "#include <re.h>" ! 9: .PP ! 10: .B "re_bm *re_bmcomp(b, e, map)" ! 11: .B "char *b, *e;" ! 12: .B "unsigned char map[256];" ! 13: .PP ! 14: .B "int re_bmexec(pat, rdfn, matchfn)" ! 15: .B re_bm *pat; ! 16: .B int (*rdfn)(), (*matchfn)(); ! 17: .PP ! 18: .B void re_bmfree(pat); ! 19: .B re_bm *pat; ! 20: .PP ! 21: .BR "re_cw *re_cwinit(map)" ! 22: .B unsigned char map[256]; ! 23: .PP ! 24: .BR "void re_cwadd(pat, b, e)" ! 25: .B re_cw *pat; ! 26: .B char *b, *e; ! 27: .PP ! 28: .BR "void re_cwcomp(pat)" ! 29: .B re_cw *pat; ! 30: .PP ! 31: .B "int re_cwexec(pat, rdfn, matchfn)" ! 32: .B re_cw *pat; ! 33: .B int (*rdfn)(), (*matchfn)(); ! 34: .PP ! 35: .B void re_cwfree(pat); ! 36: .B re_cw *pat; ! 37: .PP ! 38: .BR "re_re *re_recomp(b, e, map)" ! 39: .B char *b, *e; ! 40: .B unsigned char map[256]; ! 41: .PP ! 42: .B "re_reexec(pat, b, e, match)" ! 43: .B re_re *pat; ! 44: .B char *b, *e, *match[10][2]; ! 45: .PP ! 46: .B void re_refree(pat); ! 47: .B re_re *pat; ! 48: .PP ! 49: .B void re_error(str); ! 50: .B char *str; ! 51: .1C ! 52: .fi ! 53: .SH DESCRIPTION ! 54: These routines search for patterns in strings. ! 55: The ! 56: .I re_re ! 57: routines search for general regular expressions (defined below) ! 58: using a lazily evaluated deterministic finite automaton. ! 59: The more specialized and faster ! 60: .I re_cw ! 61: routines search for multiple literal strings ! 62: using the Commentz-Walter algorithm. ! 63: The still more specialized and efficient ! 64: .I re_bm ! 65: routines search for a single string using the Boyer-Moore algorithm. ! 66: The routines handle strings designated by pointers to ! 67: the first character of the string ! 68: and to the character following the string. ! 69: .PP ! 70: To use the ! 71: .I re_bm ! 72: routines, first build a recognizer by calling ! 73: .I re_bmcomp, ! 74: which takes the search string and a character map; ! 75: all characters are compared after mapping. ! 76: Typically, ! 77: .I map ! 78: is initialized by a loop similar to ! 79: .EE ! 80: for(i = 0; i < 256; i++) map[i] = i; ! 81: .EX ! 82: and its value is no longer required after the call to ! 83: .I re_bmcomp. ! 84: The recognizer can be run (multiple times) by calling ! 85: .I re_bmexec, ! 86: which stops and returns the first non-positive return from either ! 87: .I rdfn ! 88: or ! 89: .IR matchfn . ! 90: The recognizer calls the supplied function ! 91: .I rdfn ! 92: to obtain input and ! 93: .I matchfn ! 94: to report text matching the search string. ! 95: .PP ! 96: .I Rdfn ! 97: should be declared as ! 98: .IP ! 99: .EX ! 100: int rdfn(pb, pe) ! 101: char **pb, **pe; ! 102: .EE ! 103: .LP ! 104: where ! 105: .B *pb ! 106: and ! 107: .B *pe ! 108: delimit an as yet unprocessed text fragment ! 109: (none if ! 110: .LR *pb==*pe ) ! 111: to be saved across the call to ! 112: .IR rdfn . ! 113: On return, ! 114: .B *pb ! 115: and ! 116: .B *pe ! 117: point to the new text, including the saved fragment. ! 118: .I Rdfn ! 119: returns 0 for EOF, negative for error, and positive otherwise. ! 120: The first call to ! 121: .I rdfn ! 122: from each invocation of ! 123: .I re_bmexec ! 124: has ! 125: .BR *pb==0 . ! 126: .PP ! 127: .I Matchfn ! 128: should be declared as ! 129: .IP ! 130: .EX ! 131: int matchfn(pb, pe) ! 132: char **pb, **pe; ! 133: .EE ! 134: .LP ! 135: where ! 136: .B *pb ! 137: and ! 138: .B *pe ! 139: delimit the matched text. ! 140: .I Matchfn ! 141: sets ! 142: .BR *pb , ! 143: .BR *pe , ! 144: and returns a value in the same way as ! 145: .I rdfn. ! 146: .PP ! 147: To use the ! 148: .I re_cw ! 149: routines, first build the recognizer by calling ! 150: .IR re_cwinit , ! 151: then ! 152: .I re_cwadd ! 153: for each string, and finally ! 154: .IR re_cwcomp . ! 155: The recognizer is run by ! 156: .I re_cwexec ! 157: analogously to ! 158: .IR re_bmexec . ! 159: .PP ! 160: A full regular expression recognizer is compiled by ! 161: .I re_recomp ! 162: and executed by ! 163: .I re_reexec, ! 164: which returns 1 if there was a match and 0 if there wasn't. ! 165: The strings that match subexpressions are returned in array ! 166: .I match ! 167: using the above convention. ! 168: .L match[0] ! 169: refers to the whole matched expression. ! 170: If ! 171: .I match ! 172: is zero, then no match delimiters are set. ! 173: .PP ! 174: The routine ! 175: .I re_error ! 176: prints its argument on standard error and exits. ! 177: You may supply your own version for specialized error handling. ! 178: If ! 179: .I re_error ! 180: returns rather than exits, the compiling routines (e.g. ! 181: .IR re_bmcomp ) ! 182: will return 0. ! 183: .PP ! 184: The recognizers that these routines construct occupy storage ! 185: obtained from ! 186: .IR malloc (3). ! 187: The storage can be deallocated by ! 188: .I re_refree. ! 189: .SS Regular Expressions ! 190: The syntax for a regular expression ! 191: .B e0 ! 192: is ! 193: .EX ! 194: e3: literal | charclass | '.' | '^' | '$' | '\e'\fIn\fP | '(' e0 ')' ! 195: ! 196: e2: e3 ! 197: | e2 REP ! 198: REP: '*' | '+' | '?' | '\e{' RANGE '\e}' ! 199: RANGE: int | int ',' | int ',' int ! 200: ! 201: e1: e2 ! 202: | e1 e2 ! 203: ! 204: e0: e1 ! 205: | e0 ALT e1 ! 206: ALT: '|' | newline ! 207: .EE ! 208: .PP ! 209: A literal is any non-metacharacter or a metacharacter ! 210: (one of ! 211: .BR .*+?[]()|\e^$ ) ! 212: preceded by ! 213: .LR \e . ! 214: .PP ! 215: A charclass is a nonempty string ! 216: .I s ! 217: bracketed ! 218: .BI [ \|s\| ] ! 219: (or ! 220: .BI [^ s\| ]\fR); ! 221: it matches any character in (or not in) ! 222: .I s. ! 223: In ! 224: .I s, ! 225: the metacharacters other than ! 226: .L ] ! 227: have no special meaning, and ! 228: .L ] ! 229: may only appear as ! 230: the first letter. ! 231: A substring ! 232: .IB a - b , ! 233: with ! 234: .I a ! 235: and ! 236: .I b ! 237: in ascending ! 238: .SM ASCII ! 239: order, stands for the inclusive ! 240: range of ! 241: .SM ASCII ! 242: characters between ! 243: .I a ! 244: and ! 245: .IR b . ! 246: .PP ! 247: A ! 248: .L \e ! 249: followed by a digit ! 250: .I n ! 251: matches a copy of the string that the ! 252: parenthesized subexpression beginning with the ! 253: .IR n th ! 254: .LR ( , ! 255: counting from 1, matched. ! 256: .PP ! 257: A ! 258: .L . ! 259: matches any character. ! 260: .PP ! 261: A ! 262: .L ^ ! 263: matches the beginning of the input string; ! 264: .L $ ! 265: matches the end. ! 266: .PP ! 267: The ! 268: .B REP ! 269: operators match zero or more ! 270: .RB ( * ), ! 271: one or more ! 272: .RB ( + ), ! 273: zero or one ! 274: .RB ( ? ), ! 275: exactly ! 276: .I m ! 277: .BI \f1(\fP\e{ m \e}\f1),\fP ! 278: .I m ! 279: or more ! 280: .BI \f1(\fP\e{ m ,\e}\f1),\fP ! 281: and any number between ! 282: .I m ! 283: and ! 284: .I n ! 285: inclusive ! 286: .BI \f1(\fP\e{ m , n \e}\f1),\fP ! 287: instances respectively of the preceding regular expression ! 288: .BR e2 . ! 289: .PP ! 290: A concatenated regular expression, ! 291: .BR "e1 e2" , ! 292: matches a match to ! 293: .B e1 ! 294: followed by a match to ! 295: .BR e2 . ! 296: .PP ! 297: An alternative regular expression, ! 298: .BR "e0 ALT e1" , ! 299: matches either a match to ! 300: .B e0 ! 301: or a match to ! 302: .BR e1 . ! 303: .PP ! 304: A match to any part of a regular expression ! 305: extends as far as possible without preventing ! 306: a match to the remainder of the regular expression. ! 307: .SH SEE ALSO ! 308: .IR regexp (3), ! 309: .IR gre (1) ! 310: .SH DIAGNOSTICS ! 311: Routines that return pointers return 0 on error. ! 312: .SH BUGS ! 313: Between ! 314: .IR re (3) ! 315: and ! 316: .IR regexp (3) ! 317: there are too many routines.
This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.