|
|
1.1 ! root 1: .TH RE 3 ! 2: .CT 2 data_man ! 3: .SH NAME ! 4: re_bm, re_cw, re_re \(mi string and pattern matching ! 5: .SH SYNOPSIS ! 6: .nf ! 7: .2C ! 8: .B "#include <re.h>" ! 9: .PP ! 10: .B "re_bm *re_bmcomp(b, e, map)" ! 11: .B "char *b, *e;" ! 12: .B "unsigned char map[256];" ! 13: .PP ! 14: .B "int re_bmexec(pat, rdfn, matchfn)" ! 15: .B re_bm *pat; ! 16: .B int (*rdfn)(), (*matchfn)(); ! 17: .PP ! 18: .B void re_bmfree(pat); ! 19: .B re_bm *pat; ! 20: .PP ! 21: .BR "re_cw *re_cwinit(map)" ! 22: .B unsigned char map[256]; ! 23: .PP ! 24: .BR "void re_cwadd(pat, b, e)" ! 25: .B re_cw *pat; ! 26: .B char *b, *e; ! 27: .PP ! 28: .BR "void re_cwcomp(pat)" ! 29: .B re_cw *pat; ! 30: .PP ! 31: .B "int re_cwexec(pat, rdfn, matchfn)" ! 32: .B re_cw *pat; ! 33: .B int (*rdfn)(), (*matchfn)(); ! 34: .PP ! 35: .B void re_cwfree(pat); ! 36: .B re_cw *pat; ! 37: .PP ! 38: .BR "re_re *re_recomp(b, e, map)" ! 39: .B char *b, *e; ! 40: .B unsigned char map[256]; ! 41: .PP ! 42: .B "re_reexec(pat, b, e, match)" ! 43: .B re_re *pat; ! 44: .B char *b, *e, *match[10][2]; ! 45: .PP ! 46: .B void re_refree(pat); ! 47: .B re_re *pat; ! 48: .PP ! 49: .B void re_rerror(str); ! 50: .B char *str; ! 51: .1C ! 52: .fi ! 53: .SH DESCRIPTION ! 54: These routines search for patterns in strings. ! 55: The ! 56: .I re_re ! 57: routines search for general regular expressions (defined below) ! 58: using a lazily evaluated deterministic finite automaton. ! 59: The more specialized and faster ! 60: .I re_cw ! 61: routines search for multiple literal strings ! 62: using the Commentz-Walter algorithm. ! 63: The still more specialized and efficient ! 64: .I re_bm ! 65: routines search for a single string using the Boyer-Moore algorithm. ! 66: The routines handle strings designated by pointers to ! 67: the first character of the string ! 68: and to the character following the string. ! 69: .PP ! 70: To use the ! 71: .I re_bm ! 72: routines, first build a recognizer by calling ! 73: .I re_bmcomp, ! 74: which takes the search string and a character map; ! 75: all characters are compared after mapping. ! 76: The recognizer can be run (multiple times) by calling ! 77: .I re_bmexec, ! 78: which stops and returns the first non-positive return from either ! 79: .I rdfn ! 80: or ! 81: .IR matchfn . ! 82: The recognizer calls the supplied function ! 83: .I rdfn ! 84: to obtain input and ! 85: .I matchfn ! 86: to report text matching the search string. ! 87: .PP ! 88: .I Rdfn ! 89: should be declared as ! 90: .IP ! 91: .EX ! 92: int rdfn(pb, pe) ! 93: char **pb, **pe; ! 94: .EE ! 95: .LP ! 96: where ! 97: .B *pb ! 98: and ! 99: .B *pe ! 100: delimit an as yet unprocessed text fragment ! 101: (none if ! 102: .LR *pb==*pe ) ! 103: to be saved across the call to ! 104: .IR rdfn . ! 105: On return, ! 106: .B *pb ! 107: and ! 108: .B *pe ! 109: point to the new text, including the saved fragment. ! 110: .I Rdfn ! 111: returns 0 for EOF, negative for error, and positive otherwise. ! 112: The first call to ! 113: .I rdfn ! 114: from each invocation of ! 115: .I re_bmexec ! 116: has ! 117: .BR *pb==0 . ! 118: .PP ! 119: .I Matchfn ! 120: should be declared as ! 121: .IP ! 122: .EX ! 123: int matchfn(pb, pe) ! 124: char **pb, **pe; ! 125: .EE ! 126: .LP ! 127: where ! 128: .B *pb ! 129: and ! 130: .B *pe ! 131: delimit the matched text. ! 132: .I Matchfn ! 133: sets ! 134: .BR *pb , ! 135: .BR *pe , ! 136: and returns a value in the same way as ! 137: .I rdfn. ! 138: .PP ! 139: To use the ! 140: .I re_cw ! 141: routines, first build the recognizer by calling ! 142: .IR re_cwinit , ! 143: then ! 144: .I re_cwadd ! 145: for each string, and finally ! 146: .IR re_cwcomp . ! 147: The recognizer is run by ! 148: .I re_cwexec ! 149: analogously to ! 150: .IR re_bmexec . ! 151: .PP ! 152: A full regular expression recognizer is compiled by ! 153: .I re_recomp ! 154: and executed by ! 155: .I re_reexec, ! 156: which returns 1 if there was a match and 0 if there wasn't. ! 157: The strings that match subexpressions are returned in array ! 158: .IR match . ! 159: .L match[0] ! 160: refers to the whole matched expression. ! 161: If ! 162: .I match ! 163: is zero, then no match delimiters are set. ! 164: .PP ! 165: The routine ! 166: .I re_error ! 167: prints its argument on standard error and exits. ! 168: You may supply your own version for specialized error handling. ! 169: .PP ! 170: The recognizers that these routines construct occupy storage ! 171: obtained from ! 172: .IR malloc (3). ! 173: The storage can be deallocated by ! 174: .I free. ! 175: .SS Regular Expressions ! 176: The syntax for a regular expression ! 177: .B e0 ! 178: is ! 179: .EX ! 180: e3: literal | charclass | '.' | '^' | '$' | '\e'\fIn\fP | '(' e0 ')' ! 181: ! 182: e2: e3 ! 183: | e2 REP ! 184: REP: '*' | '+' | '?' ! 185: ! 186: e1: e2 ! 187: | e1 e2 ! 188: ! 189: e0: e1 ! 190: | e0 ALT e1 ! 191: ALT: '|' | newline ! 192: .EE ! 193: .PP ! 194: A literal is any non-metacharacter or a metacharacter ! 195: (one of ! 196: .BR .*+?[]()|\e^$ ) ! 197: preceded by ! 198: .LR \e . ! 199: .PP ! 200: A charclass is a nonempty string ! 201: .I s ! 202: bracketed ! 203: .BI [ \|s\| ] ! 204: (or ! 205: .BI [^ s\| ]\fR); ! 206: it matches any character in (or not in) ! 207: .I s. ! 208: In ! 209: .I s, ! 210: the metacharacters other than ! 211: .L ] ! 212: have no special meaning, and ! 213: .L ] ! 214: may only appear as ! 215: the first letter. ! 216: A substring ! 217: .IB a - b , ! 218: with ! 219: .I a ! 220: and ! 221: .I b ! 222: in ascending ! 223: .SM ASCII ! 224: order, stands for the inclusive ! 225: range of ! 226: .SM ASCII ! 227: characters between ! 228: .I a ! 229: and ! 230: .IR b . ! 231: .PP ! 232: A ! 233: .L \e ! 234: followed by a digit ! 235: .I n ! 236: matches a copy of the string that the ! 237: parenthesized subexpression beginning with the ! 238: .IR n th ! 239: .LR ( , ! 240: counting from 1, matched. ! 241: .PP ! 242: A ! 243: .L . ! 244: matches any character. ! 245: .PP ! 246: A ! 247: .L ^ ! 248: matches the beginning of the input string; ! 249: .L $ ! 250: matches the end. ! 251: .PP ! 252: The ! 253: .B REP ! 254: operators match zero or more ! 255: .RB ( * ), ! 256: one or more ! 257: .RB ( + ), ! 258: and zero or one ! 259: .RB ( ? ) ! 260: instances respectively of the preceding regular expression ! 261: .BR e2 . ! 262: .PP ! 263: A concatenated regular expression, ! 264: .BR "e1 e2" , ! 265: matches a match to ! 266: .B e1 ! 267: followed by a match to ! 268: .BR e2 . ! 269: .PP ! 270: An alternative regular expression, ! 271: .BR "e0 ALT e1" , ! 272: matches either a match to ! 273: .B e0 ! 274: or a match to ! 275: .BR e1 . ! 276: .PP ! 277: A match to any part of a regular expression ! 278: extends as far as possible without preventing ! 279: a match to the remainder of the regular expression. ! 280: .SH SEE ALSO ! 281: .IR regexp (3), ! 282: .IR gre (1) ! 283: .SH DIAGNOSTICS ! 284: Routines that return pointers return 0 on error. ! 285: .SH BUGS ! 286: Between ! 287: .IR re (3) ! 288: and ! 289: .IR regexp (3) ! 290: there are too many routines.
This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.