|
|
1.1 root 1: /* m_getfld.c - read/parse a message */
2:
3: #include "../h/mh.h"
4: #include <stdio.h>
5: #include "../zotnet/mts.h"
6: #include <ctype.h>
7:
8:
9: /* This module has a long and checkered history. First, it didn't burst
10: maildrops correctly because it considered two CTRL-A:s in a row to be
11: an inter-message delimiter. It really is four CTRL-A:s followed by a
12: newline. Unfortunately, MMDF will convert this delimiter *inside* a
13: message to a CTRL-B followed by three CTRL-A:s and a newline. This
14: caused the old version of m_getfld() to declare eom prematurely. The
15: fix was a lot slower than
16:
17: c == '\001' && peekc (iob) == '\001'
18:
19: but it worked, and to increase generality, UUCP style maildrops could
20: be parsed as well. Unfortunately the speed issue finally caught up with
21: us since this routine is at the very heart of MH.
22:
23: To speed things up considerably, the routine Eom() was made an auxilary
24: function called by the macro eom(). Unless we are bursting a maildrop,
25: the eom() macro returns FALSE saying we aren't at the end of the
26: message.
27:
28: The next thing to do is to read the mtstailor file and initialize
29: delimiter[] and delimlen accordingly...
30:
31: After mhl was made a built-in in msh, m_getfld() worked just fine
32: (using m_unknown() at startup). Until one day: a message which was
33: the result of a bursting was shown. Then, since the burst boundaries
34: aren't CTRL-A:s, m_getfld() would blinding plunge on past the boundary.
35: Very sad. The solution: introduce m_eomsbr(). This hook gets called
36: after the end of each line (since testing for eom involves an fseek()).
37: This worked fine, until one day: a message with no body portion arrived.
38: Then the
39:
40: while (eom (c = Getc (iob), iob))
41: continue;
42:
43: loop caused m_getfld() to return FMTERR. So, that logic was changed to
44: check for (*eom_action) and act accordingly.
45:
46: This worked fine, until one day: someone didn't use four CTRL:A's as
47: their delimiters. So, the bullet got bit and we read mts.h and
48: continue to struggle on. It's not that bad though, since the only time
49: the code gets executed is when inc (or msh) calls it, and both of these
50: have already called mts_init().
51:
52: ------------------------
53: (Written by Van Jacobson for the mh6 m_getfld, January, 1986):
54:
55: This routine was accounting for 60% of the cpu time used by most mh
56: programs. I spent a bit of time tuning and it now accounts for <10%
57: of the time used. Like any heavily tuned routine, it's a bit
58: complex and you want to be sure you understand everything that it's
59: doing before you start hacking on it. Let me try to emphasize
60: that: every line in this atrocity depends on every other line,
61: sometimes in subtle ways. You should understand it all, in detail,
62: before trying to change any part. If you do change it, test the
63: result thoroughly (I use a hand-constructed test file that exercises
64: all the ways a header name, header body, header continuation,
65: header-body separator, body line and body eom can align themselves
66: with respect to a buffer boundary). "Minor" bugs in this routine
67: result in garbaged or lost mail.
68:
69: If you hack on this and slow it down, I, my children and my
70: children's children will curse you.
71:
72: This routine gets used on three different types of files: normal,
73: single msg files, "packed" unix or mmdf mailboxs (when used by inc)
74: and packed, directoried bulletin board files (when used by msh).
75: The biggest impact of different file types is in "eom" testing. The
76: code has been carefully organized to test for eom at appropriate
77: times and at no other times (since the check is quite expensive).
78: I have tried to arrange things so that the eom check need only be
79: done on entry to this routine. Since an eom can only occur after a
80: newline, this is easy to manage for header fields. For the msg
81: body, we try to efficiently search the input buffer to see if
82: contains the eom delimiter. If it does, we take up to the
83: delimiter, otherwise we take everything in the buffer. (The change
84: to the body eom/copy processing produced the most noticeable
85: performance difference, particularly for "inc" and "show".)
86:
87: There are three qualitatively different things this routine busts
88: out of a message: field names, field text and msg bodies. Field
89: names are typically short (~8 char) and the loop that extracts them
90: might terminate on a colon, newline or max width. I considered
91: using a Vax "scanc" to locate the end of the field followed by a
92: "bcopy" but the routine call overhead on a Vax is too large for this
93: to work on short names. If Berkeley ever makes "inline" part of the
94: C optimiser (so things like "scanc" turn into inline instructions) a
95: change here would be worthwhile.
96:
97: Field text is typically 60 - 100 characters so there's (barely)
98: a win in doing a routine call to something that does a "locc"
99: followed by a "bmove". About 30% of the fields have continuations
100: (usually the 822 "received:" lines) and each continuation generates
101: another routine call. "Inline" would be a big win here, as well.
102:
103: Messages, as of this writing, seem to come in two flavors: small
104: (~1K) and long (>2K). Most messages have 400 - 600 bytes of headers
105: so message bodies average at least a few hundred characters.
106: Assuming your system uses reasonably sized stdio buffers (1K or
107: more), this routine should be able to remove the body in large
108: (>500 byte) chunks. The makes the cost of a call to "bcopy"
109: small but there is a premium on checking for the eom in packed
110: maildrops. The eom pattern is always a simple string so we can
111: construct an efficient pattern matcher for it (e.g., a Vax "matchc"
112: instruction). Some thought went into recognizing the start of
113: an eom that has been split across two buffers.
114:
115: This routine wants to deal with large chunks of data so, rather
116: than "getc" into a local buffer, it uses stdio's buffer. If
117: you try to use it on a non-buffered file, you'll get what you
118: deserve. This routine "knows" that struct FILEs have a _ptr
119: and a _cnt to describe the current state of the buffer and
120: it knows that _filbuf ignores the _ptr & _cnt and simply fills
121: the buffer. If stdio on your system doesn't work this way, you
122: may have to make small changes in this routine.
123:
124: This routine also "knows" that an EOF indication on a stream is
125: "sticky" (i.e., you will keep getting EOF until you reposition the
126: stream). If your system doesn't work this way it is broken and you
127: should complain to the vendor. As a consequence of the sticky
128: EOF, this routine will never return any kind of EOF status when
129: there is data in "name" or "buf").
130: */
131:
132:
133: #define Getc(iob) getc(iob)
134: #define eom(c,iob) (msg_style != MS_DEFAULT && \
135: (((c) == *msg_delim && m_Eom(c,iob)) ||\
136: (eom_action && (*eom_action)(c))))
137:
138: static char *matchc();
139: static char *locc();
140:
141: static char **pat_map;
142:
143: int msg_count = 0; /* disgusting hack for "inc" so it can
144: * know how many characters were stuffed
145: * in the buffer on the last call (see
146: * comments in uip/scansbr.c) */
147:
148: int msg_style = MS_DEFAULT;
149: /*
150: * The "full" delimiter string for a packed maildrop consists
151: * of a newline followed by the actual delimiter. E.g., the
152: * full string for a Unix maildrop would be: "\n\nFrom ".
153: * "Fdelim" points to the start of the full string and is used
154: * in the BODY case of the main routine to search the buffer for
155: * a possible eom. Msg_delim points to the first character of
156: * the actual delim. string (i.e., fdelim+1). Edelim
157: * points to the 2nd character of actual delimiter string. It
158: * is used in m_Eom because the first character of the string
159: * has been read and matched before m_Eom is called.
160: */
161: char *msg_delim = "";
162: static char *fdelim;
163: static char *delimend;
164: static int fdelimlen;
165: static char *edelim;
166: static int edelimlen;
167:
168: static int (*eom_action) () = NULL;
169:
170: /* */
171:
172: m_getfld (state, name, buf, bufsz, iob)
173: int state;
174: int bufsz;
175: char *name,
176: *buf;
177: register FILE *iob;
178: {
179: register char *cp;
180: register char *bp;
181: register int cnt;
182: register int c;
183: register int i;
184: register int j;
185: register char *ep;
186: register char *sp;
187:
188: if ((c = Getc(iob)) < 0) {
189: msg_count = 0;
190: *buf = 0;
191: return FILEEOF;
192: }
193: if (eom (c, iob)) {
194: if (! eom_action) {
195: /* flush null messages */
196: while ((c = Getc(iob)) >= 0 && eom (c, iob))
197: ;
198: if (c >= 0)
199: (void) ungetc(c, iob);
200: }
201: msg_count = 0;
202: *buf = 0;
203: return FILEEOF;
204: }
205:
206: switch (state) {
207: case FLDEOF:
208: case BODYEOF:
209: case FLD:
210: if (c == '\n' || c == '-') {
211: /* we hit the header/body separator */
212: while (c != '\n' && (c = Getc(iob)) >= 0)
213: ;
214:
215: if (c < 0 || (c = Getc(iob)) < 0 || eom (c, iob)) {
216: if (! eom_action) {
217: /* flush null messages */
218: while ((c = Getc(iob)) >= 0 && eom (c, iob))
219: ;
220: if (c >= 0)
221: (void) ungetc(c, iob);
222: }
223: msg_count = 0;
224: *buf = 0;
225: return FILEEOF;
226: }
227: state = BODY;
228: goto body;
229: }
230: /*
231: * get the name of this component. take characters up
232: * to a ':', a newline or NAMESZ-1 characters, whichever
233: * comes first.
234: */
235: cp = name; i = NAMESZ - 1;
236: for (;;) {
237: bp = sp = iob->_ptr - 1;
238: j = (cnt = iob->_cnt+1) < i ? cnt : i;
239: while ((c = *bp++) != ':' && c != '\n' && --j >= 0)
240: *cp++ = c;
241:
242: j = bp - sp;
243: if ((cnt -= j) <= 0) {
244: if (_filbuf(iob) == EOF) {
245: *cp = *buf = NULL;
246: advise (NULLCP, "eof encountered in field \"%s\"",
247: name);
248: return FMTERR;
249: }
250: } else {
251: iob->_ptr = bp + 1;
252: iob->_cnt = cnt - 1;
253: }
254: if (c == ':')
255: break;
256:
257: /*
258: * something went wrong. possibilities are:
259: * . hit a newline (error)
260: * . got more than namesz chars. (error)
261: * . hit the end of the buffer. (loop)
262: */
263: if (c == '\n') {
264: *cp = *buf = NULL;
265: advise (NULLCP, "eol encountered in field \"%s\"", name);
266: state = FMTERR;
267: goto finish;
268: }
269: if ((i -= j) <= 0) {
270: *cp = *buf = NULL;
271: advise (NULLCP, "field name \"%s\" exceeds %d bytes",
272: name, NAMESZ - 1);
273: state = LENERR;
274: goto finish;
275: }
276: }
277:
278: while (isspace (*--cp) && cp >= name)
279: ;
280: *++cp = NULL;
281: /* fall through */
282:
283: case FLDPLUS:
284: /*
285: * get (more of) the text of a field. take
286: * characters up to the end of this field (newline
287: * followed by non-blank) or bufsz-1 characters.
288: */
289: cp = buf; i = bufsz-1;
290: for (;;) {
291: cnt = iob->_cnt++; bp = --iob->_ptr;
292: c = cnt < i ? cnt : i;
293: while (ep = locc( c, bp, '\n' )) {
294: /*
295: * if we hit the end of this field, return.
296: */
297: if ((j = *++ep) != ' ' && j != '\t') {
298: j = ep - iob->_ptr;
299: (void) bcopy( iob->_ptr, cp, j);
300: iob->_ptr = ep; iob->_cnt -= j;
301: cp += j;
302: state = FLD;
303: goto finish;
304: }
305: c -= ep - bp; bp = ep;
306: }
307: /*
308: * end of input or dest buffer - copy what we've found.
309: */
310: c += bp - iob->_ptr;
311: (void) bcopy( iob->_ptr, cp, c);
312: i -= c; cp += c;
313: if (i <= 0) {
314: /* the dest buffer is full */
315: iob->_cnt -= c; iob->_ptr += c;
316: state = FLDPLUS;
317: break;
318: }
319: /*
320: * There's one character left in the input buffer.
321: * Copy it & fill the buffer. If the last char
322: * was a newline and the next char is not whitespace,
323: * this is the end of the field. Otherwise loop.
324: */
325: --i;
326: *cp++ = j = *(iob->_ptr + c);
327: c = _filbuf(iob);
328: if (j == '\n' && c != ' ' && c != '\t') {
329: if (c != EOF)
330: --iob->_ptr, ++iob->_cnt;
331: state = FLD;
332: break;
333: }
334: }
335: break;
336:
337: case BODY:
338: body:
339: /*
340: * get the message body up to bufsz characters or the
341: * end of the message. Sleazy hack: if bufsz is negative
342: * we assume that we were called to copy directly into
343: * the output buffer and we don't add an eos.
344: */
345: i = (bufsz < 0) ? -bufsz : bufsz-1;
346: bp = --iob->_ptr; cnt = ++iob->_cnt;
347: c = (cnt < i ? cnt : i);
348: if (msg_style != MS_DEFAULT && c > 1) {
349: /*
350: * packed maildrop - only take up to the (possible)
351: * start of the next message. This "matchc" should
352: * probably be a Boyer-Moore matcher for non-vaxen,
353: * particularly since we have the alignment table
354: * all built for the end-of-buffer test (next).
355: * But our vax timings indicate that the "matchc"
356: * instruction is 50% faster than a carefully coded
357: * B.M. matcher for most strings. (So much for elegant
358: * algorithms vs. brute force.) Since I (currently)
359: * run MH on a vax, we use the matchc instruction. --vj
360: */
361: if (ep = matchc( fdelimlen, fdelim, c, bp ) )
362: c = ep - bp + 1;
363: else {
364: /*
365: * There's no delim in the buffer but there may be
366: * a partial one at the end. If so, we want to leave
367: * it so the "eom" check on the next call picks it up.
368: * Use a modified Boyer-Moore matcher to make this
369: * check relatively cheap. The first "while" figures
370: * out what position in the pattern matches the last
371: * character in the buffer. The inner "while" matches
372: * the pattern against the buffer, backwards starting
373: * at that position. Note that unless the buffer
374: * ends with one of the characters in the pattern
375: * (excluding the first and last), we do only one test.
376: */
377: sp = delimend;
378: ep = bp + c - 1;
379: while ((cp = pat_map[*ep]) < sp) {
380: ep = bp + c - 1; sp = cp;
381: while (*--ep == *--cp && cp > fdelim)
382: ;
383: if (cp == fdelim) {
384: if (*ep == *cp && ep > bp)
385: c = (ep - bp) + 1;
386: break;
387: }
388: }
389: }
390: }
391: (void) bcopy( bp, buf, c );
392: iob->_cnt -= c;
393: iob->_ptr += c;
394: if (bufsz < 0) {
395: msg_count = c;
396: return (state);
397: }
398: cp = buf + c;
399: break;
400:
401: default:
402: adios (NULLCP, "m_getfld() called with bogus state of %d", state);
403: }
404: finish:;
405: *cp = NULL;
406: msg_count = cp - buf;
407: return (state);
408: }
409:
410: /* */
411:
412: #ifdef RPATHS
413: static char unixbuf[BUFSIZ] = "";
414: #endif RPATHS
415:
416: void m_unknown (iob)
417: register FILE *iob;
418: {
419: register int c;
420: register long pos;
421: char text[10];
422: register char *cp;
423:
424: msg_style = MS_UNKNOWN;
425:
426: /* Figure out what the message delimitter string is for this
427: * maildrop. (This used to be part of m_Eom but I didn't like
428: * the idea of an "if" statement that could only succeed on the
429: * first call to m_Eom getting executed on each call, i.e., at
430: * every newline in the message).
431: *
432: * If the first line of the maildrop is a Unix "from" line, we say the
433: * style is UUCP and eat the rest of the line. Otherwise we say the style
434: * is MMDF & look for the delimiter string specified when MH was built
435: * (or from the mtstailor file).
436: */
437: pos = ftell (iob);
438: if (fread (text, sizeof *text, 5, iob) == 5
439: && strncmp (text, "From ", 5) == 0) {
440: msg_style = MS_UUCP;
441: fdelim = "\n\nFrom ";
442: #ifndef RPATHS
443: while ((c = getc (iob)) != '\n' && c >= 0)
444: ;
445: #else RPATHS
446: cp = unixbuf;
447: while ((c = getc (iob)) != '\n')
448: *cp++ = c;
449: *cp = NULL;
450: #endif RPATHS
451: } else {
452: /* not a Unix style maildrop */
453: (void) fseek (iob, pos, 0);
454: if (mmdlm2 == NULLCP || *mmdlm2 == NULL)
455: mmdlm2 = "\001\001\001\001\n";
456: fdelim = (char *)malloc((unsigned)strlen(mmdlm2)+2);
457: *fdelim = '\n';
458: (void)strcpy(fdelim+1, mmdlm2);
459: msg_style = MS_MMDF;
460: }
461: fdelimlen = strlen(fdelim);
462: msg_delim = fdelim+1;
463: edelim = msg_delim+1;
464: edelimlen = fdelimlen-2;
465: delimend = msg_delim + edelimlen;
466: if (edelimlen <= 1)
467: adios (NULLCP, "maildrop delimiter must be at least 2 bytes");
468: /*
469: * build a Boyer-Moore end-position map for the matcher in m_getfld.
470: * N.B. - we don't match just the first char (since it's the newline
471: * separator) or the last char (since the matchc would have found it
472: * if it was a real delim).
473: */
474: pat_map = (char **) malloc( 256 * sizeof (char *));
475: for (c = 256; c--; )
476: pat_map[c] = delimend + 1;
477:
478: for (cp = fdelim + 1; cp < delimend; cp++ )
479: pat_map[*cp] = cp;
480:
481: if (msg_style == MS_MMDF) {
482: /* flush extra msg hdrs */
483: while ((c = Getc(iob)) >= 0 && eom (c, iob))
484: ;
485: if (c >= 0)
486: (void) ungetc(c, iob);
487: }
488: }
489:
490:
491: void m_eomsbr (action)
492: int (*action) ();
493: {
494: if (eom_action = action) {
495: msg_style = MS_MSH;
496: *msg_delim = 0;
497: fdelimlen = 1;
498: delimend = fdelim;
499: } else {
500: msg_style = MS_MMDF;
501: msg_delim = fdelim + 1;
502: fdelimlen = strlen (fdelim);
503: delimend = msg_delim + edelimlen;
504: }
505: }
506:
507: /* */
508:
509: /* test for msg delimiter string */
510:
511: int m_Eom (c, iob)
512: register int c;
513: register FILE *iob;
514: {
515: register long pos = 0L;
516: register int i;
517: char text[10];
518: #ifdef RPATHS
519: register char *cp;
520: #endif RPATHS
521:
522: pos = ftell (iob);
523: if ((i = fread (text, sizeof *text, edelimlen, iob)) != edelimlen
524: || strncmp (text, edelim, edelimlen)) {
525: if (i == 0 && msg_style == MS_UUCP)
526: /* the final newline in the (brain damaged) unix-format
527: * maildrop is part of the delimitter - delete it.
528: */
529: return 1;
530:
531: (void) fseek (iob, pos, 0);
532: return 0;
533: }
534:
535: if (msg_style == MS_UUCP) {
536: #ifndef RPATHS
537: while ((c = getc (iob)) != '\n')
538: if (c < 0)
539: break;
540: #else RPATHS
541: cp = unixbuf;
542: while ((c = getc (iob)) != '\n' && c >= 0)
543: *cp++ = c;
544: *cp = NULL;
545: #endif RPATHS
546: }
547:
548: return 1;
549: }
550:
551: /* */
552:
553: #ifdef RPATHS
554: char *unixline () {
555: register char *cp,
556: *dp,
557: *pp;
558: static char unixfrom[BUFSIZ];
559:
560: pp = unixfrom;
561: if (cp = dp = index (unixbuf, ' ')) {
562: while (cp = index (cp + 1, 'r'))
563: if (strncmp (cp, "remote from ", 12) == 0) {
564: *cp = NULL;
565: (void) sprintf (pp, "%s!", cp + 12);
566: pp += strlen (pp);
567: break;
568: }
569: if (cp == NULL)
570: cp = unixbuf + strlen (unixbuf);
571: if ((cp -= 25) >= dp)
572: *cp = NULL;
573: }
574:
575: (void) sprintf (pp, "%s\n", unixbuf);
576: unixbuf[0] = NULL;
577: return unixfrom;
578: }
579: #endif RPATHS
580:
581: /* */
582:
583: #if (vax && !lint)
584: asm(".align 1");
585: asm("_matchc: .word 0");
586: asm(" movq 4(ap),r0");
587: asm(" movq 12(ap),r2");
588: asm(" matchc r0,(r1),r2,(r3)");
589: asm(" beql 1f");
590: asm(" movl 4(ap),r3");
591: asm("1: subl3 4(ap),r3,r0");
592: asm(" ret");
593: #else
594: static char *
595: matchc( patln, pat, strln, str )
596: int patln;
597: char *pat;
598: int strln;
599: register char *str;
600: {
601: register char *es = str + strln - patln;
602: register char *sp;
603: register char *pp;
604: register char *ep = pat + patln;
605: register char pc = *pat++;
606:
607: for(;;) {
608: while (pc != *str++)
609: if (str > es)
610: return 0;
611:
612: sp = str; pp = pat;
613: while (pp < ep && *sp++ == *pp++)
614: ;
615: if (pp >= ep)
616: return (--str);
617: }
618: }
619: #endif
620:
621: /* */
622:
623: /*
624: * Locate character "term" in the next "cnt" characters of "src".
625: * If found, return its address, otherwise return 0.
626: */
627: #if (vax && !lint)
628: asm(".align 1");
629: asm("_locc: .word 0");
630: asm(" movq 4(ap),r0");
631: asm(" locc 12(ap),r0,(r1)");
632: asm(" beql 1f");
633: asm(" movl r1,r0");
634: asm("1: ret");
635: #else
636: static char *
637: locc( cnt, src, term )
638: register int cnt;
639: register char *src;
640: register char term;
641: {
642: while (*src++ != term && --cnt > 0);
643:
644: return (cnt > 0 ? --src : NULLCP);
645: }
646: #endif
647:
648: /* */
649:
650: #if !defined (BSD42) && !defined (bcopy)
651: int bcmp (b1, b2, length)
652: register char *b1,
653: *b2;
654: register int length;
655: {
656: while (length-- > 0)
657: if (*b1++ != *b2++)
658: return 1;
659:
660: return 0;
661: }
662:
663:
664: bcopy (b1, b2, length)
665: register char *b1,
666: *b2;
667: register int length;
668: {
669: while (length-- > 0)
670: *b2++ = *b1++;
671: }
672:
673:
674: bzero (b, length)
675: register char *b;
676: register int length;
677: {
678: while (length-- > 0)
679: *b++ = NULL;
680: }
681: #endif not BSD42 or SYS5
This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.