|
|
1.1 root 1: /* m_getfld.c - read/parse a message */
2:
3: #include "../h/mh.h"
4: #include <stdio.h>
5: #include "../zotnet/mts.h"
6: #include <ctype.h>
7:
8:
9: /* This module has a long and checkered history. First, it didn't burst
10: maildrops correctly because it considered two CTRL-A:s in a row to be
11: an inter-message delimiter. It really is four CTRL-A:s followed by a
12: newline. Unfortunately, MMDF will convert this delimiter *inside* a
13: message to a CTRL-B followed by three CTRL-A:s and a newline. This
14: caused the old version of m_getfld() to declare eom prematurely. The
15: fix was a lot slower than
16:
17: c == '\001' && peekc (iob) == '\001'
18:
19: but it worked, and to increase generality, UUCP style maildrops could
20: be parsed as well. Unfortunately the speed issue finally caught up with
21: us since this routine is at the very heart of MH.
22:
23: To speed things up considerably, the routine Eom() was made an auxilary
24: function called by the macro eom(). Unless we are bursting a maildrop,
25: the eom() macro returns FALSE saying we aren't at the end of the
26: message.
27:
28: The next thing to do is to read the mtstailor file and initialize
29: delimiter[] and delimlen accordingly...
30:
31: After mhl was made a built-in in msh, m_getfld() worked just fine
32: (using m_unknown() at startup). Until one day: a message which was
33: the result of a bursting was shown. Then, since the burst boundaries
34: aren't CTRL-A:s, m_getfld() would blinding plunge on past the boundary.
35: Very sad. The solution: introduce m_eomsbr(). This hook gets called
36: after the end of each line (since testing for eom involves an fseek()).
37: This worked fine, until one day: a message with no body portion arrived.
38: Then the
39:
40: while (eom (c = Getc (iob), iob))
41: continue;
42:
43: loop caused m_getfld() to return FMTERR. So, that logic was changed to
44: check for (*eom_action) and act accordingly.
45:
46: This worked fine, until one day: someone didn't use four CTRL:A's as
47: their delimiters. So, the bullet got bit and we read mts.h and
48: continue to struggle on. It's not that bad though, since the only time
49: the code gets executed is when inc (or msh) calls it, and both of these
50: have already called mts_init().
51:
52: ------------------------
53: (Written by Van Jacobson for the mh6 m_getfld, January, 1986):
54:
55: This routine was accounting for 60% of the cpu time used by most mh
56: programs. I spent a bit of time tuning and it now accounts for <10%
57: of the time used. Like any heavily tuned routine, it's a bit
58: complex and you want to be sure you understand everything that it's
59: doing before you start hacking on it. Let me try to emphasize
60: that: every line in this atrocity depends on every other line,
61: sometimes in subtle ways. You should understand it all, in detail,
62: before trying to change any part. If you do change it, test the
63: result thoroughly (I use a hand-constructed test file that exercises
64: all the ways a header name, header body, header continuation,
65: header-body separator, body line and body eom can align themselves
66: with respect to a buffer boundary). "Minor" bugs in this routine
67: result in garbaged or lost mail.
68:
69: If you hack on this and slow it down, I, my children and my
70: children's children will curse you.
71:
72: This routine gets used on three different types of files: normal,
73: single msg files, "packed" unix or mmdf mailboxs (when used by inc)
74: and packed, directoried bulletin board files (when used by msh).
75: The biggest impact of different file types is in "eom" testing. The
76: code has been carefully organized to test for eom at appropriate
77: times and at no other times (since the check is quite expensive).
78: I have tried to arrange things so that the eom check need only be
79: done on entry to this routine. Since an eom can only occur after a
80: newline, this is easy to manage for header fields. For the msg
81: body, we try to efficiently search the input buffer to see if
82: contains the eom delimiter. If it does, we take up to the
83: delimiter, otherwise we take everything in the buffer. (The change
84: to the body eom/copy processing produced the most noticeable
85: performance difference, particularly for "inc" and "show".)
86:
87: There are three qualitatively different things this routine busts
88: out of a message: field names, field text and msg bodies. Field
89: names are typically short (~8 char) and the loop that extracts them
90: might terminate on a colon, newline or max width. I considered
91: using a Vax "scanc" to locate the end of the field followed by a
92: "bcopy" but the routine call overhead on a Vax is too large for this
93: to work on short names. If Berkeley ever makes "inline" part of the
94: C optimiser (so things like "scanc" turn into inline instructions) a
95: change here would be worthwhile.
96:
97: Field text is typically 60 - 100 characters so there's (barely)
98: a win in doing a routine call to something that does a "locc"
99: followed by a "bmove". About 30% of the fields have continuations
100: (usually the 822 "received:" lines) and each continuation generates
101: another routine call. "Inline" would be a big win here, as well.
102:
103: Messages, as of this writing, seem to come in two flavors: small
104: (~1K) and long (>2K). Most messages have 400 - 600 bytes of headers
105: so message bodies average at least a few hundred characters.
106: Assuming your system uses reasonably sized stdio buffers (1K or
107: more), this routine should be able to remove the body in large
108: (>500 byte) chunks. The makes the cost of a call to "bcopy"
109: small but there is a premium on checking for the eom in packed
110: maildrops. The eom pattern is always a simple string so we can
111: construct an efficient pattern matcher for it (e.g., a Vax "matchc"
112: instruction). Some thought went into recognizing the start of
113: an eom that has been split across two buffers.
114:
115: This routine wants to deal with large chunks of data so, rather
116: than "getc" into a local buffer, it uses stdio's buffer. If
117: you try to use it on a non-buffered file, you'll get what you
118: deserve. This routine "knows" that struct FILEs have a _ptr
119: and a _cnt to describe the current state of the buffer and
120: it knows that _filbuf ignores the _ptr & _cnt and simply fills
121: the buffer. If stdio on your system doesn't work this way, you
122: may have to make small changes in this routine.
123:
124: This routine also "knows" that an EOF indication on a stream is
125: "sticky" (i.e., you will keep getting EOF until you reposition the
126: stream). If your system doesn't work this way it is broken and you
127: should complain to the vendor. As a consequence of the sticky
128: EOF, this routine will never return any kind of EOF status when
129: there is data in "name" or "buf").
130: */
131:
132:
133: #define Getc(iob) getc(iob)
134: #define eom(c,iob) (msg_style != MS_DEFAULT && \
135: (((c) == *msg_delim && m_Eom(c,iob)) ||\
136: (eom_action && (*eom_action)(c))))
137:
138: static unsigned char *matchc();
139: static unsigned char *locc();
140:
141: static unsigned char **pat_map;
142:
143: int msg_count = 0; /* disgusting hack for "inc" so it can
144: * know how many characters were stuffed
145: * in the buffer on the last call (see
146: * comments in uip/scansbr.c) */
147:
148: int msg_style = MS_DEFAULT;
149: /*
150: * The "full" delimiter string for a packed maildrop consists
151: * of a newline followed by the actual delimiter. E.g., the
152: * full string for a Unix maildrop would be: "\n\nFrom ".
153: * "Fdelim" points to the start of the full string and is used
154: * in the BODY case of the main routine to search the buffer for
155: * a possible eom. Msg_delim points to the first character of
156: * the actual delim. string (i.e., fdelim+1). Edelim
157: * points to the 2nd character of actual delimiter string. It
158: * is used in m_Eom because the first character of the string
159: * has been read and matched before m_Eom is called.
160: */
161: char *msg_delim = "";
162: static unsigned char *fdelim;
163: static unsigned char *delimend;
164: static int fdelimlen;
165: static unsigned char *edelim;
166: static int edelimlen;
167:
168: static int (*eom_action) () = NULL;
169:
170: /* */
171:
172: m_getfld(state, name, buf, bufsz, iob)
173: int state;
174: int bufsz;
175: unsigned char *name, *buf;
176: register FILE *iob;
177: {
178: register unsigned char *cp;
179: register unsigned char *bp;
180: register int cnt;
181: register int c;
182: register int i;
183: register int j;
184: register unsigned char *ep;
185: register unsigned char *sp;
186:
187: if ((c = Getc(iob)) < 0) {
188: msg_count = 0;
189: *buf = 0;
190: return FILEEOF;
191: }
192: if (eom (c, iob)) {
193: if (! eom_action) {
194: /* flush null messages */
195: while ((c = Getc(iob)) >= 0 && eom (c, iob))
196: ;
197: if (c >= 0)
198: (void) ungetc(c, iob);
199: }
200: msg_count = 0;
201: *buf = 0;
202: return FILEEOF;
203: }
204:
205: switch (state) {
206: case FLDEOF:
207: case BODYEOF:
208: case FLD:
209: if (c == '\n' || c == '-') {
210: /* we hit the header/body separator */
211: while (c != '\n' && (c = Getc(iob)) >= 0)
212: ;
213:
214: if (c < 0 || (c = Getc(iob)) < 0 || eom (c, iob)) {
215: if (! eom_action) {
216: /* flush null messages */
217: while ((c = Getc(iob)) >= 0 && eom (c, iob))
218: ;
219: if (c >= 0)
220: (void) ungetc(c, iob);
221: }
222: msg_count = 0;
223: *buf = 0;
224: return FILEEOF;
225: }
226: state = BODY;
227: goto body;
228: }
229: /*
230: * get the name of this component. take characters up
231: * to a ':', a newline or NAMESZ-1 characters, whichever
232: * comes first.
233: */
234: cp = name; i = NAMESZ - 1;
235: for (;;) {
236: bp = sp = (unsigned char *) iob->_ptr - 1;
237: j = (cnt = iob->_cnt+1) < i ? cnt : i;
238: while ((c = *bp++) != ':' && c != '\n' && --j >= 0)
239: *cp++ = c;
240:
241: j = bp - sp;
242: if ((cnt -= j) <= 0) {
243: if (_filbuf(iob) == EOF) {
244: *cp = *buf = NULL;
245: advise (NULLCP, "eof encountered in field \"%s\"",
246: name);
247: return FMTERR;
248: }
249: } else {
250: iob->_ptr = (char *) bp + 1;
251: iob->_cnt = cnt - 1;
252: }
253: if (c == ':')
254: break;
255:
256: /*
257: * something went wrong. possibilities are:
258: * . hit a newline (error)
259: * . got more than namesz chars. (error)
260: * . hit the end of the buffer. (loop)
261: */
262: if (c == '\n') {
263: *cp = *buf = NULL;
264: advise (NULLCP, "eol encountered in field \"%s\"", name);
265: state = FMTERR;
266: goto finish;
267: }
268: if ((i -= j) <= 0) {
269: *cp = *buf = NULL;
270: advise (NULLCP, "field name \"%s\" exceeds %d bytes",
271: name, NAMESZ - 1);
272: state = LENERR;
273: goto finish;
274: }
275: }
276:
277: while (isspace (*--cp) && cp >= name)
278: ;
279: *++cp = NULL;
280: /* fall through */
281:
282: case FLDPLUS:
283: /*
284: * get (more of) the text of a field. take
285: * characters up to the end of this field (newline
286: * followed by non-blank) or bufsz-1 characters.
287: */
288: cp = buf; i = bufsz-1;
289: for (;;) {
290: cnt = iob->_cnt++; bp = (unsigned char *) --iob->_ptr;
291: c = cnt < i ? cnt : i;
292: while (ep = locc( c, bp, '\n' )) {
293: /*
294: * if we hit the end of this field, return.
295: */
296: if ((j = *++ep) != ' ' && j != '\t') {
297: j = ep - (unsigned char *) iob->_ptr;
298: (void) bcopy( iob->_ptr, cp, j);
299: iob->_ptr = (char *) ep; iob->_cnt -= j;
300: cp += j;
301: state = FLD;
302: goto finish;
303: }
304: c -= ep - bp; bp = ep;
305: }
306: /*
307: * end of input or dest buffer - copy what we've found.
308: */
309: c += bp - (unsigned char *) iob->_ptr;
310: (void) bcopy( iob->_ptr, cp, c);
311: i -= c; cp += c;
312: if (i <= 0) {
313: /* the dest buffer is full */
314: iob->_cnt -= c; iob->_ptr += c;
315: state = FLDPLUS;
316: break;
317: }
318: /*
319: * There's one character left in the input buffer.
320: * Copy it & fill the buffer. If the last char
321: * was a newline and the next char is not whitespace,
322: * this is the end of the field. Otherwise loop.
323: */
324: --i;
325: *cp++ = j = *(iob->_ptr + c);
326: c = _filbuf(iob);
327: if (j == '\n' && c != ' ' && c != '\t') {
328: if (c != EOF)
329: --iob->_ptr, ++iob->_cnt;
330: state = FLD;
331: break;
332: }
333: }
334: break;
335:
336: case BODY:
337: body:
338: /*
339: * get the message body up to bufsz characters or the
340: * end of the message. Sleazy hack: if bufsz is negative
341: * we assume that we were called to copy directly into
342: * the output buffer and we don't add an eos.
343: */
344: i = (bufsz < 0) ? -bufsz : bufsz-1;
345: bp = (unsigned char *) --iob->_ptr; cnt = ++iob->_cnt;
346: c = (cnt < i ? cnt : i);
347: if (msg_style != MS_DEFAULT && c > 1) {
348: /*
349: * packed maildrop - only take up to the (possible)
350: * start of the next message. This "matchc" should
351: * probably be a Boyer-Moore matcher for non-vaxen,
352: * particularly since we have the alignment table
353: * all built for the end-of-buffer test (next).
354: * But our vax timings indicate that the "matchc"
355: * instruction is 50% faster than a carefully coded
356: * B.M. matcher for most strings. (So much for elegant
357: * algorithms vs. brute force.) Since I (currently)
358: * run MH on a vax, we use the matchc instruction. --vj
359: */
360: if (ep = matchc( fdelimlen, fdelim, c, bp ) )
361: c = ep - bp + 1;
362: else {
363: /*
364: * There's no delim in the buffer but there may be
365: * a partial one at the end. If so, we want to leave
366: * it so the "eom" check on the next call picks it up.
367: * Use a modified Boyer-Moore matcher to make this
368: * check relatively cheap. The first "if" figures
369: * out what position in the pattern matches the last
370: * character in the buffer. The inner "while" matches
371: * the pattern against the buffer, backwards starting
372: * at that position. Note that unless the buffer
373: * ends with one of the characters in the pattern
374: * (excluding the first and last), we do only one test.
375: */
376: ep = bp + c - 1;
377: if (sp = pat_map[*ep]) {
378: do {
379: cp = sp;
380: while (*--ep == *--cp)
381: ;
382: if (cp < fdelim) {
383: if (ep >= bp)
384: /*
385: * ep < bp means that all the buffer
386: * contains is a prefix of delim.
387: * If this prefix is really a delim, the
388: * m_eom call at entry should have found
389: * it. Thus it's not a delim and we can
390: * take all of it.
391: */
392: c = (ep - bp) + 2;
393: break;
394: }
395: /* try matching one less char of delim string */
396: ep = bp + c - 1;
397: } while (--sp > fdelim);
398: }
399: }
400: }
401: (void) bcopy( bp, buf, c );
402: iob->_cnt -= c;
403: iob->_ptr += c;
404: if (bufsz < 0) {
405: msg_count = c;
406: return (state);
407: }
408: cp = buf + c;
409: break;
410:
411: default:
412: adios (NULLCP, "m_getfld() called with bogus state of %d", state);
413: }
414: finish:;
415: *cp = NULL;
416: msg_count = cp - buf;
417: return (state);
418: }
419:
420: /* */
421:
422: #ifdef RPATHS
423: static char unixbuf[BUFSIZ] = "";
424: #endif RPATHS
425:
426: void
427: m_unknown(iob)
428: register FILE *iob;
429: {
430: register int c;
431: register long pos;
432: char text[10];
433: register char *cp;
434: register char *delimstr;
435:
436: msg_style = MS_UNKNOWN;
437:
438: /* Figure out what the message delimitter string is for this
439: * maildrop. (This used to be part of m_Eom but I didn't like
440: * the idea of an "if" statement that could only succeed on the
441: * first call to m_Eom getting executed on each call, i.e., at
442: * every newline in the message).
443: *
444: * If the first line of the maildrop is a Unix "from" line, we say the
445: * style is UUCP and eat the rest of the line. Otherwise we say the style
446: * is MMDF & look for the delimiter string specified when MH was built
447: * (or from the mtstailor file).
448: */
449: pos = ftell (iob);
450: if (fread (text, sizeof *text, 5, iob) == 5
451: && strncmp (text, "From ", 5) == 0) {
452: msg_style = MS_UUCP;
453: delimstr = "\nFrom ";
454: #ifndef RPATHS
455: while ((c = getc (iob)) != '\n' && c >= 0)
456: ;
457: #else RPATHS
458: cp = unixbuf;
459: while ((c = getc (iob)) != '\n')
460: *cp++ = c;
461: *cp = NULL;
462: #endif RPATHS
463: } else {
464: /* not a Unix style maildrop */
465: (void) fseek (iob, pos, 0);
466: if (mmdlm2 == NULLCP || *mmdlm2 == NULL)
467: mmdlm2 = "\001\001\001\001\n";
468: delimstr = mmdlm2;
469: msg_style = MS_MMDF;
470: }
471: c = strlen (delimstr);
472: fdelim = (unsigned char *)malloc((unsigned)c + 3);
473: *fdelim++ = '\0';
474: *fdelim = '\n';
475: msg_delim = (char *)fdelim+1;
476: edelim = (unsigned char *)msg_delim+1;
477: fdelimlen = c + 1;
478: edelimlen = c - 1;
479: (void)strcpy(msg_delim, delimstr);
480: delimend = (unsigned char *)msg_delim + edelimlen;
481: if (edelimlen <= 1)
482: adios (NULLCP, "maildrop delimiter must be at least 2 bytes");
483: /*
484: * build a Boyer-Moore end-position map for the matcher in m_getfld.
485: * N.B. - we don't match just the first char (since it's the newline
486: * separator) or the last char (since the matchc would have found it
487: * if it was a real delim).
488: */
489: pat_map = (unsigned char **) calloc (256, sizeof (unsigned char *));
490:
491: for (cp = (char *)fdelim + 1; cp < (char *)delimend; cp++ )
492: pat_map[*cp] = (unsigned char *)cp;
493:
494: if (msg_style == MS_MMDF) {
495: /* flush extra msg hdrs */
496: while ((c = Getc(iob)) >= 0 && eom (c, iob))
497: ;
498: if (c >= 0)
499: (void) ungetc(c, iob);
500: }
501: }
502:
503: void
504: m_eomsbr(action)
505: int (*action)();
506: {
507: if (eom_action = action) {
508: msg_style = MS_MSH;
509: *msg_delim = 0;
510: fdelimlen = 1;
511: delimend = fdelim;
512: } else {
513: msg_style = MS_MMDF;
514: msg_delim = (char *)fdelim + 1;
515: fdelimlen = strlen(fdelim);
516: delimend = (unsigned char *)(msg_delim + edelimlen);
517: }
518: }
519:
520: /* */
521:
522: /* test for msg delimiter string */
523:
524: int
525: m_Eom(c, iob)
526: register int c;
527: register FILE *iob;
528: {
529: register long pos = 0L;
530: register int i;
531: char text[10];
532: #ifdef RPATHS
533: register char *cp;
534: #endif RPATHS
535:
536: pos = ftell (iob);
537: if ((i = fread (text, sizeof *text, edelimlen, iob)) != edelimlen
538: || strncmp (text, edelim, edelimlen)) {
539: if (i == 0 && msg_style == MS_UUCP)
540: /* the final newline in the (brain damaged) unix-format
541: * maildrop is part of the delimitter - delete it.
542: */
543: return 1;
544:
545: (void) fseek (iob, pos, 0);
546: return 0;
547: }
548:
549: if (msg_style == MS_UUCP) {
550: #ifndef RPATHS
551: while ((c = getc (iob)) != '\n')
552: if (c < 0)
553: break;
554: #else RPATHS
555: cp = unixbuf;
556: while ((c = getc (iob)) != '\n' && c >= 0)
557: *cp++ = c;
558: *cp = NULL;
559: #endif RPATHS
560: }
561:
562: return 1;
563: }
564:
565: /* */
566:
567: #ifdef RPATHS
568: char *
569: unixline()
570: {
571: register char *cp,
572: *dp,
573: *pp;
574: static char unixfrom[BUFSIZ];
575:
576: pp = unixfrom;
577: if (cp = dp = index (unixbuf, ' ')) {
578: while (cp = index (cp + 1, 'r'))
579: if (strncmp (cp, "remote from ", 12) == 0) {
580: *cp = NULL;
581: (void) sprintf (pp, "%s!", cp + 12);
582: pp += strlen (pp);
583: break;
584: }
585: if (cp == NULL)
586: cp = unixbuf + strlen (unixbuf);
587: if ((cp -= 25) >= dp)
588: *cp = NULL;
589: }
590:
591: (void) sprintf (pp, "%s\n", unixbuf);
592: unixbuf[0] = NULL;
593: return unixfrom;
594: }
595: #endif RPATHS
596:
597: /* */
598:
599: #if (vax && !lint)
600: asm(".align 1");
601: asm("_matchc: .word 0");
602: asm(" movq 4(ap),r0");
603: asm(" movq 12(ap),r2");
604: asm(" matchc r0,(r1),r2,(r3)");
605: asm(" beql 1f");
606: asm(" movl 4(ap),r3");
607: asm("1: subl3 4(ap),r3,r0");
608: asm(" ret");
609: #else
610: static unsigned char *
611: matchc( patln, pat, strln, str )
612: int patln;
613: char *pat;
614: int strln;
615: register char *str;
616: {
617: register char *es = str + strln - patln;
618: register char *sp;
619: register char *pp;
620: register char *ep = pat + patln;
621: register char pc = *pat++;
622:
623: for(;;) {
624: while (pc != *str++)
625: if (str > es)
626: return 0;
627:
628: sp = str; pp = pat;
629: while (pp < ep && *sp++ == *pp++)
630: ;
631: if (pp >= ep)
632: return ((unsigned char *)--str);
633: }
634: }
635: #endif
636:
637: /* */
638:
639: /*
640: * Locate character "term" in the next "cnt" characters of "src".
641: * If found, return its address, otherwise return 0.
642: */
643: #if (vax && !lint)
644: asm(".align 1");
645: asm("_locc: .word 0");
646: asm(" movq 4(ap),r0");
647: asm(" locc 12(ap),r0,(r1)");
648: asm(" beql 1f");
649: asm(" movl r1,r0");
650: asm("1: ret");
651: #else
652: static unsigned char *
653: locc( cnt, src, term )
654: register int cnt;
655: register unsigned char *src;
656: register unsigned char term;
657: {
658: while (*src++ != term && --cnt > 0);
659:
660: return (cnt > 0 ? --src : (unsigned char *)0);
661: }
662: #endif
663:
664: /* */
665:
666: #if !defined (BSD42) && !defined (bcopy)
667: int
668: bcmp(b1, b2, length)
669: register char *b1, *b2;
670: register int length;
671: {
672: while (length-- > 0)
673: if (*b1++ != *b2++)
674: return 1;
675:
676: return 0;
677: }
678:
679:
680: bcopy(b1, b2, length)
681: register char *b1, *b2;
682: register int length;
683: {
684: while (length-- > 0)
685: *b2++ = *b1++;
686: }
687:
688:
689: bzero(b, length)
690: register char *b;
691: register int length;
692: {
693: while (length-- > 0)
694: *b++ = NULL;
695: }
696: #endif not BSD42 or SYS5
This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.