|
|
1.1 root 1: /* Prepare Tex index dribble output into an actual index.
2: Copyright (C) Richard M. Stallman 1984
3:
4: Permission is granted to anyone to make or distribute
5: verbatim copies of this program
6: provided that the copyright notice and this permission notice are preserved;
7: and provided that the recipient is not asked to waive or limit his right to
8: redistribute copies as permitted by this permission notice;
9: and provided that anyone possessing a machine-executable copy
10: is granted access to copy the source code, in machine-readable form,
11: in some reasonable manner.
12:
13: Permission is granted to distribute derived works or enhanced versions of
14: this program under the above conditions with the additional condition
15: that the entire derivative or enhanced work
16: must be covered by a permission notice identical to this one.
17:
18: Anything distributed as part of a package containing portions derived
19: from this program, which cannot in current practice perform its function
20: usefully in the absence of what was derived directly from this program,
21: is to be considered as forming, together with the latter,
22: a single work derived from this program,
23: which must be entirely covered by a permission notice identical to this one
24: in order for distribution of the package to be permitted.
25:
26: In other words, you are welcome to use, share and improve this program.
27: You are forbidden to forbid anyone else to use, share and improve
28: what you give them. Help stamp out software-hoarding! */
29:
30: #include <stdio.h>
31: #include <sys/file.h>
32:
33: #ifndef L_XTND
34: #define L_XTND 2
35: #endif
36:
37: /* When sorting in core, this structure describes one line
38: and the position and length of its first keyfield. */
39:
40: struct lineinfo
41: {
42: char *text; /* The actual text of the line */
43: union
44: { /* The start of the key (for textual comparison) */
45: char *text;
46: long number; /* or the numeric value (for numeric comparison) */
47: } key;
48: long keylen; /* Length of key field */
49: };
50:
51: /* This structure describes a field to use as a sort key */
52:
53: struct keyfield
54: {
55: int startwords; /* # words to skip */
56: int startchars; /* and # additional chars to skip, to start of field */
57: int endwords; /* similar, from beg (or end) of line, to find end of field */
58: int endchars;
59: char ignore_blanks; /* Ignore spaces and tabs within the field */
60: char fold_case; /* Convert upper case to lower before comparing */
61: char reverse; /* Compare in reverse order */
62: char numeric; /* Parse text as an integer and compare the integers */
63: char positional; /* Sort according to position within the file */
64: char braced; /* Count balanced-braced groupings as fields */
65: };
66:
67: /* Vector of keyfields to use */
68:
69: struct keyfield keyfields[3];
70:
71: /* Number of keyfields stored in that vector. */
72:
73: int num_keyfields = 3;
74:
75: /* Vector of input file names, terminated with a zero (null pointer) */
76:
77: char **infiles;
78:
79: /* Vector of corresponding output file names, or zero meaning default it */
80:
81: char **outfiles;
82:
83: /* Length of `infiles' */
84:
85: int num_infiles;
86:
87: /* Pointer to the array of pointers to lines being sorted */
88:
89: char **linearray;
90:
91: /* The allocated length of `linearray'. */
92:
93: long lines;
94:
95: /* Directory to use for temporary files */
96:
97: char *tempdir;
98:
99: /* Start of filename to use for temporary files. It starts with a slash. */
100:
101: char *tempbase;
102:
103: /* Number of last temporary file. */
104:
105: int tempcount;
106:
107: /* Number of last temporary file already deleted.
108: Temporary files are deleted by `flush_tempfiles' in order of creation. */
109:
110: int last_deleted_tempcount;
111:
112: /* During in-core sort, this points to the base of the data block
113: which contains all the lines of data. */
114:
115: char *text_base;
116:
117: /* Additional command switches */
118:
119: int keep_tempfiles; /* Nonzero means do not delete tempfiles -- for debugging */
120:
121: /* Forward declarations of functions in this file */
122:
123: void decode_command ();
124: void sort_in_core ();
125: void sort_offline ();
126: char **parsefile ();
127: char *find_field ();
128: char *find_pos ();
129: char *find_braced_pos ();
130: char *find_braced_end ();
131: void writelines ();
132: int compare_full ();
133: long readline ();
134: int merge_files ();
135: int merge_direct ();
136: char *concat ();
137: char *maketempname ();
138: void flush_tempfiles ();
139: char *tempcopy ();
140:
141: extern char *mktemp ();
142:
143: #define MAX_IN_CORE_SORT 500000
144:
145: int
146: main (argc, argv)
147: int argc;
148: char **argv;
149: {
150: int i;
151:
152: tempcount = 0;
153: last_deleted_tempcount = 0;
154:
155: /* Describe the kind of sorting to do. */
156: /* The first keyfield uses the first braced field and folds case */
157: keyfields[0].braced = 1;
158: keyfields[0].fold_case = 1;
159: keyfields[0].endwords = -1;
160: keyfields[0].endchars = -1;
161: /* The second keyfield uses the second braced field, numerically */
162: keyfields[1].braced = 1;
163: keyfields[1].numeric = 1;
164: keyfields[1].startwords = 1;
165: keyfields[1].endwords = -1;
166: keyfields[1].endchars = -1;
167: /* The third keyfield (which is ignored while discarding duplicates)
168: compares the whole line */
169: keyfields[2].endwords = -1;
170: keyfields[2].endchars = -1;
171:
172: decode_command (argc, argv);
173:
174: tempbase = mktemp (concat ("/txiXXXXXX", "", ""));
175:
176: /* Process input files completely, one by one. */
177:
178: for (i = 0; i < num_infiles; i++)
179: {
180: int desc;
181: long ptr;
182: char *outfile;
183: char *p;
184:
185: desc = open (infiles[i], 0, 0);
186: if (desc < 0) pfatal_with_name (infiles[i]);
187: lseek (desc, 0, L_XTND);
188: ptr = tell (desc);
189: close (desc);
190:
191: outfile = outfiles[i];
192: if (!outfile)
193: {
194: outfile = concat (infiles[i], "s", "");
195: }
196:
197: if (ptr < MAX_IN_CORE_SORT)
198: /* Sort a small amount of data */
199: sort_in_core (infiles[i], ptr, outfile);
200: else
201: sort_offline (infiles[i], ptr, outfile);
202: }
203:
204: flush_tempfiles (tempcount);
205: exit (0);
206: }
207:
208: /* This page decodes the command line arguments to set the parameter variables
209: and set up the vector of keyfields and the vector of input files */
210:
211: void
212: decode_command (argc, argv)
213: int argc;
214: char **argv;
215: {
216: int i;
217: char **ip;
218: char **op;
219:
220: /* Store default values into parameter variables */
221:
222: tempdir = "/tmp";
223: keep_tempfiles = 0;
224:
225: /* Allocate argc input files, which must be enough. */
226:
227: infiles = (char **) xmalloc (argc * sizeof (char *));
228: outfiles = (char **) xmalloc (argc * sizeof (char *));
229: ip = infiles;
230: op = outfiles;
231:
232: /* First find all switches that control the default kind-of-sort */
233:
234: for (i = 1; i < argc; i++)
235: {
236: int tem = classify_arg (argv[i]);
237: char c;
238: char *p;
239:
240: if (tem <= 0)
241: {
242: *ip++ = argv[i];
243: *op++ = 0;
244: continue;
245: }
246: if (tem > 1)
247: {
248: if (i + 1 == argc)
249: fatal ("switch %s given with no argument following it", argv[i]);
250: else if (!strcmp (argv[i], "-T"))
251: tempdir = argv[i + 1];
252: else if (!strcmp (argv[i], "-o"))
253: *(op - 1) = argv[i + 1];
254: i += tem - 1;
255: continue;
256: }
257:
258: p = &argv[i][1];
259: while (c = *p++)
260: switch (c)
261: {
262: case 'k':
263: keep_tempfiles = 1;
264: break;
265:
266: default:
267: fatal ("invalid command switch %c", c);
268: }
269: switchdone: ;
270: }
271:
272: /* Record number of keyfields, terminate list of filenames */
273:
274: num_infiles = ip - infiles;
275: *ip = 0;
276: }
277:
278: /* Return 0 for an argument that is not a switch;
279: for a switch, return 1 plus the number of following arguments that the switch swallows.
280: */
281:
282: int
283: classify_arg (arg)
284: char *arg;
285: {
286: if (!strcmp (arg, "-T") || !strcmp (arg, "-o"))
287: return 2;
288: if (arg[0] == '-')
289: return 1;
290: return 0;
291: }
292:
293: /* Create a name for a temporary file */
294:
295: char *
296: maketempname (count)
297: int count;
298: {
299: char tempsuffix[10];
300: sprintf (tempsuffix, "%d", count);
301: return concat (tempdir, tempbase, tempsuffix);
302: }
303:
304: /* Delete all temporary files up to the specified count */
305:
306: void
307: flush_tempfiles (to_count)
308: int to_count;
309: {
310: if (keep_tempfiles) return;
311: while (last_deleted_tempcount < to_count)
312: unlink (maketempname (++last_deleted_tempcount));
313: }
314:
315: /* Copy an input file into a temporary file, and return the temporary file name */
316:
317: #define BUFSIZE 1024
318:
319: char *
320: tempcopy (idesc)
321: int idesc;
322: {
323: char *outfile = maketempname (++tempcount);
324: int odesc;
325: char buffer[BUFSIZE];
326:
327: odesc = open (outfile, O_WRONLY | O_CREAT, 0666);
328:
329: if (odesc < 0) pfatal_with_name (outfile);
330:
331: while (1)
332: {
333: int nread = read (idesc, buffer, BUFSIZE);
334: write (odesc, buffer, nread);
335: if (!nread) break;
336: }
337:
338: close (odesc);
339:
340: return outfile;
341: }
342:
343: /* Compare two lines, provided as pointers to pointers to text,
344: according to the specified set of keyfields */
345:
346: int
347: compare_full (line1, line2)
348: char **line1, **line2;
349: {
350: int i;
351:
352: /* Compare using the first keyfield;
353: if that does not distinguish the lines, try the second keyfield; and so on. */
354:
355: for (i = 0; i < num_keyfields; i++)
356: {
357: long length1, length2;
358: char *start1 = find_field (&keyfields[i], *line1, &length1);
359: char *start2 = find_field (&keyfields[i], *line2, &length2);
360: int tem = compare_field (&keyfields[i], start1, length1, *line1 - text_base,
361: start2, length2, *line2 - text_base);
362: if (tem)
363: {
364: if (keyfields[i].reverse)
365: return - tem;
366: return tem;
367: }
368: }
369:
370: return 0; /* Lines match exactly */
371: }
372:
373: /* Compare two lines described by structures
374: in which the first keyfield is identified in advance.
375: For positional sorting, assumes that the order of the lines in core
376: reflects their nominal order. */
377:
378: int
379: compare_prepared (line1, line2)
380: struct lineinfo *line1, *line2;
381: {
382: int i;
383: int tem;
384: char *text1, *text2;
385:
386: /* Compare using the first keyfield, which has been found for us already */
387: if (keyfields->positional)
388: {
389: if (line1->text - text_base > line2->text - text_base)
390: tem = 1;
391: else
392: tem = -1;
393: }
394: else if (keyfields->numeric)
395: tem = line1->key.number - line2->key.number;
396: else
397: tem = compare_field (keyfields, line1->key.text, line1->keylen, 0, line2->key, line2->keylen, 0);
398: if (tem)
399: {
400: if (keyfields->reverse)
401: return - tem;
402: return tem;
403: }
404:
405: text1 = line1->text;
406: text2 = line2->text;
407:
408: /* Compare using the second keyfield;
409: if that does not distinguish the lines, try the third keyfield; and so on. */
410:
411: for (i = 1; i < num_keyfields; i++)
412: {
413: long length1, length2;
414: char *start1 = find_field (&keyfields[i], text1, &length1);
415: char *start2 = find_field (&keyfields[i], text2, &length2);
416: int tem = compare_field (&keyfields[i], start1, length1, text1 - text_base,
417: start2, length2, text2 - text_base);
418: if (tem)
419: {
420: if (keyfields[i].reverse)
421: return - tem;
422: return tem;
423: }
424: }
425:
426: return 0; /* Lines match exactly */
427: }
428:
429: /* Like compare_full but more general.
430: You can pass any strings, and you can say how many keyfields to use.
431: `pos1' and `pos2' should indicate the nominal positional ordering of
432: the two lines in the input. */
433:
434: int
435: compare_general (str1, str2, pos1, pos2, use_keyfields)
436: char *str1, *str2;
437: long pos1, pos2;
438: int use_keyfields;
439: {
440: int i;
441:
442: /* Compare using the first keyfield;
443: if that does not distinguish the lines, try the second keyfield; and so on. */
444:
445: for (i = 0; i < use_keyfields; i++)
446: {
447: long length1, length2;
448: char *start1 = find_field (&keyfields[i], str1, &length1);
449: char *start2 = find_field (&keyfields[i], str2, &length2);
450: int tem = compare_field (&keyfields[i], start1, length1, pos1, start2, length2, pos2);
451: if (tem)
452: {
453: if (keyfields[i].reverse)
454: return - tem;
455: return tem;
456: }
457: }
458:
459: return 0; /* Lines match exactly */
460: }
461:
462: /* Find the start and length of a field in `str' according to `keyfield'.
463: A pointer to the starting character is returned, and the length
464: is stored into the int that `lengthptr' points to. */
465:
466: char *
467: find_field (keyfield, str, lengthptr)
468: struct keyfield *keyfield;
469: char *str;
470: long *lengthptr;
471: {
472: char *start;
473: char *end;
474: char *(*fun) ();
475:
476: if (keyfield->braced) fun = find_braced_pos;
477: else fun = find_pos;
478:
479: start = ( *fun )(str, keyfield->startwords, keyfield->startchars,
480: keyfield->ignore_blanks);
481: if (keyfield->endwords < 0)
482: {
483: if (keyfield->braced)
484: end = find_braced_end (start);
485: else
486: {
487: end = start;
488: while (*end && *end != '\n') end++;
489: }
490: }
491: else
492: {
493: end = ( *fun )(str, keyfield->endwords, keyfield->endchars, 0);
494: if (end - str < start - str) end = start;
495: }
496: *lengthptr = end - start;
497: return start;
498: }
499:
500: /* Find a pointer to a specified place within `str',
501: skipping (from the beginning) `words' words and then `chars' chars.
502: If `ignore_blanks' is nonzero, we skip all blanks
503: after finding the specified word. */
504:
505: char *
506: find_pos (str, words, chars, ignore_blanks)
507: char *str;
508: int words, chars;
509: int ignore_blanks;
510: {
511: int i;
512: char *p = str;
513:
514: for (i = 0; i < words; i++)
515: {
516: char c;
517: /* Find next bunch of nonblanks and skip them. */
518: while ((c = *p) == ' ' || c == '\t') p++;
519: while ((c = *p) && c != '\n' && !(c == ' ' || c == '\t')) p++;
520: if (!*p || *p == '\n') return p;
521: }
522:
523: while (*p == ' ' || *p == '\t') p++;
524:
525: for (i = 0; i < chars; i++)
526: {
527: if (!*p || *p == '\n') break;
528: p++;
529: }
530: return p;
531: }
532:
533: /* Like find_pos but assumes that each field is surrounded by braces
534: and that braces within fields are balanced. */
535:
536: char *
537: find_braced_pos (str, words, chars, ignore_blanks)
538: char *str;
539: int words, chars;
540: int ignore_blanks;
541: {
542: int i;
543: int bracelevel;
544: char *p = str;
545: char c;
546:
547: for (i = 0; i < words; i++)
548: {
549: bracelevel = 1;
550: while ((c = *p++) != '{' && c != '\n' && c);
551: if (c != '{')
552: return p - 1;
553: while (bracelevel)
554: {
555: c = *p++;
556: if (c == '{') bracelevel++;
557: if (c == '}') bracelevel--;
558: if (c == '\\') c = *p++; /* \ quotes braces and \ */
559: if (c == 0 || c == '\n') return p-1;
560: }
561: }
562:
563: while ((c = *p++) != '{' && c != '\n' && c);
564:
565: if (c != '{')
566: return p-1;
567:
568: if (ignore_blanks)
569: while ((c = *p) == ' ' || c == '\t') p++;
570:
571: for (i = 0; i < chars; i++)
572: {
573: if (!*p || *p == '\n') break;
574: p++;
575: }
576: return p;
577: }
578:
579: /* Find the end of the balanced-brace field which starts at `str'.
580: The position returned is just before the closing brace. */
581:
582: char *
583: find_braced_end (str)
584: char *str;
585: {
586: int bracelevel;
587: char *p = str;
588: char c;
589:
590: bracelevel = 1;
591: while (bracelevel)
592: {
593: c = *p++;
594: if (c == '{') bracelevel++;
595: if (c == '}') bracelevel--;
596: if (c == '\\') c = *p++;
597: if (c == 0 || c == '\n') return p-1;
598: }
599: return p - 1;
600: }
601:
602: /* Compare two fields (each specified as a start pointer and a character count)
603: according to `keyfield'. The sign of the value reports the relation between the fields */
604:
605: int
606: compare_field (keyfield, start1, length1, pos1, start2, length2, pos2)
607: struct keyfield *keyfield;
608: char *start1;
609: long length1;
610: long pos1;
611: char *start2;
612: long length2;
613: long pos2;
614: {
615: if (keyfields->positional)
616: {
617: if (pos1 > pos2)
618: return 1;
619: else
620: return -1;
621: }
622: if (keyfield->numeric)
623: {
624: long value = atol (start1) - atol (start2);
625: if (value > 0) return 1;
626: if (value < 0) return -1;
627: return 0;
628: }
629: else
630: {
631: char *p1 = start1;
632: char *p2 = start2;
633: char *e1 = start1 + length1;
634: char *e2 = start2 + length2;
635:
636: int fold_case = keyfield->fold_case;
637:
638: while (1)
639: {
640: char c1, c2;
641:
642: if (p1 == e1) c1 = 0;
643: else c1 = *p1++;
644: if (p2 == e2) c2 = 0;
645: else c2 = *p2++;
646:
647: /* Ignore case of both if desired */
648:
649: if (fold_case)
650: {
651: if (c1 >= 'A' && c1 <= 'Z') c1 = c1 + 040;
652: if (c2 >= 'A' && c2 <= 'Z') c2 = c2 + 040;
653: }
654:
655: /* Actually compare */
656:
657: if (c1 != c2) return c1 - c2;
658: if (!c1) break;
659: }
660: return 0;
661: }
662: }
663:
664: /* A `struct linebuffer' is a structure which holds a line of text.
665: `readline' reads a line from a stream into a linebuffer
666: and works regardless of the length of the line. */
667:
668: struct linebuffer
669: {
670: long size;
671: char *buffer;
672: };
673:
674: /* Initialize a linebuffer for use */
675:
676: void
677: initbuffer (linebuffer)
678: struct linebuffer *linebuffer;
679: {
680: linebuffer->size = 200;
681: linebuffer->buffer = (char *) xmalloc (200);
682: }
683:
684: /* Read a line of text from `stream' into `linebuffer'.
685: Return the length of the line. */
686:
687: long
688: readline (linebuffer, stream)
689: struct linebuffer *linebuffer;
690: FILE *stream;
691: {
692: char *buffer = linebuffer->buffer;
693: char *p = linebuffer->buffer;
694: char *end = p + linebuffer->size;
695:
696: while (1)
697: {
698: int c = getc (stream);
699: if (p == end)
700: {
701: buffer = (char *) xrealloc (buffer, linebuffer->size *= 2);
702: p += buffer - linebuffer->buffer;
703: end += buffer - linebuffer->buffer;
704: linebuffer->buffer = buffer;
705: }
706: if (c < 0 || c == '\n')
707: {
708: *p = 0;
709: break;
710: }
711: *p++ = c;
712: }
713:
714: return p - buffer;
715: }
716:
717: /* Sort the input files together when they are too big to sort in core */
718:
719: void
720: sort_offline (infile, nfiles, total, outfile)
721: char *infile;
722: long total;
723: char *outfile;
724: {
725: int ntemps = 2 * (total + MAX_IN_CORE_SORT - 1) / MAX_IN_CORE_SORT; /* More than enough */
726: char **tempfiles = (char **) xmalloc (ntemps * sizeof (char *));
727: FILE *istream = fopen (infile, "r");
728: int i;
729: struct linebuffer lb;
730: long linelength;
731:
732: initbuffer (&lb);
733:
734: /* Read in one line of input data. */
735:
736: linelength = readline (&lb, istream);
737:
738: /* Split up the input into `ntemps' temporary files, or maybe fewer,
739: and put the new files' names into `tempfiles' */
740:
741: for (i = 0; i < ntemps; i++)
742: {
743: char *outname = maketempname (++tempcount);
744: FILE *ostream = fopen (outname, "w");
745: long tempsize = 0;
746:
747: if (!ostream) pfatal_with_name (outname);
748: tempfiles[i] = outname;
749:
750: /* Copy lines into this temp file as long as it does not make file "too big"
751: or until there are no more lines. */
752:
753: while (tempsize + linelength + 1 <= MAX_IN_CORE_SORT)
754: {
755: tempsize += linelength + 1;
756: fputs (lb.buffer, ostream);
757: putc ('\n', ostream);
758:
759: /* Read another line of input data. */
760:
761: linelength = readline (&lb, istream);
762: if (!linelength && feof (istream)) break;
763: }
764: fclose (ostream);
765: if (feof (istream)) break;
766: }
767:
768: free (lb.buffer);
769:
770: /* Record number of temp files we actually needed. */
771:
772: ntemps = i;
773:
774: /* Sort each tempfile into another tempfile.
775: Delete the first set of tempfiles and put the names of the second into `tempfiles' */
776:
777: for (i = 0; i < ntemps; i++)
778: {
779: char *newtemp = maketempname (++tempcount);
780: sort_in_core (&tempfiles[i], 1, MAX_IN_CORE_SORT, newtemp);
781: if (!keep_tempfiles)
782: unlink (tempfiles[i]);
783: tempfiles[i] = newtemp;
784: }
785:
786: /* Merge the tempfiles together and indexify */
787:
788: merge_files (tempfiles, ntemps, outfile);
789: }
790:
791: /* Sort `infile', whose size is `total',
792: assuming that is small enough to be done in-core,
793: then indexify it and send the output to `outfile' (or to stdout). */
794:
795: void
796: sort_in_core (infile, total, outfile)
797: char *infile;
798: long total;
799: char *outfile;
800: {
801: char **nextline;
802: char *data = (char *) xmalloc (total + 1);
803: char *file_data;
804: long file_size;
805: int i;
806: FILE *ostream = stdout;
807: struct lineinfo *lineinfo;
808:
809: /* Read the contents of the file into the moby array `data' */
810:
811: int desc = open (infile, 0, 0);
812:
813: if (desc < 0)
814: fatal ("failure reopening %s", infile);
815: file_size = read (desc, data, total);
816: file_data = data;
817: data[file_size] = 0;
818:
819: close (desc);
820:
821: /* Sort routines want to know this address */
822:
823: text_base = data;
824:
825: /* Create the array of pointers to lines, with a default size frequently enough. */
826:
827: lines = total / 50;
828: if (!lines) lines = 2;
829: linearray = (char **) xmalloc (lines * sizeof (char *));
830:
831: /* `nextline' points to the next free slot in this array.
832: `lines' is the allocated size. */
833:
834: nextline = linearray;
835:
836: /* Parse the input file's data, and make entries for the lines. */
837:
838: nextline = parsefile (infile, nextline, file_data, file_size);
839:
840: /* Sort the lines */
841:
842: /* If we have enough space, find the first keyfield of each line in advance.
843: Make a `struct lineinfo' for each line, which records the keyfield
844: as well as the line, and sort them. */
845:
846: lineinfo = (struct lineinfo *) malloc ((nextline - linearray) * sizeof (struct lineinfo));
847:
848: if (lineinfo)
849: {
850: struct lineinfo *lp;
851: char **p;
852:
853: for (lp = lineinfo, p = linearray; p != nextline; lp++, p++)
854: {
855: lp->text = *p;
856: lp->key.text = find_field (keyfields, *p, &lp->keylen);
857: if (keyfields->numeric)
858: lp->key.number = atol (lp->key.text);
859: }
860:
861: qsort (lineinfo, nextline - linearray, sizeof (struct lineinfo), compare_prepared);
862:
863: for (lp = lineinfo, p = linearray; p != nextline; lp++, p++)
864: *p = lp->text;
865:
866: free (lineinfo);
867: }
868: else
869: qsort (linearray, nextline - linearray, sizeof (char *), compare_full);
870:
871: /* Open the output file */
872:
873: if (outfile)
874: {
875: ostream = fopen (outfile, "w");
876: if (!ostream)
877: pfatal_with_name (outfile);
878: }
879:
880: writelines (linearray, nextline - linearray, ostream);
881: if (outfile) fclose (ostream);
882:
883: free (linearray);
884: free (data);
885: }
886:
887: /* Parse an input string in core into lines.
888: `data' is the input string, and `size' is its length.
889: Data goes in `linearray' starting at `nextline'.
890: The value returned is the first entry in `linearray' still unused. */
891:
892: char **
893: parsefile (filename, nextline, data, size)
894: char *filename;
895: char **nextline;
896: char *data;
897: long size;
898: {
899: char *p, *end;
900: char **line = nextline;
901:
902: p = data;
903: end = p + size;
904: *end = 0;
905:
906: while (p != end)
907: {
908: *line = p;
909: while (*p && *p != '\n') p++;
910: if (p != end) p++;
911:
912: /* This feature will be installed later. */
913: /* if (discard_empty_lines && p == *line + 1) continue; */
914:
915: line++;
916: if (line == linearray + lines)
917: {
918: char **old = linearray;
919: linearray = (char **) xrealloc (linearray, sizeof (char *) * (lines *= 4));
920: line += linearray - old;
921: }
922: }
923:
924: return line;
925: }
926:
927: /* Indexification is a filter applied to the sorted lines
928: as they are being written to the output file.
929: Multiple entries for the same name, with different page numbers,
930: get combined into a single entry with multiple page numbers.
931: The first braced field, which is used for sorting, is discarded.
932: However, its first character is examined, folded to lower case,
933: and if it is different from that in the previous line fed to us
934: a \initial line is written with one argument, the new initial.
935:
936: If an entry has four braced fields, then the second and third
937: constitute primary and secondary names.
938: In this case, each change of primary name
939: generates a \primary line which contains only the primary name,
940: and in between these are \secondary lines which contain
941: just a secondary name and page numbers.
942: */
943:
944: /* The last primary name we wrote a \primary entry for.
945: If only one level of indexing is being done, this is the last name seen */
946: char *lastprimary;
947: int lastprimarylength; /* Length of storage allocated for lastprimary */
948:
949: /* Similar, for the secondary name. */
950: char *lastsecondary;
951: int lastsecondarylength;
952:
953: /* Zero if we are not in the middle of writing an entry.
954: One if we have written the beginning of an entry but have not
955: yet written any page numbers into it.
956: Greater than one if we have written the beginning of an entry
957: plus at least one page number. */
958: int pending;
959:
960: /* The initial (for sorting purposes) of the last primary entry written.
961: When this changes, a \initial {c} line is written */
962:
963: char * lastinitial;
964:
965: int lastinitiallength;
966:
967: /* When we need a string of length 1 for the value of lastinitial,
968: store it here. */
969:
970: char lastinitial1[2];
971:
972: /* Initialize static storage for writing an index */
973:
974: void
975: init_index ()
976: {
977: pending = 0;
978: lastinitial = lastinitial1;
979: lastinitial1[0] = 0;
980: lastinitial1[1] = 0;
981: lastinitiallength = 0;
982: lastprimarylength = 100;
983: lastprimary = (char *) xmalloc (lastprimarylength + 1);
984: bzero (lastprimary, lastprimarylength + 1);
985: lastsecondarylength = 100;
986: lastsecondary = (char *) xmalloc (lastsecondarylength + 1);
987: bzero (lastsecondary, lastsecondarylength + 1);
988: }
989:
990: /* Indexify. Merge entries for the same name,
991: insert headers for each initial character, etc. */
992:
993: indexify (line, ostream)
994: char *line;
995: FILE *ostream;
996: {
997: char *primary, *secondary, *pagenumber;
998: int primarylength, secondarylength, pagelength;
999: int len = strlen (line);
1000: int nosecondary;
1001: int initiallength;
1002: char *initial;
1003: char initial1[2];
1004: register char *p;
1005:
1006: /* First, analyze the parts of the entry fed to us this time */
1007:
1008: p = find_braced_pos (line, 0, 0, 0);
1009: if (*p == '{')
1010: {
1011: initial = p;
1012: /* Get length of inner pair of braces starting at p,
1013: including that inner pair of braces. */
1014: initiallength = find_braced_end (p + 1) + 1 - p;
1015: }
1016: else
1017: {
1018: initial = initial1;
1019: initial1[0] = *p;
1020: initial1[1] = 0;
1021: initiallength = 1;
1022:
1023: if (initial1[0] >= 'a' && initial1[0] <= 'z')
1024: initial1[0] -= 040;
1025: }
1026:
1027: pagenumber = find_braced_pos (line, 1, 0, 0);
1028: pagelength = find_braced_end (pagenumber) - pagenumber;
1029: if (pagelength == 0)
1030: abort ();
1031:
1032: primary = find_braced_pos (line, 2, 0, 0);
1033: primarylength = find_braced_end (primary) - primary;
1034:
1035: secondary = find_braced_pos (line, 3, 0, 0);
1036: nosecondary = !*secondary;
1037: if (!nosecondary)
1038: secondarylength = find_braced_end (secondary) - secondary;
1039:
1040: /* If the primary is different from before, make a new primary entry */
1041: if (strncmp (primary, lastprimary, primarylength))
1042: {
1043: /* Close off current secondary entry first, if one is open */
1044: if (pending)
1045: {
1046: fputs ("}\n", ostream);
1047: pending = 0;
1048: }
1049:
1050: /* If this primary has a different initial, include an entry for the initial */
1051: if (initiallength != lastinitiallength || strcmp (initial, lastinitial))
1052: {
1053: fprintf (ostream, "\\initial {");
1054: fwrite (initial, 1, initiallength, ostream);
1055: fprintf (ostream, "}\n", initial);
1056: if (initial == initial1)
1057: {
1058: lastinitial = lastinitial1;
1059: *lastinitial1 = *initial1;
1060: }
1061: else
1062: {
1063: lastinitial = initial;
1064: }
1065: lastinitiallength = initiallength;
1066: }
1067:
1068: /* Make the entry for the primary. */
1069: if (nosecondary)
1070: fputs ("\\entry {", ostream);
1071: else
1072: fputs ("\\primary {", ostream);
1073: fwrite (primary, primarylength, 1, ostream);
1074: if (nosecondary)
1075: {
1076: fputs ("}{", ostream);
1077: pending = 1;
1078: }
1079: else
1080: fputs ("}\n", ostream);
1081:
1082: /* Record name of most recent primary */
1083: if (lastprimarylength < primarylength)
1084: {
1085: lastprimarylength = primarylength + 100;
1086: lastprimary = (char *) xrealloc (lastprimary,
1087: 1 + lastprimarylength);
1088: }
1089: strncpy (lastprimary, primary, primarylength);
1090: lastprimary[primarylength] = 0;
1091:
1092: /* There is no current secondary within this primary, now */
1093: lastsecondary[0] = 0;
1094: }
1095:
1096: /* Should not have an entry with no subtopic following one with a subtopic */
1097:
1098: if (nosecondary && *lastsecondary)
1099: error ("entry %s follows an entry with a secondary name", line);
1100:
1101: /* Start a new secondary entry if necessary */
1102: if (!nosecondary && strncmp (secondary, lastsecondary, secondarylength))
1103: {
1104: if (pending)
1105: {
1106: fputs ("}\n", ostream);
1107: pending = 0;
1108: }
1109:
1110: /* Write the entry for the secondary. */
1111: fputs ("\\secondary {", ostream);
1112: fwrite (secondary, secondarylength, 1, ostream);
1113: fputs ("}{", ostream);
1114: pending = 1;
1115:
1116: /* Record name of most recent secondary */
1117: if (lastsecondarylength < secondarylength)
1118: {
1119: lastsecondarylength = secondarylength + 100;
1120: lastsecondary = (char *) xrealloc (lastsecondary,
1121: 1 + lastsecondarylength);
1122: }
1123: strncpy (lastsecondary, secondary, secondarylength);
1124: lastsecondary[secondarylength] = 0;
1125: }
1126:
1127: /* Here to add one more page number to the current entry */
1128: if (pending++ != 1)
1129: fputs (", ", ostream); /* Punctuate first, if this is not the first */
1130: fwrite (pagenumber, pagelength, 1, ostream);
1131: }
1132:
1133: /* Close out any unfinished output entry */
1134:
1135: void
1136: finish_index (ostream)
1137: FILE *ostream;
1138: {
1139: if (pending)
1140: fputs ("}\n", ostream);
1141: free (lastprimary);
1142: free (lastsecondary);
1143: }
1144:
1145: /* Copy the lines in the sorted order.
1146: Each line is copied out of the input file it was found in. */
1147:
1148: void
1149: writelines (linearray, nlines, ostream)
1150: char **linearray;
1151: int nlines;
1152: FILE *ostream;
1153: {
1154: char **stop_line = linearray + nlines;
1155: char **next_line;
1156:
1157: init_index ();
1158:
1159: /* Output the text of the lines, and free the buffer space */
1160:
1161: for (next_line = linearray; next_line != stop_line; next_line++)
1162: {
1163: /* If -u was specified, output the line only if distinct from previous one. */
1164: if (next_line == linearray
1165: /* Compare previous line with this one, using only the explicitly specd keyfields */
1166: || compare_general (*(next_line - 1), *next_line, 0L, 0L, num_keyfields - 1))
1167: {
1168: char *p = *next_line;
1169: char c;
1170: while ((c = *p++) && c != '\n');
1171: *(p-1) = 0;
1172: indexify (*next_line, ostream);
1173: }
1174: }
1175:
1176: finish_index (ostream);
1177: }
1178:
1179: /* Assume (and optionally verify) that each input file is sorted;
1180: merge them and output the result.
1181: Returns nonzero if any input file fails to be sorted.
1182:
1183: This is the high-level interface that can handle an unlimited number of files. */
1184:
1185: #define MAX_DIRECT_MERGE 10
1186:
1187: int
1188: merge_files (infiles, nfiles, outfile)
1189: char **infiles;
1190: int nfiles;
1191: char *outfile;
1192: {
1193: char **tempfiles;
1194: int ntemps;
1195: int i;
1196: int value = 0;
1197: int start_tempcount = tempcount;
1198:
1199: if (nfiles <= MAX_DIRECT_MERGE)
1200: return merge_direct (infiles, nfiles, outfile);
1201:
1202: /* Merge groups of MAX_DIRECT_MERGE input files at a time,
1203: making a temporary file to hold each group's result. */
1204:
1205: ntemps = (nfiles + MAX_DIRECT_MERGE - 1) / MAX_DIRECT_MERGE;
1206: tempfiles = (char **) xmalloc (ntemps * sizeof (char *));
1207: for (i = 0; i < ntemps; i++)
1208: {
1209: int nf = MAX_DIRECT_MERGE;
1210: if (i + 1 == ntemps)
1211: nf = nfiles - i * MAX_DIRECT_MERGE;
1212: tempfiles[i] = maketempname (++tempcount);
1213: value |= merge_direct (&infiles[i * MAX_DIRECT_MERGE], nf, tempfiles[i]);
1214: }
1215:
1216: /* All temporary files that existed before are no longer needed
1217: since their contents have been merged into our new tempfiles.
1218: So delete them. */
1219: flush_tempfiles (start_tempcount);
1220:
1221: /* Now merge the temporary files we created. */
1222:
1223: merge_files (tempfiles, ntemps, outfile);
1224:
1225: free (tempfiles);
1226:
1227: return value;
1228: }
1229:
1230: /* Assume (and optionally verify) that each input file is sorted;
1231: merge them and output the result.
1232: Returns nonzero if any input file fails to be sorted.
1233:
1234: This version of merging will not work if the number of
1235: input files gets too high. Higher level functions
1236: use it only with a bounded number of input files. */
1237:
1238: int
1239: merge_direct (infiles, nfiles, outfile)
1240: char **infiles;
1241: int nfiles;
1242: char *outfile;
1243: {
1244: char **ip = infiles;
1245: struct linebuffer *lb1, *lb2;
1246: struct linebuffer **thisline, **prevline;
1247: FILE **streams;
1248: int i;
1249: int nleft;
1250: int lossage = 0;
1251: int *file_lossage;
1252: struct linebuffer *prev_out = 0;
1253: FILE *ostream = stdout;
1254:
1255: if (outfile)
1256: ostream = fopen (outfile, "w");
1257: if (!ostream) pfatal_with_name (outfile);
1258:
1259: init_index ();
1260:
1261: if (nfiles == 0)
1262: {
1263: if (outfile)
1264: fclose (ostream);
1265: return 0;
1266: }
1267:
1268: /* For each file, make two line buffers.
1269: Also, for each file, there is an element of `thisline'
1270: which points at any time to one of the file's two buffers,
1271: and an element of `prevline' which points to the other buffer.
1272: `thisline' is supposed to point to the next available line from the file,
1273: while `prevline' holds the last file line used,
1274: which is remembered so that we can verify that the file is properly sorted. */
1275:
1276: /* lb1 and lb2 contain one buffer each per file */
1277: lb1 = (struct linebuffer *) xmalloc (nfiles * sizeof (struct linebuffer));
1278: lb2 = (struct linebuffer *) xmalloc (nfiles * sizeof (struct linebuffer));
1279:
1280: /* thisline[i] points to the linebuffer holding the next available line in file i,
1281: or is zero if there are no lines left in that file. */
1282: thisline = (struct linebuffer **) xmalloc (nfiles * sizeof (struct linebuffer *));
1283: /* prevline[i] points to the linebuffer holding the last used line from file i.
1284: This is just for verifying that file i is properly sorted. */
1285: prevline = (struct linebuffer **) xmalloc (nfiles * sizeof (struct linebuffer *));
1286: /* streams[i] holds the input stream for file i. */
1287: streams = (FILE **) xmalloc (nfiles * sizeof (FILE *));
1288: /* file_lossage[i] is nonzero if we already know file i is not properly sorted. */
1289: file_lossage = (int *) xmalloc (nfiles * sizeof (int));
1290:
1291: /* Allocate and initialize all that storage */
1292:
1293: for (i = 0; i < nfiles; i++)
1294: {
1295: initbuffer (&lb1[i]);
1296: initbuffer (&lb2[i]);
1297: thisline[i] = &lb1[i];
1298: prevline[i] = &lb2[i];
1299: file_lossage[i] = 0;
1300: streams[i] = fopen (infiles[i], "r");
1301: if (!streams[i])
1302: pfatal_with_name (infiles[i]);
1303:
1304: readline (thisline[i], streams[i]);
1305: }
1306:
1307: /* Keep count of number of files not at eof */
1308: nleft = nfiles;
1309:
1310: while (nleft)
1311: {
1312: struct linebuffer *best = 0;
1313: struct linebuffer *exch;
1314: int bestfile = -1;
1315: int i;
1316:
1317: /* Look at the next avail line of each file; choose the least one. */
1318:
1319: for (i = 0; i < nfiles; i++)
1320: {
1321: if (thisline[i] &&
1322: (!best ||
1323: 0 < compare_general (best->buffer, thisline[i]->buffer,
1324: (long) bestfile, (long) i, num_keyfields)))
1325: {
1326: best = thisline[i];
1327: bestfile = i;
1328: }
1329: }
1330:
1331: /* Output that line, unless it matches the previous one and we don't want duplicates */
1332:
1333: if (!(prev_out &&
1334: !compare_general (prev_out->buffer, best->buffer, 0L, 1L, num_keyfields - 1)))
1335: indexify (best->buffer, ostream);
1336: prev_out = best;
1337:
1338: /* Now make the line the previous of its file, and fetch a new line from that file */
1339:
1340: exch = prevline[bestfile];
1341: prevline[bestfile] = thisline[bestfile];
1342: thisline[bestfile] = exch;
1343:
1344: while (1)
1345: {
1346: /* If the file has no more, mark it empty */
1347:
1348: if (feof (streams[bestfile]))
1349: {
1350: thisline[bestfile] = 0;
1351: nleft--; /* Update the number of files still not empty */
1352: break;
1353: }
1354: readline (thisline[bestfile], streams[bestfile]);
1355: if (thisline[bestfile]->buffer[0] || !feof (streams[bestfile])) break;
1356: }
1357: }
1358:
1359: finish_index (ostream);
1360:
1361: /* Free all storage and close all input streams */
1362:
1363: for (i = 0; i < nfiles; i++)
1364: {
1365: fclose (streams[i]);
1366: free (lb1[i].buffer);
1367: free (lb2[i].buffer);
1368: }
1369: free (file_lossage);
1370: free (lb1);
1371: free (lb2);
1372: free (thisline);
1373: free (prevline);
1374: free (streams);
1375:
1376: if (outfile)
1377: fclose (ostream);
1378:
1379: return lossage;
1380: }
1381:
1382: /* Print error message and exit. */
1383:
1384: fatal (s1, s2)
1385: char *s1, *s2;
1386: {
1387: error (s1, s2);
1388: exit (1);
1389: }
1390:
1391: /* Print error message. `s1' is printf control string, `s2' is arg for it. */
1392:
1393: error (s1, s2)
1394: char *s1, *s2;
1395: {
1396: printf ("texi: ");
1397: printf (s1, s2);
1398: printf ("\n");
1399: }
1400:
1401: perror_with_name (name)
1402: char *name;
1403: {
1404: extern int errno, sys_nerr;
1405: extern char *sys_errlist[];
1406: char *s;
1407:
1408: if (errno < sys_nerr)
1409: s = concat ("", sys_errlist[errno], " for %s");
1410: else
1411: s = "cannot open %s";
1412: error (s, name);
1413: }
1414:
1415: pfatal_with_name (name)
1416: char *name;
1417: {
1418: extern int errno, sys_nerr;
1419: extern char *sys_errlist[];
1420: char *s;
1421:
1422: if (errno < sys_nerr)
1423: s = concat ("", sys_errlist[errno], " for %s");
1424: else
1425: s = "cannot open %s";
1426: fatal (s, name);
1427: }
1428:
1429: /* Return a newly-allocated string whose contents concatenate those of s1, s2, s3. */
1430:
1431: char *
1432: concat (s1, s2, s3)
1433: char *s1, *s2, *s3;
1434: {
1435: int len1 = strlen (s1), len2 = strlen (s2), len3 = strlen (s3);
1436: char *result = (char *) xmalloc (len1 + len2 + len3 + 1);
1437:
1438: strcpy (result, s1);
1439: strcpy (result + len1, s2);
1440: strcpy (result + len1 + len2, s3);
1441: *(result + len1 + len2 + len3) = 0;
1442:
1443: return result;
1444: }
1445:
1446: /* Like malloc but get fatal error if memory is exhausted. */
1447:
1448: int
1449: xmalloc (size)
1450: int size;
1451: {
1452: int result = malloc (size);
1453: if (!result)
1454: fatal ("virtual memory exhausted", 0);
1455: return result;
1456: }
1457:
1458:
1459: int
1460: xrealloc (ptr, size)
1461: char *ptr;
1462: int size;
1463: {
1464: int result = realloc (ptr, size);
1465: if (!result)
1466: fatal ("virtual memory exhausted");
1467: return result;
1468: }
This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.