|
|
1.1 root 1: /* Prepare Tex index dribble output into an actual index.
2: Copyright (C) 1987 Free Software Foundation, Inc.
3:
4: This program is free software; you can redistribute it and/or modify
5: it under the terms of the GNU General Public License as published by
6: the Free Software Foundation; either version 1, or (at your option)
7: any later version.
8:
9: This program is distributed in the hope that it will be useful,
10: but WITHOUT ANY WARRANTY; without even the implied warranty of
11: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12: GNU General Public License for more details.
13:
14: You should have received a copy of the GNU General Public License
15: along with this program; if not, write to the Free Software
16: Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
17:
18: In other words, you are welcome to use, share and improve this program.
19: You are forbidden to forbid anyone else to use, share and improve
20: what you give them. Help stamp out software-hoarding! */
21:
22:
23: #include <stdio.h>
24: #include <ctype.h>
25:
26: #ifdef VMS
27: #include <file.h>
28:
29: #define EXIT_SUCCESS ((1 << 28) | 1)
30: #define EXIT_FATAL ((1 << 28) | 4)
31: #define unlink delete
32: #define tell(fd) lseek(fd, 0L, 1)
33: #else
34: #include <sys/types.h>
35: #include <sys/fcntl.h>
36: #include <sys/file.h>
37:
38: #define EXIT_SUCCESS 0
39: #define EXIT_FATAL 1
40: #endif
41:
42:
43: #ifndef L_XTND
44: #define L_XTND 2
45: #endif
46:
47: /* When sorting in core, this structure describes one line
48: and the position and length of its first keyfield. */
49:
50: struct lineinfo
51: {
52: char *text; /* The actual text of the line */
53: union
54: { /* The start of the key (for textual comparison) */
55: char *text;
56: long number; /* or the numeric value (for numeric comparison) */
57: } key;
58: long keylen; /* Length of key field */
59: };
60:
61: /* This structure describes a field to use as a sort key */
62:
63: struct keyfield
64: {
65: int startwords; /* # words to skip */
66: int startchars; /* and # additional chars to skip, to start of field */
67: int endwords; /* similar, from beg (or end) of line, to find end of field */
68: int endchars;
69: char ignore_blanks; /* Ignore spaces and tabs within the field */
70: char fold_case; /* Convert upper case to lower before comparing */
71: char reverse; /* Compare in reverse order */
72: char numeric; /* Parse text as an integer and compare the integers */
73: char positional; /* Sort according to position within the file */
74: char braced; /* Count balanced-braced groupings as fields */
75: };
76:
77: /* Vector of keyfields to use */
78:
79: struct keyfield keyfields[3];
80:
81: /* Number of keyfields stored in that vector. */
82:
83: int num_keyfields = 3;
84:
85: /* Vector of input file names, terminated with a zero (null pointer) */
86:
87: char **infiles;
88:
89: /* Vector of corresponding output file names, or zero meaning default it */
90:
91: char **outfiles;
92:
93: /* Length of `infiles' */
94:
95: int num_infiles;
96:
97: /* Pointer to the array of pointers to lines being sorted */
98:
99: char **linearray;
100:
101: /* The allocated length of `linearray'. */
102:
103: long lines;
104:
105: /* Directory to use for temporary files. On Unix, it ends with a slash. */
106:
107: char *tempdir;
108:
109: /* Start of filename to use for temporary files. */
110:
111: char *tempbase;
112:
113: /* Number of last temporary file. */
114:
115: int tempcount;
116:
117: /* Number of last temporary file already deleted.
118: Temporary files are deleted by `flush_tempfiles' in order of creation. */
119:
120: int last_deleted_tempcount;
121:
122: /* During in-core sort, this points to the base of the data block
123: which contains all the lines of data. */
124:
125: char *text_base;
126:
127: /* Additional command switches */
128:
129: int keep_tempfiles; /* Nonzero means do not delete tempfiles -- for debugging */
130:
131: /* Forward declarations of functions in this file */
132:
133: void decode_command ();
134: void sort_in_core ();
135: void sort_offline ();
136: char **parsefile ();
137: char *find_field ();
138: char *find_pos ();
139: long find_value ();
140: char *find_braced_pos ();
141: char *find_braced_end ();
142: void writelines ();
143: int compare_full ();
144: long readline ();
145: int merge_files ();
146: int merge_direct ();
147: char *concat ();
148: char *maketempname ();
149: void flush_tempfiles ();
150: char *tempcopy ();
151:
152: extern char *mktemp ();
153:
154: #define MAX_IN_CORE_SORT 500000
155:
156: int
157: main (argc, argv)
158: int argc;
159: char **argv;
160: {
161: int i;
162:
163: tempcount = 0;
164: last_deleted_tempcount = 0;
165:
166: /* Describe the kind of sorting to do. */
167: /* The first keyfield uses the first braced field and folds case */
168: keyfields[0].braced = 1;
169: keyfields[0].fold_case = 1;
170: keyfields[0].endwords = -1;
171: keyfields[0].endchars = -1;
172: /* The second keyfield uses the second braced field, numerically */
173: keyfields[1].braced = 1;
174: keyfields[1].numeric = 1;
175: keyfields[1].startwords = 1;
176: keyfields[1].endwords = -1;
177: keyfields[1].endchars = -1;
178: /* The third keyfield (which is ignored while discarding duplicates)
179: compares the whole line */
180: keyfields[2].endwords = -1;
181: keyfields[2].endchars = -1;
182:
183: decode_command (argc, argv);
184:
185: tempbase = mktemp (concat ("txiXXXXXX", "", ""));
186:
187: /* Process input files completely, one by one. */
188:
189: for (i = 0; i < num_infiles; i++)
190: {
191: int desc;
192: long ptr;
193: char *outfile;
194: char *p;
195:
196: desc = open (infiles[i], 0, 0);
197: if (desc < 0) pfatal_with_name (infiles[i]);
198: lseek (desc, 0, L_XTND);
199: ptr = tell (desc);
200: close (desc);
201:
202: outfile = outfiles[i];
203: if (!outfile)
204: {
205: outfile = concat (infiles[i], "s", "");
206: }
207:
208: if (ptr < MAX_IN_CORE_SORT)
209: /* Sort a small amount of data */
210: sort_in_core (infiles[i], ptr, outfile);
211: else
212: sort_offline (infiles[i], ptr, outfile);
213: }
214:
215: flush_tempfiles (tempcount);
216: exit (EXIT_SUCCESS);
217: }
218:
219: /* This page decodes the command line arguments to set the parameter variables
220: and set up the vector of keyfields and the vector of input files */
221:
222: void
223: decode_command (argc, argv)
224: int argc;
225: char **argv;
226: {
227: int i;
228: char **ip;
229: char **op;
230:
231: /* Store default values into parameter variables */
232:
233: #ifdef VMS
234: tempdir = "sys$scratch:";
235: #else
236: tempdir = "/tmp/";
237: #endif
238:
239: keep_tempfiles = 0;
240:
241: /* Allocate argc input files, which must be enough. */
242:
243: infiles = (char **) xmalloc (argc * sizeof (char *));
244: outfiles = (char **) xmalloc (argc * sizeof (char *));
245: ip = infiles;
246: op = outfiles;
247:
248: /* First find all switches that control the default kind-of-sort */
249:
250: for (i = 1; i < argc; i++)
251: {
252: int tem = classify_arg (argv[i]);
253: char c;
254: char *p;
255:
256: if (tem <= 0)
257: {
258: *ip++ = argv[i];
259: *op++ = 0;
260: continue;
261: }
262: if (tem > 1)
263: {
264: if (i + 1 == argc)
265: fatal ("switch %s given with no argument following it", argv[i]);
266: else if (!strcmp (argv[i], "-T"))
267: tempdir = argv[i + 1];
268: else if (!strcmp (argv[i], "-o"))
269: *(op - 1) = argv[i + 1];
270: i += tem - 1;
271: continue;
272: }
273:
274: p = &argv[i][1];
275: while (c = *p++)
276: switch (c)
277: {
278: case 'k':
279: keep_tempfiles = 1;
280: break;
281:
282: default:
283: fatal ("invalid command switch %c", c);
284: }
285: switchdone: ;
286: }
287:
288: /* Record number of keyfields, terminate list of filenames */
289:
290: num_infiles = ip - infiles;
291: *ip = 0;
292: }
293:
294: /* Return 0 for an argument that is not a switch;
295: for a switch, return 1 plus the number of following arguments that the switch swallows.
296: */
297:
298: int
299: classify_arg (arg)
300: char *arg;
301: {
302: if (!strcmp (arg, "-T") || !strcmp (arg, "-o"))
303: return 2;
304: if (arg[0] == '-')
305: return 1;
306: return 0;
307: }
308:
309: /* Create a name for a temporary file */
310:
311: char *
312: maketempname (count)
313: int count;
314: {
315: char tempsuffix[10];
316: sprintf (tempsuffix, "%d", count);
317: return concat (tempdir, tempbase, tempsuffix);
318: }
319:
320: /* Delete all temporary files up to the specified count */
321:
322: void
323: flush_tempfiles (to_count)
324: int to_count;
325: {
326: if (keep_tempfiles) return;
327: while (last_deleted_tempcount < to_count)
328: unlink (maketempname (++last_deleted_tempcount));
329: }
330:
331: /* Copy an input file into a temporary file, and return the temporary file name */
332:
333: #define BUFSIZE 1024
334:
335: char *
336: tempcopy (idesc)
337: int idesc;
338: {
339: char *outfile = maketempname (++tempcount);
340: int odesc;
341: char buffer[BUFSIZE];
342:
343: odesc = open (outfile, O_WRONLY | O_CREAT, 0666);
344:
345: if (odesc < 0) pfatal_with_name (outfile);
346:
347: while (1)
348: {
349: int nread = read (idesc, buffer, BUFSIZE);
350: write (odesc, buffer, nread);
351: if (!nread) break;
352: }
353:
354: close (odesc);
355:
356: return outfile;
357: }
358:
359: /* Compare two lines, provided as pointers to pointers to text,
360: according to the specified set of keyfields */
361:
362: int
363: compare_full (line1, line2)
364: char **line1, **line2;
365: {
366: int i;
367:
368: /* Compare using the first keyfield;
369: if that does not distinguish the lines, try the second keyfield; and so on. */
370:
371: for (i = 0; i < num_keyfields; i++)
372: {
373: long length1, length2;
374: char *start1 = find_field (&keyfields[i], *line1, &length1);
375: char *start2 = find_field (&keyfields[i], *line2, &length2);
376: int tem = compare_field (&keyfields[i], start1, length1, *line1 - text_base,
377: start2, length2, *line2 - text_base);
378: if (tem)
379: {
380: if (keyfields[i].reverse)
381: return - tem;
382: return tem;
383: }
384: }
385:
386: return 0; /* Lines match exactly */
387: }
388:
389: /* Compare two lines described by structures
390: in which the first keyfield is identified in advance.
391: For positional sorting, assumes that the order of the lines in core
392: reflects their nominal order. */
393:
394: int
395: compare_prepared (line1, line2)
396: struct lineinfo *line1, *line2;
397: {
398: int i;
399: int tem;
400: char *text1, *text2;
401:
402: /* Compare using the first keyfield, which has been found for us already */
403: if (keyfields->positional)
404: {
405: if (line1->text - text_base > line2->text - text_base)
406: tem = 1;
407: else
408: tem = -1;
409: }
410: else if (keyfields->numeric)
411: tem = line1->key.number - line2->key.number;
412: else
413: tem = compare_field (keyfields, line1->key.text, line1->keylen, 0, line2->key.text, line2->keylen, 0);
414: if (tem)
415: {
416: if (keyfields->reverse)
417: return - tem;
418: return tem;
419: }
420:
421: text1 = line1->text;
422: text2 = line2->text;
423:
424: /* Compare using the second keyfield;
425: if that does not distinguish the lines, try the third keyfield; and so on. */
426:
427: for (i = 1; i < num_keyfields; i++)
428: {
429: long length1, length2;
430: char *start1 = find_field (&keyfields[i], text1, &length1);
431: char *start2 = find_field (&keyfields[i], text2, &length2);
432: int tem = compare_field (&keyfields[i], start1, length1, text1 - text_base,
433: start2, length2, text2 - text_base);
434: if (tem)
435: {
436: if (keyfields[i].reverse)
437: return - tem;
438: return tem;
439: }
440: }
441:
442: return 0; /* Lines match exactly */
443: }
444:
445: /* Like compare_full but more general.
446: You can pass any strings, and you can say how many keyfields to use.
447: `pos1' and `pos2' should indicate the nominal positional ordering of
448: the two lines in the input. */
449:
450: int
451: compare_general (str1, str2, pos1, pos2, use_keyfields)
452: char *str1, *str2;
453: long pos1, pos2;
454: int use_keyfields;
455: {
456: int i;
457:
458: /* Compare using the first keyfield;
459: if that does not distinguish the lines, try the second keyfield; and so on. */
460:
461: for (i = 0; i < use_keyfields; i++)
462: {
463: long length1, length2;
464: char *start1 = find_field (&keyfields[i], str1, &length1);
465: char *start2 = find_field (&keyfields[i], str2, &length2);
466: int tem = compare_field (&keyfields[i], start1, length1, pos1, start2, length2, pos2);
467: if (tem)
468: {
469: if (keyfields[i].reverse)
470: return - tem;
471: return tem;
472: }
473: }
474:
475: return 0; /* Lines match exactly */
476: }
477:
478: /* Find the start and length of a field in `str' according to `keyfield'.
479: A pointer to the starting character is returned, and the length
480: is stored into the int that `lengthptr' points to. */
481:
482: char *
483: find_field (keyfield, str, lengthptr)
484: struct keyfield *keyfield;
485: char *str;
486: long *lengthptr;
487: {
488: char *start;
489: char *end;
490: char *(*fun) ();
491:
492: if (keyfield->braced) fun = find_braced_pos;
493: else fun = find_pos;
494:
495: start = ( *fun )(str, keyfield->startwords, keyfield->startchars,
496: keyfield->ignore_blanks);
497: if (keyfield->endwords < 0)
498: {
499: if (keyfield->braced)
500: end = find_braced_end (start);
501: else
502: {
503: end = start;
504: while (*end && *end != '\n') end++;
505: }
506: }
507: else
508: {
509: end = ( *fun )(str, keyfield->endwords, keyfield->endchars, 0);
510: if (end - str < start - str) end = start;
511: }
512: *lengthptr = end - start;
513: return start;
514: }
515:
516: /* Find a pointer to a specified place within `str',
517: skipping (from the beginning) `words' words and then `chars' chars.
518: If `ignore_blanks' is nonzero, we skip all blanks
519: after finding the specified word. */
520:
521: char *
522: find_pos (str, words, chars, ignore_blanks)
523: char *str;
524: int words, chars;
525: int ignore_blanks;
526: {
527: int i;
528: char *p = str;
529:
530: for (i = 0; i < words; i++)
531: {
532: char c;
533: /* Find next bunch of nonblanks and skip them. */
534: while ((c = *p) == ' ' || c == '\t') p++;
535: while ((c = *p) && c != '\n' && !(c == ' ' || c == '\t')) p++;
536: if (!*p || *p == '\n') return p;
537: }
538:
539: while (*p == ' ' || *p == '\t') p++;
540:
541: for (i = 0; i < chars; i++)
542: {
543: if (!*p || *p == '\n') break;
544: p++;
545: }
546: return p;
547: }
548:
549: /* Like find_pos but assumes that each field is surrounded by braces
550: and that braces within fields are balanced. */
551:
552: char *
553: find_braced_pos (str, words, chars, ignore_blanks)
554: char *str;
555: int words, chars;
556: int ignore_blanks;
557: {
558: int i;
559: int bracelevel;
560: char *p = str;
561: char c;
562:
563: for (i = 0; i < words; i++)
564: {
565: bracelevel = 1;
566: while ((c = *p++) != '{' && c != '\n' && c);
567: if (c != '{')
568: return p - 1;
569: while (bracelevel)
570: {
571: c = *p++;
572: if (c == '{') bracelevel++;
573: if (c == '}') bracelevel--;
574: if (c == '\\') c = *p++; /* \ quotes braces and \ */
575: if (c == 0 || c == '\n') return p-1;
576: }
577: }
578:
579: while ((c = *p++) != '{' && c != '\n' && c);
580:
581: if (c != '{')
582: return p-1;
583:
584: if (ignore_blanks)
585: while ((c = *p) == ' ' || c == '\t') p++;
586:
587: for (i = 0; i < chars; i++)
588: {
589: if (!*p || *p == '\n') break;
590: p++;
591: }
592: return p;
593: }
594:
595: /* Find the end of the balanced-brace field which starts at `str'.
596: The position returned is just before the closing brace. */
597:
598: char *
599: find_braced_end (str)
600: char *str;
601: {
602: int bracelevel;
603: char *p = str;
604: char c;
605:
606: bracelevel = 1;
607: while (bracelevel)
608: {
609: c = *p++;
610: if (c == '{') bracelevel++;
611: if (c == '}') bracelevel--;
612: if (c == '\\') c = *p++;
613: if (c == 0 || c == '\n') return p-1;
614: }
615: return p - 1;
616: }
617:
618: long
619: find_value (start, length)
620: char *start;
621: long length;
622: {
623: while (length != 0L) {
624: if (isdigit(*start))
625: return atol(start);
626: length--;
627: start++;
628: }
629: return 0l;
630: }
631:
632: /* Vector used to translate characters for comparison.
633: This is how we make all alphanumerics follow all else,
634: and ignore case in the first sorting. */
635: int char_order[256];
636:
637: init_char_order ()
638: {
639: int i;
640: for (i = 1; i < 256; i++)
641: char_order[i] = i;
642:
643: for (i = '0'; i <= '9'; i++)
644: char_order[i] += 512;
645:
646: for (i = 'a'; i <= 'z'; i++) {
647: char_order[i] = 512 + i;
648: char_order[i + 'A' - 'a'] = 512 + i;
649: }
650: }
651:
652: /* Compare two fields (each specified as a start pointer and a character count)
653: according to `keyfield'. The sign of the value reports the relation between the fields */
654:
655: int
656: compare_field (keyfield, start1, length1, pos1, start2, length2, pos2)
657: struct keyfield *keyfield;
658: char *start1;
659: long length1;
660: long pos1;
661: char *start2;
662: long length2;
663: long pos2;
664: {
665: if (keyfields->positional)
666: {
667: if (pos1 > pos2)
668: return 1;
669: else
670: return -1;
671: }
672: if (keyfield->numeric)
673: {
674: long value = find_value (start1, length1) - find_value (start2, length2);
675: if (value > 0) return 1;
676: if (value < 0) return -1;
677: return 0;
678: }
679: else
680: {
681: char *p1 = start1;
682: char *p2 = start2;
683: char *e1 = start1 + length1;
684: char *e2 = start2 + length2;
685:
686: int fold_case = keyfield->fold_case;
687:
688: while (1)
689: {
690: int c1, c2;
691:
692: if (p1 == e1) c1 = 0;
693: else c1 = *p1++;
694: if (p2 == e2) c2 = 0;
695: else c2 = *p2++;
696:
697: if (char_order[c1] != char_order[c2])
698: return char_order[c1] - char_order[c2];
699: if (!c1) break;
700: }
701:
702: /* Strings are equal except possibly for case. */
703: p1 = start1;
704: p2 = start2;
705: while (1)
706: {
707: int c1, c2;
708:
709: if (p1 == e1) c1 = 0;
710: else c1 = *p1++;
711: if (p2 == e2) c2 = 0;
712: else c2 = *p2++;
713:
714: if (c1 != c2)
715: /* Reverse sign here so upper case comes out last. */
716: return c2 - c1;
717: if (!c1) break;
718: }
719:
720: return 0;
721: }
722: }
723:
724: /* A `struct linebuffer' is a structure which holds a line of text.
725: `readline' reads a line from a stream into a linebuffer
726: and works regardless of the length of the line. */
727:
728: struct linebuffer
729: {
730: long size;
731: char *buffer;
732: };
733:
734: /* Initialize a linebuffer for use */
735:
736: void
737: initbuffer (linebuffer)
738: struct linebuffer *linebuffer;
739: {
740: linebuffer->size = 200;
741: linebuffer->buffer = (char *) xmalloc (200);
742: }
743:
744: /* Read a line of text from `stream' into `linebuffer'.
745: Return the length of the line. */
746:
747: long
748: readline (linebuffer, stream)
749: struct linebuffer *linebuffer;
750: FILE *stream;
751: {
752: char *buffer = linebuffer->buffer;
753: char *p = linebuffer->buffer;
754: char *end = p + linebuffer->size;
755:
756: while (1)
757: {
758: int c = getc (stream);
759: if (p == end)
760: {
761: buffer = (char *) xrealloc (buffer, linebuffer->size *= 2);
762: p += buffer - linebuffer->buffer;
763: end += buffer - linebuffer->buffer;
764: linebuffer->buffer = buffer;
765: }
766: if (c < 0 || c == '\n')
767: {
768: *p = 0;
769: break;
770: }
771: *p++ = c;
772: }
773:
774: return p - buffer;
775: }
776:
777: /* Sort an input file too big to sort in core. */
778:
779: void
780: sort_offline (infile, nfiles, total, outfile)
781: char *infile;
782: long total;
783: char *outfile;
784: {
785: int ntemps = 2 * (total + MAX_IN_CORE_SORT - 1) / MAX_IN_CORE_SORT; /* More than enough */
786: char **tempfiles = (char **) xmalloc (ntemps * sizeof (char *));
787: FILE *istream = fopen (infile, "r");
788: int i;
789: struct linebuffer lb;
790: long linelength;
791: int failure = 0;
792:
793: initbuffer (&lb);
794:
795: /* Read in one line of input data. */
796:
797: linelength = readline (&lb, istream);
798:
799: if (lb.buffer[0] != '\\')
800: {
801: error ("%s: not a texinfo index file", infile);
802: return;
803: }
804:
805: /* Split up the input into `ntemps' temporary files, or maybe fewer,
806: and put the new files' names into `tempfiles' */
807:
808: for (i = 0; i < ntemps; i++)
809: {
810: char *outname = maketempname (++tempcount);
811: FILE *ostream = fopen (outname, "w");
812: long tempsize = 0;
813:
814: if (!ostream) pfatal_with_name (outname);
815: tempfiles[i] = outname;
816:
817: /* Copy lines into this temp file as long as it does not make file "too big"
818: or until there are no more lines. */
819:
820: while (tempsize + linelength + 1 <= MAX_IN_CORE_SORT)
821: {
822: tempsize += linelength + 1;
823: fputs (lb.buffer, ostream);
824: putc ('\n', ostream);
825:
826: /* Read another line of input data. */
827:
828: linelength = readline (&lb, istream);
829: if (!linelength && feof (istream)) break;
830:
831: if (lb.buffer[0] != '\\')
832: {
833: error ("%s: not a texinfo index file", infile);
834: failure = 1;
835: goto fail;
836: }
837: }
838: fclose (ostream);
839: if (feof (istream)) break;
840: }
841:
842: free (lb.buffer);
843:
844: fail:
845: /* Record number of temp files we actually needed. */
846:
847: ntemps = i;
848:
849: /* Sort each tempfile into another tempfile.
850: Delete the first set of tempfiles and put the names of the second into `tempfiles' */
851:
852: for (i = 0; i < ntemps; i++)
853: {
854: char *newtemp = maketempname (++tempcount);
855: sort_in_core (&tempfiles[i], MAX_IN_CORE_SORT, newtemp);
856: if (!keep_tempfiles)
857: unlink (tempfiles[i]);
858: tempfiles[i] = newtemp;
859: }
860:
861: if (failure)
862: return;
863:
864: /* Merge the tempfiles together and indexify */
865:
866: merge_files (tempfiles, ntemps, outfile);
867: }
868:
869: /* Sort `infile', whose size is `total',
870: assuming that is small enough to be done in-core,
871: then indexify it and send the output to `outfile' (or to stdout). */
872:
873: void
874: sort_in_core (infile, total, outfile)
875: char *infile;
876: long total;
877: char *outfile;
878: {
879: char **nextline;
880: char *data = (char *) xmalloc (total + 1);
881: char *file_data;
882: long file_size;
883: int i;
884: FILE *ostream = stdout;
885: struct lineinfo *lineinfo;
886:
887: /* Read the contents of the file into the moby array `data' */
888:
889: int desc = open (infile, 0, 0);
890:
891: if (desc < 0)
892: fatal ("failure reopening %s", infile);
893: for (file_size = 0; ; )
894: {
895: if ((i = read (desc, data + file_size, total - file_size)) <= 0)
896: break;
897: file_size += i;
898: }
899: file_data = data;
900: data[file_size] = 0;
901:
902: close (desc);
903:
904: if (file_size > 0 && data[0] != '\\')
905: {
906: error ("%s: not a texinfo index file", infile);
907: return;
908: }
909:
910: init_char_order ();
911:
912: /* Sort routines want to know this address */
913:
914: text_base = data;
915:
916: /* Create the array of pointers to lines, with a default size frequently enough. */
917:
918: lines = total / 50;
919: if (!lines) lines = 2;
920: linearray = (char **) xmalloc (lines * sizeof (char *));
921:
922: /* `nextline' points to the next free slot in this array.
923: `lines' is the allocated size. */
924:
925: nextline = linearray;
926:
927: /* Parse the input file's data, and make entries for the lines. */
928:
929: nextline = parsefile (infile, nextline, file_data, file_size);
930: if (nextline == 0)
931: {
932: error ("%s: not a texinfo index file", infile);
933: return;
934: }
935:
936: /* Sort the lines */
937:
938: /* If we have enough space, find the first keyfield of each line in advance.
939: Make a `struct lineinfo' for each line, which records the keyfield
940: as well as the line, and sort them. */
941:
942: lineinfo = (struct lineinfo *) malloc ((nextline - linearray) * sizeof (struct lineinfo));
943:
944: if (lineinfo)
945: {
946: struct lineinfo *lp;
947: char **p;
948:
949: for (lp = lineinfo, p = linearray; p != nextline; lp++, p++)
950: {
951: lp->text = *p;
952: lp->key.text = find_field (keyfields, *p, &lp->keylen);
953: if (keyfields->numeric)
954: lp->key.number = find_value (lp->key.text, lp->keylen);
955: }
956:
957: qsort (lineinfo, nextline - linearray, sizeof (struct lineinfo), compare_prepared);
958:
959: for (lp = lineinfo, p = linearray; p != nextline; lp++, p++)
960: *p = lp->text;
961:
962: free (lineinfo);
963: }
964: else
965: qsort (linearray, nextline - linearray, sizeof (char *), compare_full);
966:
967: /* Open the output file */
968:
969: if (outfile)
970: {
971: ostream = fopen (outfile, "w");
972: if (!ostream)
973: pfatal_with_name (outfile);
974: }
975:
976: writelines (linearray, nextline - linearray, ostream);
977: if (outfile) fclose (ostream);
978:
979: free (linearray);
980: free (data);
981: }
982:
983: /* Parse an input string in core into lines.
984: DATA is the input string, and SIZE is its length.
985: Data goes in LINEARRAY starting at NEXTLINE.
986: The value returned is the first entry in LINEARRAY still unused.
987: Value 0 means input file contents are invalid. */
988:
989: char **
990: parsefile (filename, nextline, data, size)
991: char *filename;
992: char **nextline;
993: char *data;
994: long size;
995: {
996: char *p, *end;
997: char **line = nextline;
998:
999: p = data;
1000: end = p + size;
1001: *end = 0;
1002:
1003: while (p != end)
1004: {
1005: if (p[0] != '\\')
1006: return 0;
1007:
1008: *line = p;
1009: while (*p && *p != '\n') p++;
1010: if (p != end) p++;
1011:
1012: line++;
1013: if (line == linearray + lines)
1014: {
1015: char **old = linearray;
1016: linearray = (char **) xrealloc (linearray, sizeof (char *) * (lines *= 4));
1017: line += linearray - old;
1018: }
1019: }
1020:
1021: return line;
1022: }
1023:
1024: /* Indexification is a filter applied to the sorted lines
1025: as they are being written to the output file.
1026: Multiple entries for the same name, with different page numbers,
1027: get combined into a single entry with multiple page numbers.
1028: The first braced field, which is used for sorting, is discarded.
1029: However, its first character is examined, folded to lower case,
1030: and if it is different from that in the previous line fed to us
1031: a \initial line is written with one argument, the new initial.
1032:
1033: If an entry has four braced fields, then the second and third
1034: constitute primary and secondary names.
1035: In this case, each change of primary name
1036: generates a \primary line which contains only the primary name,
1037: and in between these are \secondary lines which contain
1038: just a secondary name and page numbers.
1039: */
1040:
1041: /* The last primary name we wrote a \primary entry for.
1042: If only one level of indexing is being done, this is the last name seen */
1043: char *lastprimary;
1044: int lastprimarylength; /* Length of storage allocated for lastprimary */
1045:
1046: /* Similar, for the secondary name. */
1047: char *lastsecondary;
1048: int lastsecondarylength;
1049:
1050: /* Zero if we are not in the middle of writing an entry.
1051: One if we have written the beginning of an entry but have not
1052: yet written any page numbers into it.
1053: Greater than one if we have written the beginning of an entry
1054: plus at least one page number. */
1055: int pending;
1056:
1057: /* The initial (for sorting purposes) of the last primary entry written.
1058: When this changes, a \initial {c} line is written */
1059:
1060: char * lastinitial;
1061:
1062: int lastinitiallength;
1063:
1064: /* When we need a string of length 1 for the value of lastinitial,
1065: store it here. */
1066:
1067: char lastinitial1[2];
1068:
1069: /* Initialize static storage for writing an index */
1070:
1071: void
1072: init_index ()
1073: {
1074: pending = 0;
1075: lastinitial = lastinitial1;
1076: lastinitial1[0] = 0;
1077: lastinitial1[1] = 0;
1078: lastinitiallength = 0;
1079: lastprimarylength = 100;
1080: lastprimary = (char *) xmalloc (lastprimarylength + 1);
1081: bzero (lastprimary, lastprimarylength + 1);
1082: lastsecondarylength = 100;
1083: lastsecondary = (char *) xmalloc (lastsecondarylength + 1);
1084: bzero (lastsecondary, lastsecondarylength + 1);
1085: }
1086:
1087: /* Indexify. Merge entries for the same name,
1088: insert headers for each initial character, etc. */
1089:
1090: indexify (line, ostream)
1091: char *line;
1092: FILE *ostream;
1093: {
1094: char *primary, *secondary, *pagenumber;
1095: int primarylength, secondarylength, pagelength;
1096: int len = strlen (line);
1097: int nosecondary;
1098: int initiallength;
1099: char *initial;
1100: char initial1[2];
1101: register char *p;
1102:
1103: /* First, analyze the parts of the entry fed to us this time */
1104:
1105: p = find_braced_pos (line, 0, 0, 0);
1106: if (*p == '{')
1107: {
1108: initial = p;
1109: /* Get length of inner pair of braces starting at p,
1110: including that inner pair of braces. */
1111: initiallength = find_braced_end (p + 1) + 1 - p;
1112: }
1113: else
1114: {
1115: initial = initial1;
1116: initial1[0] = *p;
1117: initial1[1] = 0;
1118: initiallength = 1;
1119:
1120: if (initial1[0] >= 'a' && initial1[0] <= 'z')
1121: initial1[0] -= 040;
1122: }
1123:
1124: pagenumber = find_braced_pos (line, 1, 0, 0);
1125: pagelength = find_braced_end (pagenumber) - pagenumber;
1126: if (pagelength == 0)
1127: abort ();
1128:
1129: primary = find_braced_pos (line, 2, 0, 0);
1130: primarylength = find_braced_end (primary) - primary;
1131:
1132: secondary = find_braced_pos (line, 3, 0, 0);
1133: nosecondary = !*secondary;
1134: if (!nosecondary)
1135: secondarylength = find_braced_end (secondary) - secondary;
1136:
1137: /* If the primary is different from before, make a new primary entry */
1138: if (strncmp (primary, lastprimary, primarylength))
1139: {
1140: /* Close off current secondary entry first, if one is open */
1141: if (pending)
1142: {
1143: fputs ("}\n", ostream);
1144: pending = 0;
1145: }
1146:
1147: /* If this primary has a different initial, include an entry for the initial */
1148: if (initiallength != lastinitiallength ||
1149: strncmp (initial, lastinitial, initiallength))
1150: {
1151: fprintf (ostream, "\\initial {");
1152: fwrite (initial, 1, initiallength, ostream);
1153: fprintf (ostream, "}\n", initial);
1154: if (initial == initial1)
1155: {
1156: lastinitial = lastinitial1;
1157: *lastinitial1 = *initial1;
1158: }
1159: else
1160: {
1161: lastinitial = initial;
1162: }
1163: lastinitiallength = initiallength;
1164: }
1165:
1166: /* Make the entry for the primary. */
1167: if (nosecondary)
1168: fputs ("\\entry {", ostream);
1169: else
1170: fputs ("\\primary {", ostream);
1171: fwrite (primary, primarylength, 1, ostream);
1172: if (nosecondary)
1173: {
1174: fputs ("}{", ostream);
1175: pending = 1;
1176: }
1177: else
1178: fputs ("}\n", ostream);
1179:
1180: /* Record name of most recent primary */
1181: if (lastprimarylength < primarylength)
1182: {
1183: lastprimarylength = primarylength + 100;
1184: lastprimary = (char *) xrealloc (lastprimary,
1185: 1 + lastprimarylength);
1186: }
1187: strncpy (lastprimary, primary, primarylength);
1188: lastprimary[primarylength] = 0;
1189:
1190: /* There is no current secondary within this primary, now */
1191: lastsecondary[0] = 0;
1192: }
1193:
1194: /* Should not have an entry with no subtopic following one with a subtopic */
1195:
1196: if (nosecondary && *lastsecondary)
1197: error ("entry %s follows an entry with a secondary name", line);
1198:
1199: /* Start a new secondary entry if necessary */
1200: if (!nosecondary && strncmp (secondary, lastsecondary, secondarylength))
1201: {
1202: if (pending)
1203: {
1204: fputs ("}\n", ostream);
1205: pending = 0;
1206: }
1207:
1208: /* Write the entry for the secondary. */
1209: fputs ("\\secondary {", ostream);
1210: fwrite (secondary, secondarylength, 1, ostream);
1211: fputs ("}{", ostream);
1212: pending = 1;
1213:
1214: /* Record name of most recent secondary */
1215: if (lastsecondarylength < secondarylength)
1216: {
1217: lastsecondarylength = secondarylength + 100;
1218: lastsecondary = (char *) xrealloc (lastsecondary,
1219: 1 + lastsecondarylength);
1220: }
1221: strncpy (lastsecondary, secondary, secondarylength);
1222: lastsecondary[secondarylength] = 0;
1223: }
1224:
1225: /* Here to add one more page number to the current entry */
1226: if (pending++ != 1)
1227: fputs (", ", ostream); /* Punctuate first, if this is not the first */
1228: fwrite (pagenumber, pagelength, 1, ostream);
1229: }
1230:
1231: /* Close out any unfinished output entry */
1232:
1233: void
1234: finish_index (ostream)
1235: FILE *ostream;
1236: {
1237: if (pending)
1238: fputs ("}\n", ostream);
1239: free (lastprimary);
1240: free (lastsecondary);
1241: }
1242:
1243: /* Copy the lines in the sorted order.
1244: Each line is copied out of the input file it was found in. */
1245:
1246: void
1247: writelines (linearray, nlines, ostream)
1248: char **linearray;
1249: int nlines;
1250: FILE *ostream;
1251: {
1252: char **stop_line = linearray + nlines;
1253: char **next_line;
1254:
1255: init_index ();
1256:
1257: /* Output the text of the lines, and free the buffer space */
1258:
1259: for (next_line = linearray; next_line != stop_line; next_line++)
1260: {
1261: /* If -u was specified, output the line only if distinct from previous one. */
1262: if (next_line == linearray
1263: /* Compare previous line with this one, using only the explicitly specd keyfields */
1264: || compare_general (*(next_line - 1), *next_line, 0L, 0L, num_keyfields - 1))
1265: {
1266: char *p = *next_line;
1267: char c;
1268: while ((c = *p++) && c != '\n');
1269: *(p-1) = 0;
1270: indexify (*next_line, ostream);
1271: }
1272: }
1273:
1274: finish_index (ostream);
1275: }
1276:
1277: /* Assume (and optionally verify) that each input file is sorted;
1278: merge them and output the result.
1279: Returns nonzero if any input file fails to be sorted.
1280:
1281: This is the high-level interface that can handle an unlimited number of files. */
1282:
1283: #define MAX_DIRECT_MERGE 10
1284:
1285: int
1286: merge_files (infiles, nfiles, outfile)
1287: char **infiles;
1288: int nfiles;
1289: char *outfile;
1290: {
1291: char **tempfiles;
1292: int ntemps;
1293: int i;
1294: int value = 0;
1295: int start_tempcount = tempcount;
1296:
1297: if (nfiles <= MAX_DIRECT_MERGE)
1298: return merge_direct (infiles, nfiles, outfile);
1299:
1300: /* Merge groups of MAX_DIRECT_MERGE input files at a time,
1301: making a temporary file to hold each group's result. */
1302:
1303: ntemps = (nfiles + MAX_DIRECT_MERGE - 1) / MAX_DIRECT_MERGE;
1304: tempfiles = (char **) xmalloc (ntemps * sizeof (char *));
1305: for (i = 0; i < ntemps; i++)
1306: {
1307: int nf = MAX_DIRECT_MERGE;
1308: if (i + 1 == ntemps)
1309: nf = nfiles - i * MAX_DIRECT_MERGE;
1310: tempfiles[i] = maketempname (++tempcount);
1311: value |= merge_direct (&infiles[i * MAX_DIRECT_MERGE], nf, tempfiles[i]);
1312: }
1313:
1314: /* All temporary files that existed before are no longer needed
1315: since their contents have been merged into our new tempfiles.
1316: So delete them. */
1317: flush_tempfiles (start_tempcount);
1318:
1319: /* Now merge the temporary files we created. */
1320:
1321: merge_files (tempfiles, ntemps, outfile);
1322:
1323: free (tempfiles);
1324:
1325: return value;
1326: }
1327:
1328: /* Assume (and optionally verify) that each input file is sorted;
1329: merge them and output the result.
1330: Returns nonzero if any input file fails to be sorted.
1331:
1332: This version of merging will not work if the number of
1333: input files gets too high. Higher level functions
1334: use it only with a bounded number of input files. */
1335:
1336: int
1337: merge_direct (infiles, nfiles, outfile)
1338: char **infiles;
1339: int nfiles;
1340: char *outfile;
1341: {
1342: char **ip = infiles;
1343: struct linebuffer *lb1, *lb2;
1344: struct linebuffer **thisline, **prevline;
1345: FILE **streams;
1346: int i;
1347: int nleft;
1348: int lossage = 0;
1349: int *file_lossage;
1350: struct linebuffer *prev_out = 0;
1351: FILE *ostream = stdout;
1352:
1353: if (outfile)
1354: {
1355: ostream = fopen (outfile, "w");
1356: }
1357: if (!ostream) pfatal_with_name (outfile);
1358:
1359: init_index ();
1360:
1361: if (nfiles == 0)
1362: {
1363: if (outfile)
1364: fclose (ostream);
1365: return 0;
1366: }
1367:
1368: /* For each file, make two line buffers.
1369: Also, for each file, there is an element of `thisline'
1370: which points at any time to one of the file's two buffers,
1371: and an element of `prevline' which points to the other buffer.
1372: `thisline' is supposed to point to the next available line from the file,
1373: while `prevline' holds the last file line used,
1374: which is remembered so that we can verify that the file is properly sorted. */
1375:
1376: /* lb1 and lb2 contain one buffer each per file */
1377: lb1 = (struct linebuffer *) xmalloc (nfiles * sizeof (struct linebuffer));
1378: lb2 = (struct linebuffer *) xmalloc (nfiles * sizeof (struct linebuffer));
1379:
1380: /* thisline[i] points to the linebuffer holding the next available line in file i,
1381: or is zero if there are no lines left in that file. */
1382: thisline = (struct linebuffer **) xmalloc (nfiles * sizeof (struct linebuffer *));
1383: /* prevline[i] points to the linebuffer holding the last used line from file i.
1384: This is just for verifying that file i is properly sorted. */
1385: prevline = (struct linebuffer **) xmalloc (nfiles * sizeof (struct linebuffer *));
1386: /* streams[i] holds the input stream for file i. */
1387: streams = (FILE **) xmalloc (nfiles * sizeof (FILE *));
1388: /* file_lossage[i] is nonzero if we already know file i is not properly sorted. */
1389: file_lossage = (int *) xmalloc (nfiles * sizeof (int));
1390:
1391: /* Allocate and initialize all that storage */
1392:
1393: for (i = 0; i < nfiles; i++)
1394: {
1395: initbuffer (&lb1[i]);
1396: initbuffer (&lb2[i]);
1397: thisline[i] = &lb1[i];
1398: prevline[i] = &lb2[i];
1399: file_lossage[i] = 0;
1400: streams[i] = fopen (infiles[i], "r");
1401: if (!streams[i])
1402: pfatal_with_name (infiles[i]);
1403:
1404: readline (thisline[i], streams[i]);
1405: }
1406:
1407: /* Keep count of number of files not at eof */
1408: nleft = nfiles;
1409:
1410: while (nleft)
1411: {
1412: struct linebuffer *best = 0;
1413: struct linebuffer *exch;
1414: int bestfile = -1;
1415: int i;
1416:
1417: /* Look at the next avail line of each file; choose the least one. */
1418:
1419: for (i = 0; i < nfiles; i++)
1420: {
1421: if (thisline[i] &&
1422: (!best ||
1423: 0 < compare_general (best->buffer, thisline[i]->buffer,
1424: (long) bestfile, (long) i, num_keyfields)))
1425: {
1426: best = thisline[i];
1427: bestfile = i;
1428: }
1429: }
1430:
1431: /* Output that line, unless it matches the previous one and we don't want duplicates */
1432:
1433: if (!(prev_out &&
1434: !compare_general (prev_out->buffer, best->buffer, 0L, 1L, num_keyfields - 1)))
1435: indexify (best->buffer, ostream);
1436: prev_out = best;
1437:
1438: /* Now make the line the previous of its file, and fetch a new line from that file */
1439:
1440: exch = prevline[bestfile];
1441: prevline[bestfile] = thisline[bestfile];
1442: thisline[bestfile] = exch;
1443:
1444: while (1)
1445: {
1446: /* If the file has no more, mark it empty */
1447:
1448: if (feof (streams[bestfile]))
1449: {
1450: thisline[bestfile] = 0;
1451: nleft--; /* Update the number of files still not empty */
1452: break;
1453: }
1454: readline (thisline[bestfile], streams[bestfile]);
1455: if (thisline[bestfile]->buffer[0] || !feof (streams[bestfile])) break;
1456: }
1457: }
1458:
1459: finish_index (ostream);
1460:
1461: /* Free all storage and close all input streams */
1462:
1463: for (i = 0; i < nfiles; i++)
1464: {
1465: fclose (streams[i]);
1466: free (lb1[i].buffer);
1467: free (lb2[i].buffer);
1468: }
1469: free (file_lossage);
1470: free (lb1);
1471: free (lb2);
1472: free (thisline);
1473: free (prevline);
1474: free (streams);
1475:
1476: if (outfile)
1477: fclose (ostream);
1478:
1479: return lossage;
1480: }
1481:
1482: /* Print error message and exit. */
1483:
1484: fatal (s1, s2)
1485: char *s1, *s2;
1486: {
1487: error (s1, s2);
1488: exit (EXIT_FATAL);
1489: }
1490:
1491: /* Print error message. `s1' is printf control string, `s2' is arg for it. */
1492:
1493: error (s1, s2)
1494: char *s1, *s2;
1495: {
1496: printf ("texindex: ");
1497: printf (s1, s2);
1498: printf ("\n");
1499: }
1500:
1501: perror_with_name (name)
1502: char *name;
1503: {
1504: #ifdef VMS
1505: #include <errno.h>
1506: extern noshare int sys_nerr;
1507: extern noshare char *sys_errlist[];
1508: #else
1509: extern int errno, sys_nerr;
1510: extern char *sys_errlist[];
1511: #endif
1512: char *s;
1513:
1514: if (errno < sys_nerr)
1515: s = concat ("", sys_errlist[errno], " for %s");
1516: else
1517: s = "cannot open %s";
1518: error (s, name);
1519: }
1520:
1521: pfatal_with_name (name)
1522: char *name;
1523: {
1524: extern int errno, sys_nerr;
1525: extern char *sys_errlist[];
1526: char *s;
1527:
1528: if (errno < sys_nerr)
1529: s = concat ("", sys_errlist[errno], " for %s");
1530: else
1531: s = "cannot open %s";
1532: fatal (s, name);
1533: }
1534:
1535: /* Return a newly-allocated string whose contents concatenate those of s1, s2, s3. */
1536:
1537: char *
1538: concat (s1, s2, s3)
1539: char *s1, *s2, *s3;
1540: {
1541: int len1 = strlen (s1), len2 = strlen (s2), len3 = strlen (s3);
1542: char *result = (char *) xmalloc (len1 + len2 + len3 + 1);
1543:
1544: strcpy (result, s1);
1545: strcpy (result + len1, s2);
1546: strcpy (result + len1 + len2, s3);
1547: *(result + len1 + len2 + len3) = 0;
1548:
1549: return result;
1550: }
1551:
1552: /* Like malloc but get fatal error if memory is exhausted. */
1553:
1554: int
1555: xmalloc (size)
1556: int size;
1557: {
1558: int result = malloc (size);
1559: if (!result)
1560: fatal ("virtual memory exhausted", 0);
1561: return result;
1562: }
1563:
1564:
1565: int
1566: xrealloc (ptr, size)
1567: char *ptr;
1568: int size;
1569: {
1570: int result = realloc (ptr, size);
1571: if (!result)
1572: fatal ("virtual memory exhausted");
1573: return result;
1574: }
1575:
1576: bzero (b, length)
1577: register char *b;
1578: register int length;
1579: {
1580: #ifdef VMS
1581: short zero = 0;
1582: long max_str = 65535;
1583:
1584: while (length > max_str) {
1585: (void) LIB$MOVC5 (&zero, &zero, &zero, &max_str, b);
1586: length -= max_str;
1587: b += max_str;
1588: }
1589: (void) LIB$MOVC5 (&zero, &zero, &zero, &length, b);
1590: #else
1591: while (length-- > 0)
1592: *b++ = 0;
1593: #endif /* not VMS */
1594: }
This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.