|
|
1.1 root 1: /*
2: * Copyright (c) 1989 The Regents of the University of California.
3: * All rights reserved.
4: *
5: * This code is derived from software contributed to Berkeley by
6: * Ken Arnold.
7: *
8: * Redistribution and use in source and binary forms are permitted
9: * provided that the above copyright notice and this paragraph are
10: * duplicated in all such forms and that any documentation,
11: * advertising materials, and other materials related to such
12: * distribution and use acknowledge that the software was developed
13: * by the University of California, Berkeley. The name of the
14: * University may not be used to endorse or promote products derived
15: * from this software without specific prior written permission.
16: * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR
17: * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED
18: * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
19: */
20:
21: #ifndef lint
22: char copyright[] =
23: "@(#) Copyright (c) 1989 The Regents of the University of California.\n\
24: All rights reserved.\n";
25: #endif /* not lint */
26:
27: #ifndef lint
28: static char sccsid[] = "@(#)strfile.c 5.11 (Berkeley) 12/15/89";
29: #endif /* not lint */
30:
31: # include <machine/endian.h>
32: # include <sys/param.h>
33: # include <stdio.h>
34: # include <ctype.h>
35: # include "strfile.h"
36:
37: # ifndef MAXPATHLEN
38: # define MAXPATHLEN 1024
39: # endif /* MAXPATHLEN */
40:
41: /*
42: * This program takes a file composed of strings seperated by
43: * lines starting with two consecutive delimiting character (default
44: * character is '%') and creates another file which consists of a table
45: * describing the file (structure from "strfile.h"), a table of seek
46: * pointers to the start of the strings, and the strings, each terminated
47: * by a null byte. Usage:
48: *
49: * % strfile [-iorsx] [ -cC ] sourcefile [ datafile ]
50: *
51: * c - Change delimiting character from '%' to 'C'
52: * s - Silent. Give no summary of data processed at the end of
53: * the run.
54: * o - order the strings in alphabetic order
55: * i - if ordering, ignore case
56: * r - randomize the order of the strings
57: * x - set rotated bit
58: *
59: * Ken Arnold Sept. 7, 1978 --
60: *
61: * Added ordering options.
62: */
63:
64: # define TRUE 1
65: # define FALSE 0
66:
67: # define STORING_PTRS (Oflag || Rflag)
68: # define CHUNKSIZE 512
69:
70: #ifdef lint
71: # define ALWAYS atoi("1")
72: #else
73: # define ALWAYS 1
74: #endif
75: # define ALLOC(ptr,sz) if (ALWAYS) { \
76: if (ptr == NULL) \
77: ptr = malloc((unsigned int) (CHUNKSIZE * sizeof *ptr)); \
78: else if (((sz) + 1) % CHUNKSIZE == 0) \
79: ptr = realloc((void *) ptr, ((unsigned int) ((sz) + CHUNKSIZE) * sizeof *ptr)); \
80: if (ptr == NULL) { \
81: fprintf(stderr, "out of space\n"); \
82: exit(1); \
83: } \
84: } else
85:
86: #ifdef NO_VOID
87: # define void char
88: #endif
89:
90: typedef struct {
91: char first;
92: off_t pos;
93: } STR;
94:
95: char *Infile = NULL, /* input file name */
96: Outfile[MAXPATHLEN] = "", /* output file name */
97: Delimch = '%'; /* delimiting character */
98:
99: int Sflag = FALSE; /* silent run flag */
100: int Oflag = FALSE; /* ordering flag */
101: int Iflag = FALSE; /* ignore case flag */
102: int Rflag = FALSE; /* randomize order flag */
103: int Xflag = FALSE; /* set rotated bit */
104: long Num_pts = 0; /* number of pointers/strings */
105:
106: off_t *Seekpts;
107:
108: FILE *Sort_1, *Sort_2; /* pointers for sorting */
109:
110: STRFILE Tbl; /* statistics table */
111:
112: STR *Firstch; /* first chars of each string */
113:
114: char *fgets(), *strcpy(), *strcat();
115:
116: void *malloc(), *realloc();
117:
118: /*
119: * main:
120: * Drive the sucker. There are two main modes -- either we store
121: * the seek pointers, if the table is to be sorted or randomized,
122: * or we write the pointer directly to the file, if we are to stay
123: * in file order. If the former, we allocate and re-allocate in
124: * CHUNKSIZE blocks; if the latter, we just write each pointer,
125: * and then seek back to the beginning to write in the table.
126: */
127: main(ac, av)
128: int ac;
129: char **av;
130: {
131: register char *sp, dc;
132: register FILE *inf, *outf;
133: register off_t last_off, length, pos, *p;
134: register int first, cnt;
135: register char *nsp;
136: register STR *fp;
137: static char string[257];
138:
139: getargs(ac, av); /* evalute arguments */
140: dc = Delimch;
141: if ((inf = fopen(Infile, "r")) == NULL) {
142: perror(Infile);
143: exit(1);
144: }
145:
146: if ((outf = fopen(Outfile, "w")) == NULL) {
147: perror(Outfile);
148: exit(1);
149: }
150: if (!STORING_PTRS)
151: (void) fseek(outf, sizeof Tbl, 0);
152:
153: /*
154: * Write the strings onto the file
155: */
156:
157: Tbl.str_longlen = 0;
158: Tbl.str_shortlen = (unsigned int) 0xffffffff;
159: Tbl.str_delim = dc;
160: Tbl.str_version = VERSION;
161: first = Oflag;
162: add_offset(outf, ftell(inf));
163: last_off = 0;
164: do {
165: sp = fgets(string, 256, inf);
166: if (sp == NULL || sp[0] == dc && sp[1] == '\n') {
167: pos = ftell(inf);
168: length = pos - last_off - (sp ? strlen(sp) : 0);
169: last_off = pos;
170: if (!length)
171: continue;
172: add_offset(outf, pos);
173: if (Tbl.str_longlen < length)
174: Tbl.str_longlen = length;
175: if (Tbl.str_shortlen > length)
176: Tbl.str_shortlen = length;
177: first = Oflag;
178: }
179: else if (first) {
180: for (nsp = sp; !isalnum(*nsp); nsp++)
181: continue;
182: ALLOC(Firstch, Num_pts);
183: fp = &Firstch[Num_pts - 1];
184: if (Iflag && isupper(*nsp))
185: fp->first = tolower(*nsp);
186: else
187: fp->first = *nsp;
188: fp->pos = Seekpts[Num_pts - 1];
189: first = FALSE;
190: }
191: } while (sp != NULL);
192:
193: /*
194: * write the tables in
195: */
196:
197: (void) fclose(inf);
198:
199: if (Oflag)
200: do_order();
201: else if (Rflag)
202: randomize();
203:
204: if (Xflag)
205: Tbl.str_flags |= STR_ROTATED;
206:
207: if (!Sflag) {
208: printf("\"%s\" created\n", Outfile);
209: if (Num_pts == 2)
210: puts("There was 1 string");
211: else
212: printf("There were %d strings\n", Num_pts - 1);
213: printf("Longest string: %lu byte%s\n", Tbl.str_longlen,
214: Tbl.str_longlen == 1 ? "" : "s");
215: printf("Shortest string: %lu byte%s\n", Tbl.str_shortlen,
216: Tbl.str_shortlen == 1 ? "" : "s");
217: }
218:
219: (void) fseek(outf, (off_t) 0, 0);
220: Tbl.str_version = htonl(Tbl.str_version);
221: Tbl.str_numstr = htonl(Num_pts - 1);
222: Tbl.str_longlen = htonl(Tbl.str_longlen);
223: Tbl.str_shortlen = htonl(Tbl.str_shortlen);
224: Tbl.str_flags = htonl(Tbl.str_flags);
225: (void) fwrite((char *) &Tbl, sizeof Tbl, 1, outf);
226: if (STORING_PTRS) {
227: for (p = Seekpts, cnt = Num_pts; cnt--; ++p)
228: *p = htonl(*p);
229: (void) fwrite((char *) Seekpts, sizeof *Seekpts, (int) Num_pts, outf);
230: }
231: (void) fclose(outf);
232: exit(0);
233: }
234:
235: /*
236: * This routine evaluates arguments from the command line
237: */
238: getargs(argc, argv)
239: int argc;
240: char **argv;
241: {
242: extern char *optarg;
243: extern int optind;
244: int ch;
245:
246: while ((ch = getopt(argc, argv, "c:iorsx")) != EOF)
247: switch(ch) {
248: case 'c': /* new delimiting char */
249: Delimch = *optarg;
250: if (!isascii(Delimch)) {
251: printf("bad delimiting character: '\\%o\n'",
252: Delimch);
253: }
254: break;
255: case 'i': /* ignore case in ordering */
256: Iflag++;
257: break;
258: case 'o': /* order strings */
259: Oflag++;
260: break;
261: case 'r': /* randomize pointers */
262: Rflag++;
263: break;
264: case 's': /* silent */
265: Sflag++;
266: break;
267: case 'x': /* set the rotated bit */
268: Xflag++;
269: break;
270: case '?':
271: default:
272: usage();
273: }
274: argv += optind;
275:
276: if (*argv) {
277: Infile = *argv;
278: if (*++argv)
279: (void) strcpy(Outfile, *argv);
280: }
281: if (!Infile) {
282: puts("No input file name");
283: usage();
284: }
285: if (*Outfile == '\0') {
286: (void) strcpy(Outfile, Infile);
287: (void) strcat(Outfile, ".dat");
288: }
289: }
290:
291: usage()
292: {
293: (void) fprintf(stderr,
294: "strfile [-iorsx] [-c char] sourcefile [datafile]\n");
295: exit(1);
296: }
297:
298: /*
299: * add_offset:
300: * Add an offset to the list, or write it out, as appropriate.
301: */
302: add_offset(fp, off)
303: FILE *fp;
304: off_t off;
305: {
306: off_t net;
307:
308: if (!STORING_PTRS) {
309: net = htonl(off);
310: fwrite(&net, 1, sizeof net, fp);
311: } else {
312: ALLOC(Seekpts, Num_pts + 1);
313: Seekpts[Num_pts] = off;
314: }
315: Num_pts++;
316: }
317:
318: /*
319: * do_order:
320: * Order the strings alphabetically (possibly ignoring case).
321: */
322: do_order()
323: {
324: register int i;
325: register off_t *lp;
326: register STR *fp;
327: extern int cmp_str();
328:
329: Sort_1 = fopen(Infile, "r");
330: Sort_2 = fopen(Infile, "r");
331: qsort((char *) Firstch, (int) Tbl.str_numstr, sizeof *Firstch, cmp_str);
332: i = Tbl.str_numstr;
333: lp = Seekpts;
334: fp = Firstch;
335: while (i--)
336: *lp++ = fp++->pos;
337: (void) fclose(Sort_1);
338: (void) fclose(Sort_2);
339: Tbl.str_flags |= STR_ORDERED;
340: }
341:
342: /*
343: * cmp_str:
344: * Compare two strings in the file
345: */
346: char *
347: unctrl(c)
348: char c;
349: {
350: static char buf[3];
351:
352: if (isprint(c)) {
353: buf[0] = c;
354: buf[1] = '\0';
355: }
356: else if (c == 0177) {
357: buf[0] = '^';
358: buf[1] = '?';
359: }
360: else {
361: buf[0] = '^';
362: buf[1] = c + 'A' - 1;
363: }
364: return buf;
365: }
366:
367: cmp_str(p1, p2)
368: STR *p1, *p2;
369: {
370: register int c1, c2;
371: register int n1, n2;
372:
373: # define SET_N(nf,ch) (nf = (ch == '\n'))
374: # define IS_END(ch,nf) (ch == Delimch && nf)
375:
376: c1 = p1->first;
377: c2 = p2->first;
378: if (c1 != c2)
379: return c1 - c2;
380:
381: (void) fseek(Sort_1, p1->pos, 0);
382: (void) fseek(Sort_2, p2->pos, 0);
383:
384: n1 = FALSE;
385: n2 = FALSE;
386: while (!isalnum(c1 = getc(Sort_1)) && c1 != '\0')
387: SET_N(n1, c1);
388: while (!isalnum(c2 = getc(Sort_2)) && c2 != '\0')
389: SET_N(n2, c2);
390:
391: while (!IS_END(c1, n1) && !IS_END(c2, n2)) {
392: if (Iflag) {
393: if (isupper(c1))
394: c1 = tolower(c1);
395: if (isupper(c2))
396: c2 = tolower(c2);
397: }
398: if (c1 != c2)
399: return c1 - c2;
400: SET_N(n1, c1);
401: SET_N(n2, c2);
402: c1 = getc(Sort_1);
403: c2 = getc(Sort_2);
404: }
405: if (IS_END(c1, n1))
406: c1 = 0;
407: if (IS_END(c2, n2))
408: c2 = 0;
409: return c1 - c2;
410: }
411:
412: /*
413: * randomize:
414: * Randomize the order of the string table. We must be careful
415: * not to randomize across delimiter boundaries. All
416: * randomization is done within each block.
417: */
418: randomize()
419: {
420: register int cnt, i;
421: register off_t tmp;
422: register off_t *sp;
423: extern time_t time();
424:
425: srandom((int)(time((time_t *) NULL) + getpid()));
426:
427: Tbl.str_flags |= STR_RANDOM;
428: cnt = Tbl.str_numstr;
429:
430: /*
431: * move things around randomly
432: */
433:
434: for (sp = Seekpts; cnt > 0; cnt--, sp++) {
435: i = random() % cnt;
436: tmp = sp[0];
437: sp[0] = sp[i];
438: sp[i] = tmp;
439: }
440: }
This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.