|
|
1.1 root 1: #ifndef lint
2: static char sccsid[] = "@(#)invert.c 2.6 3/5/87";
3: #endif not lint
4: #
5: /* input: records of lines, separated by blank lines
6: output: key:file1 start/length ... start/length:file2 start/length ...
7: */
8:
9: # include "stdio.h"
10: # include "streams.h"
11: # include "bib.h"
12: # define isnull(x) (*(x) == NULL)
13: # define makelow(c) ('A'<=(c) && (c)<='Z' ? (c)-'A'+'a' : c)
14:
15: int max_kcnt = 100; /* max number of keys */
16: int max_klen = 6; /* max length of keys */
17: char *ignore = /* string of line starts to ignore */
18: "CNOPVX";
19: char *common = /* name of file of common words */
20: COMFILE;
21: char *INDEX= /* name of output file */
22: INDXFILE;
23:
24: char *bibtmpfile = /* name of temporary file */
25: INVTEMPFILE;
26:
27: int silent = 0; /* 0 => statistics printed */
28: /* 1 => no statisitics printed */
29:
30: char *sort_it =
31: "sort -u +0 -1 +1 -2 +2n -3 +3n %s -o %s";
32: char sortcmd[maxstr];
33:
34: int argc;
35: char **argv;
36:
37: main(argcount,arglist)
38: int argcount;
39: char **arglist;
40: { char *filename;
41: FILE *input, *output;
42: long int start,length;
43: char word[maxstr];
44: int kcnt;
45: char tag_line[maxstr];
46:
47: long int records = 0; /* number of records read */
48: long int keys = 0; /* number of keys read (occurences) */
49: long int distinct; /* number of distinct keys */
50: long int shorten();
51:
52: strcpy(COMFILE, N_COMFILE);
53: strcpy(BMACLIB, N_BMACLIB);
54:
55: argc= argcount-1;
56: argv= arglist+1;
57: mktemp(bibtmpfile);
58: output= fopen(bibtmpfile,"w");
59:
60: for ( flags() ; argc>0 ; argc--, argv++ ,flags() )
61: { /* open input file */
62: filename= *argv;
63: input= fopen(filename,"r");
64: if (input==NULL)
65: { fprintf(stderr, "invert: error in open of %s\n", filename);
66: continue;
67: }
68: start= 0L;
69: length= 0L;
70:
71: for(;;) /* each record */
72: { /* find start of next record (exit if none) */
73: start= nextrecord(input,start+length);
74: if (start==EOF) break;
75: records++;
76: kcnt= 0;
77: length= recsize(input,start);
78: sprintf(tag_line, " %s %d %d\n", filename, start, length);
79:
80: while (ftell(input) < start+length && kcnt < max_kcnt)
81: { getword(input,word,ignore);
82: makekey(word,max_klen,common);
83: if (!isnull(word))
84: { fputs(word,output); fputs(tag_line,output);
85: kcnt++; keys++;
86: }
87: }
88: }
89: fclose(input);
90: }
91: fclose(output);
92:
93: sprintf(sortcmd, sort_it, bibtmpfile, bibtmpfile);
94: system(sortcmd);
95:
96: distinct = shorten(bibtmpfile,INDEX);
97: if( silent == 0 )
98: fprintf(stderr,
99: "%d documents %d distinct keys %d key occurrences\n",
100: records, distinct, keys);
101: exit(0);
102: }
103:
104:
105:
106: /* Flag Meaning Default
107: -ki Keys per record 100
108: -li max Length of keys 6
109: -%str ignore lines that begin with %x CNOPVX
110: where x is in str
111: str is a seq of chars
112: -cfile file contains Common words /usr/new/lib/bib/common
113: do not use common words as keys
114: -pfile name of output file INDEX
115: -s do not print statistics statistics printed
116: */
117:
118: # define operand (strlen(*argv+2)==0 ? (argv++,argc--,*argv) : *argv+2)
119:
120: flags()
121: {
122: char *p;
123: for (; argc>0 && *argv[0]=='-'; argc--,argv++)
124: { switch ((*argv)[1])
125: { case 'k': max_kcnt= atoi(operand);
126: break;
127: case 'l': max_klen= atoi(operand);
128: break;
129: case 'c': common= operand;
130: break;
131: case '%': ignore= *argv+2;
132: break;
133: case 'p': INDEX= operand;
134: break;
135: case 's': silent= 1;
136: break;
137: case 'd':
138: p = &argv[0][2];
139: if (!p) {
140: argv++;
141: p = &argv[0][0];
142: }
143: strreplace(COMFILE, BMACLIB, p);
144: strcpy(BMACLIB, p);
145: break;
146: default: fprintf(stderr, "unknown flag '%s'\n", *argv);
147: }
148: }
149: }
150:
151:
152: /* shorten(inf,outf): file "inf" consists of lines of the form:
153: key file start length
154: sorted by key and file. replace lines with the same key
155: with one line of the form:
156: key:file1 start/length ... start/length:file2 start/length ...
157: rename as file "outf"
158: returns number of lines in output
159: */
160: long shorten(inf,outf)
161: char *inf, *outf;
162: { FILE *in, *out;
163: char line[maxstr];
164: char key[maxstr], newkey[maxstr],
165: file[maxstr], newfile[maxstr];
166: long int start, length;
167: long int lines = 0;
168:
169: in= fopen(inf, "r");
170: out= fopen(outf, "w");
171: if (in==NULL || out==NULL)
172: { fprintf(stderr, "invert: error in opening file for compression\n");
173: return(0);
174: }
175:
176: getline(in,line);
177: sscanf(line,"%s%s%d%d", key, file, &start, &length);
178: fprintf(out, "%s :%s %d/%d", key, file, start, length);
179: for ( getline(in, line) ; !feof(in); getline(in, line))
180: { sscanf(line,"%s%s%d%d", newkey, newfile, &start, &length);
181: if (strcmp(key,newkey)!=0)
182: { strcpy(key, newkey);
183: strcpy(file, newfile);
184: fprintf(out, "\n%s :%s %d/%d", key, file, start, length);
185: lines++;
186: }
187: else if (strcmp(file,newfile)!=0)
188: { strcpy(file,newfile);
189: fprintf(out, ":%s %d/%d", file, start, length);
190: }
191: else
192: fprintf(out, " %d/%d", start, length);
193: }
194: fprintf(out, "\n");
195: lines++;
196:
197: fclose(in); fclose(out);
198: unlink(inf);
199: return (lines);
200: }
This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.