|
|
1.1 root 1: /*
2: * pinvert - create inverted index to a bibliographic database
3: * input: records of lines, separated by blank lines
4: * output: key:file1 start/length ... start/length:file2 start/length ...
5: */
6:
7: #include <signal.h>
8: #ifdef BSD
9: #include <strings.h>
10: #else
11: #include <string.h>
12: #endif
13: #include "stdio.h"
14: #include "streams.h"
15: #include "bib.h"
16:
17: #define isnull(x) (*(x) == NULL)
18: #define makelow(c) ('A'<=(c) && (c)<='Z' ? (c)-'A'+'a' : c)
19: #define INVTEMPFILE "/tmp/invertXXXXXX" /* tmp index */
20: #define HEADTEMPFILE "/tmp/headXXXXXX" /* tmp header */
21:
22: char *strrchr(), *mktemp();
23:
24: int max_kcnt = 100; /* max number of keys */
25: int max_klen = MAXKLEN; /* max length of keys */
26: char *plib = PLIB; /* library file */
27: char *common; /* name of file of common words */
28: char *ignore; /* name of file of %xxx to ignore */
29: char *tempfile = INVTEMPFILE; /* name of temporary file */
30: char *tmphead = HEADTEMPFILE; /* name of temporary file for header */
31: char *DINPUT = BIBFILE; /* default input file */
32: char INDEX[maxstr]; /* output index file */
33: char HEAD[maxstr]; /* output header file */
34: int silent = 1; /* 0 => statistics printed */
35:
36: char mvcmd[maxstr];
37: char path_ign[maxstr];
38:
39: FILE *input, *output, *head;
40: char *sort_it = "sort -u %s -o %s";
41: char sortcmd[maxstr];
42:
43: char *libpath();
44: int cleanup();
45:
46: int argc;
47: char **argv;
48:
49: #define USAGE "pinvert [-c common -i ignore -k nkeys -l length -p database -v] [file...]\n"
50:
51: main(argcount,arglist)
52: int argcount;
53: char **arglist;
54: {
55: char *filename;
56: long int start,length;
57: char word[maxstr];
58: int kcnt;
59: char tag_line[maxstr];
60: char outstring[maxstr];
61: char lenstring[5];
62: char *f;
63: long int records = 0; /* number of records read */
64: long int keys = 0; /* number of keys read (occurences) */
65: long int distinct; /* number of distinct keys */
66: long int shorten();
67: int first = 1;
68: int stat;
69:
70: /* initialize and open files */
71: argc= argcount-1;
72: argv= arglist+1;
73: mktemp(tempfile);
74: mktemp(tmphead);
75: if(signal(SIGINT, SIG_IGN) != SIG_IGN)
76: signal(SIGINT, cleanup);
77: signal(SIGQUIT, cleanup);
78: output= fopen(tempfile,"w");
79: head = fopen(tmphead,"w");
80:
81: /* make path names */
82: common = COMFILE;
83: strcpy(path_ign,libpath(plib));
84: strcat(path_ign,"/");
85: strcat(path_ign,IGNFILE);
86: ignore = path_ign;
87: INDEX[0] = NULL;
88: HEAD[0] = NULL;
89:
90: flags();
91: if(load_ign(ignore) == -1) {
92: baleout();
93: }
94:
95: /* write out name of common file and max_klen to header */
96: fprintf(head,"%s %d\n",common,max_klen);
97:
98: /* now index input files */
99: for (; argc>=0 ; argc--, argv++) {
100: if(argc == 0) {
101: if(!first)
102: break;
103: else
104: filename = DINPUT;
105: }
106: else {
107: filename= *argv;
108: }
109:
110: first = 0;
111: input = fopen(filename,"r");
112: if (input==NULL) {
113: fprintf(stderr, "pinvert: cannot open %s\n",
114: filename);
115: baleout();
116: }
117: if(INDEX[0] == NULL) {
118: strcpy(INDEX,filename);
119: strcat(INDEX,".i");
120: strcpy(HEAD,filename);
121: strcat(HEAD,".h");
122: }
123: start= 0L;
124: length= 0L;
125:
126: /* write out file name to header */
127: if((f=strrchr(filename,'/')) != NULL)
128: f++;
129: else
130: f = filename;
131: fprintf(head,"%s\n",f);
132:
133: sprintf(lenstring,"%d",max_klen);
134: strcpy(outstring,"%-");
135: strcat(outstring,lenstring);
136: strcat(outstring,"s%s");
137: for(;;) {
138: /* find start of next record (exit if none) */
139: start= nextrecord(input,start+length);
140: if (start==EOF) break;
141: records++;
142: kcnt= 0;
143: length= recsize(input,start);
144: sprintf(tag_line, " %-18s %08ld %08ld\n", f, start, length);
145:
146: while (ftell(input) < start+length && kcnt < max_kcnt) {
147: getword(input,word);
148: makekey(word,max_klen,common);
149: if (!isnull(word)) {
150: fprintf(output,outstring,word,tag_line);
151: kcnt++;
152: keys++;
153: }
154: }
155: }
156: fclose(input);
157: }
158: fclose(output);
159: fclose(head);
160:
161: sprintf(sortcmd, sort_it, tempfile, tempfile);
162: system(sortcmd);
163:
164: distinct = shorten(tempfile,INDEX);
165: sprintf(mvcmd,"cp %s %s\n",tmphead,HEAD);
166: if(stat = system(mvcmd)) {
167: unlink(tmphead);
168: exit(stat);
169: }
170: else {
171: unlink(tmphead);
172: }
173: if( silent == 0 )
174: fprintf(stderr,
175: "%ld documents %ld distinct keys %ld key occurrences\n",
176: records, distinct, keys);
177: exit(0);
178: }
179:
180:
181: baleout()
182: {
183: unlink(tempfile);
184: unlink(tmphead);
185: exit(1);
186: }
187:
188: /* Flag Meaning Default
189: -ki Keys per record 100
190: -li max Length of keys 6
191: -cfile file contains Common words /usr/lib/prefer/common
192: do not use common words as keys
193: -ifile %xxx lines in input file to ignore /usr/lib/prefer/ignore
194: -pfile name of output file INDEX
195: -s do not print statistics statistics printed
196: */
197:
198: # define operand (strlen(*argv+2)==0 ? (argv++,argc--,*argv) : *argv+2)
199:
200: flags()
201: {
202: char *tmp;
203: for (; argc>0 && *argv[0]=='-'; argc--,argv++) {
204: switch ((*argv)[1]) {
205: case 'k':
206: max_kcnt= atoi(operand);
207: break;
208: case 'l':
209: max_klen= atoi(operand);
210: break;
211: case 'c':
212: common= operand;
213: break;
214: case 'i':
215: ignore= operand;
216: break;
217: case 'p':
218: tmp = operand;
219: strcpy(INDEX,tmp);
220: strcat(INDEX,".i");
221: strcpy(HEAD,tmp);
222: strcat(HEAD,".h");
223: break;
224: case 'v':
225: silent= 0;
226: break;
227: default:
228: fprintf(stderr,USAGE);
229: baleout();
230: }
231: }
232: }
233:
234:
235: /* shorten(inf,outf): file "inf" consists of lines of the form:
236: key file start length
237: sorted by key and file. replace lines with the same key
238: with one line of the form:
239: key:file1 start/length ... start/length:file2 start/length ...
240: rename as file "outf"
241: returns number of lines in output
242: */
243: long shorten(inf,outf)
244: char *inf, *outf;
245: {
246: FILE *in, *out;
247: char line[maxstr];
248: char key[maxstr], newkey[maxstr],
249: file[maxstr], newfile[maxstr];
250: long int start, length;
251: long int lines = 0;
252:
253: in= fopen(inf, "r");
254: out= fopen(outf, "w");
255: if (in==NULL || out==NULL) {
256: fprintf(stderr, "pinvert: error in opening file for compression\n");
257: return(0);
258: }
259:
260: getline(in,line);
261: sscanf(line,"%s%s%ld%ld", key, file, &start, &length);
262: fprintf(out, "%s :%s %ld/%ld", key, file, start, length);
263: for ( getline(in, line) ; !feof(in); getline(in, line)) {
264: sscanf(line,"%s%s%ld%ld", newkey, newfile, &start, &length);
265: if (strcmp(key,newkey)!=0) {
266: strcpy(key, newkey);
267: strcpy(file, newfile);
268: fprintf(out, "\n%s :%s %ld/%ld", key, file, start, length);
269: lines++;
270: }
271: else if (strcmp(file,newfile)!=0) {
272: strcpy(file,newfile);
273: fprintf(out, ":%s %ld/%ld", file, start, length);
274: }
275: else
276: fprintf(out, " %ld/%ld", start, length);
277: }
278: fprintf(out, "\n");
279: lines++;
280:
281: fclose(in);
282: fclose(out);
283: unlink(inf);
284: return (lines);
285: }
286:
287: cleanup()
288: {
289: signal(SIGINT, SIG_IGN);
290: signal(SIGQUIT, SIG_IGN);
291: fclose(output);
292: fclose(head);
293: unlink(tempfile);
294: unlink(tmphead);
295: exit(1);
296: }
This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.