|
|
1.1 ! root 1: /* ! 2: * pinvert - create inverted index to a bibliographic database ! 3: * input: records of lines, separated by blank lines ! 4: * output: key:file1 start/length ... start/length:file2 start/length ... ! 5: */ ! 6: ! 7: #include <signal.h> ! 8: #ifdef BSD ! 9: #include <strings.h> ! 10: #else ! 11: #include <string.h> ! 12: #endif ! 13: #include "stdio.h" ! 14: #include "streams.h" ! 15: #include "bib.h" ! 16: ! 17: #define isnull(x) (*(x) == NULL) ! 18: #define makelow(c) ('A'<=(c) && (c)<='Z' ? (c)-'A'+'a' : c) ! 19: #define INVTEMPFILE "/tmp/invertXXXXXX" /* tmp index */ ! 20: #define HEADTEMPFILE "/tmp/headXXXXXX" /* tmp header */ ! 21: ! 22: char *strrchr(), *mktemp(); ! 23: ! 24: int max_kcnt = 100; /* max number of keys */ ! 25: int max_klen = MAXKLEN; /* max length of keys */ ! 26: char *plib = PLIB; /* library file */ ! 27: char *common; /* name of file of common words */ ! 28: char *ignore; /* name of file of %xxx to ignore */ ! 29: char *tempfile = INVTEMPFILE; /* name of temporary file */ ! 30: char *tmphead = HEADTEMPFILE; /* name of temporary file for header */ ! 31: char *DINPUT = BIBFILE; /* default input file */ ! 32: char INDEX[maxstr]; /* output index file */ ! 33: char HEAD[maxstr]; /* output header file */ ! 34: int silent = 1; /* 0 => statistics printed */ ! 35: ! 36: char mvcmd[maxstr]; ! 37: char path_ign[maxstr]; ! 38: ! 39: FILE *input, *output, *head; ! 40: char *sort_it = "sort -u %s -o %s"; ! 41: char sortcmd[maxstr]; ! 42: ! 43: char *libpath(); ! 44: int cleanup(); ! 45: ! 46: int argc; ! 47: char **argv; ! 48: ! 49: #define USAGE "pinvert [-c common -i ignore -k nkeys -l length -p database -v] [file...]\n" ! 50: ! 51: main(argcount,arglist) ! 52: int argcount; ! 53: char **arglist; ! 54: { ! 55: char *filename; ! 56: long int start,length; ! 57: char word[maxstr]; ! 58: int kcnt; ! 59: char tag_line[maxstr]; ! 60: char outstring[maxstr]; ! 61: char lenstring[5]; ! 62: char *f; ! 63: long int records = 0; /* number of records read */ ! 64: long int keys = 0; /* number of keys read (occurences) */ ! 65: long int distinct; /* number of distinct keys */ ! 66: long int shorten(); ! 67: int first = 1; ! 68: int stat; ! 69: ! 70: /* initialize and open files */ ! 71: argc= argcount-1; ! 72: argv= arglist+1; ! 73: mktemp(tempfile); ! 74: mktemp(tmphead); ! 75: if(signal(SIGINT, SIG_IGN) != SIG_IGN) ! 76: signal(SIGINT, cleanup); ! 77: signal(SIGQUIT, cleanup); ! 78: output= fopen(tempfile,"w"); ! 79: head = fopen(tmphead,"w"); ! 80: ! 81: /* make path names */ ! 82: common = COMFILE; ! 83: strcpy(path_ign,libpath(plib)); ! 84: strcat(path_ign,"/"); ! 85: strcat(path_ign,IGNFILE); ! 86: ignore = path_ign; ! 87: INDEX[0] = NULL; ! 88: HEAD[0] = NULL; ! 89: ! 90: flags(); ! 91: if(load_ign(ignore) == -1) { ! 92: baleout(); ! 93: } ! 94: ! 95: /* write out name of common file and max_klen to header */ ! 96: fprintf(head,"%s %d\n",common,max_klen); ! 97: ! 98: /* now index input files */ ! 99: for (; argc>=0 ; argc--, argv++) { ! 100: if(argc == 0) { ! 101: if(!first) ! 102: break; ! 103: else ! 104: filename = DINPUT; ! 105: } ! 106: else { ! 107: filename= *argv; ! 108: } ! 109: ! 110: first = 0; ! 111: input = fopen(filename,"r"); ! 112: if (input==NULL) { ! 113: fprintf(stderr, "pinvert: cannot open %s\n", ! 114: filename); ! 115: baleout(); ! 116: } ! 117: if(INDEX[0] == NULL) { ! 118: strcpy(INDEX,filename); ! 119: strcat(INDEX,".i"); ! 120: strcpy(HEAD,filename); ! 121: strcat(HEAD,".h"); ! 122: } ! 123: start= 0L; ! 124: length= 0L; ! 125: ! 126: /* write out file name to header */ ! 127: if((f=strrchr(filename,'/')) != NULL) ! 128: f++; ! 129: else ! 130: f = filename; ! 131: fprintf(head,"%s\n",f); ! 132: ! 133: sprintf(lenstring,"%d",max_klen); ! 134: strcpy(outstring,"%-"); ! 135: strcat(outstring,lenstring); ! 136: strcat(outstring,"s%s"); ! 137: for(;;) { ! 138: /* find start of next record (exit if none) */ ! 139: start= nextrecord(input,start+length); ! 140: if (start==EOF) break; ! 141: records++; ! 142: kcnt= 0; ! 143: length= recsize(input,start); ! 144: sprintf(tag_line, " %-18s %08ld %08ld\n", f, start, length); ! 145: ! 146: while (ftell(input) < start+length && kcnt < max_kcnt) { ! 147: getword(input,word); ! 148: makekey(word,max_klen,common); ! 149: if (!isnull(word)) { ! 150: fprintf(output,outstring,word,tag_line); ! 151: kcnt++; ! 152: keys++; ! 153: } ! 154: } ! 155: } ! 156: fclose(input); ! 157: } ! 158: fclose(output); ! 159: fclose(head); ! 160: ! 161: sprintf(sortcmd, sort_it, tempfile, tempfile); ! 162: system(sortcmd); ! 163: ! 164: distinct = shorten(tempfile,INDEX); ! 165: sprintf(mvcmd,"cp %s %s\n",tmphead,HEAD); ! 166: if(stat = system(mvcmd)) { ! 167: unlink(tmphead); ! 168: exit(stat); ! 169: } ! 170: else { ! 171: unlink(tmphead); ! 172: } ! 173: if( silent == 0 ) ! 174: fprintf(stderr, ! 175: "%ld documents %ld distinct keys %ld key occurrences\n", ! 176: records, distinct, keys); ! 177: exit(0); ! 178: } ! 179: ! 180: ! 181: baleout() ! 182: { ! 183: unlink(tempfile); ! 184: unlink(tmphead); ! 185: exit(1); ! 186: } ! 187: ! 188: /* Flag Meaning Default ! 189: -ki Keys per record 100 ! 190: -li max Length of keys 6 ! 191: -cfile file contains Common words /usr/lib/prefer/common ! 192: do not use common words as keys ! 193: -ifile %xxx lines in input file to ignore /usr/lib/prefer/ignore ! 194: -pfile name of output file INDEX ! 195: -s do not print statistics statistics printed ! 196: */ ! 197: ! 198: # define operand (strlen(*argv+2)==0 ? (argv++,argc--,*argv) : *argv+2) ! 199: ! 200: flags() ! 201: { ! 202: char *tmp; ! 203: for (; argc>0 && *argv[0]=='-'; argc--,argv++) { ! 204: switch ((*argv)[1]) { ! 205: case 'k': ! 206: max_kcnt= atoi(operand); ! 207: break; ! 208: case 'l': ! 209: max_klen= atoi(operand); ! 210: break; ! 211: case 'c': ! 212: common= operand; ! 213: break; ! 214: case 'i': ! 215: ignore= operand; ! 216: break; ! 217: case 'p': ! 218: tmp = operand; ! 219: strcpy(INDEX,tmp); ! 220: strcat(INDEX,".i"); ! 221: strcpy(HEAD,tmp); ! 222: strcat(HEAD,".h"); ! 223: break; ! 224: case 'v': ! 225: silent= 0; ! 226: break; ! 227: default: ! 228: fprintf(stderr,USAGE); ! 229: baleout(); ! 230: } ! 231: } ! 232: } ! 233: ! 234: ! 235: /* shorten(inf,outf): file "inf" consists of lines of the form: ! 236: key file start length ! 237: sorted by key and file. replace lines with the same key ! 238: with one line of the form: ! 239: key:file1 start/length ... start/length:file2 start/length ... ! 240: rename as file "outf" ! 241: returns number of lines in output ! 242: */ ! 243: long shorten(inf,outf) ! 244: char *inf, *outf; ! 245: { ! 246: FILE *in, *out; ! 247: char line[maxstr]; ! 248: char key[maxstr], newkey[maxstr], ! 249: file[maxstr], newfile[maxstr]; ! 250: long int start, length; ! 251: long int lines = 0; ! 252: ! 253: in= fopen(inf, "r"); ! 254: out= fopen(outf, "w"); ! 255: if (in==NULL || out==NULL) { ! 256: fprintf(stderr, "pinvert: error in opening file for compression\n"); ! 257: return(0); ! 258: } ! 259: ! 260: getline(in,line); ! 261: sscanf(line,"%s%s%ld%ld", key, file, &start, &length); ! 262: fprintf(out, "%s :%s %ld/%ld", key, file, start, length); ! 263: for ( getline(in, line) ; !feof(in); getline(in, line)) { ! 264: sscanf(line,"%s%s%ld%ld", newkey, newfile, &start, &length); ! 265: if (strcmp(key,newkey)!=0) { ! 266: strcpy(key, newkey); ! 267: strcpy(file, newfile); ! 268: fprintf(out, "\n%s :%s %ld/%ld", key, file, start, length); ! 269: lines++; ! 270: } ! 271: else if (strcmp(file,newfile)!=0) { ! 272: strcpy(file,newfile); ! 273: fprintf(out, ":%s %ld/%ld", file, start, length); ! 274: } ! 275: else ! 276: fprintf(out, " %ld/%ld", start, length); ! 277: } ! 278: fprintf(out, "\n"); ! 279: lines++; ! 280: ! 281: fclose(in); ! 282: fclose(out); ! 283: unlink(inf); ! 284: return (lines); ! 285: } ! 286: ! 287: cleanup() ! 288: { ! 289: signal(SIGINT, SIG_IGN); ! 290: signal(SIGQUIT, SIG_IGN); ! 291: fclose(output); ! 292: fclose(head); ! 293: unlink(tempfile); ! 294: unlink(tmphead); ! 295: exit(1); ! 296: }
This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.