|
|
1.1 ! root 1: #ifndef lint ! 2: static char sccsid[] = "@(#)invert.c 2.5 9/10/85"; ! 3: #endif not lint ! 4: # ! 5: /* input: records of lines, separated by blank lines ! 6: output: key:file1 start/length ... start/length:file2 start/length ... ! 7: */ ! 8: ! 9: # include "stdio.h" ! 10: # include "streams.h" ! 11: # include "bib.h" ! 12: # define isnull(x) (*(x) == NULL) ! 13: # define makelow(c) ('A'<=(c) && (c)<='Z' ? (c)-'A'+'a' : c) ! 14: ! 15: int max_kcnt = 100; /* max number of keys */ ! 16: int max_klen = 6; /* max length of keys */ ! 17: char *ignore = /* string of line starts to ignore */ ! 18: "CNOPVX"; ! 19: char *common = /* name of file of common words */ ! 20: COMFILE; ! 21: char *INDEX= /* name of output file */ ! 22: INDXFILE; ! 23: ! 24: char *tmpfile = /* name of temporary file */ ! 25: INVTEMPFILE; ! 26: ! 27: int silent = 0; /* 0 => statistics printed */ ! 28: /* 1 => no statisitics printed */ ! 29: ! 30: char *sort_it = ! 31: "sort -u +0 -1 +1 -2 +2n -3 +3n %s -o %s"; ! 32: char sortcmd[maxstr]; ! 33: ! 34: int argc; ! 35: char **argv; ! 36: ! 37: main(argcount,arglist) ! 38: int argcount; ! 39: char **arglist; ! 40: { char *filename; ! 41: FILE *input, *output; ! 42: long int start,length; ! 43: char word[maxstr]; ! 44: int kcnt; ! 45: char tag_line[maxstr]; ! 46: ! 47: long int records = 0; /* number of records read */ ! 48: long int keys = 0; /* number of keys read (occurences) */ ! 49: long int distinct; /* number of distinct keys */ ! 50: long int shorten(); ! 51: ! 52: strcpy(COMFILE, N_COMFILE); ! 53: strcpy(BMACLIB, N_BMACLIB); ! 54: ! 55: argc= argcount-1; ! 56: argv= arglist+1; ! 57: mktemp(tmpfile); ! 58: output= fopen(tmpfile,"w"); ! 59: ! 60: for ( flags() ; argc>0 ; argc--, argv++ ,flags() ) ! 61: { /* open input file */ ! 62: filename= *argv; ! 63: input= fopen(filename,"r"); ! 64: if (input==NULL) ! 65: { fprintf(stderr, "invert: error in open of %s\n", filename); ! 66: continue; ! 67: } ! 68: start= 0L; ! 69: length= 0L; ! 70: ! 71: for(;;) /* each record */ ! 72: { /* find start of next record (exit if none) */ ! 73: start= nextrecord(input,start+length); ! 74: if (start==EOF) break; ! 75: records++; ! 76: kcnt= 0; ! 77: length= recsize(input,start); ! 78: sprintf(tag_line, " %s %d %d\n", filename, start, length); ! 79: ! 80: while (ftell(input) < start+length && kcnt < max_kcnt) ! 81: { getword(input,word,ignore); ! 82: makekey(word,max_klen,common); ! 83: if (!isnull(word)) ! 84: { fputs(word,output); fputs(tag_line,output); ! 85: kcnt++; keys++; ! 86: } ! 87: } ! 88: } ! 89: fclose(input); ! 90: } ! 91: fclose(output); ! 92: ! 93: sprintf(sortcmd, sort_it, tmpfile, tmpfile); ! 94: system(sortcmd); ! 95: ! 96: distinct = shorten(tmpfile,INDEX); ! 97: if( silent == 0 ) ! 98: fprintf(stderr, ! 99: "%d documents %d distinct keys %d key occurrences\n", ! 100: records, distinct, keys); ! 101: exit(0); ! 102: } ! 103: ! 104: ! 105: ! 106: /* Flag Meaning Default ! 107: -ki Keys per record 100 ! 108: -li max Length of keys 6 ! 109: -%str ignore lines that begin with %x CNOPVX ! 110: where x is in str ! 111: str is a seq of chars ! 112: -cfile file contains Common words /usr/new/lib/bib/common ! 113: do not use common words as keys ! 114: -pfile name of output file INDEX ! 115: -s do not print statistics statistics printed ! 116: */ ! 117: ! 118: # define operand (strlen(*argv+2)==0 ? (argv++,argc--,*argv) : *argv+2) ! 119: ! 120: flags() ! 121: { ! 122: char *p; ! 123: for (; argc>0 && *argv[0]=='-'; argc--,argv++) ! 124: { switch ((*argv)[1]) ! 125: { case 'k': max_kcnt= atoi(operand); ! 126: break; ! 127: case 'l': max_klen= atoi(operand); ! 128: break; ! 129: case 'c': common= operand; ! 130: break; ! 131: case '%': ignore= *argv+2; ! 132: break; ! 133: case 'p': INDEX= operand; ! 134: break; ! 135: case 's': silent= 1; ! 136: break; ! 137: case 'd': ! 138: p = &argv[0][2]; ! 139: if (!p) { ! 140: argv++; ! 141: p = &argv[0][0]; ! 142: } ! 143: strreplace(COMFILE, BMACLIB, p); ! 144: strcpy(BMACLIB, p); ! 145: break; ! 146: default: fprintf(stderr, "unknown flag '%s'\n", *argv); ! 147: } ! 148: } ! 149: } ! 150: ! 151: ! 152: /* shorten(inf,outf): file "inf" consists of lines of the form: ! 153: key file start length ! 154: sorted by key and file. replace lines with the same key ! 155: with one line of the form: ! 156: key:file1 start/length ... start/length:file2 start/length ... ! 157: rename as file "outf" ! 158: returns number of lines in output ! 159: */ ! 160: long shorten(inf,outf) ! 161: char *inf, *outf; ! 162: { FILE *in, *out; ! 163: char line[maxstr]; ! 164: char key[maxstr], newkey[maxstr], ! 165: file[maxstr], newfile[maxstr]; ! 166: long int start, length; ! 167: long int lines = 0; ! 168: ! 169: in= fopen(inf, "r"); ! 170: out= fopen(outf, "w"); ! 171: if (in==NULL || out==NULL) ! 172: { fprintf(stderr, "invert: error in opening file for compression\n"); ! 173: return(0); ! 174: } ! 175: ! 176: getline(in,line); ! 177: sscanf(line,"%s%s%d%d", key, file, &start, &length); ! 178: fprintf(out, "%s :%s %d/%d", key, file, start, length); ! 179: for ( getline(in, line) ; !feof(in); getline(in, line)) ! 180: { sscanf(line,"%s%s%d%d", newkey, newfile, &start, &length); ! 181: if (strcmp(key,newkey)!=0) ! 182: { strcpy(key, newkey); ! 183: strcpy(file, newfile); ! 184: fprintf(out, "\n%s :%s %d/%d", key, file, start, length); ! 185: lines++; ! 186: } ! 187: else if (strcmp(file,newfile)!=0) ! 188: { strcpy(file,newfile); ! 189: fprintf(out, ":%s %d/%d", file, start, length); ! 190: } ! 191: else ! 192: fprintf(out, " %d/%d", start, length); ! 193: } ! 194: fprintf(out, "\n"); ! 195: lines++; ! 196: ! 197: fclose(in); fclose(out); ! 198: unlink(inf); ! 199: return (lines); ! 200: }
This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.