researchv10no/cmd/prefer/pref/invert.c - view

File: [Research Unix] / researchv10no / cmd / prefer / pref / invert.c
Revision 1.1.1.1 (vendor branch): download - view: text, annotated - select for diffs
Tue Apr 24 17:21:35 2018 UTC (8 years, 1 month ago) by root
Branches: belllabs, MAIN
CVS tags: researchv10, HEAD

researchv10 Norman

/* * pinvert - create inverted index to a bibliographic database * input: records of lines, separated by blank lines * output: key:file1 start/length ... start/length:file2 start/length ... */ #include <signal.h> #ifdef BSD #include <strings.h> #else #include <string.h> #endif #include "stdio.h" #include "streams.h" #include "bib.h" #define isnull(x) (*(x) == NULL) #define makelow(c) ('A'<=(c) && (c)<='Z' ? (c)-'A'+'a' : c) #define INVTEMPFILE "/tmp/invertXXXXXX" /* tmp index */ #define HEADTEMPFILE "/tmp/headXXXXXX" /* tmp header */ char *strrchr(), *mktemp(); int max_kcnt = 100; /* max number of keys */ int max_klen = MAXKLEN; /* max length of keys */ char *plib = PLIB; /* library file */ char *common; /* name of file of common words */ char *ignore; /* name of file of %xxx to ignore */ char *tempfile = INVTEMPFILE; /* name of temporary file */ char *tmphead = HEADTEMPFILE; /* name of temporary file for header */ char *DINPUT = BIBFILE; /* default input file */ char INDEX[maxstr]; /* output index file */ char HEAD[maxstr]; /* output header file */ int silent = 1; /* 0 => statistics printed */ char mvcmd[maxstr]; char path_ign[maxstr]; FILE *input, *output, *head; char *sort_it = "sort -u %s -o %s"; char sortcmd[maxstr]; char *libpath(); int cleanup(); int argc; char **argv; #define USAGE "pinvert [-c common -i ignore -k nkeys -l length -p database -v] [file...]\n" main(argcount,arglist) int argcount; char **arglist; { char *filename; long int start,length; char word[maxstr]; int kcnt; char tag_line[maxstr]; char outstring[maxstr]; char lenstring[5]; char *f; long int records = 0; /* number of records read */ long int keys = 0; /* number of keys read (occurences) */ long int distinct; /* number of distinct keys */ long int shorten(); int first = 1; int stat; /* initialize and open files */ argc= argcount-1; argv= arglist+1; mktemp(tempfile); mktemp(tmphead); if(signal(SIGINT, SIG_IGN) != SIG_IGN) signal(SIGINT, cleanup); signal(SIGQUIT, cleanup); output= fopen(tempfile,"w"); head = fopen(tmphead,"w"); /* make path names */ common = COMFILE; strcpy(path_ign,libpath(plib)); strcat(path_ign,"/"); strcat(path_ign,IGNFILE); ignore = path_ign; INDEX[0] = NULL; HEAD[0] = NULL; flags(); if(load_ign(ignore) == -1) { baleout(); } /* write out name of common file and max_klen to header */ fprintf(head,"%s %d\n",common,max_klen); /* now index input files */ for (; argc>=0 ; argc--, argv++) { if(argc == 0) { if(!first) break; else filename = DINPUT; } else { filename= *argv; } first = 0; input = fopen(filename,"r"); if (input==NULL) { fprintf(stderr, "pinvert: cannot open %s\n", filename); baleout(); } if(INDEX[0] == NULL) { strcpy(INDEX,filename); strcat(INDEX,".i"); strcpy(HEAD,filename); strcat(HEAD,".h"); } start= 0L; length= 0L; /* write out file name to header */ if((f=strrchr(filename,'/')) != NULL) f++; else f = filename; fprintf(head,"%s\n",f); sprintf(lenstring,"%d",max_klen); strcpy(outstring,"%-"); strcat(outstring,lenstring); strcat(outstring,"s%s"); for(;;) { /* find start of next record (exit if none) */ start= nextrecord(input,start+length); if (start==EOF) break; records++; kcnt= 0; length= recsize(input,start); sprintf(tag_line, " %-18s %08ld %08ld\n", f, start, length); while (ftell(input) < start+length && kcnt < max_kcnt) { getword(input,word); makekey(word,max_klen,common); if (!isnull(word)) { fprintf(output,outstring,word,tag_line); kcnt++; keys++; } } } fclose(input); } fclose(output); fclose(head); sprintf(sortcmd, sort_it, tempfile, tempfile); system(sortcmd); distinct = shorten(tempfile,INDEX); sprintf(mvcmd,"cp %s %s\n",tmphead,HEAD); if(stat = system(mvcmd)) { unlink(tmphead); exit(stat); } else { unlink(tmphead); } if( silent == 0 ) fprintf(stderr, "%ld documents %ld distinct keys %ld key occurrences\n", records, distinct, keys); exit(0); } baleout() { unlink(tempfile); unlink(tmphead); exit(1); } /* Flag Meaning Default -ki Keys per record 100 -li max Length of keys 6 -cfile file contains Common words /usr/lib/prefer/common do not use common words as keys -ifile %xxx lines in input file to ignore /usr/lib/prefer/ignore -pfile name of output file INDEX -s do not print statistics statistics printed */ # define operand (strlen(*argv+2)==0 ? (argv++,argc--,*argv) : *argv+2) flags() { char *tmp; for (; argc>0 && *argv[0]=='-'; argc--,argv++) { switch ((*argv)[1]) { case 'k': max_kcnt= atoi(operand); break; case 'l': max_klen= atoi(operand); break; case 'c': common= operand; break; case 'i': ignore= operand; break; case 'p': tmp = operand; strcpy(INDEX,tmp); strcat(INDEX,".i"); strcpy(HEAD,tmp); strcat(HEAD,".h"); break; case 'v': silent= 0; break; default: fprintf(stderr,USAGE); baleout(); } } } /* shorten(inf,outf): file "inf" consists of lines of the form: key file start length sorted by key and file. replace lines with the same key with one line of the form: key:file1 start/length ... start/length:file2 start/length ... rename as file "outf" returns number of lines in output */ long shorten(inf,outf) char *inf, *outf; { FILE *in, *out; char line[maxstr]; char key[maxstr], newkey[maxstr], file[maxstr], newfile[maxstr]; long int start, length; long int lines = 0; in= fopen(inf, "r"); out= fopen(outf, "w"); if (in==NULL || out==NULL) { fprintf(stderr, "pinvert: error in opening file for compression\n"); return(0); } getline(in,line); sscanf(line,"%s%s%ld%ld", key, file, &start, &length); fprintf(out, "%s :%s %ld/%ld", key, file, start, length); for ( getline(in, line) ; !feof(in); getline(in, line)) { sscanf(line,"%s%s%ld%ld", newkey, newfile, &start, &length); if (strcmp(key,newkey)!=0) { strcpy(key, newkey); strcpy(file, newfile); fprintf(out, "\n%s :%s %ld/%ld", key, file, start, length); lines++; } else if (strcmp(file,newfile)!=0) { strcpy(file,newfile); fprintf(out, ":%s %ld/%ld", file, start, length); } else fprintf(out, " %ld/%ld", start, length); } fprintf(out, "\n"); lines++; fclose(in); fclose(out); unlink(inf); return (lines); } cleanup() { signal(SIGINT, SIG_IGN); signal(SIGQUIT, SIG_IGN); fclose(output); fclose(head); unlink(tempfile); unlink(tmphead); exit(1); }

This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.