File:  [Research Unix] / researchv10no / cmd / prefer / pref / invert.c
Revision 1.1.1.1 (vendor branch): download - view: text, annotated - select for diffs
Tue Apr 24 17:21:35 2018 UTC (8 years, 1 month ago) by root
Branches: belllabs, MAIN
CVS tags: researchv10, HEAD
researchv10 Norman

/*
 * pinvert - create inverted index to a bibliographic database
 *	input:	records of lines, separated by blank lines
 *	output:	key:file1 start/length ... start/length:file2 start/length ...
 */

#include	<signal.h>
#ifdef BSD
#include	<strings.h>
#else
#include	<string.h>
#endif
#include "stdio.h"
#include "streams.h"
#include "bib.h"

#define isnull(x)  (*(x) == NULL)
#define makelow(c) ('A'<=(c) && (c)<='Z' ? (c)-'A'+'a' : c)
#define INVTEMPFILE "/tmp/invertXXXXXX"		/* tmp index */
#define HEADTEMPFILE "/tmp/headXXXXXX"		/* tmp header */

char *strrchr(), *mktemp();

int     max_kcnt = 100;		/* max number of keys */
int     max_klen =   MAXKLEN;	/* max length of keys */
char 	*plib =	PLIB;		/* library file */
char    *common;		/* name of file of common words */
char	*ignore;		/* name of file of %xxx to ignore */
char    *tempfile = INVTEMPFILE;	/*  name of temporary file */
char    *tmphead = HEADTEMPFILE;	/*  name of temporary file for header */
char	*DINPUT = BIBFILE;	/* default input file */
char	INDEX[maxstr];		/* output index file */
char	HEAD[maxstr];		/* output header file */
int	silent = 1;		/* 0 => statistics printed */

char	mvcmd[maxstr];
char	path_ign[maxstr];

FILE	*input, *output, *head;
char	*sort_it = "sort -u %s -o %s";
char	sortcmd[maxstr];

char	*libpath();
int	cleanup();

int     argc;
char    **argv;

#define USAGE	"pinvert [-c common -i ignore -k nkeys -l length -p database -v] [file...]\n"

main(argcount,arglist)
int argcount;
char **arglist;
{
	char            *filename;
	long int        start,length;
	char            word[maxstr];
	int             kcnt;
	char            tag_line[maxstr];
	char		outstring[maxstr];
	char		lenstring[5];
	char *f;
	long int	records = 0;  /*  number of records read           */
	long int	keys    = 0;  /*  number of keys read (occurences) */
	long int	distinct;     /*  number of distinct keys          */
	long int	shorten();
	int first = 1;
	int stat;

	/* initialize and open files */
	argc= argcount-1;
	argv= arglist+1;
	mktemp(tempfile);
	mktemp(tmphead);
	if(signal(SIGINT, SIG_IGN) != SIG_IGN)
		signal(SIGINT, cleanup);
	signal(SIGQUIT, cleanup);
	output= fopen(tempfile,"w");
	head = fopen(tmphead,"w");

	/* make path names */
	common = COMFILE;
	strcpy(path_ign,libpath(plib));
	strcat(path_ign,"/");
	strcat(path_ign,IGNFILE);
	ignore = path_ign;
	INDEX[0] = NULL;
	HEAD[0] = NULL;

	flags();
	if(load_ign(ignore) == -1) {
		baleout();
	}

	/* write out name of common file and max_klen to header */
	fprintf(head,"%s %d\n",common,max_klen);

	/* now index input files */
	for (; argc>=0 ; argc--, argv++) {
		if(argc == 0) {
			if(!first)
				break;
			else
				filename = DINPUT;
		}
		else {
			filename=   *argv;
		}

		first = 0;
		input = fopen(filename,"r");
		if (input==NULL) {
			fprintf(stderr, "pinvert: cannot open %s\n",
			    filename);
			baleout();
		}
		if(INDEX[0] == NULL) {
			strcpy(INDEX,filename);
			strcat(INDEX,".i");
			strcpy(HEAD,filename);
			strcat(HEAD,".h");
		}
		start=      0L;
		length=     0L;

		/* write out file name to header */
		if((f=strrchr(filename,'/')) != NULL)
			f++;
		else
			f = filename;
		fprintf(head,"%s\n",f);

		sprintf(lenstring,"%d",max_klen);
		strcpy(outstring,"%-");
		strcat(outstring,lenstring);
		strcat(outstring,"s%s");
		for(;;) {
			/* find start of next record (exit if none)  */
			start= nextrecord(input,start+length);
			if (start==EOF)   break;
			records++;
			kcnt= 0;
			length= recsize(input,start);
			sprintf(tag_line, " %-18s %08ld %08ld\n", f, start, length);

			while (ftell(input) < start+length && kcnt < max_kcnt) {
				getword(input,word);
				makekey(word,max_klen,common);
				if (!isnull(word)) {
					fprintf(output,outstring,word,tag_line);
					kcnt++;
					keys++;
				}
			}
		}
		fclose(input);
	}
	fclose(output);
	fclose(head);

	sprintf(sortcmd, sort_it, tempfile, tempfile);
	system(sortcmd);

	distinct = shorten(tempfile,INDEX);
	sprintf(mvcmd,"cp %s %s\n",tmphead,HEAD);
	if(stat = system(mvcmd)) {
		unlink(tmphead);
		exit(stat);
	}
	else {
		unlink(tmphead);
	}
	if( silent == 0 )
		fprintf(stderr,
		    "%ld documents   %ld distinct keys  %ld key occurrences\n",
		    records, distinct, keys);
	exit(0);
}


baleout()
{
	unlink(tempfile);
	unlink(tmphead);
	exit(1);
}

/*  Flag    Meaning                             Default
    -ki     Keys per record                     100
    -li     max Length of keys                  6
    -cfile  file contains Common words          /usr/lib/prefer/common
            do not use common words as keys
    -ifile  %xxx lines in input file to ignore 	/usr/lib/prefer/ignore
    -pfile  name of output file                 INDEX
    -s	    do not print statistics		statistics printed
*/

# define    operand     (strlen(*argv+2)==0 ? (argv++,argc--,*argv) : *argv+2)

flags()
{
	char *tmp;
	for (; argc>0 && *argv[0]=='-';  argc--,argv++) {
		switch ((*argv)[1]) {
		case 'k':
			max_kcnt= atoi(operand);
			break;
		case 'l':
			max_klen= atoi(operand);
			break;
		case 'c':
			common=  operand;
			break;
		case 'i':
			ignore=  operand;
			break;
		case 'p':
			tmp = operand;
			strcpy(INDEX,tmp);
			strcat(INDEX,".i");
			strcpy(HEAD,tmp);
			strcat(HEAD,".h");
			break;
		case 'v':
			silent= 0;
			break;
		default:
			fprintf(stderr,USAGE);
			baleout();
		}
	}
}


/*  shorten(inf,outf): file "inf" consists of lines of the form:
        key file start length
    sorted by key and file.  replace lines with the same key
    with one line of the form:
        key:file1 start/length ... start/length:file2 start/length ...
    rename as file "outf"
    returns number of lines in output
*/
long shorten(inf,outf)
char *inf, *outf;
{
	FILE *in, *out;
	char line[maxstr];
	char key[maxstr],  newkey[maxstr],
	file[maxstr], newfile[maxstr];
	long int start, length;
	long int lines = 0;

	in=  fopen(inf, "r");
	out= fopen(outf, "w");
	if (in==NULL || out==NULL) {
		fprintf(stderr, "pinvert: error in opening file for compression\n");
		return(0);
	}

	getline(in,line);
	sscanf(line,"%s%s%ld%ld", key, file, &start, &length);
	fprintf(out, "%s :%s %ld/%ld", key, file, start, length);
	for ( getline(in, line) ; !feof(in);  getline(in, line)) {
		sscanf(line,"%s%s%ld%ld", newkey, newfile, &start, &length);
		if (strcmp(key,newkey)!=0) {
			strcpy(key, newkey);
			strcpy(file, newfile);
			fprintf(out, "\n%s :%s %ld/%ld",  key, file, start, length);
			lines++;
		}
		else if (strcmp(file,newfile)!=0) {
			strcpy(file,newfile);
			fprintf(out, ":%s %ld/%ld", file, start, length);
		}
		else
			fprintf(out, " %ld/%ld", start, length);
	}
	fprintf(out, "\n");
	lines++;

	fclose(in);
	fclose(out);
	unlink(inf);
	return (lines);
}

cleanup()
{
	signal(SIGINT, SIG_IGN);
	signal(SIGQUIT, SIG_IGN);
	fclose(output);
	fclose(head);
	unlink(tempfile);
	unlink(tmphead);
	exit(1);
}

unix.superglobalmegacorp.com

This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.