Annotation of 43BSDReno/usr.bin/locate/code/locate.code.c, revision 1.1.1.1

1.1       root        1: /*
                      2:  * Copyright (c) 1989 The Regents of the University of California.
                      3:  * All rights reserved.
                      4:  *
                      5:  * This code is derived from software contributed to Berkeley by
                      6:  * James A. Woods.
                      7:  *
                      8:  * Redistribution and use in source and binary forms are permitted
                      9:  * provided that: (1) source distributions retain this entire copyright
                     10:  * notice and comment, and (2) distributions including binaries display
                     11:  * the following acknowledgement:  ``This product includes software
                     12:  * developed by the University of California, Berkeley and its contributors''
                     13:  * in the documentation or other materials provided with the distribution
                     14:  * and in all advertising materials mentioning features or use of this
                     15:  * software. Neither the name of the University nor the names of its
                     16:  * contributors may be used to endorse or promote products derived
                     17:  * from this software without specific prior written permission.
                     18:  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR
                     19:  * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED
                     20:  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
                     21:  */
                     22: 
                     23: #ifndef lint
                     24: char copyright[] =
                     25: "@(#) Copyright (c) 1989 The Regents of the University of California.\n\
                     26:  All rights reserved.\n";
                     27: #endif /* not lint */
                     28: 
                     29: #ifndef lint
                     30: static char sccsid[] = "@(#)locate.code.c      4.8 (Berkeley) 6/1/90";
                     31: #endif /* not lint */
                     32: 
                     33: /*
                     34:  * PURPOSE:    sorted list compressor (works with a modified 'find'
                     35:  *             to encode/decode a filename database)
                     36:  *
                     37:  * USAGE:      bigram < list > bigrams
                     38:  *             process bigrams (see updatedb) > common_bigrams
                     39:  *             code common_bigrams < list > squozen_list
                     40:  *
                     41:  * METHOD:     Uses 'front compression' (see ";login:", Volume 8, Number 1
                     42:  *             February/March 1983, p. 8 ).  Output format is, per line, an
                     43:  *             offset differential count byte followed by a partially bigram-
                     44:  *             encoded ascii residue.  A bigram is a two-character sequence,
                     45:  *             the first 128 most common of which are encoded in one byte.
                     46:  *
                     47:  * EXAMPLE:    For simple front compression with no bigram encoding,
                     48:  *             if the input is...              then the output is...
                     49:  *
                     50:  *             /usr/src                         0 /usr/src
                     51:  *             /usr/src/cmd/aardvark.c          8 /cmd/aardvark.c
                     52:  *             /usr/src/cmd/armadillo.c        14 armadillo.c
                     53:  *             /usr/tmp/zoo                     5 tmp/zoo
                     54:  *
                     55:  *     The codes are:
                     56:  *
                     57:  *     0-28    likeliest differential counts + offset to make nonnegative 
                     58:  *     30      switch code for out-of-range count to follow in next word
                     59:  *     128-255 bigram codes (128 most common, as determined by 'updatedb')
                     60:  *     32-127  single character (printable) ascii residue (ie, literal)
                     61:  *
                     62:  * SEE ALSO:   updatedb.csh, bigram.c, find.c
                     63:  * 
                     64:  * AUTHOR:     James A. Woods, Informatics General Corp.,
                     65:  *             NASA Ames Research Center, 10/82
                     66:  */
                     67: 
                     68: #include <sys/param.h>
                     69: #include <stdio.h>
                     70: #include "locate.h"
                     71: 
                     72: #define BGBUFSIZE      (NBG * 2)       /* size of bigram buffer */
                     73: 
                     74: char buf1[MAXPATHLEN] = " ";   
                     75: char buf2[MAXPATHLEN];
                     76: char bigrams[BGBUFSIZE + 1] = { 0 };
                     77: 
                     78: main ( argc, argv )
                     79:        int argc; char *argv[];
                     80: {
                     81:        register char *cp, *oldpath = buf1, *path = buf2;
                     82:        int code, count, diffcount, oldcount = 0;
                     83:        FILE *fp;
                     84: 
                     85:        if ((fp = fopen(argv[1], "r")) == NULL) {
                     86:                printf("Usage: code common_bigrams < list > squozen_list\n");
                     87:                exit(1);
                     88:        }
                     89:        /* first copy bigram array to stdout */
                     90:        fgets ( bigrams, BGBUFSIZE + 1, fp );
                     91:        fwrite ( bigrams, 1, BGBUFSIZE, stdout );
                     92:        fclose( fp );
                     93: 
                     94:        while ( fgets ( path, sizeof(buf2), stdin ) != NULL ) {
                     95:                /* truncate newline */
                     96:                cp = path + strlen(path) - 1;
                     97:                if (cp > path && *cp == '\n')
                     98:                        *cp = '\0';
                     99:                /* squelch characters that would botch the decoding */
                    100:                for ( cp = path; *cp != NULL; cp++ ) {
                    101:                        if ( (unsigned char)*cp >= PARITY )
                    102:                                *cp &= PARITY-1;
                    103:                        else if ( *cp <= SWITCH )
                    104:                                *cp = '?';
                    105:                }
                    106:                /* skip longest common prefix */
                    107:                for ( cp = path; *cp == *oldpath; cp++, oldpath++ )
                    108:                        if ( *oldpath == NULL )
                    109:                                break;
                    110:                count = cp - path;
                    111:                diffcount = count - oldcount + OFFSET;
                    112:                oldcount = count;
                    113:                if ( diffcount < 0 || diffcount > 2*OFFSET ) {
                    114:                        putc ( SWITCH, stdout );
                    115:                        putw ( diffcount, stdout );
                    116:                }
                    117:                else
                    118:                        putc ( diffcount, stdout );     
                    119: 
                    120:                while ( *cp != NULL ) {
                    121:                        if ( *(cp + 1) == NULL ) {
                    122:                                putchar ( *cp );
                    123:                                break;
                    124:                        }
                    125:                        if ( (code = bgindex ( cp )) < 0 ) {
                    126:                                putchar ( *cp++ );
                    127:                                putchar ( *cp++ );
                    128:                        }
                    129:                        else {  /* found, so mark byte with parity bit */
                    130:                                putchar ( (code / 2) | PARITY );
                    131:                                cp += 2;
                    132:                        }
                    133:                }
                    134:                if ( path == buf1 )             /* swap pointers */
                    135:                        path = buf2, oldpath = buf1;
                    136:                else
                    137:                        path = buf1, oldpath = buf2;
                    138:        }
                    139: }
                    140: 
                    141: bgindex ( bg )                 /* return location of bg in bigrams or -1 */
                    142:        char *bg;
                    143: {
                    144:        register char *p;
                    145:        register char bg0 = bg[0], bg1 = bg[1];
                    146: 
                    147:        for ( p = bigrams; *p != NULL; p++ )
                    148:                if ( *p++ == bg0 && *p == bg1 )
                    149:                        break;
                    150:        return ( *p == NULL ? -1 : --p - bigrams );
                    151: }

unix.superglobalmegacorp.com

This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.