|
|
1.1 ! root 1: /* ! 2: * Copyright (c) 1989 The Regents of the University of California. ! 3: * All rights reserved. ! 4: * ! 5: * This code is derived from software contributed to Berkeley by ! 6: * James A. Woods. ! 7: * ! 8: * Redistribution and use in source and binary forms are permitted ! 9: * provided that: (1) source distributions retain this entire copyright ! 10: * notice and comment, and (2) distributions including binaries display ! 11: * the following acknowledgement: ``This product includes software ! 12: * developed by the University of California, Berkeley and its contributors'' ! 13: * in the documentation or other materials provided with the distribution ! 14: * and in all advertising materials mentioning features or use of this ! 15: * software. Neither the name of the University nor the names of its ! 16: * contributors may be used to endorse or promote products derived ! 17: * from this software without specific prior written permission. ! 18: * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR ! 19: * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED ! 20: * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. ! 21: */ ! 22: ! 23: #ifndef lint ! 24: char copyright[] = ! 25: "@(#) Copyright (c) 1989 The Regents of the University of California.\n\ ! 26: All rights reserved.\n"; ! 27: #endif /* not lint */ ! 28: ! 29: #ifndef lint ! 30: static char sccsid[] = "@(#)locate.code.c 4.8 (Berkeley) 6/1/90"; ! 31: #endif /* not lint */ ! 32: ! 33: /* ! 34: * PURPOSE: sorted list compressor (works with a modified 'find' ! 35: * to encode/decode a filename database) ! 36: * ! 37: * USAGE: bigram < list > bigrams ! 38: * process bigrams (see updatedb) > common_bigrams ! 39: * code common_bigrams < list > squozen_list ! 40: * ! 41: * METHOD: Uses 'front compression' (see ";login:", Volume 8, Number 1 ! 42: * February/March 1983, p. 8 ). Output format is, per line, an ! 43: * offset differential count byte followed by a partially bigram- ! 44: * encoded ascii residue. A bigram is a two-character sequence, ! 45: * the first 128 most common of which are encoded in one byte. ! 46: * ! 47: * EXAMPLE: For simple front compression with no bigram encoding, ! 48: * if the input is... then the output is... ! 49: * ! 50: * /usr/src 0 /usr/src ! 51: * /usr/src/cmd/aardvark.c 8 /cmd/aardvark.c ! 52: * /usr/src/cmd/armadillo.c 14 armadillo.c ! 53: * /usr/tmp/zoo 5 tmp/zoo ! 54: * ! 55: * The codes are: ! 56: * ! 57: * 0-28 likeliest differential counts + offset to make nonnegative ! 58: * 30 switch code for out-of-range count to follow in next word ! 59: * 128-255 bigram codes (128 most common, as determined by 'updatedb') ! 60: * 32-127 single character (printable) ascii residue (ie, literal) ! 61: * ! 62: * SEE ALSO: updatedb.csh, bigram.c, find.c ! 63: * ! 64: * AUTHOR: James A. Woods, Informatics General Corp., ! 65: * NASA Ames Research Center, 10/82 ! 66: */ ! 67: ! 68: #include <sys/param.h> ! 69: #include <stdio.h> ! 70: #include "locate.h" ! 71: ! 72: #define BGBUFSIZE (NBG * 2) /* size of bigram buffer */ ! 73: ! 74: char buf1[MAXPATHLEN] = " "; ! 75: char buf2[MAXPATHLEN]; ! 76: char bigrams[BGBUFSIZE + 1] = { 0 }; ! 77: ! 78: main ( argc, argv ) ! 79: int argc; char *argv[]; ! 80: { ! 81: register char *cp, *oldpath = buf1, *path = buf2; ! 82: int code, count, diffcount, oldcount = 0; ! 83: FILE *fp; ! 84: ! 85: if ((fp = fopen(argv[1], "r")) == NULL) { ! 86: printf("Usage: code common_bigrams < list > squozen_list\n"); ! 87: exit(1); ! 88: } ! 89: /* first copy bigram array to stdout */ ! 90: fgets ( bigrams, BGBUFSIZE + 1, fp ); ! 91: fwrite ( bigrams, 1, BGBUFSIZE, stdout ); ! 92: fclose( fp ); ! 93: ! 94: while ( fgets ( path, sizeof(buf2), stdin ) != NULL ) { ! 95: /* truncate newline */ ! 96: cp = path + strlen(path) - 1; ! 97: if (cp > path && *cp == '\n') ! 98: *cp = '\0'; ! 99: /* squelch characters that would botch the decoding */ ! 100: for ( cp = path; *cp != NULL; cp++ ) { ! 101: if ( (unsigned char)*cp >= PARITY ) ! 102: *cp &= PARITY-1; ! 103: else if ( *cp <= SWITCH ) ! 104: *cp = '?'; ! 105: } ! 106: /* skip longest common prefix */ ! 107: for ( cp = path; *cp == *oldpath; cp++, oldpath++ ) ! 108: if ( *oldpath == NULL ) ! 109: break; ! 110: count = cp - path; ! 111: diffcount = count - oldcount + OFFSET; ! 112: oldcount = count; ! 113: if ( diffcount < 0 || diffcount > 2*OFFSET ) { ! 114: putc ( SWITCH, stdout ); ! 115: putw ( diffcount, stdout ); ! 116: } ! 117: else ! 118: putc ( diffcount, stdout ); ! 119: ! 120: while ( *cp != NULL ) { ! 121: if ( *(cp + 1) == NULL ) { ! 122: putchar ( *cp ); ! 123: break; ! 124: } ! 125: if ( (code = bgindex ( cp )) < 0 ) { ! 126: putchar ( *cp++ ); ! 127: putchar ( *cp++ ); ! 128: } ! 129: else { /* found, so mark byte with parity bit */ ! 130: putchar ( (code / 2) | PARITY ); ! 131: cp += 2; ! 132: } ! 133: } ! 134: if ( path == buf1 ) /* swap pointers */ ! 135: path = buf2, oldpath = buf1; ! 136: else ! 137: path = buf1, oldpath = buf2; ! 138: } ! 139: } ! 140: ! 141: bgindex ( bg ) /* return location of bg in bigrams or -1 */ ! 142: char *bg; ! 143: { ! 144: register char *p; ! 145: register char bg0 = bg[0], bg1 = bg[1]; ! 146: ! 147: for ( p = bigrams; *p != NULL; p++ ) ! 148: if ( *p++ == bg0 && *p == bg1 ) ! 149: break; ! 150: return ( *p == NULL ? -1 : --p - bigrams ); ! 151: }
This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.