|
|
1.1 root 1: /*
2: * Copyright (c) 1989 The Regents of the University of California.
3: * All rights reserved.
4: *
5: * This code is derived from software contributed to Berkeley by
6: * James A. Woods.
7: *
8: * Redistribution and use in source and binary forms are permitted
9: * provided that: (1) source distributions retain this entire copyright
10: * notice and comment, and (2) distributions including binaries display
11: * the following acknowledgement: ``This product includes software
12: * developed by the University of California, Berkeley and its contributors''
13: * in the documentation or other materials provided with the distribution
14: * and in all advertising materials mentioning features or use of this
15: * software. Neither the name of the University nor the names of its
16: * contributors may be used to endorse or promote products derived
17: * from this software without specific prior written permission.
18: * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR
19: * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED
20: * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
21: */
22:
23: #ifndef lint
24: char copyright[] =
25: "@(#) Copyright (c) 1989 The Regents of the University of California.\n\
26: All rights reserved.\n";
27: #endif /* not lint */
28:
29: #ifndef lint
30: static char sccsid[] = "@(#)locate.code.c 4.8 (Berkeley) 6/1/90";
31: #endif /* not lint */
32:
33: /*
34: * PURPOSE: sorted list compressor (works with a modified 'find'
35: * to encode/decode a filename database)
36: *
37: * USAGE: bigram < list > bigrams
38: * process bigrams (see updatedb) > common_bigrams
39: * code common_bigrams < list > squozen_list
40: *
41: * METHOD: Uses 'front compression' (see ";login:", Volume 8, Number 1
42: * February/March 1983, p. 8 ). Output format is, per line, an
43: * offset differential count byte followed by a partially bigram-
44: * encoded ascii residue. A bigram is a two-character sequence,
45: * the first 128 most common of which are encoded in one byte.
46: *
47: * EXAMPLE: For simple front compression with no bigram encoding,
48: * if the input is... then the output is...
49: *
50: * /usr/src 0 /usr/src
51: * /usr/src/cmd/aardvark.c 8 /cmd/aardvark.c
52: * /usr/src/cmd/armadillo.c 14 armadillo.c
53: * /usr/tmp/zoo 5 tmp/zoo
54: *
55: * The codes are:
56: *
57: * 0-28 likeliest differential counts + offset to make nonnegative
58: * 30 switch code for out-of-range count to follow in next word
59: * 128-255 bigram codes (128 most common, as determined by 'updatedb')
60: * 32-127 single character (printable) ascii residue (ie, literal)
61: *
62: * SEE ALSO: updatedb.csh, bigram.c, find.c
63: *
64: * AUTHOR: James A. Woods, Informatics General Corp.,
65: * NASA Ames Research Center, 10/82
66: */
67:
68: #include <sys/param.h>
69: #include <stdio.h>
70: #include "locate.h"
71:
72: #define BGBUFSIZE (NBG * 2) /* size of bigram buffer */
73:
74: char buf1[MAXPATHLEN] = " ";
75: char buf2[MAXPATHLEN];
76: char bigrams[BGBUFSIZE + 1] = { 0 };
77:
78: main ( argc, argv )
79: int argc; char *argv[];
80: {
81: register char *cp, *oldpath = buf1, *path = buf2;
82: int code, count, diffcount, oldcount = 0;
83: FILE *fp;
84:
85: if ((fp = fopen(argv[1], "r")) == NULL) {
86: printf("Usage: code common_bigrams < list > squozen_list\n");
87: exit(1);
88: }
89: /* first copy bigram array to stdout */
90: fgets ( bigrams, BGBUFSIZE + 1, fp );
91: fwrite ( bigrams, 1, BGBUFSIZE, stdout );
92: fclose( fp );
93:
94: while ( fgets ( path, sizeof(buf2), stdin ) != NULL ) {
95: /* truncate newline */
96: cp = path + strlen(path) - 1;
97: if (cp > path && *cp == '\n')
98: *cp = '\0';
99: /* squelch characters that would botch the decoding */
100: for ( cp = path; *cp != NULL; cp++ ) {
101: if ( (unsigned char)*cp >= PARITY )
102: *cp &= PARITY-1;
103: else if ( *cp <= SWITCH )
104: *cp = '?';
105: }
106: /* skip longest common prefix */
107: for ( cp = path; *cp == *oldpath; cp++, oldpath++ )
108: if ( *oldpath == NULL )
109: break;
110: count = cp - path;
111: diffcount = count - oldcount + OFFSET;
112: oldcount = count;
113: if ( diffcount < 0 || diffcount > 2*OFFSET ) {
114: putc ( SWITCH, stdout );
115: putw ( diffcount, stdout );
116: }
117: else
118: putc ( diffcount, stdout );
119:
120: while ( *cp != NULL ) {
121: if ( *(cp + 1) == NULL ) {
122: putchar ( *cp );
123: break;
124: }
125: if ( (code = bgindex ( cp )) < 0 ) {
126: putchar ( *cp++ );
127: putchar ( *cp++ );
128: }
129: else { /* found, so mark byte with parity bit */
130: putchar ( (code / 2) | PARITY );
131: cp += 2;
132: }
133: }
134: if ( path == buf1 ) /* swap pointers */
135: path = buf2, oldpath = buf1;
136: else
137: path = buf1, oldpath = buf2;
138: }
139: }
140:
141: bgindex ( bg ) /* return location of bg in bigrams or -1 */
142: char *bg;
143: {
144: register char *p;
145: register char bg0 = bg[0], bg1 = bg[1];
146:
147: for ( p = bigrams; *p != NULL; p++ )
148: if ( *p++ == bg0 && *p == bg1 )
149: break;
150: return ( *p == NULL ? -1 : --p - bigrams );
151: }
This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.