|
|
1.1 root 1:
2: #include <stdio.h>
3: #include <ctype.h>
4: #include <string.h>
5: #include "code.h"
6:
7: #ifndef __cplusplus
8:
9: void exit(int);
10: void qsort(void*, unsigned, int, int(*)(void*, void*));
11:
12: #else
13:
14: #include <memory.h>
15: extern "C" {
16: void exit(int);
17: void qsort(void*, unsigned, int, int(*)(void*, void*));
18: }
19:
20: #endif
21:
22: /* read an annotated spelling list in the form
23: word <tab> affixcode [ , affixcode ] ...
24: print a reencoded version
25: octal <tab> word
26: */
27:
28: typedef long Bits;
29: typedef struct Dict Dict;
30: struct Dict
31: {
32: char* word;
33: Bits encode;
34: };
35:
36: Dict words[200000];
37: char space[500000];
38: Bits encodes[4094];
39: long nspace;
40: long nwords;
41: int ncodes;
42:
43: void readinput(FILE*);
44: long typecode(char*);
45: int wcmp(void*, void*);
46: void pdict(void);
47: void sput(int);
48:
49: main(int argc, char *argv[])
50: {
51: FILE* f;
52:
53: nwords = 0;
54: nspace = 0;
55: ncodes = 0;
56: if(argc <= 1)
57: readinput(stdin);
58: while(argc > 1) {
59: f = fopen(argv[1], "r");
60: if(f == 0) {
61: fprintf(stderr, "Cannot open %s\n", argv[1]);
62: exit(1);
63: }
64: readinput(f);
65: fclose(f);
66: argc--;
67: argv++;
68: }
69: fprintf(stderr, "words = %ld; space = %ld; codes = %d\n",
70: nwords, nspace, ncodes);
71: qsort(words, nwords, sizeof(words[0]), wcmp);
72: pdict();
73: return 0;
74: }
75:
76: wcmp(void *a, void *b)
77: {
78:
79: return strcmp(((Dict*)a)->word, ((Dict*)b)->word);
80: }
81:
82: void
83: readinput(FILE* f)
84: {
85: long i;
86: char *code, *bword;
87: char line[200];
88: long lineno = 0;
89:
90: while(fgets(line, sizeof(line), f)) {
91: line[strlen(line)-1] = 0;
92: lineno++;
93: code = line;
94: while(isspace(*code))
95: code++;
96: bword = code;
97: while(*code && !isspace(*code))
98: code++;
99:
100: i = code-bword;
101: memcpy(space+nspace, bword, i);
102: words[nwords].word = space+nspace;
103: nspace += i;
104: space[nspace] = 0;
105: nspace++;
106:
107: if(*code) {
108: *code++ = 0;
109: while(isspace(*code))
110: code++;
111: }
112: words[nwords].encode = typecode(code);
113: nwords++;
114: if(nwords >= sizeof(words)/sizeof(words[0])) {
115: fprintf(stderr, "words array too small\n");
116: exit(1);
117: }
118: if(nspace >= sizeof(space)/sizeof(space[0])) {
119: fprintf(stderr, "space array too small\n");
120: exit(1);
121: }
122: }
123: }
124:
125:
126: typedef struct Class Class;
127: struct Class
128: {
129: char* codename;
130: long bits;
131: };
132: Class codea[] =
133: {
134: { "a", ADJ },
135: { "adv", ADV },
136: 0
137: };
138: Class codec[] =
139: {
140: { "comp", COMP },
141: 0
142: };
143: Class coded[] =
144: {
145: { "d", DONT_TOUCH},
146: 0
147: };
148:
149: Class codee[] =
150: {
151: { "ed", ED },
152: { "er", ACTOR },
153: 0
154: };
155:
156: Class codei[] =
157: {
158: { "in", IN },
159: { "ion", ION },
160: 0
161: };
162:
163: Class codem[] =
164: {
165: { "man", MAN },
166: { "ms", MONO },
167: 0
168: };
169:
170: Class coden[] =
171: {
172: { "n", NOUN },
173: { "na", N_AFFIX },
174: { "nopref", NOPREF },
175: 0
176: };
177:
178: Class codep[] =
179: {
180: { "pc", PROP_COLLECT },
181: 0
182: };
183: Class codes[] =
184: {
185: { "s", STOP },
186: 0
187: };
188:
189: Class codev[] =
190: {
191: { "v", VERB },
192: { "va", V_AFFIX },
193: { "vi", V_IRREG },
194: 0
195: };
196:
197: Class codey[] =
198: {
199: { "y", _Y },
200: 0
201: };
202:
203: Class codez[] =
204: {
205: 0
206: };
207: Class* codetab[] =
208: {
209: codea,
210: codez,
211: codec,
212: coded,
213: codee,
214: codez,
215: codez,
216: codez,
217: codei,
218: codez,
219: codez,
220: codez,
221: codem,
222: coden,
223: codez,
224: codep,
225: codez,
226: codez,
227: codes,
228: codez,
229: codez,
230: codev,
231: codez,
232: codez,
233: codey,
234: codez,
235: };
236:
237: long
238: typecode(char *str)
239: {
240: Class *p;
241: long code;
242: int n, i;
243: char *s, *sp, *st;
244:
245: code = 0;
246:
247: loop:
248: for(s=str; *s != 0 && *s != ','; s++)
249: ;
250: for(p = codetab[*str-'a']; sp = p->codename; p++) {
251: st = str;
252: for(n=s-str;; st++,sp++) {
253: if(*st != *sp)
254: goto cont;
255: n--;
256: if(n == 0)
257: break;
258: }
259: code |= p->bits;
260: if(*s == 0)
261: goto out;
262: str = s+1;
263: goto loop;
264: cont:;
265: }
266: fprintf(stderr, "Unknown affix code \"%s\"\n", str);
267: return 0;
268: out:
269: for(i=0; i<ncodes; i++)
270: if(encodes[i] == code)
271: return i;
272: encodes[i] = code;
273: ncodes++;
274: return i;
275: }
276:
277: void
278: sput(int s)
279: {
280:
281: putchar(s>>8);
282: putchar(s);
283: }
284:
285: void
286: lput(long l)
287: {
288: putchar(l>>24);
289: putchar(l>>16);
290: putchar(l>>8);
291: putchar(l);
292: }
293:
294: /*
295: * spit out the encoded dictionary
296: * all numbers are encoded big-endian.
297: * struct
298: * {
299: * short ncodes;
300: * int encodes[ncodes];
301: * struct
302: * {
303: * short encode;
304: * char word[*];
305: * } words[*];
306: * };
307: * 0x8000 flag for code word
308: * 0x7800 count of number of common bytes with previous word
309: * 0x07ff index into codes array for affixes
310: */
311: void
312: pdict(void)
313: {
314: long i, count;
315: Bits encode;
316: int j, c;
317: char *lastword, *thisword, *word;
318:
319: sput(ncodes);
320: for(i=0; i<ncodes; i++)
321: lput(encodes[i]);
322:
323: count = ncodes*4 + 2;
324: lastword = "";
325: for(i=0; i<nwords; i++) {
326: word = words[i].word;
327: thisword = word;
328: for(j=0; *thisword == *lastword; j++) {
329: if(*thisword == 0) {
330: fprintf(stderr, "identical words: %s\n", word);
331: break;
332: }
333: thisword++;
334: lastword++;
335: }
336: if(j > 15)
337: j = 15;
338: encode = words[i].encode;
339: c = (1<<15) | (j<<11) | encode;
340: sput(c);
341: count += 2;
342: for(thisword=word+j; c = *thisword; thisword++) {
343: putchar(c);
344: count++;
345: }
346: lastword = word;
347: }
348: fprintf(stderr, "output bytes = %ld\n", count);
349: }
This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.