|
|
1.1 ! root 1: ! 2: #include <stdio.h> ! 3: #include <ctype.h> ! 4: #include <string.h> ! 5: #include "code.h" ! 6: ! 7: #ifndef __cplusplus ! 8: ! 9: void exit(int); ! 10: void qsort(void*, unsigned, int, int(*)(void*, void*)); ! 11: ! 12: #else ! 13: ! 14: #include <memory.h> ! 15: extern "C" { ! 16: void exit(int); ! 17: void qsort(void*, unsigned, int, int(*)(void*, void*)); ! 18: } ! 19: ! 20: #endif ! 21: ! 22: /* read an annotated spelling list in the form ! 23: word <tab> affixcode [ , affixcode ] ... ! 24: print a reencoded version ! 25: octal <tab> word ! 26: */ ! 27: ! 28: typedef long Bits; ! 29: typedef struct Dict Dict; ! 30: struct Dict ! 31: { ! 32: char* word; ! 33: Bits encode; ! 34: }; ! 35: ! 36: Dict words[200000]; ! 37: char space[500000]; ! 38: Bits encodes[4094]; ! 39: long nspace; ! 40: long nwords; ! 41: int ncodes; ! 42: ! 43: void readinput(FILE*); ! 44: long typecode(char*); ! 45: int wcmp(void*, void*); ! 46: void pdict(void); ! 47: void sput(int); ! 48: ! 49: main(int argc, char *argv[]) ! 50: { ! 51: FILE* f; ! 52: ! 53: nwords = 0; ! 54: nspace = 0; ! 55: ncodes = 0; ! 56: if(argc <= 1) ! 57: readinput(stdin); ! 58: while(argc > 1) { ! 59: f = fopen(argv[1], "r"); ! 60: if(f == 0) { ! 61: fprintf(stderr, "Cannot open %s\n", argv[1]); ! 62: exit(1); ! 63: } ! 64: readinput(f); ! 65: fclose(f); ! 66: argc--; ! 67: argv++; ! 68: } ! 69: fprintf(stderr, "words = %ld; space = %ld; codes = %d\n", ! 70: nwords, nspace, ncodes); ! 71: qsort(words, nwords, sizeof(words[0]), wcmp); ! 72: pdict(); ! 73: return 0; ! 74: } ! 75: ! 76: wcmp(void *a, void *b) ! 77: { ! 78: ! 79: return strcmp(((Dict*)a)->word, ((Dict*)b)->word); ! 80: } ! 81: ! 82: void ! 83: readinput(FILE* f) ! 84: { ! 85: long i; ! 86: char *code, *bword; ! 87: char line[200]; ! 88: long lineno = 0; ! 89: ! 90: while(fgets(line, sizeof(line), f)) { ! 91: line[strlen(line)-1] = 0; ! 92: lineno++; ! 93: code = line; ! 94: while(isspace(*code)) ! 95: code++; ! 96: bword = code; ! 97: while(*code && !isspace(*code)) ! 98: code++; ! 99: ! 100: i = code-bword; ! 101: memcpy(space+nspace, bword, i); ! 102: words[nwords].word = space+nspace; ! 103: nspace += i; ! 104: space[nspace] = 0; ! 105: nspace++; ! 106: ! 107: if(*code) { ! 108: *code++ = 0; ! 109: while(isspace(*code)) ! 110: code++; ! 111: } ! 112: words[nwords].encode = typecode(code); ! 113: nwords++; ! 114: if(nwords >= sizeof(words)/sizeof(words[0])) { ! 115: fprintf(stderr, "words array too small\n"); ! 116: exit(1); ! 117: } ! 118: if(nspace >= sizeof(space)/sizeof(space[0])) { ! 119: fprintf(stderr, "space array too small\n"); ! 120: exit(1); ! 121: } ! 122: } ! 123: } ! 124: ! 125: ! 126: typedef struct Class Class; ! 127: struct Class ! 128: { ! 129: char* codename; ! 130: long bits; ! 131: }; ! 132: Class codea[] = ! 133: { ! 134: { "a", ADJ }, ! 135: { "adv", ADV }, ! 136: 0 ! 137: }; ! 138: Class codec[] = ! 139: { ! 140: { "comp", COMP }, ! 141: 0 ! 142: }; ! 143: Class coded[] = ! 144: { ! 145: { "d", DONT_TOUCH}, ! 146: 0 ! 147: }; ! 148: ! 149: Class codee[] = ! 150: { ! 151: { "ed", ED }, ! 152: { "er", ACTOR }, ! 153: 0 ! 154: }; ! 155: ! 156: Class codei[] = ! 157: { ! 158: { "in", IN }, ! 159: { "ion", ION }, ! 160: 0 ! 161: }; ! 162: ! 163: Class codem[] = ! 164: { ! 165: { "man", MAN }, ! 166: { "ms", MONO }, ! 167: 0 ! 168: }; ! 169: ! 170: Class coden[] = ! 171: { ! 172: { "n", NOUN }, ! 173: { "na", N_AFFIX }, ! 174: { "nopref", NOPREF }, ! 175: 0 ! 176: }; ! 177: ! 178: Class codep[] = ! 179: { ! 180: { "pc", PROP_COLLECT }, ! 181: 0 ! 182: }; ! 183: Class codes[] = ! 184: { ! 185: { "s", STOP }, ! 186: 0 ! 187: }; ! 188: ! 189: Class codev[] = ! 190: { ! 191: { "v", VERB }, ! 192: { "va", V_AFFIX }, ! 193: { "vi", V_IRREG }, ! 194: 0 ! 195: }; ! 196: ! 197: Class codey[] = ! 198: { ! 199: { "y", _Y }, ! 200: 0 ! 201: }; ! 202: ! 203: Class codez[] = ! 204: { ! 205: 0 ! 206: }; ! 207: Class* codetab[] = ! 208: { ! 209: codea, ! 210: codez, ! 211: codec, ! 212: coded, ! 213: codee, ! 214: codez, ! 215: codez, ! 216: codez, ! 217: codei, ! 218: codez, ! 219: codez, ! 220: codez, ! 221: codem, ! 222: coden, ! 223: codez, ! 224: codep, ! 225: codez, ! 226: codez, ! 227: codes, ! 228: codez, ! 229: codez, ! 230: codev, ! 231: codez, ! 232: codez, ! 233: codey, ! 234: codez, ! 235: }; ! 236: ! 237: long ! 238: typecode(char *str) ! 239: { ! 240: Class *p; ! 241: long code; ! 242: int n, i; ! 243: char *s, *sp, *st; ! 244: ! 245: code = 0; ! 246: ! 247: loop: ! 248: for(s=str; *s != 0 && *s != ','; s++) ! 249: ; ! 250: for(p = codetab[*str-'a']; sp = p->codename; p++) { ! 251: st = str; ! 252: for(n=s-str;; st++,sp++) { ! 253: if(*st != *sp) ! 254: goto cont; ! 255: n--; ! 256: if(n == 0) ! 257: break; ! 258: } ! 259: code |= p->bits; ! 260: if(*s == 0) ! 261: goto out; ! 262: str = s+1; ! 263: goto loop; ! 264: cont:; ! 265: } ! 266: fprintf(stderr, "Unknown affix code \"%s\"\n", str); ! 267: return 0; ! 268: out: ! 269: for(i=0; i<ncodes; i++) ! 270: if(encodes[i] == code) ! 271: return i; ! 272: encodes[i] = code; ! 273: ncodes++; ! 274: return i; ! 275: } ! 276: ! 277: void ! 278: sput(int s) ! 279: { ! 280: ! 281: putchar(s>>8); ! 282: putchar(s); ! 283: } ! 284: ! 285: void ! 286: lput(long l) ! 287: { ! 288: putchar(l>>24); ! 289: putchar(l>>16); ! 290: putchar(l>>8); ! 291: putchar(l); ! 292: } ! 293: ! 294: /* ! 295: * spit out the encoded dictionary ! 296: * all numbers are encoded big-endian. ! 297: * struct ! 298: * { ! 299: * short ncodes; ! 300: * int encodes[ncodes]; ! 301: * struct ! 302: * { ! 303: * short encode; ! 304: * char word[*]; ! 305: * } words[*]; ! 306: * }; ! 307: * 0x8000 flag for code word ! 308: * 0x7800 count of number of common bytes with previous word ! 309: * 0x07ff index into codes array for affixes ! 310: */ ! 311: void ! 312: pdict(void) ! 313: { ! 314: long i, count; ! 315: Bits encode; ! 316: int j, c; ! 317: char *lastword, *thisword, *word; ! 318: ! 319: sput(ncodes); ! 320: for(i=0; i<ncodes; i++) ! 321: lput(encodes[i]); ! 322: ! 323: count = ncodes*4 + 2; ! 324: lastword = ""; ! 325: for(i=0; i<nwords; i++) { ! 326: word = words[i].word; ! 327: thisword = word; ! 328: for(j=0; *thisword == *lastword; j++) { ! 329: if(*thisword == 0) { ! 330: fprintf(stderr, "identical words: %s\n", word); ! 331: break; ! 332: } ! 333: thisword++; ! 334: lastword++; ! 335: } ! 336: if(j > 15) ! 337: j = 15; ! 338: encode = words[i].encode; ! 339: c = (1<<15) | (j<<11) | encode; ! 340: sput(c); ! 341: count += 2; ! 342: for(thisword=word+j; c = *thisword; thisword++) { ! 343: putchar(c); ! 344: count++; ! 345: } ! 346: lastword = word; ! 347: } ! 348: fprintf(stderr, "output bytes = %ld\n", count); ! 349: }
This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.