researchv10no/cmd/spell/pcode.c - annotate

Return to pcode.c CVS log
Up to [Research Unix] / researchv10no / cmd / spell
Annotation of researchv10no/cmd/spell/pcode.c, revision 1.1.1.1

1.1       root        1: 
                      2: #include <stdio.h>
                      3: #include <ctype.h>
                      4: #include <string.h>
                      5: #include "code.h"
                      6: 
                      7: #ifndef __cplusplus
                      8: 
                      9: void   exit(int);
                     10: void   qsort(void*, unsigned, int, int(*)(void*, void*));
                     11: 
                     12: #else
                     13: 
                     14: #include <memory.h>
                     15: extern "C" {
                     16: void   exit(int);
                     17: void   qsort(void*, unsigned, int, int(*)(void*, void*));
                     18: }
                     19: 
                     20: #endif
                     21: 
                     22: /* read an annotated spelling list in the form
                     23:        word <tab> affixcode [ , affixcode ] ...
                     24:    print a reencoded version
                     25:        octal <tab> word
                     26:  */
                     27: 
                     28: typedef long Bits;
                     29: typedef        struct  Dict    Dict;
                     30: struct Dict
                     31: {
                     32:        char*   word;
                     33:        Bits    encode;
                     34: };
                     35: 
                     36: Dict   words[200000];
                     37: char   space[500000];
                     38: Bits   encodes[4094];
                     39: long   nspace;
                     40: long   nwords;
                     41: int    ncodes;
                     42: 
                     43: void   readinput(FILE*);
                     44: long   typecode(char*);
                     45: int    wcmp(void*, void*);
                     46: void   pdict(void);
                     47: void   sput(int);
                     48: 
                     49: main(int argc, char *argv[])
                     50: {
                     51:        FILE* f;
                     52: 
                     53:        nwords = 0;
                     54:        nspace = 0;
                     55:        ncodes = 0;
                     56:        if(argc <= 1)
                     57:                readinput(stdin);
                     58:        while(argc > 1) {
                     59:                f = fopen(argv[1], "r");
                     60:                if(f == 0) {
                     61:                        fprintf(stderr, "Cannot open %s\n", argv[1]);
                     62:                        exit(1);
                     63:                }
                     64:                readinput(f);
                     65:                fclose(f);
                     66:                argc--;
                     67:                argv++;
                     68:        }
                     69:        fprintf(stderr, "words = %ld; space = %ld; codes = %d\n",
                     70:                nwords, nspace, ncodes);
                     71:        qsort(words, nwords, sizeof(words[0]), wcmp);
                     72:        pdict();
                     73:        return 0;
                     74: }
                     75: 
                     76: wcmp(void *a, void *b)
                     77: {
                     78: 
                     79:        return strcmp(((Dict*)a)->word, ((Dict*)b)->word);
                     80: }
                     81: 
                     82: void
                     83: readinput(FILE* f)
                     84: {
                     85:        long i;
                     86:        char *code, *bword;
                     87:        char line[200];
                     88:        long lineno = 0;
                     89: 
                     90:        while(fgets(line, sizeof(line), f)) {
                     91:                line[strlen(line)-1] = 0;
                     92:                lineno++;
                     93:                code = line;
                     94:                while(isspace(*code))
                     95:                        code++;
                     96:                bword = code;
                     97:                while(*code && !isspace(*code))
                     98:                        code++;
                     99: 
                    100:                i = code-bword;
                    101:                memcpy(space+nspace, bword, i);
                    102:                words[nwords].word = space+nspace;
                    103:                nspace += i;
                    104:                space[nspace] = 0;
                    105:                nspace++;
                    106: 
                    107:                if(*code) {
                    108:                        *code++ = 0;
                    109:                        while(isspace(*code))
                    110:                                code++;
                    111:                }
                    112:                words[nwords].encode = typecode(code);
                    113:                nwords++;
                    114:                if(nwords >= sizeof(words)/sizeof(words[0])) {
                    115:                        fprintf(stderr, "words array too small\n");
                    116:                        exit(1);
                    117:                }
                    118:                if(nspace >= sizeof(space)/sizeof(space[0])) {
                    119:                        fprintf(stderr, "space array too small\n");
                    120:                        exit(1);
                    121:                }
                    122:        }
                    123: }
                    124: 
                    125: 
                    126: typedef        struct  Class   Class;
                    127: struct Class
                    128: {
                    129:        char*   codename;
                    130:        long    bits;
                    131: };
                    132: Class  codea[]  =
                    133: {
                    134:        { "a", ADJ },
                    135:        { "adv", ADV },
                    136:        0
                    137: };
                    138: Class  codec[] =
                    139: {
                    140:        { "comp", COMP },
                    141:        0
                    142: };
                    143: Class  coded[] =
                    144: {
                    145:        { "d", DONT_TOUCH},
                    146:        0
                    147: };
                    148: 
                    149: Class  codee[] =
                    150: {
                    151:        { "ed", ED },
                    152:        { "er", ACTOR },
                    153:        0
                    154: };
                    155: 
                    156: Class  codei[] =
                    157: {
                    158:        { "in", IN },
                    159:        { "ion", ION },
                    160:        0
                    161: };
                    162: 
                    163: Class  codem[] =
                    164: {
                    165:        { "man", MAN },
                    166:        { "ms", MONO },
                    167:        0
                    168: };
                    169: 
                    170: Class  coden[] =
                    171: {
                    172:        { "n", NOUN },
                    173:        { "na", N_AFFIX },
                    174:        { "nopref", NOPREF },
                    175:        0
                    176: };
                    177: 
                    178: Class  codep[] =
                    179: {
                    180:        { "pc", PROP_COLLECT },
                    181:        0
                    182: };
                    183: Class  codes[] =
                    184: {
                    185:        { "s", STOP },
                    186:        0
                    187: };
                    188: 
                    189: Class  codev[] =
                    190: {
                    191:        { "v", VERB },
                    192:        { "va", V_AFFIX },
                    193:        { "vi", V_IRREG },
                    194:        0
                    195: };
                    196: 
                    197: Class  codey[] =
                    198: {
                    199:        { "y", _Y },
                    200:        0
                    201: };
                    202: 
                    203: Class  codez[] =
                    204: {
                    205:        0
                    206: };
                    207: Class* codetab[] =
                    208: {
                    209:        codea,
                    210:        codez,
                    211:        codec,
                    212:        coded,
                    213:        codee,
                    214:        codez,
                    215:        codez,
                    216:        codez,
                    217:        codei,
                    218:        codez,
                    219:        codez,
                    220:        codez,
                    221:        codem,
                    222:        coden,
                    223:        codez,
                    224:        codep,
                    225:        codez,
                    226:        codez,
                    227:        codes,
                    228:        codez,
                    229:        codez,
                    230:        codev,
                    231:        codez,
                    232:        codez,
                    233:        codey,
                    234:        codez,
                    235: };
                    236: 
                    237: long
                    238: typecode(char *str)
                    239: {
                    240:        Class *p;
                    241:        long code;
                    242:        int n, i;
                    243:        char *s, *sp, *st;
                    244: 
                    245:        code = 0;
                    246: 
                    247: loop:
                    248:        for(s=str; *s != 0 && *s != ','; s++)
                    249:                ;
                    250:        for(p = codetab[*str-'a']; sp = p->codename; p++) {
                    251:                st = str;
                    252:                for(n=s-str;; st++,sp++) {
                    253:                        if(*st != *sp)
                    254:                                goto cont;
                    255:                        n--;
                    256:                        if(n == 0)
                    257:                                break;
                    258:                }
                    259:                code |= p->bits;
                    260:                if(*s == 0)
                    261:                        goto out;
                    262:                str = s+1;
                    263:                goto loop;
                    264:        cont:;
                    265:        }
                    266:        fprintf(stderr, "Unknown affix code \"%s\"\n", str);
                    267:        return 0;
                    268: out:
                    269:        for(i=0; i<ncodes; i++)
                    270:                if(encodes[i] == code)
                    271:                        return i;
                    272:        encodes[i] = code;
                    273:        ncodes++;
                    274:        return i;
                    275: }
                    276: 
                    277: void
                    278: sput(int s)
                    279: {
                    280: 
                    281:        putchar(s>>8);
                    282:        putchar(s);
                    283: }
                    284: 
                    285: void
                    286: lput(long l)
                    287: {
                    288:        putchar(l>>24);
                    289:        putchar(l>>16);
                    290:        putchar(l>>8);
                    291:        putchar(l);
                    292: }
                    293: 
                    294: /*
                    295:  * spit out the encoded dictionary
                    296:  * all numbers are encoded big-endian.
                    297:  *     struct
                    298:  *     {
                    299:  *             short   ncodes;
                    300:  *             int     encodes[ncodes];
                    301:  *             struct
                    302:  *             {
                    303:  *                     short   encode;
                    304:  *                     char    word[*];
                    305:  *             } words[*];
                    306:  *     };
                    307:  * 0x8000 flag for code word
                    308:  * 0x7800 count of number of common bytes with previous word
                    309:  * 0x07ff index into codes array for affixes
                    310:  */
                    311: void
                    312: pdict(void)
                    313: {
                    314:        long i, count;
                    315:        Bits encode;
                    316:        int j, c;
                    317:        char *lastword, *thisword, *word;
                    318: 
                    319:        sput(ncodes);
                    320:        for(i=0; i<ncodes; i++)
                    321:                lput(encodes[i]);
                    322: 
                    323:        count = ncodes*4 + 2;
                    324:        lastword = "";
                    325:        for(i=0; i<nwords; i++) {
                    326:                word = words[i].word;
                    327:                thisword = word;
                    328:                for(j=0; *thisword == *lastword; j++) {
                    329:                        if(*thisword == 0) {
                    330:                                fprintf(stderr, "identical words: %s\n", word);
                    331:                                break;
                    332:                        }
                    333:                        thisword++;
                    334:                        lastword++;
                    335:                }
                    336:                if(j > 15)
                    337:                        j = 15;
                    338:                encode = words[i].encode;
                    339:                c = (1<<15) | (j<<11) | encode;
                    340:                sput(c);
                    341:                count += 2;
                    342:                for(thisword=word+j; c = *thisword; thisword++) {
                    343:                        putchar(c);
                    344:                        count++;
                    345:                }
                    346:                lastword = word;
                    347:        }
                    348:        fprintf(stderr, "output bytes = %ld\n", count);
                    349: }
unix.superglobalmegacorp.com
This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.