|
|
1.1 ! root 1: %{ ! 2: /* break out words, output cap + word(inverted) */ ! 3: #include <stdio.h> ! 4: #include <ctype.h> ! 5: #define OUT() for(i=yyleng-1;i>=0; i--)putchar(yytext[i]); putchar('\n') ! 6: #define OUT1(nam) printf("%c:%s\n",nam,yytext) ! 7: #define OUTN(string) printf("%s\n",string) ! 8: #include "names.h" ! 9: #include "nhash.c" ! 10: #include "dict.c" ! 11: #include "ydict.c" ! 12: #include "abbrev.c" ! 13: char nt[] = "D:n't"; ! 14: char qs[] = "c:'s"; ! 15: char fin[] = "E:."; ! 16: int NOCAPS = 0; /* if set all caps are turned to lower case */ ! 17: int i,j; ! 18: int dot = 0; ! 19: int first = 1; ! 20: int qflg,nflg; ! 21: int cap = 0; ! 22: %} ! 23: %p 3000 ! 24: %a 3300 ! 25: %o 4500 ! 26: ! 27: L [a-z] ! 28: N [0-9] ! 29: C [A-Z] ! 30: A [a-zA-Z] ! 31: P [a-zA-Z0-9] ! 32: ! 33: %% ! 34: ^[.!].+[\n] { ! 35: if(dot){ ! 36: OUTN(fin); ! 37: dot = 0; ! 38: first = 1; ! 39: } ! 40: printf(":%s",yytext); ! 41: } ! 42: May { ! 43: if(first == 0){ ! 44: OUT1(NOUN); ! 45: } ! 46: else { ! 47: first = 0; ! 48: yytext[0] = tolower(yytext[0]); ! 49: cap = 1; ! 50: goto wd; ! 51: } ! 52: } ! 53: "U.S." { ! 54: OUT1(NOUN); ! 55: } ! 56: {C}{L}*'[s] { ! 57: pos(1); ! 58: if(first==1)first=0; ! 59: } ! 60: {C}+['][s] { ! 61: if(NOCAPS) ! 62: for(i=0;i<yyleng;i++) ! 63: if(isupper(yytext[i]))yytext[i] = tolower(yytext[i]); ! 64: OUT1(POS); ! 65: } ! 66: {P}+([-]{P}+)+ { ! 67: if(NOCAPS) ! 68: for(i=0;i<yyleng;i++) ! 69: if(isupper(yytext[i]))yytext[i] = tolower(yytext[i]); ! 70: OUT1(NOUN_ADJ); ! 71: } ! 72: {C}{C}+ { ! 73: if(NOCAPS) ! 74: for(i=0;i<yyleng;i++) ! 75: yytext[i] = tolower(yytext[i]); ! 76: if((i=input()) == 's'){ ! 77: yytext[yyleng++] = 's'; ! 78: yytext[yyleng] = '\0'; ! 79: OUT1(PNOUN); ! 80: } ! 81: else if(i == '&'){ ! 82: yytext[yyleng++] = i; ! 83: if(isupper(i=input())){ ! 84: yytext[yyleng++] = i; ! 85: while(isupper(i=input())) ! 86: yytext[yyleng++] = i; ! 87: } ! 88: yytext[yyleng] = '\0'; ! 89: unput(i); ! 90: OUT1(NOUN); ! 91: } ! 92: else { ! 93: unput(i); ! 94: if(!NOCAPS) ! 95: for(i=0;i<yyleng;i++)yytext[i] = tolower(yytext[i]); ! 96: goto wd; ! 97: } ! 98: } ! 99: [LD][']{C}{L}* { ! 100: if(NOCAPS){ ! 101: yytext[0] = tolower(yytext[0]); ! 102: yytext[2] = tolower(yytext[2]); ! 103: } ! 104: OUT1(NOUN_ADJ); ! 105: } ! 106: {C}{L}* { ! 107: if(first==1) ! 108: first=0; ! 109: else cap = 1; ! 110: if(yyleng==1 && yytext[0] == 'I'){ ! 111: cap = 0; ! 112: goto wd; ! 113: } ! 114: yytext[0] = tolower(yytext[0]); ! 115: goto wd; ! 116: } ! 117: {N}":"{N}{N} { ! 118: OUT1(NOUN_ADJ); ! 119: } ! 120: ({N}*[,])*({N}+".")+[ \t\n]+{C} { ! 121: for(i=yyleng-1;i>0;i--) ! 122: if(yytext[i] == '.')break; ! 123: unput(yytext[yyleng-1]); ! 124: yytext[i] = '\0'; ! 125: OUT1(NOUN_ADJ); ! 126: OUTN(fin); ! 127: first = 1; ! 128: } ! 129: ([hH]e"/"[sS]he)|([sS]he"/"[hH]e) { ! 130: if(NOCAPS) ! 131: if(isupper(yytext[0]))yytext[0] = tolower(yytext[0]); ! 132: OUT1(PRONS); ! 133: } ! 134: ([hH]is"/"[hH]er)|([hH]er"/"[hH]is) { ! 135: if(NOCAPS) ! 136: if(isupper(yytext[0]))yytext[0] = tolower(yytext[0]); ! 137: OUT1(POS); ! 138: } ! 139: [ \t`]*[a-zA-Z0-9.]*("\/"[a-zA-Z0-9.]+)+[']* { ! 140: if(yytext[yyleng-1] == '.'){ ! 141: if(ahead() == 0)dot=1; ! 142: } ! 143: if(NOCAPS) ! 144: for(i=0;i<yyleng;i++) ! 145: if(isupper(yytext[i]))yytext[i] = tolower(yytext[i]); ! 146: OUT1(NOUN_ADJ); ! 147: } ! 148: {N}+([,]{N}+)*("."{N}+)*[']*[s]* { ! 149: OUT1(NOUN_ADJ); ! 150: } ! 151: {N}*([,]{N}+)*("."{N}+)+[']*[s]* { ! 152: OUT1(NOUN_ADJ); ! 153: } ! 154: {N}+([,]{N}+)*("."{N}*)*[']*[s]* { ! 155: if(yytext[yyleng-1] == '.')dot=1; ! 156: OUT1(NOUN_ADJ); ! 157: } ! 158: ({A}*{N}+{A}*)+ { ! 159: if(input() == '.') ! 160: ahead(); ! 161: if(NOCAPS) ! 162: for(i=0;i<yyleng;i++) ! 163: if(isupper(yytext[i]))yytext[i]=tolower(yytext[i]); ! 164: OUT1(NOUN_ADJ); ! 165: } ! 166: {N}+[%] { ! 167: OUT1(NOUN_ADJ); ! 168: } ! 169: "$"{N}+([,]{N}+)*("."{N}*)* { ! 170: if(yytext[yyleng-1] == '.')dot=1; ! 171: OUT1(NOUN); ! 172: } ! 173: [Aa]"."[ ]*[Mm]"." { ! 174: OUT1(ADJ_ADV); ! 175: } ! 176: [Pp]"."[ ]*[Mm]"." { ! 177: OUT1(ADJ_ADV); ! 178: } ! 179: "a."[ ]*"d." { ! 180: OUT1(ADJ_ADV); ! 181: } ! 182: "b."[ ]*"c." { ! 183: OUT1(ADJ_ADV); ! 184: } ! 185: "i."[ ]*"e." { ! 186: OUT1(PREP); ! 187: } ! 188: "e."[ ]*"g." { ! 189: OUT1(PREP); ! 190: } ! 191: "etc."[ \n]*[,)]* { ! 192: i = yytext[4]; ! 193: yytext[4] = '\0'; ! 194: OUT1(NOUN); ! 195: yytext[4] = i; ! 196: yytext[0] = yytext[yyleng-1]; ! 197: yytext[1] = '\0'; ! 198: if(yytext[0] == ',' || yytext[0] == ')') ! 199: OUT1(','); ! 200: else { ! 201: OUTN(fin); ! 202: first = 1; ! 203: } ! 204: } ! 205: "et al." { ! 206: OUT1(NOUN); ! 207: } ! 208: in"."[ \n]*{C} { ! 209: unput(yytext[yyleng-1]); ! 210: yytext[2] = '\0'; ! 211: OUT1(PREP); ! 212: OUTN(fin); ! 213: first = 1; ! 214: } ! 215: Ph"."[ ]*[Dd]"." { ! 216: OUT1(ADJ); ! 217: } ! 218: [A-Z]"." { ! 219: dot=1; ! 220: OUT1(NOUN); ! 221: } ! 222: can't { ! 223: yytext[3]='\0'; ! 224: yyleng -= 2; ! 225: nflg=1; ! 226: goto wd; ! 227: } ! 228: won't { ! 229: OUT1('X'); ! 230: } ! 231: ain't { ! 232: OUT1('g'); ! 233: } ! 234: [A-Z]*{L}+n't { ! 235: if(isupper(yytext[0])) ! 236: yytext[0] = tolower(yytext[0]); ! 237: nflg=1; ! 238: yytext[yyleng-3]='\0'; ! 239: yyleng -= 3; ! 240: goto wd; ! 241: } ! 242: o'clock { ! 243: OUT1(ADV); ! 244: } ! 245: {L}+'[s] { ! 246: pos(0); ! 247: } ! 248: 'll { ! 249: OUT1(lookup("will",1,0)); ! 250: } ! 251: 've { ! 252: OUT1(lookup("have",1,0)); ! 253: } ! 254: 're { ! 255: OUT1(lookup("are",1,0)); ! 256: } ! 257: 'd { ! 258: OUT1(lookup("had",1,0)); ! 259: } ! 260: 'm { ! 261: OUT1(lookup("am",1,0)); ! 262: } ! 263: 'ld { ! 264: OUT1(lookup("would",1,0)); ! 265: } ! 266: {L}+ { ! 267: wd: ! 268: if((j = lookup(yytext,1,0)) != 0){ ! 269: first=0; ! 270: if(cap){ ! 271: if(!NOCAPS) ! 272: yytext[0] = toupper(yytext[0]); ! 273: cap = 0; ! 274: if(dot)OUTN(fin); ! 275: } ! 276: dot=0; ! 277: OUT1(j); ! 278: if(nflg==1){ ! 279: nflg=0; ! 280: OUTN(nt); ! 281: } ! 282: } ! 283: else{ ! 284: first = dot=0; ! 285: if(yytext[yyleng-1] == 'y' && cap == 0){ ! 286: switch(yytext[yyleng-2]){ ! 287: case 'c': look(cy,yyleng-2,NOUN); ! 288: break; ! 289: case 'f': look(fy,yyleng-2,VERB); ! 290: break; ! 291: case 'l': look(ly,yyleng-2,ADV); ! 292: break; ! 293: case 'g': if(yytext[yyleng-3] == 'o'){ ! 294: OUT1(NOUN); ! 295: break; ! 296: } ! 297: look(gy,yyleng-2,ADJ); ! 298: break; ! 299: case 'r': switch(yytext[yyleng-3]){ ! 300: case 'a': look(ary,yyleng-3,ADJ); ! 301: break; ! 302: case 'o': look(ory,yyleng-3,ADJ); ! 303: break; ! 304: case 'e': look(ery,yyleng-3,NOUN); ! 305: break; ! 306: default: look(ry,yyleng-2,NOUN); ! 307: } ! 308: break; ! 309: case 't': if(yytext[yyleng-3] == 'i')look(ity,yyleng-3,NOUN); ! 310: else look(ty,yyleng-2,ADJ); ! 311: break; ! 312: default: OUT(); ! 313: } ! 314: } ! 315: else { ! 316: if(cap){ ! 317: if(!NOCAPS)yytext[0] = toupper(yytext[0]); ! 318: cap = 0; ! 319: OUT1(NOUN_ADJ); ! 320: } ! 321: else { ! 322: OUT(); ! 323: } ! 324: } ! 325: } ! 326: } ! 327: [\n] ; ! 328: [ ]+ ; ! 329: [\t]+ ; ! 330: ";" { ! 331: OUT1(';'); ! 332: first=1; ! 333: } ! 334: (\"|`|')+ { ! 335: if(dot){ ! 336: OUTN(fin); ! 337: dot=0; ! 338: } ! 339: if(qflg==1){ ! 340: qflg=0; ! 341: OUT1('"'); ! 342: } ! 343: else { ! 344: qflg=1; ! 345: first=1; ! 346: OUT1('"'); ! 347: } ! 348: } ! 349: ".\"" { ! 350: qflg=0; ! 351: first=1; ! 352: OUT1(END); ! 353: } ! 354: "..." { ! 355: OUT1(','); ! 356: } ! 357: "~." { ! 358: first = 1; ! 359: OUT1(END); ! 360: } ! 361: {A}{A}+"." { ! 362: yytext[yyleng-1] = '\0'; ! 363: if((j=abbrev(yytext,1,0)) != 0){ ! 364: if(isupper(yytext[0])){ ! 365: if(NOCAPS)yytext[0] = tolower(yytext[0]); ! 366: if(first == 1)first=0; ! 367: } ! 368: yytext[yyleng-1] = '.'; ! 369: OUT1(j); ! 370: } ! 371: else { ! 372: j = ahead(); ! 373: if(j == 0) ! 374: yyleng--; ! 375: for(i=0;i<yyleng;i++) ! 376: if(isupper(yytext[i])){ ! 377: yytext[i] = tolower(yytext[i]); ! 378: if(i == 0)cap = 1; ! 379: else cap = 0; ! 380: } ! 381: if(j == 0)goto wd; ! 382: if(cap) ! 383: if(!NOCAPS)yytext[0] = toupper(yytext[0]); ! 384: OUT1(NOUN_ADJ); ! 385: } ! 386: } ! 387: "." { ! 388: first=1; ! 389: OUT1(END); ! 390: } ! 391: "!\"" { ! 392: qflg=0; ! 393: first=1; ! 394: OUT1(END); ! 395: } ! 396: "!" { ! 397: first=1; ! 398: OUT1(END); ! 399: } ! 400: "?\"" { ! 401: qflg=0; ! 402: first=1; ! 403: OUT1(END); ! 404: } ! 405: "?" { ! 406: first=1; ! 407: OUT1(END); ! 408: } ! 409: ":" { ! 410: OUT1(','); ! 411: first=1; ! 412: } ! 413: [-]+ { ! 414: OUT1(','); ! 415: first=1; ! 416: } ! 417: "," { ! 418: OUT1(','); ! 419: } ! 420: (\[|\(|\{|\]|\)|\}) { ! 421: OUT1(','); ! 422: } ! 423: . { ! 424: /* fprintf(stderr,"nwords funny char: %c\n",yytext[0])*/ ; ! 425: } ! 426: %% ! 427: look(f,n,cc) ! 428: char (*f)(); ! 429: int n; ! 430: char cc; ! 431: { ! 432: int nn; ! 433: char save; ! 434: save=yytext[n]; ! 435: yytext[n] = '\0'; ! 436: nn=(*f)(yytext,1,0); ! 437: yytext[n] = save; ! 438: if(nn != 0){ ! 439: OUT1(nn); ! 440: } ! 441: else { ! 442: OUT1(cc); ! 443: } ! 444: } ! 445: pos(flg){ ! 446: int ii,j; ! 447: if(flg == 1)yytext[0] = tolower(yytext[0]); ! 448: for(ii=yyleng-1;yytext[ii] != '\''; ii--); ! 449: yytext[ii] = '\0'; ! 450: if((j=lookup(yytext,1,0)) != 0){ ! 451: yyleng = ii; ! 452: OUT1(j); ! 453: OUTN(qs); ! 454: } ! 455: else{ ! 456: if(flg==1 && !NOCAPS)yytext[0] = toupper(yytext[0]); ! 457: yytext[ii] = '\''; ! 458: OUT1(POS); ! 459: } ! 460: } ! 461: char *filename="-"; ! 462: ! 463: main(argc,argv) ! 464: int argc; ! 465: char *argv[]; ! 466: { ! 467: register int rc=0; ! 468: putchar(':'); putchar('\n'); ! 469: getd(); ! 470: getab(); ! 471: ygetd(); ! 472: if(argc<=1) { ! 473: yylex(); ! 474: OUTN(fin); ! 475: }else{ ! 476: while(argc>1) { ! 477: if(freopen(argv[1],"r",stdin)==NULL) { ! 478: fprintf(stderr,"%s: cannot open\n", argv[1]); ! 479: rc++; ! 480: }else{ ! 481: filename=argv[1]; ! 482: yylex(); ! 483: OUTN(fin); ! 484: } ! 485: argc--; argv++; ! 486: } ! 487: } ! 488: return(rc); ! 489: } ! 490: ahead(){ ! 491: register int c; ! 492: if(isalnum((c=input()))){ ! 493: yytext[yyleng++] = '.'; ! 494: while(!isspace((c=input() ))) ! 495: yytext[yyleng++] = c; ! 496: yytext[yyleng] = '\0'; ! 497: unput(c); ! 498: return(1); ! 499: } ! 500: unput(c); ! 501: unput('.'); ! 502: return(0); ! 503: }
This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.