|
|
1.1 ! root 1: %{ ! 2: /* break out words, output cap + word(inverted) */ ! 3: ! 4: #ifndef lint ! 5: static char sccsid[] = "@(#)nwords.l 4.2 (Berkeley) 82/11/06"; ! 6: #endif not lint ! 7: ! 8: #include <stdio.h> ! 9: #include <ctype.h> ! 10: #define OUT() for(i=yyleng-1;i>=0; i--)putchar(yytext[i]); putchar('\n') ! 11: #define OUT1(nam) printf("%c:%s\n",nam,yytext) ! 12: #define OUTN(string) printf("%s\n",string) ! 13: #include "names.h" ! 14: #include "nhash.c" ! 15: #include "dict.c" ! 16: #include "ydict.c" ! 17: #include "abbrev.c" ! 18: char nt[] = "D:n't"; ! 19: char qs[] = "c:'s"; ! 20: char fin[] = "E:."; ! 21: int NOCAPS = 0; /* if set all caps are turned to lower case */ ! 22: int i,j; ! 23: int dot = 0; ! 24: int first = 1; ! 25: int qflg,nflg; ! 26: int cap = 0; ! 27: %} ! 28: %p 3000 ! 29: %a 3300 ! 30: %o 4500 ! 31: ! 32: L [a-z] ! 33: N [0-9] ! 34: C [A-Z] ! 35: A [a-zA-Z] ! 36: P [a-zA-Z0-9] ! 37: ! 38: %% ! 39: ^[.!].+[\n] { ! 40: if(dot){ ! 41: OUTN(fin); ! 42: dot = 0; ! 43: first = 1; ! 44: } ! 45: printf(":%s",yytext); ! 46: } ! 47: May { ! 48: if(first == 0){ ! 49: OUT1(NOUN); ! 50: } ! 51: else { ! 52: first = 0; ! 53: yytext[0] = tolower(yytext[0]); ! 54: cap = 1; ! 55: goto wd; ! 56: } ! 57: } ! 58: "U.S." { ! 59: OUT1(NOUN); ! 60: } ! 61: {C}{L}*'[s] { ! 62: pos(1); ! 63: if(first==1)first=0; ! 64: } ! 65: {C}+['][s] { ! 66: if(NOCAPS) ! 67: for(i=0;i<yyleng;i++) ! 68: if(isupper(yytext[i]))yytext[i] = tolower(yytext[i]); ! 69: OUT1(POS); ! 70: } ! 71: {P}+([-]{P}+)+ { ! 72: if(NOCAPS) ! 73: for(i=0;i<yyleng;i++) ! 74: if(isupper(yytext[i]))yytext[i] = tolower(yytext[i]); ! 75: OUT1(NOUN_ADJ); ! 76: } ! 77: {C}{C}+ { ! 78: if(NOCAPS) ! 79: for(i=0;i<yyleng;i++) ! 80: yytext[i] = tolower(yytext[i]); ! 81: if((i=input()) == 's'){ ! 82: yytext[yyleng++] = 's'; ! 83: yytext[yyleng] = '\0'; ! 84: OUT1(PNOUN); ! 85: } ! 86: else { ! 87: unput(i); ! 88: if(!NOCAPS) ! 89: for(i=0;i<yyleng;i++)yytext[i] = tolower(yytext[i]); ! 90: goto wd; ! 91: } ! 92: } ! 93: [LD][']{C}{L}* { ! 94: if(NOCAPS){ ! 95: yytext[0] = tolower(yytext[0]); ! 96: yytext[2] = tolower(yytext[2]); ! 97: } ! 98: OUT1(NOUN_ADJ); ! 99: } ! 100: {C}{L}* { ! 101: if(first==1) ! 102: first=0; ! 103: else cap = 1; ! 104: if(yyleng==1 && yytext[0] == 'I'){ ! 105: cap = 0; ! 106: goto wd; ! 107: } ! 108: yytext[0] = tolower(yytext[0]); ! 109: goto wd; ! 110: } ! 111: {N}":"{N}{N} { ! 112: OUT1(NOUN_ADJ); ! 113: } ! 114: ({N}*[,])*({N}+".")+[ \t\n]+{C} { ! 115: for(i=yyleng-1;i>0;i--) ! 116: if(yytext[i] == '.')break; ! 117: unput(yytext[yyleng-1]); ! 118: yytext[i] = '\0'; ! 119: OUT1(NOUN_ADJ); ! 120: OUTN(fin); ! 121: first = 1; ! 122: } ! 123: ([hH]e"/"[sS]he)|([sS]he"/"[hH]e) { ! 124: if(NOCAPS) ! 125: if(isupper(yytext[0]))yytext[0] = tolower(yytext[0]); ! 126: OUT1(PRONS); ! 127: } ! 128: ([hH]is"/"[hH]er)|([hH]er"/"[hH]is) { ! 129: if(NOCAPS) ! 130: if(isupper(yytext[0]))yytext[0] = tolower(yytext[0]); ! 131: OUT1(POS); ! 132: } ! 133: [ \t`]*[a-zA-Z0-9.]*("\/"[a-zA-Z0-9.]+)+[']* { ! 134: if(yytext[yyleng-1] == '.'){ ! 135: if(ahead() == 0)dot=1; ! 136: } ! 137: if(NOCAPS) ! 138: for(i=0;i<yyleng;i++) ! 139: if(isupper(yytext[i]))yytext[i] = tolower(yytext[i]); ! 140: OUT1(NOUN_ADJ); ! 141: } ! 142: {N}+([,]{N}+)*("."{N}+)*[']*[s]* { ! 143: OUT1(NOUN_ADJ); ! 144: } ! 145: {N}*([,]{N}+)*("."{N}+)+[']*[s]* { ! 146: OUT1(NOUN_ADJ); ! 147: } ! 148: {N}+([,]{N}+)*("."{N}*)*[']*[s]* { ! 149: if(yytext[yyleng-1] == '.')dot=1; ! 150: OUT1(NOUN_ADJ); ! 151: } ! 152: ({A}*{N}+{A}*)+ { ! 153: if(input() == '.') ! 154: ahead(); ! 155: if(NOCAPS) ! 156: for(i=0;i<yyleng;i++) ! 157: if(isupper(yytext[i]))yytext[i]=tolower(yytext[i]); ! 158: OUT1(NOUN_ADJ); ! 159: } ! 160: {N}+[%] { ! 161: OUT1(NOUN_ADJ); ! 162: } ! 163: "$"{N}+([,]{N}+)*("."{N}*)* { ! 164: if(yytext[yyleng-1] == '.')dot=1; ! 165: OUT1(NOUN); ! 166: } ! 167: [Aa]"."[ ]*[Mm]"." { ! 168: OUT1(ADJ_ADV); ! 169: } ! 170: [Pp]"."[ ]*[Mm]"." { ! 171: OUT1(ADJ_ADV); ! 172: } ! 173: "a."[ ]*"d." { ! 174: OUT1(ADJ_ADV); ! 175: } ! 176: "b."[ ]*"c." { ! 177: OUT1(ADJ_ADV); ! 178: } ! 179: "i."[ ]*"e." { ! 180: OUT1(PREP); ! 181: } ! 182: "e."[ ]*"g." { ! 183: OUT1(PREP); ! 184: } ! 185: "etc."[ \n]*[,)]* { ! 186: i = yytext[4]; ! 187: yytext[4] = '\0'; ! 188: OUT1(NOUN); ! 189: yytext[4] = i; ! 190: yytext[0] = yytext[yyleng-1]; ! 191: yytext[1] = '\0'; ! 192: if(yytext[0] == ',' || yytext[0] == ')') ! 193: OUT1(','); ! 194: else { ! 195: OUTN(fin); ! 196: first = 1; ! 197: } ! 198: } ! 199: "et al." { ! 200: OUT1(NOUN); ! 201: } ! 202: in"."[ \n]*{C} { ! 203: unput(yytext[yyleng-1]); ! 204: yytext[2] = '\0'; ! 205: OUT1(PREP); ! 206: OUTN(fin); ! 207: first = 1; ! 208: } ! 209: Ph"."[ ]*[Dd]"." { ! 210: OUT1(ADJ); ! 211: } ! 212: [A-Z]"." { ! 213: dot=1; ! 214: OUT1(NOUN); ! 215: } ! 216: can't { ! 217: yytext[3]='\0'; ! 218: yyleng -= 2; ! 219: nflg=1; ! 220: goto wd; ! 221: } ! 222: won't { ! 223: OUT1('X'); ! 224: } ! 225: ain't { ! 226: OUT1('g'); ! 227: } ! 228: {L}+n't { ! 229: nflg=1; ! 230: yytext[yyleng-3]='\0'; ! 231: yyleng -= 3; ! 232: goto wd; ! 233: } ! 234: [A-Z]{L}+n't { ! 235: yytext[0] = tolower(yytext[0]); ! 236: nflg=1; ! 237: yytext[yyleng-3]='\0'; ! 238: yyleng -= 3; ! 239: goto wd; ! 240: } ! 241: o'clock { ! 242: OUT1(ADV); ! 243: } ! 244: {L}+'[s] { ! 245: pos(0); ! 246: } ! 247: 'll { ! 248: OUT1(lookup("will",1,0)); ! 249: } ! 250: 've { ! 251: OUT1(lookup("have",1,0)); ! 252: } ! 253: 're { ! 254: OUT1(lookup("are",1,0)); ! 255: } ! 256: 'd { ! 257: OUT1(lookup("had",1,0)); ! 258: } ! 259: 'm { ! 260: OUT1(lookup("am",1,0)); ! 261: } ! 262: 'ld { ! 263: OUT1(lookup("would",1,0)); ! 264: } ! 265: {L}+ { ! 266: wd: ! 267: if((j = lookup(yytext,1,0)) != 0){ ! 268: first=0; ! 269: if(cap){ ! 270: if(!NOCAPS) ! 271: yytext[0] = toupper(yytext[0]); ! 272: cap = 0; ! 273: if(dot)OUTN(fin); ! 274: } ! 275: dot=0; ! 276: OUT1(j); ! 277: if(nflg==1){ ! 278: nflg=0; ! 279: OUTN(nt); ! 280: } ! 281: } ! 282: else{ ! 283: first = dot=0; ! 284: if(yytext[yyleng-1] == 'y' && cap == 0){ ! 285: switch(yytext[yyleng-2]){ ! 286: case 'c': look(cy,yyleng-2,NOUN); ! 287: break; ! 288: case 'f': look(fy,yyleng-2,VERB); ! 289: break; ! 290: case 'l': look(ly,yyleng-2,ADV); ! 291: break; ! 292: case 'g': if(yytext[yyleng-3] == 'o'){ ! 293: OUT1(NOUN); ! 294: break; ! 295: } ! 296: look(gy,yyleng-2,ADJ); ! 297: break; ! 298: case 'r': switch(yytext[yyleng-3]){ ! 299: case 'a': look(ary,yyleng-3,ADJ); ! 300: break; ! 301: case 'o': look(ory,yyleng-3,ADJ); ! 302: break; ! 303: case 'e': look(ery,yyleng-3,NOUN); ! 304: break; ! 305: default: look(ry,yyleng-2,NOUN); ! 306: } ! 307: break; ! 308: case 't': if(yytext[yyleng-3] == 'i')look(ity,yyleng-3,NOUN); ! 309: else look(ty,yyleng-2,ADJ); ! 310: break; ! 311: default: OUT(); ! 312: } ! 313: } ! 314: else { ! 315: if(cap){ ! 316: if(!NOCAPS)yytext[0] = toupper(yytext[0]); ! 317: cap = 0; ! 318: OUT1(NOUN_ADJ); ! 319: } ! 320: else { ! 321: OUT(); ! 322: } ! 323: } ! 324: } ! 325: } ! 326: [\n] ; ! 327: [ ]+ ; ! 328: [\t]+ ; ! 329: ";" { ! 330: OUT1(';'); ! 331: first=1; ! 332: } ! 333: (\"|`|')+ { ! 334: if(dot){ ! 335: OUTN(fin); ! 336: dot=0; ! 337: } ! 338: if(qflg==1){ ! 339: qflg=0; ! 340: OUT1('"'); ! 341: } ! 342: else { ! 343: qflg=1; ! 344: first=1; ! 345: OUT1('"'); ! 346: } ! 347: } ! 348: ".\"" { ! 349: qflg=0; ! 350: first=1; ! 351: OUT1(END); ! 352: } ! 353: "..." { ! 354: OUT1(','); ! 355: } ! 356: "/." { ! 357: first = 1; ! 358: OUT1(END); ! 359: } ! 360: {A}{A}+"." { ! 361: yytext[yyleng-1] = '\0'; ! 362: if((j=abbrev(yytext,1,0)) != 0){ ! 363: if(isupper(yytext[0])){ ! 364: if(NOCAPS)yytext[0] = tolower(yytext[0]); ! 365: if(first == 1)first=0; ! 366: } ! 367: yytext[yyleng-1] = '.'; ! 368: OUT1(j); ! 369: } ! 370: else { ! 371: j = ahead(); ! 372: if(j == 0) ! 373: yyleng--; ! 374: for(i=0;i<yyleng;i++) ! 375: if(isupper(yytext[i])){ ! 376: yytext[i] = tolower(yytext[i]); ! 377: if(i == 0)cap = 1; ! 378: else cap = 0; ! 379: } ! 380: if(j == 0)goto wd; ! 381: OUT1(NOUN_ADJ); ! 382: } ! 383: } ! 384: "." { ! 385: first=1; ! 386: OUT1(END); ! 387: } ! 388: "!\"" { ! 389: qflg=0; ! 390: first=1; ! 391: OUT1(END); ! 392: } ! 393: "!" { ! 394: first=1; ! 395: OUT1(END); ! 396: } ! 397: "?\"" { ! 398: qflg=0; ! 399: first=1; ! 400: OUT1(END); ! 401: } ! 402: "?" { ! 403: first=1; ! 404: OUT1(END); ! 405: } ! 406: ":" { ! 407: OUT1(','); ! 408: first=1; ! 409: } ! 410: [-]+ { ! 411: OUT1(','); ! 412: first=1; ! 413: } ! 414: "," { ! 415: OUT1(','); ! 416: } ! 417: (\[|\(|\{|\]|\)|\}) { ! 418: OUT1(','); ! 419: } ! 420: . { ! 421: /* fprintf(stderr,"nwords funny char: %c\n",yytext[0])*/ ; ! 422: } ! 423: %% ! 424: look(f,n,cc) ! 425: char (*f)(); ! 426: int n; ! 427: char cc; ! 428: { ! 429: int nn; ! 430: char save; ! 431: save=yytext[n]; ! 432: yytext[n] = '\0'; ! 433: nn=(*f)(yytext,1,0); ! 434: yytext[n] = save; ! 435: if(nn != 0){ ! 436: OUT1(nn); ! 437: } ! 438: else { ! 439: OUT1(cc); ! 440: } ! 441: } ! 442: pos(flg){ ! 443: int ii,j; ! 444: if(flg == 1)yytext[0] = tolower(yytext[0]); ! 445: for(ii=yyleng-1;yytext[ii] != '\''; ii--); ! 446: yytext[ii] = '\0'; ! 447: if((j=lookup(yytext,1,0)) != 0){ ! 448: yyleng = ii; ! 449: OUT1(j); ! 450: OUTN(qs); ! 451: } ! 452: else{ ! 453: if(flg==1 && !NOCAPS)yytext[0] = toupper(yytext[0]); ! 454: yytext[ii] = '\''; ! 455: OUT1(POS); ! 456: } ! 457: } ! 458: char *filename="-"; ! 459: ! 460: main(argc,argv) ! 461: int argc; ! 462: char *argv[]; ! 463: { ! 464: register int rc=0; ! 465: putchar(':'); putchar('\n'); ! 466: getd(); ! 467: getab(); ! 468: ygetd(); ! 469: if(argc<=1) { ! 470: yylex(); ! 471: OUTN(fin); ! 472: }else{ ! 473: while(argc>1) { ! 474: if(freopen(argv[1],"r",stdin)==NULL) { ! 475: fprintf(stderr,"%s: cannot open\n", argv[1]); ! 476: rc++; ! 477: }else{ ! 478: filename=argv[1]; ! 479: yylex(); ! 480: OUTN(fin); ! 481: } ! 482: argc--; argv++; ! 483: } ! 484: } ! 485: return(rc); ! 486: } ! 487: ahead(){ ! 488: register int c; ! 489: if(isalnum((c=input()))){ ! 490: yytext[yyleng++] = '.'; ! 491: while(!isspace((c=input() ))) ! 492: yytext[yyleng++] = c; ! 493: yytext[yyleng] = '\0'; ! 494: unput(c); ! 495: return(1); ! 496: } ! 497: unput(c); ! 498: unput('.'); ! 499: return(0); ! 500: }
This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.