|
|
1.1 root 1: %{
2: /* break out words, output cap + word(inverted) */
3: #include <stdio.h>
4: #include <ctype.h>
5: #define OUT() for(i=yyleng-1;i>=0; i--)putchar(yytext[i]); putchar('\n')
6: #define OUT1(nam) printf("%c:%s\n",nam,yytext)
7: #define OUTN(string) printf("%s\n",string)
8: #include "names.h"
9: #include "nhash.c"
10: #include "dict.c"
11: #include "ydict.c"
12: #include "abbrev.c"
13: char nt[] = "D:n't";
14: char qs[] = "c:'s";
15: char fin[] = "E:.";
16: int NOCAPS = 0; /* if set all caps are turned to lower case */
17: int i,j;
18: int dot = 0;
19: int first = 1;
20: int qflg,nflg;
21: int cap = 0;
22: %}
23: %p 3000
24: %a 3300
25: %o 4500
26:
27: L [a-z]
28: N [0-9]
29: C [A-Z]
30: A [a-zA-Z]
31: P [a-zA-Z0-9]
32:
33: %%
34: ^[.!].+[\n] {
35: if(dot){
36: OUTN(fin);
37: dot = 0;
38: first = 1;
39: }
40: printf(":%s",yytext);
41: }
42: May {
43: if(first == 0){
44: OUT1(NOUN);
45: }
46: else {
47: first = 0;
48: yytext[0] = tolower(yytext[0]);
49: cap = 1;
50: goto wd;
51: }
52: }
53: "U.S." {
54: OUT1(NOUN);
55: }
56: {C}{L}*'[s] {
57: pos(1);
58: if(first==1)first=0;
59: }
60: {C}+['][s] {
61: if(NOCAPS)
62: for(i=0;i<yyleng;i++)
63: if(isupper(yytext[i]))yytext[i] = tolower(yytext[i]);
64: OUT1(POS);
65: }
66: {P}+([-]{P}+)+ {
67: if(NOCAPS)
68: for(i=0;i<yyleng;i++)
69: if(isupper(yytext[i]))yytext[i] = tolower(yytext[i]);
70: OUT1(NOUN_ADJ);
71: }
72: {C}{C}+ {
73: if(NOCAPS)
74: for(i=0;i<yyleng;i++)
75: yytext[i] = tolower(yytext[i]);
76: if((i=input()) == 's'){
77: yytext[yyleng++] = 's';
78: yytext[yyleng] = '\0';
79: OUT1(PNOUN);
80: }
81: else if(i == '&'){
82: yytext[yyleng++] = i;
83: if(isupper(i=input())){
84: yytext[yyleng++] = i;
85: while(isupper(i=input()))
86: yytext[yyleng++] = i;
87: }
88: yytext[yyleng] = '\0';
89: unput(i);
90: OUT1(NOUN);
91: }
92: else {
93: unput(i);
94: if(!NOCAPS)
95: for(i=0;i<yyleng;i++)yytext[i] = tolower(yytext[i]);
96: goto wd;
97: }
98: }
99: [LD][']{C}{L}* {
100: if(NOCAPS){
101: yytext[0] = tolower(yytext[0]);
102: yytext[2] = tolower(yytext[2]);
103: }
104: OUT1(NOUN_ADJ);
105: }
106: {C}{L}* {
107: if(first==1)
108: first=0;
109: else cap = 1;
110: if(yyleng==1 && yytext[0] == 'I'){
111: cap = 0;
112: goto wd;
113: }
114: yytext[0] = tolower(yytext[0]);
115: goto wd;
116: }
117: {N}":"{N}{N} {
118: OUT1(NOUN_ADJ);
119: }
120: ({N}*[,])*({N}+".")+[ \t\n]+{C} {
121: for(i=yyleng-1;i>0;i--)
122: if(yytext[i] == '.')break;
123: unput(yytext[yyleng-1]);
124: yytext[i] = '\0';
125: OUT1(NOUN_ADJ);
126: OUTN(fin);
127: first = 1;
128: }
129: ([hH]e"/"[sS]he)|([sS]he"/"[hH]e) {
130: if(NOCAPS)
131: if(isupper(yytext[0]))yytext[0] = tolower(yytext[0]);
132: OUT1(PRONS);
133: }
134: ([hH]is"/"[hH]er)|([hH]er"/"[hH]is) {
135: if(NOCAPS)
136: if(isupper(yytext[0]))yytext[0] = tolower(yytext[0]);
137: OUT1(POS);
138: }
139: [ \t`]*[a-zA-Z0-9.]*("\/"[a-zA-Z0-9.]+)+[']* {
140: if(yytext[yyleng-1] == '.'){
141: if(ahead() == 0)dot=1;
142: }
143: if(NOCAPS)
144: for(i=0;i<yyleng;i++)
145: if(isupper(yytext[i]))yytext[i] = tolower(yytext[i]);
146: OUT1(NOUN_ADJ);
147: }
148: {N}+([,]{N}+)*("."{N}+)*[']*[s]* {
149: OUT1(NOUN_ADJ);
150: }
151: {N}*([,]{N}+)*("."{N}+)+[']*[s]* {
152: OUT1(NOUN_ADJ);
153: }
154: {N}+([,]{N}+)*("."{N}*)*[']*[s]* {
155: if(yytext[yyleng-1] == '.')dot=1;
156: OUT1(NOUN_ADJ);
157: }
158: ({A}*{N}+{A}*)+ {
159: if(input() == '.')
160: ahead();
161: if(NOCAPS)
162: for(i=0;i<yyleng;i++)
163: if(isupper(yytext[i]))yytext[i]=tolower(yytext[i]);
164: OUT1(NOUN_ADJ);
165: }
166: {N}+[%] {
167: OUT1(NOUN_ADJ);
168: }
169: "$"{N}+([,]{N}+)*("."{N}*)* {
170: if(yytext[yyleng-1] == '.')dot=1;
171: OUT1(NOUN);
172: }
173: [Aa]"."[ ]*[Mm]"." {
174: OUT1(ADJ_ADV);
175: }
176: [Pp]"."[ ]*[Mm]"." {
177: OUT1(ADJ_ADV);
178: }
179: "a."[ ]*"d." {
180: OUT1(ADJ_ADV);
181: }
182: "b."[ ]*"c." {
183: OUT1(ADJ_ADV);
184: }
185: "i."[ ]*"e." {
186: OUT1(PREP);
187: }
188: "e."[ ]*"g." {
189: OUT1(PREP);
190: }
191: "etc."[ \n]*[,)]* {
192: i = yytext[4];
193: yytext[4] = '\0';
194: OUT1(NOUN);
195: yytext[4] = i;
196: yytext[0] = yytext[yyleng-1];
197: yytext[1] = '\0';
198: if(yytext[0] == ',' || yytext[0] == ')')
199: OUT1(',');
200: else {
201: OUTN(fin);
202: first = 1;
203: }
204: }
205: "et al." {
206: OUT1(NOUN);
207: }
208: in"."[ \n]*{C} {
209: unput(yytext[yyleng-1]);
210: yytext[2] = '\0';
211: OUT1(PREP);
212: OUTN(fin);
213: first = 1;
214: }
215: Ph"."[ ]*[Dd]"." {
216: OUT1(ADJ);
217: }
218: [A-Z]"." {
219: dot=1;
220: OUT1(NOUN);
221: }
222: can't {
223: yytext[3]='\0';
224: yyleng -= 2;
225: nflg=1;
226: goto wd;
227: }
228: won't {
229: OUT1('X');
230: }
231: ain't {
232: OUT1('g');
233: }
234: [A-Z]*{L}+n't {
235: if(isupper(yytext[0]))
236: yytext[0] = tolower(yytext[0]);
237: nflg=1;
238: yytext[yyleng-3]='\0';
239: yyleng -= 3;
240: goto wd;
241: }
242: o'clock {
243: OUT1(ADV);
244: }
245: {L}+'[s] {
246: pos(0);
247: }
248: 'll {
249: OUT1(lookup("will",1,0));
250: }
251: 've {
252: OUT1(lookup("have",1,0));
253: }
254: 're {
255: OUT1(lookup("are",1,0));
256: }
257: 'd {
258: OUT1(lookup("had",1,0));
259: }
260: 'm {
261: OUT1(lookup("am",1,0));
262: }
263: 'ld {
264: OUT1(lookup("would",1,0));
265: }
266: {L}+ {
267: wd:
268: if((j = lookup(yytext,1,0)) != 0){
269: first=0;
270: if(cap){
271: if(!NOCAPS)
272: yytext[0] = toupper(yytext[0]);
273: cap = 0;
274: if(dot)OUTN(fin);
275: }
276: dot=0;
277: OUT1(j);
278: if(nflg==1){
279: nflg=0;
280: OUTN(nt);
281: }
282: }
283: else{
284: first = dot=0;
285: if(yytext[yyleng-1] == 'y' && cap == 0){
286: switch(yytext[yyleng-2]){
287: case 'c': look(cy,yyleng-2,NOUN);
288: break;
289: case 'f': look(fy,yyleng-2,VERB);
290: break;
291: case 'l': look(ly,yyleng-2,ADV);
292: break;
293: case 'g': if(yytext[yyleng-3] == 'o'){
294: OUT1(NOUN);
295: break;
296: }
297: look(gy,yyleng-2,ADJ);
298: break;
299: case 'r': switch(yytext[yyleng-3]){
300: case 'a': look(ary,yyleng-3,ADJ);
301: break;
302: case 'o': look(ory,yyleng-3,ADJ);
303: break;
304: case 'e': look(ery,yyleng-3,NOUN);
305: break;
306: default: look(ry,yyleng-2,NOUN);
307: }
308: break;
309: case 't': if(yytext[yyleng-3] == 'i')look(ity,yyleng-3,NOUN);
310: else look(ty,yyleng-2,ADJ);
311: break;
312: default: OUT();
313: }
314: }
315: else {
316: if(cap){
317: if(!NOCAPS)yytext[0] = toupper(yytext[0]);
318: cap = 0;
319: OUT1(NOUN_ADJ);
320: }
321: else {
322: OUT();
323: }
324: }
325: }
326: }
327: [\n] ;
328: [ ]+ ;
329: [\t]+ ;
330: ";" {
331: OUT1(';');
332: first=1;
333: }
334: (\"|`|')+ {
335: if(dot){
336: OUTN(fin);
337: dot=0;
338: }
339: if(qflg==1){
340: qflg=0;
341: OUT1('"');
342: }
343: else {
344: qflg=1;
345: first=1;
346: OUT1('"');
347: }
348: }
349: ".\"" {
350: qflg=0;
351: first=1;
352: OUT1(END);
353: }
354: "..." {
355: OUT1(',');
356: }
357: "~." {
358: first = 1;
359: OUT1(END);
360: }
361: {A}{A}+"." {
362: yytext[yyleng-1] = '\0';
363: if((j=abbrev(yytext,1,0)) != 0){
364: if(isupper(yytext[0])){
365: if(NOCAPS)yytext[0] = tolower(yytext[0]);
366: if(first == 1)first=0;
367: }
368: yytext[yyleng-1] = '.';
369: OUT1(j);
370: }
371: else {
372: j = ahead();
373: if(j == 0)
374: yyleng--;
375: for(i=0;i<yyleng;i++)
376: if(isupper(yytext[i])){
377: yytext[i] = tolower(yytext[i]);
378: if(i == 0)cap = 1;
379: else cap = 0;
380: }
381: if(j == 0)goto wd;
382: if(cap)
383: if(!NOCAPS)yytext[0] = toupper(yytext[0]);
384: OUT1(NOUN_ADJ);
385: }
386: }
387: "." {
388: first=1;
389: OUT1(END);
390: }
391: "!\"" {
392: qflg=0;
393: first=1;
394: OUT1(END);
395: }
396: "!" {
397: first=1;
398: OUT1(END);
399: }
400: "?\"" {
401: qflg=0;
402: first=1;
403: OUT1(END);
404: }
405: "?" {
406: first=1;
407: OUT1(END);
408: }
409: ":" {
410: OUT1(',');
411: first=1;
412: }
413: [-]+ {
414: OUT1(',');
415: first=1;
416: }
417: "," {
418: OUT1(',');
419: }
420: (\[|\(|\{|\]|\)|\}) {
421: OUT1(',');
422: }
423: . {
424: /* fprintf(stderr,"nwords funny char: %c\n",yytext[0])*/ ;
425: }
426: %%
427: look(f,n,cc)
428: char (*f)();
429: int n;
430: char cc;
431: {
432: int nn;
433: char save;
434: save=yytext[n];
435: yytext[n] = '\0';
436: nn=(*f)(yytext,1,0);
437: yytext[n] = save;
438: if(nn != 0){
439: OUT1(nn);
440: }
441: else {
442: OUT1(cc);
443: }
444: }
445: pos(flg){
446: int ii,j;
447: if(flg == 1)yytext[0] = tolower(yytext[0]);
448: for(ii=yyleng-1;yytext[ii] != '\''; ii--);
449: yytext[ii] = '\0';
450: if((j=lookup(yytext,1,0)) != 0){
451: yyleng = ii;
452: OUT1(j);
453: OUTN(qs);
454: }
455: else{
456: if(flg==1 && !NOCAPS)yytext[0] = toupper(yytext[0]);
457: yytext[ii] = '\'';
458: OUT1(POS);
459: }
460: }
461: char *filename="-";
462:
463: main(argc,argv)
464: int argc;
465: char *argv[];
466: {
467: register int rc=0;
468: putchar(':'); putchar('\n');
469: getd();
470: getab();
471: ygetd();
472: if(argc<=1) {
473: yylex();
474: OUTN(fin);
475: }else{
476: while(argc>1) {
477: if(freopen(argv[1],"r",stdin)==NULL) {
478: fprintf(stderr,"%s: cannot open\n", argv[1]);
479: rc++;
480: }else{
481: filename=argv[1];
482: yylex();
483: OUTN(fin);
484: }
485: argc--; argv++;
486: }
487: }
488: return(rc);
489: }
490: ahead(){
491: register int c;
492: if(isalnum((c=input()))){
493: yytext[yyleng++] = '.';
494: while(!isspace((c=input() )))
495: yytext[yyleng++] = c;
496: yytext[yyleng] = '\0';
497: unput(c);
498: return(1);
499: }
500: unput(c);
501: unput('.');
502: return(0);
503: }
This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.