|
|
1.1 root 1: %{
2: /* break out words, output cap + word(inverted) */
3: #include <stdio.h>
4: #define OUT() for(i=yyleng-1;i>=0; i--)putchar(yytext[i]); putchar('\n')
5: #define OUT1(nam) printf("%c:%s\n",nam,yytext)
6: #define OUTN(string) printf("%s\n",string)
7: #include "names.h"
8: #include "nhash.c"
9: #include "dict.c"
10: #include "ydict.c"
11: char nt[] = "D:n't";
12: char qs[] = "c:'s";
13: char fin[] = "E:.";
14: int i,j;
15: int dot = 0;
16: int first = 1;
17: int qflg,nflg;
18: int cap = 0;
19: %}
20: %p 3000
21: %a 2500
22:
23: L [a-z]
24: N [0-9]
25: C [A-Z]
26:
27: %%
28: (St|Dr|Drs|Mr|Mrs|Ms)"." {
29: OUT1(NOUN);
30: }
31: {C}{L}*'[s] {
32: pos(1);
33: if(first==1)first=0;
34: }
35: {C}+['][s]* {
36: OUT1(POS);
37: }
38: (({C}+{L}*)|({C}*{L}+))+([-](({C}*{L}+)|({C}+{L}*))+)+ {
39: OUT1(NOUN_ADJ);
40: }
41: {C}{C}+ {
42: if((i=input()) == 's'){
43: yytext[yyleng++] = 's';
44: yytext[yyleng] = '\0';
45: OUT1(NOUN);
46: }
47: else {
48: unput(i);
49: for(i=0;i<yyleng;i++)yytext[i]+= 'a' - 'A';
50: goto wd;
51: }
52: }
53: [LD][']{C}{L}* {
54: OUT1(NOUN_ADJ);
55: }
56: {C}{L}* {
57: if(first==1)
58: first=0;
59: else cap = 1;
60: if(yyleng==1 && yytext[0] == 'I'){
61: cap = 0;
62: goto wd;
63: }
64: yytext[0]+= 'a' - 'A';
65: goto wd;
66: }
67: ({N}+[-]{N}+[-]*)+ {
68: OUT1(NOUN_ADJ);
69: }
70: ({N}+[-]*{L}+[-]*)+ {
71: OUT1(NOUN_ADJ);
72: }
73: ({N}*[,])*({N}+".")+[ \t\n]+{C} {
74: for(i=yyleng-1;i>0;i--)
75: if(yytext[i] == '.')break;
76: unput(yytext[yyleng-1]);
77: yytext[i] = '\0';
78: OUT1(NOUN_ADJ);
79: OUTN(fin);
80: first = 1;
81: }
82: [ \t`][a-zA-Z0-9.]*("\/"[a-zA-Z0-9]+"."*)+[']* {
83: if(yytext[yyleng-1] == '.')dot=1;
84: OUT1(NOUN_ADJ);
85: }
86: {N}+([,]{N}+)*("."{N}+)*[']*[s]* {
87: OUT1(NOUN_ADJ);
88: }
89: {N}*([,]{N}+)*("."{N}+)+[']*[s]* {
90: OUT1(NOUN_ADJ);
91: }
92: {N}+([,]{N}+)*("."{N}*)*[']*[s]* {
93: if(yytext[yyleng-1] == '.')dot=1;
94: OUT1(NOUN_ADJ);
95: }
96: {L}+[-]*{N}+ {
97: OUT1(NOUN_ADJ);
98: }
99: {C}+[-]*{N}+ {
100: OUT1(NOUN_ADJ);
101: }
102: {N}+[-]+{C}+ {
103: OUT1(NOUN_ADJ);
104: }
105: {N}+[%] {
106: OUT1(NOUN_ADJ);
107: }
108: "$"{N}+([,]{N}+)*("."{N}*)* {
109: if(yytext[yyleng-1] == '.')dot=1;
110: OUT1(NOUN);
111: }
112: [Aa]"."[ ]*[Mm]"." {
113: OUT1(ADJ_ADV);
114: }
115: [Pp]"."[ ]*[Mm]"." {
116: OUT1(ADJ_ADV);
117: }
118: "a."[ ]*"d." {
119: OUT1(ADJ_ADV);
120: }
121: "b."[ ]*"c." {
122: OUT1(ADJ_ADV);
123: }
124: "i."[ ]*"e." {
125: OUT1(PREP);
126: }
127: "e."[ ]*"g." {
128: OUT1(PREP);
129: }
130: "etc."[ \n]*[,)]* {
131: i = yytext[4];
132: yytext[4] = '\0';
133: OUT1(NOUN);
134: yytext[4] = i;
135: yytext[0] = yytext[yyleng-1];
136: yytext[1] = '\0';
137: if(yytext[0] == ',' || yytext[0] == ')')
138: OUT1(',');
139: else {
140: OUTN(fin);
141: first = 1;
142: }
143: }
144: "et al." {
145: OUT1(NOUN);
146: }
147: [Nn][Oo][s]*"." {
148: OUT1(NOUN_ADJ);
149: }
150: [Ff]ig[s]*"." {
151: OUT1(NOUN_ADJ);
152: }
153: [Dd]ept[s]*"." {
154: OUT1(NOUN_ADJ);
155: }
156: [Ee]q"." {
157: OUT1(NOUN_ADJ);
158: }
159: dB"." {
160: OUT1(NOUN_ADJ);
161: }
162: vs"." {
163: OUT1(PREP);
164: }
165: in"."[ \n]*{C} {
166: unput(yytext[yyleng-1]);
167: yytext[2] = '\0';
168: OUT1(PREP);
169: OUTN(fin);
170: first = 1;
171: }
172: (in|ft|yr|ckts|mi)"." {
173: OUT1(NOUN_ADJ);
174: }
175: Ph"."[ ]*[Dd]"." {
176: OUT1(ADJ);
177: }
178: [Jj]r"." {
179: OUT1(ADJ);
180: }
181: [Cc]h"." {
182: OUT1(NOUN_ADJ);
183: }
184: [Rr]ef[s]*"." {
185: OUT1(NOUN_ADJ);
186: }
187: Inc"." {
188: OUT1(ADJ);
189: }
190: [A-Z]"." {
191: dot=1;
192: OUT1(NOUN);
193: }
194: can't {
195: yytext[3]='\0';
196: yyleng -= 2;
197: nflg=1;
198: goto wd;
199: }
200: won't {
201: OUT1('X');
202: }
203: {L}+n't {
204: nflg=1;
205: yytext[yyleng-3]='\0';
206: yyleng -= 3;
207: goto wd;
208: }
209: [A-Z]{L}+n't {
210: yytext[0]+= 'a' - 'A';
211: nflg=1;
212: yytext[yyleng-3]='\0';
213: yyleng -= 3;
214: goto wd;
215: }
216: o'clock {
217: OUT1(ADV);
218: }
219: {L}+'[s] {
220: pos(0);
221: }
222: 'll {
223: OUT1(lookup("will",1,0));
224: }
225: 've {
226: OUT1(lookup("have",1,0));
227: }
228: 're {
229: OUT1(lookup("are",1,0));
230: }
231: 'd {
232: OUT1(lookup("had",1,0));
233: }
234: 'm {
235: OUT1(lookup("am",1,0));
236: }
237: 'ld {
238: OUT1(lookup("would",1,0));
239: }
240: {L}+ {
241: wd:
242: if((j = lookup(yytext,1,0)) != 0){
243: first=0;
244: if(cap){
245: yytext[0] += 'A' - 'a';
246: cap = 0;
247: if(dot)OUTN(fin);
248: }
249: dot=0;
250: OUT1(j);
251: if(nflg==1){
252: nflg=0;
253: OUTN(nt);
254: }
255: }
256: else{
257: first = dot=0;
258: if(yytext[yyleng-1] == 'y' && cap == 0){
259: switch(yytext[yyleng-2]){
260: case 'c': look(cy,yyleng-2,NOUN);
261: break;
262: case 'f': look(fy,yyleng-2,VERB);
263: break;
264: case 'l': look(ly,yyleng-2,ADV);
265: break;
266: case 'g': if(yytext[yyleng-3] == 'o'){
267: OUT1(NOUN);
268: break;
269: }
270: look(gy,yyleng-2,ADJ);
271: break;
272: case 'r': switch(yytext[yyleng-3]){
273: case 'a': look(ary,yyleng-3,ADJ);
274: break;
275: case 'o': look(ory,yyleng-3,ADJ);
276: break;
277: case 'e': look(ery,yyleng-3,NOUN);
278: break;
279: default: look(ry,yyleng-2,NOUN);
280: }
281: break;
282: case 't': if(yytext[yyleng-3] == 'i')look(ity,yyleng-3,NOUN);
283: else look(ty,yyleng-2,ADJ);
284: break;
285: default: OUT();
286: }
287: }
288: else {
289: if(cap){
290: yytext[0] += 'A' - 'a';
291: cap = 0;
292: OUT1(NOUN_ADJ);
293: }
294: else {
295: OUT();
296: }
297: }
298: }
299: }
300: [\n] ;
301: [ ]+ ;
302: [\t]+ ;
303: ";" {
304: OUT1(';');
305: first=1;
306: }
307: (\"|`|')+ {
308: if(dot){
309: OUTN(fin);
310: dot=0;
311: }
312: if(qflg==1){
313: qflg=0;
314: OUT1('"');
315: }
316: else {
317: qflg=1;
318: first=1;
319: OUT1('"');
320: }
321: }
322: ".\"" {
323: qflg=0;
324: first=1;
325: OUT1(END);
326: }
327: "..." {
328: OUT1(',');
329: }
330: "/." {
331: first = 1;
332: OUT1(END);
333: }
334: "." {
335: first=1;
336: OUT1(END);
337: }
338: "!\"" {
339: qflg=0;
340: first=1;
341: OUT1(END);
342: }
343: "!" {
344: first=1;
345: OUT1(END);
346: }
347: "?\"" {
348: qflg=0;
349: first=1;
350: OUT1(END);
351: }
352: "?" {
353: first=1;
354: OUT1(END);
355: }
356: ":" {
357: OUT1(',');
358: first=1;
359: }
360: [-]+ {
361: OUT1(',');
362: first=1;
363: }
364: "," {
365: OUT1(',');
366: }
367: (\[|\(|\{|\]|\)|\}) {
368: OUT1(',');
369: }
370: . {
371: /* fprintf(stderr,"nwords funny char: %c\n",yytext[0])*/ ;
372: }
373: %%
374: look(f,n,cc)
375: char (*f)();
376: int n;
377: char cc;
378: {
379: int nn;
380: char save;
381: save=yytext[n];
382: yytext[n] = '\0';
383: nn=(*f)(yytext,1,0);
384: yytext[n] = save;
385: if(nn != 0){
386: OUT1(nn);
387: }
388: else {
389: OUT1(cc);
390: }
391: }
392: pos(flg){
393: int ii,j;
394: if(flg==1)yytext[0] += 'a' - 'A';
395: for(ii=yyleng-1;yytext[ii] != '\''; ii--);
396: yytext[ii] = '\0';
397: if((j=lookup(yytext,1,0)) != 0){
398: yyleng = ii;
399: OUT1(j);
400: OUTN(qs);
401: }
402: else{
403: if(flg==1)yytext[0] += 'A' - 'a';
404: yytext[ii] = '\'';
405: OUT1(POS);
406: }
407: }
408: char *filename="-";
409:
410: main(argc,argv)
411: int argc;
412: char *argv[];
413: {
414: register int rc=0;
415: putchar(':'); putchar('\n');
416: getd();
417: ygetd();
418: if(argc<=1) {
419: yylex();
420: }else{
421: while(argc>1) {
422: if(freopen(argv[1],"r",stdin)==NULL) {
423: fprintf(stderr,"%s: cannot open\n", argv[1]);
424: rc++;
425: }else{
426: filename=argv[1];
427: yylex();
428: }
429: argc--; argv++;
430: }
431: }
432: return(rc);
433: }
This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.