|
|
1.1 root 1: %{
2: /* break out words, output cap + word(inverted) */
3:
4: #ifndef lint
5: static char sccsid[] = "@(#)nwords.l 4.2 (Berkeley) 82/11/06";
6: #endif not lint
7:
8: #include <stdio.h>
9: #include <ctype.h>
10: #define OUT() for(i=yyleng-1;i>=0; i--)putchar(yytext[i]); putchar('\n')
11: #define OUT1(nam) printf("%c:%s\n",nam,yytext)
12: #define OUTN(string) printf("%s\n",string)
13: #include "names.h"
14: #include "nhash.c"
15: #include "dict.c"
16: #include "ydict.c"
17: #include "abbrev.c"
18: char nt[] = "D:n't";
19: char qs[] = "c:'s";
20: char fin[] = "E:.";
21: int NOCAPS = 0; /* if set all caps are turned to lower case */
22: int i,j;
23: int dot = 0;
24: int first = 1;
25: int qflg,nflg;
26: int cap = 0;
27: %}
28: %p 3000
29: %a 3300
30: %o 4500
31:
32: L [a-z]
33: N [0-9]
34: C [A-Z]
35: A [a-zA-Z]
36: P [a-zA-Z0-9]
37:
38: %%
39: ^[.!].+[\n] {
40: if(dot){
41: OUTN(fin);
42: dot = 0;
43: first = 1;
44: }
45: printf(":%s",yytext);
46: }
47: May {
48: if(first == 0){
49: OUT1(NOUN);
50: }
51: else {
52: first = 0;
53: yytext[0] = tolower(yytext[0]);
54: cap = 1;
55: goto wd;
56: }
57: }
58: "U.S." {
59: OUT1(NOUN);
60: }
61: {C}{L}*'[s] {
62: pos(1);
63: if(first==1)first=0;
64: }
65: {C}+['][s] {
66: if(NOCAPS)
67: for(i=0;i<yyleng;i++)
68: if(isupper(yytext[i]))yytext[i] = tolower(yytext[i]);
69: OUT1(POS);
70: }
71: {P}+([-]{P}+)+ {
72: if(NOCAPS)
73: for(i=0;i<yyleng;i++)
74: if(isupper(yytext[i]))yytext[i] = tolower(yytext[i]);
75: OUT1(NOUN_ADJ);
76: }
77: {C}{C}+ {
78: if(NOCAPS)
79: for(i=0;i<yyleng;i++)
80: yytext[i] = tolower(yytext[i]);
81: if((i=input()) == 's'){
82: yytext[yyleng++] = 's';
83: yytext[yyleng] = '\0';
84: OUT1(PNOUN);
85: }
86: else {
87: unput(i);
88: if(!NOCAPS)
89: for(i=0;i<yyleng;i++)yytext[i] = tolower(yytext[i]);
90: goto wd;
91: }
92: }
93: [LD][']{C}{L}* {
94: if(NOCAPS){
95: yytext[0] = tolower(yytext[0]);
96: yytext[2] = tolower(yytext[2]);
97: }
98: OUT1(NOUN_ADJ);
99: }
100: {C}{L}* {
101: if(first==1)
102: first=0;
103: else cap = 1;
104: if(yyleng==1 && yytext[0] == 'I'){
105: cap = 0;
106: goto wd;
107: }
108: yytext[0] = tolower(yytext[0]);
109: goto wd;
110: }
111: {N}":"{N}{N} {
112: OUT1(NOUN_ADJ);
113: }
114: ({N}*[,])*({N}+".")+[ \t\n]+{C} {
115: for(i=yyleng-1;i>0;i--)
116: if(yytext[i] == '.')break;
117: unput(yytext[yyleng-1]);
118: yytext[i] = '\0';
119: OUT1(NOUN_ADJ);
120: OUTN(fin);
121: first = 1;
122: }
123: ([hH]e"/"[sS]he)|([sS]he"/"[hH]e) {
124: if(NOCAPS)
125: if(isupper(yytext[0]))yytext[0] = tolower(yytext[0]);
126: OUT1(PRONS);
127: }
128: ([hH]is"/"[hH]er)|([hH]er"/"[hH]is) {
129: if(NOCAPS)
130: if(isupper(yytext[0]))yytext[0] = tolower(yytext[0]);
131: OUT1(POS);
132: }
133: [ \t`]*[a-zA-Z0-9.]*("\/"[a-zA-Z0-9.]+)+[']* {
134: if(yytext[yyleng-1] == '.'){
135: if(ahead() == 0)dot=1;
136: }
137: if(NOCAPS)
138: for(i=0;i<yyleng;i++)
139: if(isupper(yytext[i]))yytext[i] = tolower(yytext[i]);
140: OUT1(NOUN_ADJ);
141: }
142: {N}+([,]{N}+)*("."{N}+)*[']*[s]* {
143: OUT1(NOUN_ADJ);
144: }
145: {N}*([,]{N}+)*("."{N}+)+[']*[s]* {
146: OUT1(NOUN_ADJ);
147: }
148: {N}+([,]{N}+)*("."{N}*)*[']*[s]* {
149: if(yytext[yyleng-1] == '.')dot=1;
150: OUT1(NOUN_ADJ);
151: }
152: ({A}*{N}+{A}*)+ {
153: if(input() == '.')
154: ahead();
155: if(NOCAPS)
156: for(i=0;i<yyleng;i++)
157: if(isupper(yytext[i]))yytext[i]=tolower(yytext[i]);
158: OUT1(NOUN_ADJ);
159: }
160: {N}+[%] {
161: OUT1(NOUN_ADJ);
162: }
163: "$"{N}+([,]{N}+)*("."{N}*)* {
164: if(yytext[yyleng-1] == '.')dot=1;
165: OUT1(NOUN);
166: }
167: [Aa]"."[ ]*[Mm]"." {
168: OUT1(ADJ_ADV);
169: }
170: [Pp]"."[ ]*[Mm]"." {
171: OUT1(ADJ_ADV);
172: }
173: "a."[ ]*"d." {
174: OUT1(ADJ_ADV);
175: }
176: "b."[ ]*"c." {
177: OUT1(ADJ_ADV);
178: }
179: "i."[ ]*"e." {
180: OUT1(PREP);
181: }
182: "e."[ ]*"g." {
183: OUT1(PREP);
184: }
185: "etc."[ \n]*[,)]* {
186: i = yytext[4];
187: yytext[4] = '\0';
188: OUT1(NOUN);
189: yytext[4] = i;
190: yytext[0] = yytext[yyleng-1];
191: yytext[1] = '\0';
192: if(yytext[0] == ',' || yytext[0] == ')')
193: OUT1(',');
194: else {
195: OUTN(fin);
196: first = 1;
197: }
198: }
199: "et al." {
200: OUT1(NOUN);
201: }
202: in"."[ \n]*{C} {
203: unput(yytext[yyleng-1]);
204: yytext[2] = '\0';
205: OUT1(PREP);
206: OUTN(fin);
207: first = 1;
208: }
209: Ph"."[ ]*[Dd]"." {
210: OUT1(ADJ);
211: }
212: [A-Z]"." {
213: dot=1;
214: OUT1(NOUN);
215: }
216: can't {
217: yytext[3]='\0';
218: yyleng -= 2;
219: nflg=1;
220: goto wd;
221: }
222: won't {
223: OUT1('X');
224: }
225: ain't {
226: OUT1('g');
227: }
228: {L}+n't {
229: nflg=1;
230: yytext[yyleng-3]='\0';
231: yyleng -= 3;
232: goto wd;
233: }
234: [A-Z]{L}+n't {
235: yytext[0] = tolower(yytext[0]);
236: nflg=1;
237: yytext[yyleng-3]='\0';
238: yyleng -= 3;
239: goto wd;
240: }
241: o'clock {
242: OUT1(ADV);
243: }
244: {L}+'[s] {
245: pos(0);
246: }
247: 'll {
248: OUT1(lookup("will",1,0));
249: }
250: 've {
251: OUT1(lookup("have",1,0));
252: }
253: 're {
254: OUT1(lookup("are",1,0));
255: }
256: 'd {
257: OUT1(lookup("had",1,0));
258: }
259: 'm {
260: OUT1(lookup("am",1,0));
261: }
262: 'ld {
263: OUT1(lookup("would",1,0));
264: }
265: {L}+ {
266: wd:
267: if((j = lookup(yytext,1,0)) != 0){
268: first=0;
269: if(cap){
270: if(!NOCAPS)
271: yytext[0] = toupper(yytext[0]);
272: cap = 0;
273: if(dot)OUTN(fin);
274: }
275: dot=0;
276: OUT1(j);
277: if(nflg==1){
278: nflg=0;
279: OUTN(nt);
280: }
281: }
282: else{
283: first = dot=0;
284: if(yytext[yyleng-1] == 'y' && cap == 0){
285: switch(yytext[yyleng-2]){
286: case 'c': look(cy,yyleng-2,NOUN);
287: break;
288: case 'f': look(fy,yyleng-2,VERB);
289: break;
290: case 'l': look(ly,yyleng-2,ADV);
291: break;
292: case 'g': if(yytext[yyleng-3] == 'o'){
293: OUT1(NOUN);
294: break;
295: }
296: look(gy,yyleng-2,ADJ);
297: break;
298: case 'r': switch(yytext[yyleng-3]){
299: case 'a': look(ary,yyleng-3,ADJ);
300: break;
301: case 'o': look(ory,yyleng-3,ADJ);
302: break;
303: case 'e': look(ery,yyleng-3,NOUN);
304: break;
305: default: look(ry,yyleng-2,NOUN);
306: }
307: break;
308: case 't': if(yytext[yyleng-3] == 'i')look(ity,yyleng-3,NOUN);
309: else look(ty,yyleng-2,ADJ);
310: break;
311: default: OUT();
312: }
313: }
314: else {
315: if(cap){
316: if(!NOCAPS)yytext[0] = toupper(yytext[0]);
317: cap = 0;
318: OUT1(NOUN_ADJ);
319: }
320: else {
321: OUT();
322: }
323: }
324: }
325: }
326: [\n] ;
327: [ ]+ ;
328: [\t]+ ;
329: ";" {
330: OUT1(';');
331: first=1;
332: }
333: (\"|`|')+ {
334: if(dot){
335: OUTN(fin);
336: dot=0;
337: }
338: if(qflg==1){
339: qflg=0;
340: OUT1('"');
341: }
342: else {
343: qflg=1;
344: first=1;
345: OUT1('"');
346: }
347: }
348: ".\"" {
349: qflg=0;
350: first=1;
351: OUT1(END);
352: }
353: "..." {
354: OUT1(',');
355: }
356: "/." {
357: first = 1;
358: OUT1(END);
359: }
360: {A}{A}+"." {
361: yytext[yyleng-1] = '\0';
362: if((j=abbrev(yytext,1,0)) != 0){
363: if(isupper(yytext[0])){
364: if(NOCAPS)yytext[0] = tolower(yytext[0]);
365: if(first == 1)first=0;
366: }
367: yytext[yyleng-1] = '.';
368: OUT1(j);
369: }
370: else {
371: j = ahead();
372: if(j == 0)
373: yyleng--;
374: for(i=0;i<yyleng;i++)
375: if(isupper(yytext[i])){
376: yytext[i] = tolower(yytext[i]);
377: if(i == 0)cap = 1;
378: else cap = 0;
379: }
380: if(j == 0)goto wd;
381: OUT1(NOUN_ADJ);
382: }
383: }
384: "." {
385: first=1;
386: OUT1(END);
387: }
388: "!\"" {
389: qflg=0;
390: first=1;
391: OUT1(END);
392: }
393: "!" {
394: first=1;
395: OUT1(END);
396: }
397: "?\"" {
398: qflg=0;
399: first=1;
400: OUT1(END);
401: }
402: "?" {
403: first=1;
404: OUT1(END);
405: }
406: ":" {
407: OUT1(',');
408: first=1;
409: }
410: [-]+ {
411: OUT1(',');
412: first=1;
413: }
414: "," {
415: OUT1(',');
416: }
417: (\[|\(|\{|\]|\)|\}) {
418: OUT1(',');
419: }
420: . {
421: /* fprintf(stderr,"nwords funny char: %c\n",yytext[0])*/ ;
422: }
423: %%
424: look(f,n,cc)
425: char (*f)();
426: int n;
427: char cc;
428: {
429: int nn;
430: char save;
431: save=yytext[n];
432: yytext[n] = '\0';
433: nn=(*f)(yytext,1,0);
434: yytext[n] = save;
435: if(nn != 0){
436: OUT1(nn);
437: }
438: else {
439: OUT1(cc);
440: }
441: }
442: pos(flg){
443: int ii,j;
444: if(flg == 1)yytext[0] = tolower(yytext[0]);
445: for(ii=yyleng-1;yytext[ii] != '\''; ii--);
446: yytext[ii] = '\0';
447: if((j=lookup(yytext,1,0)) != 0){
448: yyleng = ii;
449: OUT1(j);
450: OUTN(qs);
451: }
452: else{
453: if(flg==1 && !NOCAPS)yytext[0] = toupper(yytext[0]);
454: yytext[ii] = '\'';
455: OUT1(POS);
456: }
457: }
458: char *filename="-";
459:
460: main(argc,argv)
461: int argc;
462: char *argv[];
463: {
464: register int rc=0;
465: putchar(':'); putchar('\n');
466: getd();
467: getab();
468: ygetd();
469: if(argc<=1) {
470: yylex();
471: OUTN(fin);
472: }else{
473: while(argc>1) {
474: if(freopen(argv[1],"r",stdin)==NULL) {
475: fprintf(stderr,"%s: cannot open\n", argv[1]);
476: rc++;
477: }else{
478: filename=argv[1];
479: yylex();
480: OUTN(fin);
481: }
482: argc--; argv++;
483: }
484: }
485: return(rc);
486: }
487: ahead(){
488: register int c;
489: if(isalnum((c=input()))){
490: yytext[yyleng++] = '.';
491: while(!isspace((c=input() )))
492: yytext[yyleng++] = c;
493: yytext[yyleng] = '\0';
494: unput(c);
495: return(1);
496: }
497: unput(c);
498: unput('.');
499: return(0);
500: }
This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.