|
|
1.1 root 1: /*
2: * The lexical analyzer.
3: */
4:
5: #include "itran.h"
6: #include "token.h"
7: #include "lex.h"
8: #include "char.h"
9: #include "tree.h"
10:
11: int tline;
12: int tcol;
13:
14: /*
15: * yylex - find the next token in the input stream, and return its token
16: * type and value to the parser.
17: *
18: * Variables of interest:
19: *
20: * cc - character following last token.
21: * comflag - set if in a comment.
22: * nlflag - set if a newline was between the last token and the current token
23: * lastend - set if the last token was an ENDER.
24: * lastval - when a semicolon is inserted and returned, lastval gets the
25: * token value that would have been returned if the semicolon hadn't
26: * been inserted.
27: */
28:
29: yylex()
30: {
31: register struct toktab *t;
32: register int c;
33: int nlflag;
34: int comflag;
35: static struct toktab *lasttok = NULL;
36: static nodeptr lastval;
37: static int lastend = 0;
38: static int eofflag = 0;
39: static int lastline = 0;
40: static int cc = '\n';
41: extern struct toktab *getident(), *getnum(), *getstring(), *getop();
42:
43: if (lasttok != NULL) {
44: /*
45: * A semicolon was inserted and returned on the last call to yylex,
46: * instead of going to the input, return lasttok and set the
47: * appropriate variables.
48: */
49: yylval = lastval;
50: tline = LINE(lastval);
51: tcol = COL(lastval);
52: t = lasttok;
53: goto ret;
54: }
55: nlflag = 0;
56: comflag = 0;
57: loop:
58: c = cc;
59: /*
60: * Skip whitespace and comments.
61: */
62: while (c != EOF && (comflag || c == COMMENT || isspace(c))) {
63: if (c == '\n') {
64: nlflag++;
65: comflag = 0;
66: }
67: else if (c == COMMENT)
68: comflag++;
69: c = NEXTCHAR;
70: }
71: /*
72: * A token is the next thing in the input. Record the last line number
73: * and set tline and tcol to the current line and column.
74: */
75: lastline = tline;
76: tline = inline;
77: tcol = incol;
78:
79: if (c == EOF) {
80: /*
81: * End of file has been reached. Set eofflag, return T_EOF, and
82: * set cc to EOF so that any subsequent scans also return T_EOF.
83: */
84: if (eofflag++) {
85: eofflag = 0;
86: cc = '\n';
87: return (int) (yylval = 0);
88: }
89: cc = EOF;
90: t = T_EOF;
91: yylval = 0;
92: goto ret;
93: }
94:
95: /*
96: * Look at current input character to determine what class of token
97: * is next and take the appropriate action. Note that the various
98: * token gathering routines write a value into cc.
99: */
100: c = ctran[c];
101: if (isalpha(c)) { /* gather ident or reserved word */
102: if ((t = getident(c, &cc)) == NULL)
103: goto loop;
104: }
105: else if (isdigit(c)) { /* gather numeric literal */
106: if ((t = getnum(c, &cc)) == NULL)
107: goto loop;
108: }
109: else if (c == '"' || c == '\'') { /* gather string or cset literal */
110: if ((t = getstring(c, &cc)) == NULL)
111: goto loop;
112: }
113: else { /* gather longest legal operator */
114: if ((t = getop(c, &cc)) == NULL)
115: goto loop;
116: yylval = OPNODE(t->t_type);
117: }
118: if (nlflag && lastend && (t->t_flags & BEGINNER)) {
119: /*
120: * A newline was encountered between the current token and the last,
121: * the last token was an ENDER, and the current token is a BEGINNER.
122: * Return a semicolon and save the current token in lastval.
123: */
124: lastval = yylval;
125: lasttok = t;
126: tline = lastline;
127: tcol = 0;
128: yylval = OPNODE(SEMICOL);
129: return (SEMICOL);
130: }
131: ret:
132: /*
133: * Clear lasttok, set lastend if the token being returned is an
134: * ENDER, and return the token.
135: */
136: lasttok = 0;
137: lastend = t->t_flags & ENDER;
138: return (t->t_type);
139: }
140:
141: /*
142: * getident - gather an identifier beginning with ac. The character
143: * following identifier goes in cc.
144: */
145:
146: struct toktab *getident(ac, cc)
147: char ac;
148: int *cc;
149: {
150: register c;
151: register char *p;
152: register struct toktab *t;
153: extern char *putident();
154: extern struct toktab *findres();
155:
156: c = ac;
157: p = sfree;
158: /*
159: * Copy characters into string space until a non-alphanumeric character
160: * is found.
161: */
162: do {
163: if (p >= send)
164: syserr("out of string space");
165: *p++ = c;
166: c = ctran[NEXTCHAR];
167: } while (isalnum(c));
168: if (p >= send)
169: syserr("out of string space");
170: *p++ = 0;
171: *cc = c;
172: /*
173: * If the identifier is a reserved word, make a RESNODE for it and return
174: * the token value. Otherwise, install it with putident, make an
175: * IDNODE for it, and return.
176: */
177: if ((t = findres()) != NULL) {
178: yylval = RESNODE(t->t_type);
179: return (t);
180: }
181: else {
182: yylval = IDNODE((int)putident(p-sfree));
183: return (T_IDENT);
184: }
185: }
186:
187: /*
188: * findres - if the string just copied into the string space by getident
189: * is a reserved word, return a pointer to its entry in the token table.
190: * Return NULL if the string isn't a reserved word.
191: */
192:
193: struct toktab *findres()
194: {
195: register struct toktab *t;
196: register char c, *p;
197:
198: p = sfree;
199: c = *p;
200: if (!islower(c))
201: return (NULL);
202: /*
203: * Point t at first reserved word that starts with c (if any).
204: */
205: if ((t = restab[c - '_']) == NULL)
206: return (NULL);
207: /*
208: * Search through reserved words, stopping when a match is found
209: * or when the current reserved word doesn't start with c.
210: */
211: while (t->t_word[0] == c) {
212: if (strcmp(t->t_word, p) == 0)
213: return (t);
214: t++;
215: }
216: return (NULL);
217: }
218:
219: /*
220: * getnum - gather a numeric literal starting with ac and put the
221: * character following the literal into *cc.
222: */
223:
224: struct toktab *getnum(ac, cc)
225: char ac;
226: int *cc;
227: {
228: register c;
229: register r;
230: register state;
231: char *p;
232: int realflag;
233: extern char *putident();
234:
235: c = ac;
236: r = tonum(c);
237: p = sfree;
238: state = 0;
239: realflag = 0;
240: for (;;) {
241: if (p >= send)
242: syserr("out of string space");
243: *p++ = c;
244: c = ctran[NEXTCHAR];
245: switch (state) {
246: case 0: /* integer part */
247: if (isdigit(c)) { r = r * 10 + tonum(c); continue; }
248: if (c == '.') { state = 1; realflag++; continue; }
249: if (tolower(c) == 'e') { state = 2; realflag++; continue; }
250: if (tolower(c) == 'r') {
251: state = 5;
252: if (r < 2 || r > 36)
253: err("invalid radix for integer literal", 0);
254: continue;
255: }
256: break;
257: case 1: /* fractional part */
258: if (isdigit(c)) continue;
259: if (tolower(c) == 'e') { state = 2; continue; }
260: break;
261: case 2: /* optional exponent sign */
262: if (c == '+' || c == '-') { state = 3; continue; }
263: case 3: /* first digit after e, e+, or e- */
264: if (isdigit(c)) { state = 4; continue; }
265: err("invalid real literal", 0);
266: break;
267: case 4: /* remaining digits after e */
268: if (isdigit(c)) continue;
269: break;
270: case 5: /* first digit after r */
271: if ((isdigit(c) || isletter(c)) && tonum(c) < r)
272: { state = 6; continue; }
273: err("invalid integer literal", 0);
274: break;
275: case 6: /* remaining digits after r */
276: if (isdigit(c) || isletter(c)) {
277: if (tonum(c) >= r) { /* illegal digit for radix r */
278: err("invalid digit in integer literal", 0);
279: r = tonum('z'); /* prevent more messages */
280: }
281: continue;
282: }
283: break;
284: }
285: break;
286: }
287: if (p >= send)
288: syserr("out of string space");
289: *p++ = 0;
290: *cc = c;
291: if (realflag) {
292: yylval = REALNODE((int)putident(p-sfree));
293: return (T_REAL);
294: }
295: yylval = INTNODE((int)putident(p-sfree));
296: return (T_INT);
297: }
298:
299: /*
300: * getstring - gather a string literal starting with ac and place the
301: * character following the literal in *cc.
302: */
303:
304: struct toktab *getstring(ac, cc)
305: char ac;
306: int *cc;
307: {
308: register c, sc;
309: register char *p;
310: char *lc;
311: extern char *putident();
312:
313: sc = c = ac;
314: p = sfree;
315: lc = 0;
316: while ((c = NEXTCHAR) != sc && c != '\n' && c != EOF) {
317: contin:
318: if (c == '_')
319: lc = p;
320: else if (!isspace(c))
321: lc = 0;
322: if (ctran[c] == ESCAPE) {
323: c = NEXTCHAR;
324: if (isoctal(c))
325: c = octesc(c);
326: else if (ctran[c] == 'x')
327: c = hexesc();
328: else if (ctran[c] == '^')
329: c = ctlesc();
330: else
331: c = esctab[c];
332: if (c == EOF)
333: goto noquote;
334: }
335: if (p >= send)
336: syserr("out of string space");
337: *p++ = c;
338: }
339: if (p >= send)
340: syserr("out of string space");
341: *p++ = 0;
342: if (c == sc)
343: *cc = ' ';
344: else {
345: if (c == '\n' && lc) {
346: p = lc;
347: while ((c = NEXTCHAR) != EOF && isspace(c)) ;
348: if (c != EOF)
349: goto contin;
350: }
351: noquote:
352: err("unclosed quote", 0);
353: *cc = c;
354: }
355: if (ac == '"') { /* a string literal */
356: yylval = STRNODE((int)putident(p-sfree), p-sfree);
357: return (T_STRING);
358: }
359: else { /* a cset literal */
360: yylval = CSETNODE((int)putident(p-sfree), p-sfree);
361: return (T_CSET);
362: }
363: }
364:
365: /*
366: * ctlesc - translate a control escape -- backslash followed by
367: * caret and one character.
368: */
369:
370: ctlesc()
371: {
372: register c;
373:
374: c = NEXTCHAR;
375: if (c == EOF)
376: return (EOF);
377: return (c & 037);
378: }
379:
380: /*
381: * octesc - translate an octal escape -- backslash followed by
382: * one, two, or three octal digits.
383: */
384:
385: octesc(ac)
386: char ac;
387: {
388: register c, nc, i;
389:
390: c = 0;
391: nc = ac;
392: i = 1;
393: do {
394: c = (c << 3) | (nc - '0');
395: nc = NEXTCHAR;
396: if (nc == EOF)
397: return (EOF);
398: } while (isoctal(nc) && i++ < 3);
399: PUSHCHAR(nc);
400: return (c & 0377);
401: }
402:
403: /*
404: * hexesc - translate a hexadecimal escape -- backslash-x
405: * followed by one or two hexadecimal digits.
406: */
407:
408: hexesc()
409: {
410: register c, nc, i;
411:
412: c = 0;
413: i = 0;
414: while (i++ < 2) {
415: nc = NEXTCHAR;
416: if (nc == EOF)
417: return (EOF);
418: if (nc >= 'a' && nc <= 'f')
419: nc -= 'a' - 10;
420: else if (nc >= 'A' && nc <= 'F')
421: nc -= 'A' - 10;
422: else if (isdigit(nc))
423: nc -= '0';
424: else {
425: PUSHCHAR(nc);
426: break;
427: }
428: c = (c << 4) | nc;
429: }
430: return (c);
431: }
432:
433: /*
434: * getop - find the longest legal operator and return a pointer
435: * to its entry in the token table. The tour describes the
436: * operator recognition process in detail.
437: */
438:
439: struct toktab *getop(ac, cc)
440: char ac;
441: int *cc;
442: {
443: register struct optab *state;
444: register char c, i;
445:
446: state = state0;
447: c = ac;
448: for (;;) {
449: while ((i = state->o_input) && c != i)
450: state++;
451: switch (state->o_action) {
452: case A_GOTO:
453: state = (struct optab *) state->o_val;
454: c = ctran[NEXTCHAR];
455: continue;
456: case A_ERROR:
457: err("invalid character", 0);
458: *cc = ' ';
459: return (NULL);
460: case A_RETURN:
461: *cc = c;
462: return (struct toktab *) (state->o_val);
463: case A_IMMRET:
464: *cc = ' ';
465: return (struct toktab *) (state->o_val);
466: }
467: }
468: }
469:
470: /*
471: * nextchar - return the next character in the input.
472: */
473:
474: nextchar()
475: {
476: register char c;
477:
478: if (c = peekc) {
479: peekc = 0;
480: return (c);
481: }
482: c = getc(infile);
483: switch (c) {
484: case EOF:
485: inline = 0;
486: incol = 0;
487: break;
488: case '\n':
489: inline++;
490: incol = 0;
491: break;
492: case '\t':
493: incol = (incol | 7) + 1;
494: break;
495: case '\b':
496: if (incol)
497: incol--;
498: break;
499: default:
500: incol++;
501: }
502: return (c);
503: }
This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.