|
|
1.1 root 1: /* Token-reader for Bison's input parser,
2: Copyright (C) 1984, 1986, 1989 Free Software Foundation, Inc.
3:
4: This file is part of Bison, the GNU Compiler Compiler.
5:
6: Bison is free software; you can redistribute it and/or modify
7: it under the terms of the GNU General Public License as published by
8: the Free Software Foundation; either version 2, or (at your option)
9: any later version.
10:
11: Bison is distributed in the hope that it will be useful,
12: but WITHOUT ANY WARRANTY; without even the implied warranty of
13: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14: GNU General Public License for more details.
15:
16: You should have received a copy of the GNU General Public License
17: along with Bison; see the file COPYING. If not, write to
18: the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. */
19:
20:
21: /*
22: lex() is the entry point. It is called from reader.c.
23: It returns one of the token-type codes defined in lex.h.
24: When an identifier is seen, the code IDENTIFIER is returned
25: and the name is looked up in the symbol table using symtab.c;
26: symval is set to a pointer to the entry found. */
27:
28: #include <stdio.h>
29: #include <ctype.h>
30: #include "system.h"
31: #include "files.h"
32: #include "symtab.h"
33: #include "lex.h"
34: #include "new.h"
35:
36:
37: extern int lineno;
38: extern int translations;
39:
40: int parse_percent_token();
41:
42: extern void fatals();
43: extern void fatal();
44:
45: /* Buffer for storing the current token. */
46: char *token_buffer;
47:
48: /* Allocated size of token_buffer, not including space for terminator. */
49: static int maxtoken;
50:
51: bucket *symval;
52: int numval;
53:
54: static int unlexed; /* these two describe a token to be reread */
55: static bucket *unlexed_symval; /* by the next call to lex */
56:
57:
58: void
59: init_lex()
60: {
61: maxtoken = 100;
62: token_buffer = NEW2 (maxtoken + 1, char);
63: unlexed = -1;
64: }
65:
66:
67: static char *
68: grow_token_buffer (p)
69: char *p;
70: {
71: int offset = p - token_buffer;
72: maxtoken *= 2;
73: token_buffer = (char *) xrealloc(token_buffer, maxtoken + 1);
74: return token_buffer + offset;
75: }
76:
77:
78: int
79: skip_white_space()
80: {
81: register int c;
82: register int inside;
83:
84: c = getc(finput);
85:
86: for (;;)
87: {
88: int cplus_comment;
89:
90: switch (c)
91: {
92: case '/':
93: c = getc(finput);
94: if (c != '*' && c != '/')
95: fatals("unexpected `/%c' found",c);
96: cplus_comment = (c == '/');
97:
98: c = getc(finput);
99:
100: inside = 1;
101: while (inside)
102: {
103: if (!cplus_comment && c == '*')
104: {
105: while (c == '*')
106: c = getc(finput);
107:
108: if (c == '/')
109: {
110: inside = 0;
111: c = getc(finput);
112: }
113: }
114: else if (c == '\n')
115: {
116: lineno++;
117: if (cplus_comment)
118: inside = 0;
119: c = getc(finput);
120: }
121: else if (c == EOF)
122: fatal("unterminated comment");
123: else
124: c = getc(finput);
125: }
126:
127: break;
128:
129: case '\n':
130: lineno++;
131:
132: case ' ':
133: case '\t':
134: case '\f':
135: c = getc(finput);
136: break;
137:
138: default:
139: return (c);
140: }
141: }
142: }
143:
144:
145: void
146: unlex(token)
147: int token;
148: {
149: unlexed = token;
150: unlexed_symval = symval;
151: }
152:
153:
154:
155: int
156: lex()
157: {
158: register int c;
159: register char *p;
160:
161: if (unlexed >= 0)
162: {
163: symval = unlexed_symval;
164: c = unlexed;
165: unlexed = -1;
166: return (c);
167: }
168:
169: c = skip_white_space();
170:
171: switch (c)
172: {
173: case EOF:
174: return (ENDFILE);
175:
176: case 'A': case 'B': case 'C': case 'D': case 'E':
177: case 'F': case 'G': case 'H': case 'I': case 'J':
178: case 'K': case 'L': case 'M': case 'N': case 'O':
179: case 'P': case 'Q': case 'R': case 'S': case 'T':
180: case 'U': case 'V': case 'W': case 'X': case 'Y':
181: case 'Z':
182: case 'a': case 'b': case 'c': case 'd': case 'e':
183: case 'f': case 'g': case 'h': case 'i': case 'j':
184: case 'k': case 'l': case 'm': case 'n': case 'o':
185: case 'p': case 'q': case 'r': case 's': case 't':
186: case 'u': case 'v': case 'w': case 'x': case 'y':
187: case 'z':
188: case '.': case '_':
189: p = token_buffer;
190: while (isalnum(c) || c == '_' || c == '.')
191: {
192: if (p == token_buffer + maxtoken)
193: p = grow_token_buffer(p);
194:
195: *p++ = c;
196: c = getc(finput);
197: }
198:
199: *p = 0;
200: ungetc(c, finput);
201: symval = getsym(token_buffer);
202: return (IDENTIFIER);
203:
204: case '0': case '1': case '2': case '3': case '4':
205: case '5': case '6': case '7': case '8': case '9':
206: {
207: numval = 0;
208:
209: while (isdigit(c))
210: {
211: numval = numval*10 + c - '0';
212: c = getc(finput);
213: }
214: ungetc(c, finput);
215: return (NUMBER);
216: }
217:
218: case '\'':
219: translations = -1;
220:
221: /* parse the literal token and compute character code in code */
222:
223: c = getc(finput);
224: {
225: register int code = 0;
226:
227: if (c == '\\')
228: {
229: c = getc(finput);
230:
231: if (c <= '7' && c >= '0')
232: {
233: while (c <= '7' && c >= '0')
234: {
235: code = (code * 8) + (c - '0');
236: c = getc(finput);
237: if (code >= 256 || code < 0)
238: fatals("malformatted literal token `\\%03o'", code);
239: }
240: }
241: else
242: {
243: if (c == 't')
244: code = '\t';
245: else if (c == 'n')
246: code = '\n';
247: else if (c == 'a')
248: code = '\007';
249: else if (c == 'r')
250: code = '\r';
251: else if (c == 'f')
252: code = '\f';
253: else if (c == 'b')
254: code = '\b';
255: else if (c == 'v')
256: code = 013;
257: else if (c == 'x')
258: {
259: c = getc(finput);
260: while ((c <= '9' && c >= '0')
261: || (c >= 'a' && c <= 'z')
262: || (c >= 'A' && c <= 'Z'))
263: {
264: code *= 16;
265: if (c <= '9' && c >= '0')
266: code += c - '0';
267: else if (c >= 'a' && c <= 'z')
268: code += c - 'a' + 10;
269: else if (c >= 'A' && c <= 'Z')
270: code += c - 'A' + 10;
271: if (code >= 256 || code<0)/* JF this said if(c>=128) */
272: fatals("malformatted literal token `\\x%x'",code);
273: c = getc(finput);
274: }
275: ungetc(c, finput);
276: }
277: else if (c == '\\')
278: code = '\\';
279: else if (c == '\'')
280: code = '\'';
281: else if (c == '\"') /* JF this is a good idea */
282: code = '\"';
283: else
284: {
285: if (c >= 040 && c <= 0177)
286: fatals ("unknown escape sequence `\\%c'", c);
287: else
288: fatals ("unknown escape sequence: `\\' followed by char code 0x%x", c);
289: }
290:
291: c = getc(finput);
292: }
293: }
294: else
295: {
296: code = c;
297: c = getc(finput);
298: }
299: if (c != '\'')
300: fatal("multicharacter literal tokens not supported");
301:
302: /* now fill token_buffer with the canonical name for this character
303: as a literal token. Do not use what the user typed,
304: so that '\012' and '\n' can be interchangeable. */
305:
306: p = token_buffer;
307: *p++ = '\'';
308: if (code == '\\')
309: {
310: *p++ = '\\';
311: *p++ = '\\';
312: }
313: else if (code == '\'')
314: {
315: *p++ = '\\';
316: *p++ = '\'';
317: }
318: else if (code >= 040 && code != 0177)
319: *p++ = code;
320: else if (code == '\t')
321: {
322: *p++ = '\\';
323: *p++ = 't';
324: }
325: else if (code == '\n')
326: {
327: *p++ = '\\';
328: *p++ = 'n';
329: }
330: else if (code == '\r')
331: {
332: *p++ = '\\';
333: *p++ = 'r';
334: }
335: else if (code == '\v')
336: {
337: *p++ = '\\';
338: *p++ = 'v';
339: }
340: else if (code == '\b')
341: {
342: *p++ = '\\';
343: *p++ = 'b';
344: }
345: else if (code == '\f')
346: {
347: *p++ = '\\';
348: *p++ = 'f';
349: }
350: else
351: {
352: *p++ = code / 0100 + '0';
353: *p++ = ((code / 010) & 07) + '0';
354: *p++ = (code & 07) + '0';
355: }
356: *p++ = '\'';
357: *p = 0;
358: symval = getsym(token_buffer);
359: symval->class = STOKEN;
360: if (! symval->user_token_number)
361: symval->user_token_number = code;
362: return (IDENTIFIER);
363: }
364:
365: case ',':
366: return (COMMA);
367:
368: case ':':
369: return (COLON);
370:
371: case ';':
372: return (SEMICOLON);
373:
374: case '|':
375: return (BAR);
376:
377: case '{':
378: return (LEFT_CURLY);
379:
380: case '=':
381: do
382: {
383: c = getc(finput);
384: if (c == '\n') lineno++;
385: }
386: while(c==' ' || c=='\n' || c=='\t');
387:
388: if (c == '{')
389: return(LEFT_CURLY);
390: else
391: {
392: ungetc(c, finput);
393: return(ILLEGAL);
394: }
395:
396: case '<':
397: p = token_buffer;
398: c = getc(finput);
399: while (c != '>')
400: {
401: if (c == '\n' || c == EOF)
402: fatal("unterminated type name");
403:
404: if (p == token_buffer + maxtoken)
405: p = grow_token_buffer(p);
406:
407: *p++ = c;
408: c = getc(finput);
409: }
410: *p = 0;
411: return (TYPENAME);
412:
413:
414: case '%':
415: return (parse_percent_token());
416:
417: default:
418: return (ILLEGAL);
419: }
420: }
421:
422:
423: /* parse a token which starts with %. Assumes the % has already been read and discarded. */
424:
425: int
426: parse_percent_token ()
427: {
428: register int c;
429: register char *p;
430:
431: p = token_buffer;
432: c = getc(finput);
433:
434: switch (c)
435: {
436: case '%':
437: return (TWO_PERCENTS);
438:
439: case '{':
440: return (PERCENT_LEFT_CURLY);
441:
442: case '<':
443: return (LEFT);
444:
445: case '>':
446: return (RIGHT);
447:
448: case '2':
449: return (NONASSOC);
450:
451: case '0':
452: return (TOKEN);
453:
454: case '=':
455: return (PREC);
456: }
457: if (!isalpha(c))
458: return (ILLEGAL);
459:
460: while (isalpha(c) || c == '_')
461: {
462: if (p == token_buffer + maxtoken)
463: p = grow_token_buffer(p);
464:
465: *p++ = c;
466: c = getc(finput);
467: }
468:
469: ungetc(c, finput);
470:
471: *p = 0;
472:
473: if (strcmp(token_buffer, "token") == 0
474: ||
475: strcmp(token_buffer, "term") == 0)
476: return (TOKEN);
477: else if (strcmp(token_buffer, "nterm") == 0)
478: return (NTERM);
479: else if (strcmp(token_buffer, "type") == 0)
480: return (TYPE);
481: else if (strcmp(token_buffer, "guard") == 0)
482: return (GUARD);
483: else if (strcmp(token_buffer, "union") == 0)
484: return (UNION);
485: else if (strcmp(token_buffer, "expect") == 0)
486: return (EXPECT);
487: else if (strcmp(token_buffer, "start") == 0)
488: return (START);
489: else if (strcmp(token_buffer, "left") == 0)
490: return (LEFT);
491: else if (strcmp(token_buffer, "right") == 0)
492: return (RIGHT);
493: else if (strcmp(token_buffer, "nonassoc") == 0
494: ||
495: strcmp(token_buffer, "binary") == 0)
496: return (NONASSOC);
497: else if (strcmp(token_buffer, "semantic_parser") == 0)
498: return (SEMANTIC_PARSER);
499: else if (strcmp(token_buffer, "pure_parser") == 0)
500: return (PURE_PARSER);
501: else if (strcmp(token_buffer, "prec") == 0)
502: return (PREC);
503: else return (ILLEGAL);
504: }
This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.