|
|
1.1 ! root 1: /* Token-reader for Bison's input parser, ! 2: Copyright (C) 1984, 1986, 1989 Free Software Foundation, Inc. ! 3: ! 4: This file is part of Bison, the GNU Compiler Compiler. ! 5: ! 6: Bison is free software; you can redistribute it and/or modify ! 7: it under the terms of the GNU General Public License as published by ! 8: the Free Software Foundation; either version 2, or (at your option) ! 9: any later version. ! 10: ! 11: Bison is distributed in the hope that it will be useful, ! 12: but WITHOUT ANY WARRANTY; without even the implied warranty of ! 13: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ! 14: GNU General Public License for more details. ! 15: ! 16: You should have received a copy of the GNU General Public License ! 17: along with Bison; see the file COPYING. If not, write to ! 18: the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. */ ! 19: ! 20: ! 21: /* ! 22: lex() is the entry point. It is called from reader.c. ! 23: It returns one of the token-type codes defined in lex.h. ! 24: When an identifier is seen, the code IDENTIFIER is returned ! 25: and the name is looked up in the symbol table using symtab.c; ! 26: symval is set to a pointer to the entry found. */ ! 27: ! 28: #include <stdio.h> ! 29: #include <ctype.h> ! 30: #include "system.h" ! 31: #include "files.h" ! 32: #include "symtab.h" ! 33: #include "lex.h" ! 34: #include "new.h" ! 35: ! 36: ! 37: extern int lineno; ! 38: extern int translations; ! 39: ! 40: int parse_percent_token(); ! 41: ! 42: extern void fatals(); ! 43: extern void fatal(); ! 44: ! 45: /* Buffer for storing the current token. */ ! 46: char *token_buffer; ! 47: ! 48: /* Allocated size of token_buffer, not including space for terminator. */ ! 49: static int maxtoken; ! 50: ! 51: bucket *symval; ! 52: int numval; ! 53: ! 54: static int unlexed; /* these two describe a token to be reread */ ! 55: static bucket *unlexed_symval; /* by the next call to lex */ ! 56: ! 57: ! 58: void ! 59: init_lex() ! 60: { ! 61: maxtoken = 100; ! 62: token_buffer = NEW2 (maxtoken + 1, char); ! 63: unlexed = -1; ! 64: } ! 65: ! 66: ! 67: static char * ! 68: grow_token_buffer (p) ! 69: char *p; ! 70: { ! 71: int offset = p - token_buffer; ! 72: maxtoken *= 2; ! 73: token_buffer = (char *) xrealloc(token_buffer, maxtoken + 1); ! 74: return token_buffer + offset; ! 75: } ! 76: ! 77: ! 78: int ! 79: skip_white_space() ! 80: { ! 81: register int c; ! 82: register int inside; ! 83: ! 84: c = getc(finput); ! 85: ! 86: for (;;) ! 87: { ! 88: int cplus_comment; ! 89: ! 90: switch (c) ! 91: { ! 92: case '/': ! 93: c = getc(finput); ! 94: if (c != '*' && c != '/') ! 95: fatals("unexpected `/%c' found",c); ! 96: cplus_comment = (c == '/'); ! 97: ! 98: c = getc(finput); ! 99: ! 100: inside = 1; ! 101: while (inside) ! 102: { ! 103: if (!cplus_comment && c == '*') ! 104: { ! 105: while (c == '*') ! 106: c = getc(finput); ! 107: ! 108: if (c == '/') ! 109: { ! 110: inside = 0; ! 111: c = getc(finput); ! 112: } ! 113: } ! 114: else if (c == '\n') ! 115: { ! 116: lineno++; ! 117: if (cplus_comment) ! 118: inside = 0; ! 119: c = getc(finput); ! 120: } ! 121: else if (c == EOF) ! 122: fatal("unterminated comment"); ! 123: else ! 124: c = getc(finput); ! 125: } ! 126: ! 127: break; ! 128: ! 129: case '\n': ! 130: lineno++; ! 131: ! 132: case ' ': ! 133: case '\t': ! 134: case '\f': ! 135: c = getc(finput); ! 136: break; ! 137: ! 138: default: ! 139: return (c); ! 140: } ! 141: } ! 142: } ! 143: ! 144: ! 145: void ! 146: unlex(token) ! 147: int token; ! 148: { ! 149: unlexed = token; ! 150: unlexed_symval = symval; ! 151: } ! 152: ! 153: ! 154: ! 155: int ! 156: lex() ! 157: { ! 158: register int c; ! 159: register char *p; ! 160: ! 161: if (unlexed >= 0) ! 162: { ! 163: symval = unlexed_symval; ! 164: c = unlexed; ! 165: unlexed = -1; ! 166: return (c); ! 167: } ! 168: ! 169: c = skip_white_space(); ! 170: ! 171: switch (c) ! 172: { ! 173: case EOF: ! 174: return (ENDFILE); ! 175: ! 176: case 'A': case 'B': case 'C': case 'D': case 'E': ! 177: case 'F': case 'G': case 'H': case 'I': case 'J': ! 178: case 'K': case 'L': case 'M': case 'N': case 'O': ! 179: case 'P': case 'Q': case 'R': case 'S': case 'T': ! 180: case 'U': case 'V': case 'W': case 'X': case 'Y': ! 181: case 'Z': ! 182: case 'a': case 'b': case 'c': case 'd': case 'e': ! 183: case 'f': case 'g': case 'h': case 'i': case 'j': ! 184: case 'k': case 'l': case 'm': case 'n': case 'o': ! 185: case 'p': case 'q': case 'r': case 's': case 't': ! 186: case 'u': case 'v': case 'w': case 'x': case 'y': ! 187: case 'z': ! 188: case '.': case '_': ! 189: p = token_buffer; ! 190: while (isalnum(c) || c == '_' || c == '.') ! 191: { ! 192: if (p == token_buffer + maxtoken) ! 193: p = grow_token_buffer(p); ! 194: ! 195: *p++ = c; ! 196: c = getc(finput); ! 197: } ! 198: ! 199: *p = 0; ! 200: ungetc(c, finput); ! 201: symval = getsym(token_buffer); ! 202: return (IDENTIFIER); ! 203: ! 204: case '0': case '1': case '2': case '3': case '4': ! 205: case '5': case '6': case '7': case '8': case '9': ! 206: { ! 207: numval = 0; ! 208: ! 209: while (isdigit(c)) ! 210: { ! 211: numval = numval*10 + c - '0'; ! 212: c = getc(finput); ! 213: } ! 214: ungetc(c, finput); ! 215: return (NUMBER); ! 216: } ! 217: ! 218: case '\'': ! 219: translations = -1; ! 220: ! 221: /* parse the literal token and compute character code in code */ ! 222: ! 223: c = getc(finput); ! 224: { ! 225: register int code = 0; ! 226: ! 227: if (c == '\\') ! 228: { ! 229: c = getc(finput); ! 230: ! 231: if (c <= '7' && c >= '0') ! 232: { ! 233: while (c <= '7' && c >= '0') ! 234: { ! 235: code = (code * 8) + (c - '0'); ! 236: c = getc(finput); ! 237: if (code >= 256 || code < 0) ! 238: fatals("malformatted literal token `\\%03o'", code); ! 239: } ! 240: } ! 241: else ! 242: { ! 243: if (c == 't') ! 244: code = '\t'; ! 245: else if (c == 'n') ! 246: code = '\n'; ! 247: else if (c == 'a') ! 248: code = '\007'; ! 249: else if (c == 'r') ! 250: code = '\r'; ! 251: else if (c == 'f') ! 252: code = '\f'; ! 253: else if (c == 'b') ! 254: code = '\b'; ! 255: else if (c == 'v') ! 256: code = 013; ! 257: else if (c == 'x') ! 258: { ! 259: c = getc(finput); ! 260: while ((c <= '9' && c >= '0') ! 261: || (c >= 'a' && c <= 'z') ! 262: || (c >= 'A' && c <= 'Z')) ! 263: { ! 264: code *= 16; ! 265: if (c <= '9' && c >= '0') ! 266: code += c - '0'; ! 267: else if (c >= 'a' && c <= 'z') ! 268: code += c - 'a' + 10; ! 269: else if (c >= 'A' && c <= 'Z') ! 270: code += c - 'A' + 10; ! 271: if (code >= 256 || code<0)/* JF this said if(c>=128) */ ! 272: fatals("malformatted literal token `\\x%x'",code); ! 273: c = getc(finput); ! 274: } ! 275: ungetc(c, finput); ! 276: } ! 277: else if (c == '\\') ! 278: code = '\\'; ! 279: else if (c == '\'') ! 280: code = '\''; ! 281: else if (c == '\"') /* JF this is a good idea */ ! 282: code = '\"'; ! 283: else ! 284: { ! 285: if (c >= 040 && c <= 0177) ! 286: fatals ("unknown escape sequence `\\%c'", c); ! 287: else ! 288: fatals ("unknown escape sequence: `\\' followed by char code 0x%x", c); ! 289: } ! 290: ! 291: c = getc(finput); ! 292: } ! 293: } ! 294: else ! 295: { ! 296: code = c; ! 297: c = getc(finput); ! 298: } ! 299: if (c != '\'') ! 300: fatal("multicharacter literal tokens not supported"); ! 301: ! 302: /* now fill token_buffer with the canonical name for this character ! 303: as a literal token. Do not use what the user typed, ! 304: so that '\012' and '\n' can be interchangeable. */ ! 305: ! 306: p = token_buffer; ! 307: *p++ = '\''; ! 308: if (code == '\\') ! 309: { ! 310: *p++ = '\\'; ! 311: *p++ = '\\'; ! 312: } ! 313: else if (code == '\'') ! 314: { ! 315: *p++ = '\\'; ! 316: *p++ = '\''; ! 317: } ! 318: else if (code >= 040 && code != 0177) ! 319: *p++ = code; ! 320: else if (code == '\t') ! 321: { ! 322: *p++ = '\\'; ! 323: *p++ = 't'; ! 324: } ! 325: else if (code == '\n') ! 326: { ! 327: *p++ = '\\'; ! 328: *p++ = 'n'; ! 329: } ! 330: else if (code == '\r') ! 331: { ! 332: *p++ = '\\'; ! 333: *p++ = 'r'; ! 334: } ! 335: else if (code == '\v') ! 336: { ! 337: *p++ = '\\'; ! 338: *p++ = 'v'; ! 339: } ! 340: else if (code == '\b') ! 341: { ! 342: *p++ = '\\'; ! 343: *p++ = 'b'; ! 344: } ! 345: else if (code == '\f') ! 346: { ! 347: *p++ = '\\'; ! 348: *p++ = 'f'; ! 349: } ! 350: else ! 351: { ! 352: *p++ = code / 0100 + '0'; ! 353: *p++ = ((code / 010) & 07) + '0'; ! 354: *p++ = (code & 07) + '0'; ! 355: } ! 356: *p++ = '\''; ! 357: *p = 0; ! 358: symval = getsym(token_buffer); ! 359: symval->class = STOKEN; ! 360: if (! symval->user_token_number) ! 361: symval->user_token_number = code; ! 362: return (IDENTIFIER); ! 363: } ! 364: ! 365: case ',': ! 366: return (COMMA); ! 367: ! 368: case ':': ! 369: return (COLON); ! 370: ! 371: case ';': ! 372: return (SEMICOLON); ! 373: ! 374: case '|': ! 375: return (BAR); ! 376: ! 377: case '{': ! 378: return (LEFT_CURLY); ! 379: ! 380: case '=': ! 381: do ! 382: { ! 383: c = getc(finput); ! 384: if (c == '\n') lineno++; ! 385: } ! 386: while(c==' ' || c=='\n' || c=='\t'); ! 387: ! 388: if (c == '{') ! 389: return(LEFT_CURLY); ! 390: else ! 391: { ! 392: ungetc(c, finput); ! 393: return(ILLEGAL); ! 394: } ! 395: ! 396: case '<': ! 397: p = token_buffer; ! 398: c = getc(finput); ! 399: while (c != '>') ! 400: { ! 401: if (c == '\n' || c == EOF) ! 402: fatal("unterminated type name"); ! 403: ! 404: if (p == token_buffer + maxtoken) ! 405: p = grow_token_buffer(p); ! 406: ! 407: *p++ = c; ! 408: c = getc(finput); ! 409: } ! 410: *p = 0; ! 411: return (TYPENAME); ! 412: ! 413: ! 414: case '%': ! 415: return (parse_percent_token()); ! 416: ! 417: default: ! 418: return (ILLEGAL); ! 419: } ! 420: } ! 421: ! 422: ! 423: /* parse a token which starts with %. Assumes the % has already been read and discarded. */ ! 424: ! 425: int ! 426: parse_percent_token () ! 427: { ! 428: register int c; ! 429: register char *p; ! 430: ! 431: p = token_buffer; ! 432: c = getc(finput); ! 433: ! 434: switch (c) ! 435: { ! 436: case '%': ! 437: return (TWO_PERCENTS); ! 438: ! 439: case '{': ! 440: return (PERCENT_LEFT_CURLY); ! 441: ! 442: case '<': ! 443: return (LEFT); ! 444: ! 445: case '>': ! 446: return (RIGHT); ! 447: ! 448: case '2': ! 449: return (NONASSOC); ! 450: ! 451: case '0': ! 452: return (TOKEN); ! 453: ! 454: case '=': ! 455: return (PREC); ! 456: } ! 457: if (!isalpha(c)) ! 458: return (ILLEGAL); ! 459: ! 460: while (isalpha(c) || c == '_') ! 461: { ! 462: if (p == token_buffer + maxtoken) ! 463: p = grow_token_buffer(p); ! 464: ! 465: *p++ = c; ! 466: c = getc(finput); ! 467: } ! 468: ! 469: ungetc(c, finput); ! 470: ! 471: *p = 0; ! 472: ! 473: if (strcmp(token_buffer, "token") == 0 ! 474: || ! 475: strcmp(token_buffer, "term") == 0) ! 476: return (TOKEN); ! 477: else if (strcmp(token_buffer, "nterm") == 0) ! 478: return (NTERM); ! 479: else if (strcmp(token_buffer, "type") == 0) ! 480: return (TYPE); ! 481: else if (strcmp(token_buffer, "guard") == 0) ! 482: return (GUARD); ! 483: else if (strcmp(token_buffer, "union") == 0) ! 484: return (UNION); ! 485: else if (strcmp(token_buffer, "expect") == 0) ! 486: return (EXPECT); ! 487: else if (strcmp(token_buffer, "start") == 0) ! 488: return (START); ! 489: else if (strcmp(token_buffer, "left") == 0) ! 490: return (LEFT); ! 491: else if (strcmp(token_buffer, "right") == 0) ! 492: return (RIGHT); ! 493: else if (strcmp(token_buffer, "nonassoc") == 0 ! 494: || ! 495: strcmp(token_buffer, "binary") == 0) ! 496: return (NONASSOC); ! 497: else if (strcmp(token_buffer, "semantic_parser") == 0) ! 498: return (SEMANTIC_PARSER); ! 499: else if (strcmp(token_buffer, "pure_parser") == 0) ! 500: return (PURE_PARSER); ! 501: else if (strcmp(token_buffer, "prec") == 0) ! 502: return (PREC); ! 503: else return (ILLEGAL); ! 504: }
This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.