GNUtools/bison/lex.c - annotate

Return to lex.c CVS log
Up to [Apple XNU] / GNUtools / bison
Annotation of GNUtools/bison/lex.c, revision 1.1.1.1

1.1       root        1: /* Token-reader for Bison's input parser,
                      2:    Copyright (C) 1984, 1986, 1989 Free Software Foundation, Inc.
                      3: 
                      4: This file is part of Bison, the GNU Compiler Compiler.
                      5: 
                      6: Bison is free software; you can redistribute it and/or modify
                      7: it under the terms of the GNU General Public License as published by
                      8: the Free Software Foundation; either version 2, or (at your option)
                      9: any later version.
                     10: 
                     11: Bison is distributed in the hope that it will be useful,
                     12: but WITHOUT ANY WARRANTY; without even the implied warranty of
                     13: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
                     14: GNU General Public License for more details.
                     15: 
                     16: You should have received a copy of the GNU General Public License
                     17: along with Bison; see the file COPYING.  If not, write to
                     18: the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.  */
                     19: 
                     20: 
                     21: /* 
                     22:    lex() is the entry point.  It is called from reader.c.
                     23:    It returns one of the token-type codes defined in lex.h.
                     24:    When an identifier is seen, the code IDENTIFIER is returned
                     25:    and the name is looked up in the symbol table using symtab.c;
                     26:    symval is set to a pointer to the entry found.  */
                     27: 
                     28: #include <stdio.h>
                     29: #include <ctype.h>
                     30: #include "system.h"
                     31: #include "files.h"
                     32: #include "symtab.h"
                     33: #include "lex.h"
                     34: #include "new.h"
                     35: 
                     36: 
                     37: extern int lineno;
                     38: extern int translations;
                     39: 
                     40: int parse_percent_token();
                     41: 
                     42: extern void fatals();
                     43: extern void fatal();
                     44: 
                     45: /* Buffer for storing the current token.  */
                     46: char *token_buffer;
                     47: 
                     48: /* Allocated size of token_buffer, not including space for terminator.  */
                     49: static int maxtoken;
                     50: 
                     51: bucket *symval;
                     52: int numval;
                     53: 
                     54: static int unlexed;            /* these two describe a token to be reread */
                     55: static bucket *unlexed_symval; /* by the next call to lex */
                     56: 
                     57: 
                     58: void
                     59: init_lex()
                     60: {
                     61:   maxtoken = 100;
                     62:   token_buffer = NEW2 (maxtoken + 1, char);
                     63:   unlexed = -1;
                     64: }
                     65: 
                     66: 
                     67: static char *
                     68: grow_token_buffer (p)
                     69:      char *p;
                     70: {
                     71:   int offset = p - token_buffer;
                     72:   maxtoken *= 2;
                     73:   token_buffer = (char *) xrealloc(token_buffer, maxtoken + 1);
                     74:   return token_buffer + offset;
                     75: }
                     76: 
                     77: 
                     78: int
                     79: skip_white_space()
                     80: {
                     81:   register int c;
                     82:   register int inside;
                     83: 
                     84:   c = getc(finput);
                     85: 
                     86:   for (;;)
                     87:     {
                     88:       int cplus_comment;
                     89: 
                     90:       switch (c)
                     91:        {
                     92:        case '/':
                     93:          c = getc(finput);
                     94:          if (c != '*' && c != '/')
                     95:            fatals("unexpected `/%c' found",c);
                     96:          cplus_comment = (c == '/');
                     97: 
                     98:          c = getc(finput);
                     99: 
                    100:          inside = 1;
                    101:          while (inside)
                    102:            {
                    103:              if (!cplus_comment && c == '*')
                    104:                {
                    105:                  while (c == '*')
                    106:                    c = getc(finput);
                    107: 
                    108:                  if (c == '/')
                    109:                    {
                    110:                      inside = 0;
                    111:                      c = getc(finput);
                    112:                    }
                    113:                }
                    114:              else if (c == '\n')
                    115:                {
                    116:                  lineno++;
                    117:                  if (cplus_comment)
                    118:                    inside = 0;
                    119:                  c = getc(finput);
                    120:                }
                    121:              else if (c == EOF)
                    122:                fatal("unterminated comment");
                    123:              else
                    124:                c = getc(finput);
                    125:            }
                    126: 
                    127:          break;
                    128: 
                    129:        case '\n':
                    130:          lineno++;
                    131: 
                    132:        case ' ':
                    133:        case '\t':
                    134:        case '\f':
                    135:          c = getc(finput);
                    136:          break;
                    137: 
                    138:        default:
                    139:          return (c);
                    140:        }
                    141:     }
                    142: }
                    143: 
                    144: 
                    145: void
                    146: unlex(token)
                    147: int token;
                    148: {
                    149:   unlexed = token;
                    150:   unlexed_symval = symval;
                    151: }
                    152: 
                    153: 
                    154: 
                    155: int
                    156: lex()
                    157: {
                    158:   register int c;
                    159:   register char *p;
                    160: 
                    161:   if (unlexed >= 0)
                    162:     {
                    163:       symval = unlexed_symval;
                    164:       c = unlexed;
                    165:       unlexed = -1;
                    166:       return (c);
                    167:     }
                    168: 
                    169:   c = skip_white_space();
                    170: 
                    171:   switch (c)
                    172:     {
                    173:     case EOF:
                    174:       return (ENDFILE);
                    175: 
                    176:     case 'A':  case 'B':  case 'C':  case 'D':  case 'E':
                    177:     case 'F':  case 'G':  case 'H':  case 'I':  case 'J':
                    178:     case 'K':  case 'L':  case 'M':  case 'N':  case 'O':
                    179:     case 'P':  case 'Q':  case 'R':  case 'S':  case 'T':
                    180:     case 'U':  case 'V':  case 'W':  case 'X':  case 'Y':
                    181:     case 'Z':
                    182:     case 'a':  case 'b':  case 'c':  case 'd':  case 'e':
                    183:     case 'f':  case 'g':  case 'h':  case 'i':  case 'j':
                    184:     case 'k':  case 'l':  case 'm':  case 'n':  case 'o':
                    185:     case 'p':  case 'q':  case 'r':  case 's':  case 't':
                    186:     case 'u':  case 'v':  case 'w':  case 'x':  case 'y':
                    187:     case 'z':
                    188:     case '.':  case '_':
                    189:       p = token_buffer;
                    190:       while (isalnum(c) || c == '_' || c == '.')
                    191:        {
                    192:          if (p == token_buffer + maxtoken)
                    193:            p = grow_token_buffer(p);
                    194: 
                    195:          *p++ = c;
                    196:          c = getc(finput);
                    197:        }
                    198: 
                    199:       *p = 0;
                    200:       ungetc(c, finput);
                    201:       symval = getsym(token_buffer);
                    202:       return (IDENTIFIER);
                    203: 
                    204:     case '0':  case '1':  case '2':  case '3':  case '4':
                    205:     case '5':  case '6':  case '7':  case '8':  case '9':
                    206:       {
                    207:        numval = 0;
                    208: 
                    209:        while (isdigit(c))
                    210:          {
                    211:            numval = numval*10 + c - '0';
                    212:            c = getc(finput);
                    213:          }
                    214:        ungetc(c, finput);
                    215:        return (NUMBER);
                    216:       }
                    217: 
                    218:     case '\'':
                    219:       translations = -1;
                    220: 
                    221:       /* parse the literal token and compute character code in  code  */
                    222: 
                    223:       c = getc(finput);
                    224:       {
                    225:        register int code = 0;
                    226: 
                    227:        if (c == '\\')
                    228:          {
                    229:            c = getc(finput);
                    230: 
                    231:            if (c <= '7' && c >= '0')
                    232:              {
                    233:                while (c <= '7' && c >= '0')
                    234:                  {
                    235:                    code = (code * 8) + (c - '0');
                    236:                    c = getc(finput);
                    237:                    if (code >= 256 || code < 0)
                    238:                      fatals("malformatted literal token `\\%03o'", code);
                    239:                  }
                    240:              }
                    241:            else
                    242:              {
                    243:                if (c == 't')
                    244:                  code = '\t';
                    245:                else if (c == 'n')
                    246:                  code = '\n';
                    247:                else if (c == 'a')
                    248:                  code = '\007';
                    249:                else if (c == 'r')
                    250:                  code = '\r';
                    251:                else if (c == 'f')
                    252:                  code = '\f';
                    253:                else if (c == 'b')
                    254:                  code = '\b';
                    255:                else if (c == 'v')
                    256:                  code = 013;
                    257:                else if (c == 'x')
                    258:                  {
                    259:                    c = getc(finput);
                    260:                    while ((c <= '9' && c >= '0')
                    261:                           || (c >= 'a' && c <= 'z')
                    262:                           || (c >= 'A' && c <= 'Z'))
                    263:                      {
                    264:                        code *= 16;
                    265:                        if (c <= '9' && c >= '0')
                    266:                          code += c - '0';
                    267:                        else if (c >= 'a' && c <= 'z')
                    268:                          code += c - 'a' + 10;
                    269:                        else if (c >= 'A' && c <= 'Z')
                    270:                          code += c - 'A' + 10;
                    271:                        if (code >= 256 || code<0)/* JF this said if(c>=128) */
                    272:                          fatals("malformatted literal token `\\x%x'",code);
                    273:                        c = getc(finput);
                    274:                      }
                    275:                    ungetc(c, finput);
                    276:                  }
                    277:                else if (c == '\\')
                    278:                  code = '\\';
                    279:                else if (c == '\'')
                    280:                  code = '\'';
                    281:                else if (c == '\"')     /* JF this is a good idea */
                    282:                  code = '\"';
                    283:                else
                    284:                  {
                    285:                    if (c >= 040 && c <= 0177)
                    286:                      fatals ("unknown escape sequence `\\%c'", c);
                    287:                    else
                    288:                      fatals ("unknown escape sequence: `\\' followed by char code 0x%x", c);
                    289:                  }
                    290: 
                    291:                c = getc(finput);
                    292:              }
                    293:          }
                    294:        else
                    295:          {
                    296:            code = c;
                    297:            c = getc(finput);
                    298:          }
                    299:        if (c != '\'')
                    300:          fatal("multicharacter literal tokens not supported");
                    301: 
                    302:        /* now fill token_buffer with the canonical name for this character
                    303:           as a literal token.  Do not use what the user typed,
                    304:           so that '\012' and '\n' can be interchangeable.  */
                    305: 
                    306:        p = token_buffer;
                    307:        *p++ = '\'';
                    308:        if (code == '\\')
                    309:          {
                    310:            *p++ = '\\';
                    311:            *p++ = '\\';
                    312:          }
                    313:        else if (code == '\'')
                    314:          {
                    315:            *p++ = '\\';
                    316:            *p++ = '\'';
                    317:          }
                    318:        else if (code >= 040 && code != 0177)
                    319:          *p++ = code;
                    320:        else if (code == '\t')
                    321:          {
                    322:            *p++ = '\\';
                    323:            *p++ = 't';
                    324:          }
                    325:        else if (code == '\n')
                    326:          {
                    327:            *p++ = '\\';
                    328:            *p++ = 'n';
                    329:          }
                    330:        else if (code == '\r')
                    331:          {
                    332:            *p++ = '\\';
                    333:            *p++ = 'r';
                    334:          }
                    335:        else if (code == '\v')
                    336:          {
                    337:            *p++ = '\\';
                    338:            *p++ = 'v';
                    339:          }
                    340:        else if (code == '\b')
                    341:          {
                    342:            *p++ = '\\';
                    343:            *p++ = 'b';
                    344:          }
                    345:        else if (code == '\f')
                    346:          {
                    347:            *p++ = '\\';
                    348:            *p++ = 'f';
                    349:          }
                    350:         else
                    351:          {
                    352:            *p++ = code / 0100 + '0';
                    353:            *p++ = ((code / 010) & 07) + '0';
                    354:            *p++ = (code & 07) + '0';
                    355:          }
                    356:        *p++ = '\'';
                    357:        *p = 0;
                    358:        symval = getsym(token_buffer);
                    359:        symval->class = STOKEN;
                    360:        if (! symval->user_token_number)
                    361:          symval->user_token_number = code;
                    362:        return (IDENTIFIER);
                    363:       }
                    364: 
                    365:     case ',':
                    366:       return (COMMA);
                    367: 
                    368:     case ':':
                    369:       return (COLON);
                    370: 
                    371:     case ';':
                    372:       return (SEMICOLON);
                    373: 
                    374:     case '|':
                    375:       return (BAR);
                    376: 
                    377:     case '{':
                    378:       return (LEFT_CURLY);
                    379: 
                    380:     case '=':
                    381:       do
                    382:        {
                    383:          c = getc(finput);
                    384:          if (c == '\n') lineno++;
                    385:        }
                    386:       while(c==' ' || c=='\n' || c=='\t');
                    387: 
                    388:       if (c == '{')
                    389:        return(LEFT_CURLY);
                    390:       else
                    391:        {
                    392:          ungetc(c, finput);
                    393:          return(ILLEGAL);
                    394:        }
                    395: 
                    396:     case '<':
                    397:       p = token_buffer;
                    398:       c = getc(finput);
                    399:       while (c != '>')
                    400:        {
                    401:          if (c == '\n' || c == EOF)
                    402:            fatal("unterminated type name");
                    403: 
                    404:          if (p == token_buffer + maxtoken)
                    405:            p = grow_token_buffer(p);
                    406: 
                    407:          *p++ = c;
                    408:          c = getc(finput);
                    409:        }
                    410:       *p = 0;
                    411:       return (TYPENAME);
                    412:            
                    413: 
                    414:     case '%':
                    415:       return (parse_percent_token());
                    416: 
                    417:     default:
                    418:       return (ILLEGAL);
                    419:     }
                    420: }
                    421: 
                    422: 
                    423: /* parse a token which starts with %.  Assumes the % has already been read and discarded.  */
                    424: 
                    425: int
                    426: parse_percent_token ()
                    427: {
                    428:   register int c;
                    429:   register char *p;
                    430: 
                    431:   p = token_buffer;
                    432:   c = getc(finput);
                    433: 
                    434:   switch (c)
                    435:     {
                    436:     case '%':
                    437:       return (TWO_PERCENTS);
                    438: 
                    439:     case '{':
                    440:       return (PERCENT_LEFT_CURLY);
                    441: 
                    442:     case '<':
                    443:       return (LEFT);
                    444: 
                    445:     case '>':
                    446:       return (RIGHT);
                    447: 
                    448:     case '2':
                    449:       return (NONASSOC);
                    450: 
                    451:     case '0':
                    452:       return (TOKEN);
                    453: 
                    454:     case '=':
                    455:       return (PREC);
                    456:     }
                    457:   if (!isalpha(c))
                    458:     return (ILLEGAL);
                    459: 
                    460:   while (isalpha(c) || c == '_')
                    461:     {
                    462:       if (p == token_buffer + maxtoken)
                    463:        p = grow_token_buffer(p);
                    464: 
                    465:       *p++ = c;
                    466:       c = getc(finput);
                    467:     }
                    468: 
                    469:   ungetc(c, finput);
                    470: 
                    471:   *p = 0;
                    472: 
                    473:   if (strcmp(token_buffer, "token") == 0
                    474:       ||
                    475:       strcmp(token_buffer, "term") == 0)
                    476:     return (TOKEN);
                    477:   else if (strcmp(token_buffer, "nterm") == 0)
                    478:     return (NTERM);
                    479:   else if (strcmp(token_buffer, "type") == 0)
                    480:     return (TYPE);
                    481:   else if (strcmp(token_buffer, "guard") == 0)
                    482:     return (GUARD);
                    483:   else if (strcmp(token_buffer, "union") == 0)
                    484:     return (UNION);
                    485:   else if (strcmp(token_buffer, "expect") == 0)
                    486:     return (EXPECT);
                    487:   else if (strcmp(token_buffer, "start") == 0)
                    488:     return (START);
                    489:   else if (strcmp(token_buffer, "left") == 0)
                    490:     return (LEFT);
                    491:   else if (strcmp(token_buffer, "right") == 0)
                    492:     return (RIGHT);
                    493:   else if (strcmp(token_buffer, "nonassoc") == 0
                    494:           ||
                    495:           strcmp(token_buffer, "binary") == 0)
                    496:     return (NONASSOC);
                    497:   else if (strcmp(token_buffer, "semantic_parser") == 0)
                    498:     return (SEMANTIC_PARSER);
                    499:   else if (strcmp(token_buffer, "pure_parser") == 0)
                    500:     return (PURE_PARSER);
                    501:   else if (strcmp(token_buffer, "prec") == 0)
                    502:     return (PREC);
                    503:   else return (ILLEGAL);
                    504: }
unix.superglobalmegacorp.com
This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.