Annotation of 43BSD/contrib/icon/tran/lex.c, revision 1.1.1.1

1.1       root        1: /*
                      2:  * The lexical analyzer.
                      3:  */
                      4: 
                      5: #include "itran.h"
                      6: #include "token.h"
                      7: #include "lex.h"
                      8: #include "char.h"
                      9: #include "tree.h"
                     10: 
                     11: int tline;
                     12: int tcol;
                     13: 
                     14: /*
                     15:  * yylex - find the next token in the input stream, and return its token
                     16:  *  type and value to the parser.
                     17:  *
                     18:  * Variables of interest:
                     19:  *
                     20:  *  cc - character following last token.
                     21:  *  comflag - set if in a comment.
                     22:  *  nlflag - set if a newline was between the last token and the current token
                     23:  *  lastend - set if the last token was an ENDER.
                     24:  *  lastval - when a semicolon is inserted and returned, lastval gets the
                     25:  *   token value that would have been returned if the semicolon hadn't
                     26:  *   been inserted.
                     27:  */
                     28: 
                     29: yylex()
                     30:    {
                     31:    register struct toktab *t;
                     32:    register int c;
                     33:    int nlflag;
                     34:    int comflag;
                     35:    static struct toktab *lasttok = NULL;
                     36:    static nodeptr lastval;
                     37:    static int lastend = 0;
                     38:    static int eofflag = 0;
                     39:    static int lastline = 0;
                     40:    static int cc = '\n';
                     41:    extern struct toktab *getident(), *getnum(), *getstring(), *getop();
                     42: 
                     43:    if (lasttok != NULL) {
                     44:       /*
                     45:        * A semicolon was inserted and returned on the last call to yylex,
                     46:        *  instead of going to the input, return lasttok and set the
                     47:        *  appropriate variables.
                     48:        */
                     49:       yylval = lastval;
                     50:       tline = LINE(lastval);
                     51:       tcol = COL(lastval);
                     52:       t = lasttok;
                     53:       goto ret;
                     54:       }
                     55:    nlflag = 0;
                     56:    comflag = 0;
                     57: loop:
                     58:    c = cc;
                     59:    /*
                     60:     * Skip whitespace and comments.
                     61:     */
                     62:    while (c != EOF && (comflag || c == COMMENT || isspace(c))) {
                     63:       if (c == '\n') {
                     64:          nlflag++;
                     65:          comflag = 0;
                     66:          }
                     67:       else if (c == COMMENT)
                     68:          comflag++;
                     69:       c = NEXTCHAR;
                     70:       }
                     71:    /*
                     72:     * A token is the next thing in the input.  Record the last line number
                     73:     *  and set tline and tcol to the current line and column.
                     74:     */
                     75:    lastline = tline;
                     76:    tline = inline;
                     77:    tcol = incol;
                     78: 
                     79:    if (c == EOF) {
                     80:       /*
                     81:        * End of file has been reached.  Set eofflag, return T_EOF, and
                     82:        *  set cc to EOF so that any subsequent scans also return T_EOF.
                     83:        */
                     84:       if (eofflag++) {
                     85:          eofflag = 0;
                     86:          cc = '\n';
                     87:          return (int) (yylval = 0);
                     88:          }
                     89:       cc = EOF;
                     90:       t = T_EOF;
                     91:       yylval = 0;
                     92:       goto ret;
                     93:       }
                     94: 
                     95:    /*
                     96:     * Look at current input character to determine what class of token
                     97:     *  is next and take the appropriate action.  Note that the various
                     98:     *  token gathering routines write a value into cc.
                     99:     */
                    100:    c = ctran[c];
                    101:    if (isalpha(c)) {                    /* gather ident or reserved word */
                    102:       if ((t = getident(c, &cc)) == NULL)
                    103:          goto loop;
                    104:       }
                    105:    else if (isdigit(c)) {               /* gather numeric literal */
                    106:       if ((t = getnum(c, &cc)) == NULL)
                    107:          goto loop;
                    108:       }
                    109:    else if (c == '"' || c == '\'') {    /* gather string or cset literal */
                    110:       if ((t = getstring(c, &cc)) == NULL)
                    111:          goto loop;
                    112:       }
                    113:    else {                      /* gather longest legal operator */
                    114:       if ((t = getop(c, &cc)) == NULL)
                    115:          goto loop;
                    116:       yylval = OPNODE(t->t_type);
                    117:       }
                    118:    if (nlflag && lastend && (t->t_flags & BEGINNER)) {
                    119:       /*
                    120:        * A newline was encountered between the current token and the last,
                    121:        *  the last token was an ENDER, and the current token is a BEGINNER.
                    122:        *  Return a semicolon and save the current token in lastval.
                    123:        */
                    124:       lastval = yylval;
                    125:       lasttok = t;
                    126:       tline = lastline;
                    127:       tcol = 0;
                    128:       yylval = OPNODE(SEMICOL);
                    129:       return (SEMICOL);
                    130:       }
                    131: ret:
                    132:    /*
                    133:     * Clear lasttok, set lastend if the token being returned is an
                    134:     *  ENDER, and return the token.
                    135:     */
                    136:    lasttok = 0;
                    137:    lastend = t->t_flags & ENDER;
                    138:    return (t->t_type);
                    139:    }
                    140: 
                    141: /*
                    142:  * getident - gather an identifier beginning with ac.  The character
                    143:  *  following identifier goes in cc.
                    144:  */
                    145: 
                    146: struct toktab *getident(ac, cc)
                    147: char ac;
                    148: int *cc;
                    149:    {
                    150:    register c;
                    151:    register char *p;
                    152:    register struct toktab *t;
                    153:    extern char *putident();
                    154:    extern struct toktab *findres();
                    155: 
                    156:    c = ac;
                    157:    p = sfree;
                    158:    /*
                    159:     * Copy characters into string space until a non-alphanumeric character
                    160:     *  is found.
                    161:     */
                    162:    do {
                    163:       if (p >= send)
                    164:          syserr("out of string space");
                    165:       *p++ = c;
                    166:       c = ctran[NEXTCHAR];
                    167:       } while (isalnum(c));
                    168:    if (p >= send)
                    169:       syserr("out of string space");
                    170:    *p++ = 0;
                    171:    *cc = c;
                    172:    /*
                    173:     * If the identifier is a reserved word, make a RESNODE for it and return
                    174:     *  the token value.  Otherwise, install it with putident, make an
                    175:     *  IDNODE for it, and return.
                    176:     */
                    177:    if ((t = findres()) != NULL) {
                    178:       yylval = RESNODE(t->t_type);
                    179:       return (t);
                    180:       }
                    181:    else {
                    182:       yylval = IDNODE((int)putident(p-sfree));
                    183:       return (T_IDENT);
                    184:       }
                    185:    }
                    186: 
                    187: /*
                    188:  * findres - if the string just copied into the string space by getident
                    189:  *  is a reserved word, return a pointer to its entry in the token table.
                    190:  *  Return NULL if the string isn't a reserved word.
                    191:  */
                    192: 
                    193: struct toktab *findres()
                    194:    {
                    195:    register struct toktab *t;
                    196:    register char c, *p;
                    197: 
                    198:    p = sfree;
                    199:    c = *p;
                    200:    if (!islower(c))
                    201:       return (NULL);
                    202:    /*
                    203:     * Point t at first reserved word that starts with c (if any).
                    204:     */
                    205:    if ((t = restab[c - '_']) == NULL)
                    206:       return (NULL);
                    207:    /*
                    208:     * Search through reserved words, stopping when a match is found
                    209:     *  or when the current reserved word doesn't start with c.
                    210:     */
                    211:    while (t->t_word[0] == c) {
                    212:       if (strcmp(t->t_word, p) == 0)
                    213:          return (t);
                    214:       t++;
                    215:       }
                    216:    return (NULL);
                    217:    }
                    218: 
                    219: /*
                    220:  * getnum - gather a numeric literal starting with ac and put the
                    221:  *  character following the literal into *cc.
                    222:  */
                    223: 
                    224: struct toktab *getnum(ac, cc)
                    225: char ac;
                    226: int *cc;
                    227:    {
                    228:    register c;
                    229:    register r;
                    230:    register state;
                    231:    char *p;
                    232:    int realflag;
                    233:    extern char *putident();
                    234: 
                    235:    c = ac;
                    236:    r = tonum(c);
                    237:    p = sfree;
                    238:    state = 0;
                    239:    realflag = 0;
                    240:    for (;;) {
                    241:       if (p >= send)
                    242:          syserr("out of string space");
                    243:       *p++ = c;
                    244:       c = ctran[NEXTCHAR];
                    245:       switch (state) {
                    246:          case 0:               /* integer part */
                    247:             if (isdigit(c))         { r = r * 10 + tonum(c); continue; }
                    248:             if (c == '.')           { state = 1; realflag++; continue; }
                    249:             if (tolower(c) == 'e')  { state = 2; realflag++; continue; }
                    250:             if (tolower(c) == 'r')  {
                    251:                state = 5;
                    252:                if (r < 2 || r > 36)
                    253:                   err("invalid radix for integer literal", 0);
                    254:                continue;
                    255:                }
                    256:             break;
                    257:          case 1:               /* fractional part */
                    258:             if (isdigit(c))   continue;
                    259:             if (tolower(c) == 'e')   { state = 2; continue; }
                    260:             break;
                    261:          case 2:               /* optional exponent sign */
                    262:             if (c == '+' || c == '-') { state = 3; continue; }
                    263:          case 3:               /* first digit after e, e+, or e- */
                    264:             if (isdigit(c)) { state = 4; continue; }
                    265:             err("invalid real literal", 0);
                    266:             break;
                    267:          case 4:               /* remaining digits after e */
                    268:             if (isdigit(c))   continue;
                    269:             break;
                    270:          case 5:               /* first digit after r */
                    271:             if ((isdigit(c) || isletter(c)) && tonum(c) < r)
                    272:                { state = 6; continue; }
                    273:             err("invalid integer literal", 0);
                    274:             break;
                    275:          case 6:               /* remaining digits after r */
                    276:             if (isdigit(c) || isletter(c)) {
                    277:                if (tonum(c) >= r) {    /* illegal digit for radix r */
                    278:                   err("invalid digit in integer literal", 0);
                    279:                   r = tonum('z');      /* prevent more messages */
                    280:                   }
                    281:                continue;
                    282:                }
                    283:             break;
                    284:          }
                    285:       break;
                    286:       }
                    287:    if (p >= send)
                    288:       syserr("out of string space");
                    289:    *p++ = 0;
                    290:    *cc = c;
                    291:    if (realflag) {
                    292:       yylval = REALNODE((int)putident(p-sfree));
                    293:       return (T_REAL);
                    294:       }
                    295:    yylval = INTNODE((int)putident(p-sfree));
                    296:    return (T_INT);
                    297:    }
                    298: 
                    299: /*
                    300:  * getstring - gather a string literal starting with ac and place the
                    301:  *  character following the literal in *cc.
                    302:  */
                    303: 
                    304: struct toktab *getstring(ac, cc)
                    305: char ac;
                    306: int *cc;
                    307:    {
                    308:    register c, sc;
                    309:    register char *p;
                    310:    char *lc;
                    311:    extern char *putident();
                    312: 
                    313:    sc = c = ac;
                    314:    p = sfree;
                    315:    lc = 0;
                    316:    while ((c = NEXTCHAR) != sc && c != '\n' && c != EOF) {
                    317:    contin:
                    318:       if (c == '_')
                    319:          lc = p;
                    320:       else if (!isspace(c))
                    321:          lc = 0;
                    322:       if (ctran[c] == ESCAPE) {
                    323:          c = NEXTCHAR;
                    324:          if (isoctal(c))
                    325:             c = octesc(c);
                    326:          else if (ctran[c] == 'x')
                    327:             c = hexesc();
                    328:          else if (ctran[c] == '^')
                    329:             c = ctlesc();
                    330:          else
                    331:             c = esctab[c];
                    332:          if (c == EOF)
                    333:             goto noquote;
                    334:          }
                    335:       if (p >= send)
                    336:          syserr("out of string space");
                    337:       *p++ = c;
                    338:       }
                    339:    if (p >= send)
                    340:       syserr("out of string space");
                    341:    *p++ = 0;
                    342:    if (c == sc)
                    343:       *cc = ' ';
                    344:    else {
                    345:       if (c == '\n' && lc) {
                    346:          p = lc;
                    347:          while ((c = NEXTCHAR) != EOF && isspace(c)) ;
                    348:          if (c != EOF)
                    349:             goto contin;
                    350:          }
                    351: noquote:
                    352:       err("unclosed quote", 0);
                    353:       *cc = c;
                    354:       }
                    355:    if (ac == '"') {    /* a string literal */
                    356:       yylval = STRNODE((int)putident(p-sfree), p-sfree);
                    357:       return (T_STRING);
                    358:       }
                    359:    else {              /* a cset literal */
                    360:       yylval = CSETNODE((int)putident(p-sfree), p-sfree);
                    361:       return (T_CSET);
                    362:       }
                    363:    }
                    364: 
                    365: /*
                    366:  * ctlesc - translate a control escape -- backslash followed by
                    367:  *  caret and one character.
                    368:  */
                    369: 
                    370: ctlesc()
                    371:    {
                    372:    register c;
                    373: 
                    374:    c = NEXTCHAR;
                    375:    if (c == EOF)
                    376:       return (EOF);
                    377:    return (c & 037);
                    378:    }
                    379: 
                    380: /*
                    381:  * octesc - translate an octal escape -- backslash followed by
                    382:  *  one, two, or three octal digits.
                    383:  */
                    384: 
                    385: octesc(ac)
                    386: char ac;
                    387:    {
                    388:    register c, nc, i;
                    389: 
                    390:    c = 0;
                    391:    nc = ac;
                    392:    i = 1;
                    393:    do {
                    394:       c = (c << 3) | (nc - '0');
                    395:       nc = NEXTCHAR;
                    396:       if (nc == EOF)
                    397:          return (EOF);
                    398:       } while (isoctal(nc) && i++ < 3);
                    399:    PUSHCHAR(nc);
                    400:    return (c & 0377);
                    401:    }
                    402: 
                    403: /*
                    404:  * hexesc - translate a hexadecimal escape -- backslash-x
                    405:  *  followed by one or two hexadecimal digits.
                    406:  */
                    407: 
                    408: hexesc()
                    409:    {
                    410:    register c, nc, i;
                    411: 
                    412:    c = 0;
                    413:    i = 0;
                    414:    while (i++ < 2) {
                    415:       nc = NEXTCHAR;
                    416:       if (nc == EOF)
                    417:          return (EOF);
                    418:       if (nc >= 'a' && nc <= 'f')
                    419:          nc -= 'a' - 10;
                    420:       else if (nc >= 'A' && nc <= 'F')
                    421:          nc -= 'A' - 10;
                    422:       else if (isdigit(nc))
                    423:          nc -= '0';
                    424:       else {
                    425:          PUSHCHAR(nc);
                    426:          break;
                    427:          }
                    428:       c = (c << 4) | nc;
                    429:       }
                    430:    return (c);
                    431:    }
                    432: 
                    433: /*
                    434:  * getop - find the longest legal operator and return a pointer
                    435:  *  to its entry in the token table.  The tour describes the
                    436:  *  operator recognition process in detail.
                    437:  */
                    438: 
                    439: struct toktab *getop(ac, cc)
                    440: char ac;
                    441: int *cc;
                    442:    {
                    443:    register struct optab *state;
                    444:    register char c, i;
                    445: 
                    446:    state = state0;
                    447:    c = ac;
                    448:    for (;;) {
                    449:       while ((i = state->o_input) && c != i)
                    450:          state++;
                    451:       switch (state->o_action) {
                    452:          case A_GOTO:
                    453:             state = (struct optab *) state->o_val;
                    454:             c = ctran[NEXTCHAR];
                    455:             continue;
                    456:          case A_ERROR:
                    457:             err("invalid character", 0);
                    458:             *cc = ' ';
                    459:             return (NULL);
                    460:          case A_RETURN:
                    461:             *cc = c;
                    462:             return (struct toktab *) (state->o_val);
                    463:          case A_IMMRET:
                    464:             *cc = ' ';
                    465:             return (struct toktab *) (state->o_val);
                    466:          }
                    467:       }
                    468:    }
                    469: 
                    470: /*
                    471:  * nextchar - return the next character in the input.
                    472:  */
                    473: 
                    474: nextchar()
                    475:    {
                    476:    register char c;
                    477: 
                    478:    if (c = peekc) {
                    479:       peekc = 0;
                    480:       return (c);
                    481:       }
                    482:    c = getc(infile);
                    483:    switch (c) {
                    484:       case EOF:
                    485:          inline = 0;
                    486:          incol = 0;
                    487:          break;
                    488:       case '\n':
                    489:          inline++;
                    490:          incol = 0;
                    491:          break;
                    492:       case '\t':
                    493:          incol = (incol | 7) + 1;
                    494:          break;
                    495:       case '\b':
                    496:          if (incol)
                    497:             incol--;
                    498:          break;
                    499:       default:
                    500:          incol++;
                    501:       }
                    502:    return (c);
                    503:    }

unix.superglobalmegacorp.com

This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.