File:  [Qemu by Fabrice Bellard] / qemu / json-lexer.c
Revision 1.1.1.5 (vendor branch): download - view: text, annotated - select for diffs
Tue Apr 24 19:34:07 2018 UTC (23 months, 1 week ago) by root
Branches: qemu, MAIN
CVS tags: qemu1101, HEAD
qemu 1.1.1

    1: /*
    2:  * JSON lexer
    3:  *
    4:  * Copyright IBM, Corp. 2009
    5:  *
    6:  * Authors:
    7:  *  Anthony Liguori   <aliguori@us.ibm.com>
    8:  *
    9:  * This work is licensed under the terms of the GNU LGPL, version 2.1 or later.
   10:  * See the COPYING.LIB file in the top-level directory.
   11:  *
   12:  */
   13: 
   14: #include "qstring.h"
   15: #include "qlist.h"
   16: #include "qdict.h"
   17: #include "qint.h"
   18: #include "qemu-common.h"
   19: #include "json-lexer.h"
   20: 
   21: #define MAX_TOKEN_SIZE (64ULL << 20)
   22: 
   23: /*
   24:  * \"([^\\\"]|(\\\"\\'\\\\\\/\\b\\f\\n\\r\\t\\u[0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F]))*\"
   25:  * '([^\\']|(\\\"\\'\\\\\\/\\b\\f\\n\\r\\t\\u[0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F]))*'
   26:  * 0|([1-9][0-9]*(.[0-9]+)?([eE]([-+])?[0-9]+))
   27:  * [{}\[\],:]
   28:  * [a-z]+
   29:  *
   30:  */
   31: 
   32: enum json_lexer_state {
   33:     IN_ERROR = 0,
   34:     IN_DQ_UCODE3,
   35:     IN_DQ_UCODE2,
   36:     IN_DQ_UCODE1,
   37:     IN_DQ_UCODE0,
   38:     IN_DQ_STRING_ESCAPE,
   39:     IN_DQ_STRING,
   40:     IN_SQ_UCODE3,
   41:     IN_SQ_UCODE2,
   42:     IN_SQ_UCODE1,
   43:     IN_SQ_UCODE0,
   44:     IN_SQ_STRING_ESCAPE,
   45:     IN_SQ_STRING,
   46:     IN_ZERO,
   47:     IN_DIGITS,
   48:     IN_DIGIT,
   49:     IN_EXP_E,
   50:     IN_MANTISSA,
   51:     IN_MANTISSA_DIGITS,
   52:     IN_NONZERO_NUMBER,
   53:     IN_NEG_NONZERO_NUMBER,
   54:     IN_KEYWORD,
   55:     IN_ESCAPE,
   56:     IN_ESCAPE_L,
   57:     IN_ESCAPE_LL,
   58:     IN_ESCAPE_I,
   59:     IN_ESCAPE_I6,
   60:     IN_ESCAPE_I64,
   61:     IN_WHITESPACE,
   62:     IN_START,
   63: };
   64: 
   65: #define TERMINAL(state) [0 ... 0x7F] = (state)
   66: 
   67: /* Return whether TERMINAL is a terminal state and the transition to it
   68:    from OLD_STATE required lookahead.  This happens whenever the table
   69:    below uses the TERMINAL macro.  */
   70: #define TERMINAL_NEEDED_LOOKAHEAD(old_state, terminal) \
   71:             (json_lexer[(old_state)][0] == (terminal))
   72: 
   73: static const uint8_t json_lexer[][256] =  {
   74:     /* double quote string */
   75:     [IN_DQ_UCODE3] = {
   76:         ['0' ... '9'] = IN_DQ_STRING,
   77:         ['a' ... 'f'] = IN_DQ_STRING,
   78:         ['A' ... 'F'] = IN_DQ_STRING,
   79:     },
   80:     [IN_DQ_UCODE2] = {
   81:         ['0' ... '9'] = IN_DQ_UCODE3,
   82:         ['a' ... 'f'] = IN_DQ_UCODE3,
   83:         ['A' ... 'F'] = IN_DQ_UCODE3,
   84:     },
   85:     [IN_DQ_UCODE1] = {
   86:         ['0' ... '9'] = IN_DQ_UCODE2,
   87:         ['a' ... 'f'] = IN_DQ_UCODE2,
   88:         ['A' ... 'F'] = IN_DQ_UCODE2,
   89:     },
   90:     [IN_DQ_UCODE0] = {
   91:         ['0' ... '9'] = IN_DQ_UCODE1,
   92:         ['a' ... 'f'] = IN_DQ_UCODE1,
   93:         ['A' ... 'F'] = IN_DQ_UCODE1,
   94:     },
   95:     [IN_DQ_STRING_ESCAPE] = {
   96:         ['b'] = IN_DQ_STRING,
   97:         ['f'] =  IN_DQ_STRING,
   98:         ['n'] =  IN_DQ_STRING,
   99:         ['r'] =  IN_DQ_STRING,
  100:         ['t'] =  IN_DQ_STRING,
  101:         ['/'] = IN_DQ_STRING,
  102:         ['\\'] = IN_DQ_STRING,
  103:         ['\''] = IN_DQ_STRING,
  104:         ['\"'] = IN_DQ_STRING,
  105:         ['u'] = IN_DQ_UCODE0,
  106:     },
  107:     [IN_DQ_STRING] = {
  108:         [1 ... 0xBF] = IN_DQ_STRING,
  109:         [0xC2 ... 0xF4] = IN_DQ_STRING,
  110:         ['\\'] = IN_DQ_STRING_ESCAPE,
  111:         ['"'] = JSON_STRING,
  112:     },
  113: 
  114:     /* single quote string */
  115:     [IN_SQ_UCODE3] = {
  116:         ['0' ... '9'] = IN_SQ_STRING,
  117:         ['a' ... 'f'] = IN_SQ_STRING,
  118:         ['A' ... 'F'] = IN_SQ_STRING,
  119:     },
  120:     [IN_SQ_UCODE2] = {
  121:         ['0' ... '9'] = IN_SQ_UCODE3,
  122:         ['a' ... 'f'] = IN_SQ_UCODE3,
  123:         ['A' ... 'F'] = IN_SQ_UCODE3,
  124:     },
  125:     [IN_SQ_UCODE1] = {
  126:         ['0' ... '9'] = IN_SQ_UCODE2,
  127:         ['a' ... 'f'] = IN_SQ_UCODE2,
  128:         ['A' ... 'F'] = IN_SQ_UCODE2,
  129:     },
  130:     [IN_SQ_UCODE0] = {
  131:         ['0' ... '9'] = IN_SQ_UCODE1,
  132:         ['a' ... 'f'] = IN_SQ_UCODE1,
  133:         ['A' ... 'F'] = IN_SQ_UCODE1,
  134:     },
  135:     [IN_SQ_STRING_ESCAPE] = {
  136:         ['b'] = IN_SQ_STRING,
  137:         ['f'] =  IN_SQ_STRING,
  138:         ['n'] =  IN_SQ_STRING,
  139:         ['r'] =  IN_SQ_STRING,
  140:         ['t'] =  IN_SQ_STRING,
  141:         ['/'] = IN_DQ_STRING,
  142:         ['\\'] = IN_DQ_STRING,
  143:         ['\''] = IN_SQ_STRING,
  144:         ['\"'] = IN_SQ_STRING,
  145:         ['u'] = IN_SQ_UCODE0,
  146:     },
  147:     [IN_SQ_STRING] = {
  148:         [1 ... 0xBF] = IN_SQ_STRING,
  149:         [0xC2 ... 0xF4] = IN_SQ_STRING,
  150:         ['\\'] = IN_SQ_STRING_ESCAPE,
  151:         ['\''] = JSON_STRING,
  152:     },
  153: 
  154:     /* Zero */
  155:     [IN_ZERO] = {
  156:         TERMINAL(JSON_INTEGER),
  157:         ['0' ... '9'] = IN_ERROR,
  158:         ['.'] = IN_MANTISSA,
  159:     },
  160: 
  161:     /* Float */
  162:     [IN_DIGITS] = {
  163:         TERMINAL(JSON_FLOAT),
  164:         ['0' ... '9'] = IN_DIGITS,
  165:     },
  166: 
  167:     [IN_DIGIT] = {
  168:         ['0' ... '9'] = IN_DIGITS,
  169:     },
  170: 
  171:     [IN_EXP_E] = {
  172:         ['-'] = IN_DIGIT,
  173:         ['+'] = IN_DIGIT,
  174:         ['0' ... '9'] = IN_DIGITS,
  175:     },
  176: 
  177:     [IN_MANTISSA_DIGITS] = {
  178:         TERMINAL(JSON_FLOAT),
  179:         ['0' ... '9'] = IN_MANTISSA_DIGITS,
  180:         ['e'] = IN_EXP_E,
  181:         ['E'] = IN_EXP_E,
  182:     },
  183: 
  184:     [IN_MANTISSA] = {
  185:         ['0' ... '9'] = IN_MANTISSA_DIGITS,
  186:     },
  187: 
  188:     /* Number */
  189:     [IN_NONZERO_NUMBER] = {
  190:         TERMINAL(JSON_INTEGER),
  191:         ['0' ... '9'] = IN_NONZERO_NUMBER,
  192:         ['e'] = IN_EXP_E,
  193:         ['E'] = IN_EXP_E,
  194:         ['.'] = IN_MANTISSA,
  195:     },
  196: 
  197:     [IN_NEG_NONZERO_NUMBER] = {
  198:         ['0'] = IN_ZERO,
  199:         ['1' ... '9'] = IN_NONZERO_NUMBER,
  200:     },
  201: 
  202:     /* keywords */
  203:     [IN_KEYWORD] = {
  204:         TERMINAL(JSON_KEYWORD),
  205:         ['a' ... 'z'] = IN_KEYWORD,
  206:     },
  207: 
  208:     /* whitespace */
  209:     [IN_WHITESPACE] = {
  210:         TERMINAL(JSON_SKIP),
  211:         [' '] = IN_WHITESPACE,
  212:         ['\t'] = IN_WHITESPACE,
  213:         ['\r'] = IN_WHITESPACE,
  214:         ['\n'] = IN_WHITESPACE,
  215:     },        
  216: 
  217:     /* escape */
  218:     [IN_ESCAPE_LL] = {
  219:         ['d'] = JSON_ESCAPE,
  220:     },
  221: 
  222:     [IN_ESCAPE_L] = {
  223:         ['d'] = JSON_ESCAPE,
  224:         ['l'] = IN_ESCAPE_LL,
  225:     },
  226: 
  227:     [IN_ESCAPE_I64] = {
  228:         ['d'] = JSON_ESCAPE,
  229:     },
  230: 
  231:     [IN_ESCAPE_I6] = {
  232:         ['4'] = IN_ESCAPE_I64,
  233:     },
  234: 
  235:     [IN_ESCAPE_I] = {
  236:         ['6'] = IN_ESCAPE_I6,
  237:     },
  238: 
  239:     [IN_ESCAPE] = {
  240:         ['d'] = JSON_ESCAPE,
  241:         ['i'] = JSON_ESCAPE,
  242:         ['p'] = JSON_ESCAPE,
  243:         ['s'] = JSON_ESCAPE,
  244:         ['f'] = JSON_ESCAPE,
  245:         ['l'] = IN_ESCAPE_L,
  246:         ['I'] = IN_ESCAPE_I,
  247:     },
  248: 
  249:     /* top level rule */
  250:     [IN_START] = {
  251:         ['"'] = IN_DQ_STRING,
  252:         ['\''] = IN_SQ_STRING,
  253:         ['0'] = IN_ZERO,
  254:         ['1' ... '9'] = IN_NONZERO_NUMBER,
  255:         ['-'] = IN_NEG_NONZERO_NUMBER,
  256:         ['{'] = JSON_OPERATOR,
  257:         ['}'] = JSON_OPERATOR,
  258:         ['['] = JSON_OPERATOR,
  259:         [']'] = JSON_OPERATOR,
  260:         [','] = JSON_OPERATOR,
  261:         [':'] = JSON_OPERATOR,
  262:         ['a' ... 'z'] = IN_KEYWORD,
  263:         ['%'] = IN_ESCAPE,
  264:         [' '] = IN_WHITESPACE,
  265:         ['\t'] = IN_WHITESPACE,
  266:         ['\r'] = IN_WHITESPACE,
  267:         ['\n'] = IN_WHITESPACE,
  268:     },
  269: };
  270: 
  271: void json_lexer_init(JSONLexer *lexer, JSONLexerEmitter func)
  272: {
  273:     lexer->emit = func;
  274:     lexer->state = IN_START;
  275:     lexer->token = qstring_new();
  276:     lexer->x = lexer->y = 0;
  277: }
  278: 
  279: static int json_lexer_feed_char(JSONLexer *lexer, char ch, bool flush)
  280: {
  281:     int char_consumed, new_state;
  282: 
  283:     lexer->x++;
  284:     if (ch == '\n') {
  285:         lexer->x = 0;
  286:         lexer->y++;
  287:     }
  288: 
  289:     do {
  290:         new_state = json_lexer[lexer->state][(uint8_t)ch];
  291:         char_consumed = !TERMINAL_NEEDED_LOOKAHEAD(lexer->state, new_state);
  292:         if (char_consumed) {
  293:             qstring_append_chr(lexer->token, ch);
  294:         }
  295: 
  296:         switch (new_state) {
  297:         case JSON_OPERATOR:
  298:         case JSON_ESCAPE:
  299:         case JSON_INTEGER:
  300:         case JSON_FLOAT:
  301:         case JSON_KEYWORD:
  302:         case JSON_STRING:
  303:             lexer->emit(lexer, lexer->token, new_state, lexer->x, lexer->y);
  304:             /* fall through */
  305:         case JSON_SKIP:
  306:             QDECREF(lexer->token);
  307:             lexer->token = qstring_new();
  308:             new_state = IN_START;
  309:             break;
  310:         case IN_ERROR:
  311:             /* XXX: To avoid having previous bad input leaving the parser in an
  312:              * unresponsive state where we consume unpredictable amounts of
  313:              * subsequent "good" input, percolate this error state up to the
  314:              * tokenizer/parser by forcing a NULL object to be emitted, then
  315:              * reset state.
  316:              *
  317:              * Also note that this handling is required for reliable channel
  318:              * negotiation between QMP and the guest agent, since chr(0xFF)
  319:              * is placed at the beginning of certain events to ensure proper
  320:              * delivery when the channel is in an unknown state. chr(0xFF) is
  321:              * never a valid ASCII/UTF-8 sequence, so this should reliably
  322:              * induce an error/flush state.
  323:              */
  324:             lexer->emit(lexer, lexer->token, JSON_ERROR, lexer->x, lexer->y);
  325:             QDECREF(lexer->token);
  326:             lexer->token = qstring_new();
  327:             new_state = IN_START;
  328:             lexer->state = new_state;
  329:             return 0;
  330:         default:
  331:             break;
  332:         }
  333:         lexer->state = new_state;
  334:     } while (!char_consumed && !flush);
  335: 
  336:     /* Do not let a single token grow to an arbitrarily large size,
  337:      * this is a security consideration.
  338:      */
  339:     if (lexer->token->length > MAX_TOKEN_SIZE) {
  340:         lexer->emit(lexer, lexer->token, lexer->state, lexer->x, lexer->y);
  341:         QDECREF(lexer->token);
  342:         lexer->token = qstring_new();
  343:         lexer->state = IN_START;
  344:     }
  345: 
  346:     return 0;
  347: }
  348: 
  349: int json_lexer_feed(JSONLexer *lexer, const char *buffer, size_t size)
  350: {
  351:     size_t i;
  352: 
  353:     for (i = 0; i < size; i++) {
  354:         int err;
  355: 
  356:         err = json_lexer_feed_char(lexer, buffer[i], false);
  357:         if (err < 0) {
  358:             return err;
  359:         }
  360:     }
  361: 
  362:     return 0;
  363: }
  364: 
  365: int json_lexer_flush(JSONLexer *lexer)
  366: {
  367:     return lexer->state == IN_START ? 0 : json_lexer_feed_char(lexer, 0, true);
  368: }
  369: 
  370: void json_lexer_destroy(JSONLexer *lexer)
  371: {
  372:     QDECREF(lexer->token);
  373: }

unix.superglobalmegacorp.com