/* * This file is part of LibCSS. * Licensed under the MIT License, * http://www.opensource.org/licenses/mit-license.php * Copyright 2008 John-Mark Bell */ /** \file CSS lexer * * See docs/Tokens for the production rules used by this lexer. * * See docs/Lexer for the inferred first characters for each token. * * See also CSS3 Syntax module and CSS2.1 $4.1.1 + errata * * The lexer assumes that all invalid Unicode codepoints have been converted * to U+FFFD by the input stream. * * The lexer comprises a state machine, the top-level of which is derived from * the First sets in docs/Lexer. Each top-level state may contain a number of * sub states. These enable restarting of the parser. */ #include #include #include #include #include #include #include #include #include "lex/lex.h" #include "utils/parserutilserror.h" #include "utils/utils.h" /** \todo Optimisation -- we're currently revisiting a bunch of input * characters (Currently, we're calling parserutils_inputstream_peek * about 1.5x the number of characters in the input stream). Ideally, * we'll visit each character in the input exactly once. In reality, * the upper bound is twice, due to the need, in some cases, to read * one character beyond the end of a token's input to detect the end * of the token. Resumability adds a little overhead here, unless * we're somewhat more clever when it comes to having support for * restarting mid-escape sequence. Currently, we rewind back to the * start of the sequence and process the whole thing again. */ enum { sSTART = 0, sATKEYWORD = 1, sSTRING = 2, sHASH = 3, sNUMBER = 4, sCDO = 5, sCDC = 6, sS = 7, sCOMMENT = 8, sMATCH = 9, sURI = 10, sIDENT = 11, sESCAPEDIDENT = 12, sURL = 13, sUCR = 14 }; /** * CSS lexer object */ struct css_lexer { parserutils_inputstream *input; /**< Inputstream containing CSS */ size_t bytesReadForToken; /**< Total bytes read from the * inputstream for the current token */ css_token token; /**< The current token */ bool escapeSeen; /**< Whether an escape sequence has * been seen while processing the input * for the current token */ parserutils_buffer *unescapedTokenData; /**< Buffer containing * unescaped token data * (used iff escapeSeen == true) */ uint32_t state : 4, /**< Current state */ substate : 4; /**< Current substate */ struct { uint8_t first; /**< First character read for token */ size_t origBytes; /**< Storage of current number of * bytes read, for rewinding */ bool lastWasStar; /**< Whether the previous character * was an asterisk */ bool lastWasCR; /**< Whether the previous character * was CR */ size_t bytesForURL; /**< Input bytes read for "url(", for * rewinding */ size_t dataLenForURL; /**< Output length for "url(", for * rewinding */ int hexCount; /**< Counter for reading hex digits */ } context; /**< Context for the current state */ bool emit_comments; /**< Whether to emit comment tokens */ uint32_t currentCol; /**< Current column in source */ uint32_t currentLine; /**< Current line in source */ css_allocator_fn alloc; /**< Memory (de)allocation function */ void *pw; /**< Pointer to client-specific data */ }; #define APPEND(lexer, data, len) \ do { \ css_error error; \ error = appendToTokenData((lexer), \ (const uint8_t *) (data), (len)); \ if (error != CSS_OK) \ return error; \ (lexer)->bytesReadForToken += (len); \ (lexer)->currentCol += (len); \ } while(0) \ static css_error appendToTokenData(css_lexer *lexer, const uint8_t *data, size_t len); static css_error emitToken(css_lexer *lexer, css_token_type type, css_token **token); static css_error AtKeyword(css_lexer *lexer, css_token **token); static css_error CDCOrIdentOrFunctionOrNPD(css_lexer *lexer, css_token **token); static css_error CDO(css_lexer *lexer, css_token **token); static css_error Comment(css_lexer *lexer, css_token **token); static css_error EscapedIdentOrFunction(css_lexer *lexer, css_token **token); static css_error Hash(css_lexer *lexer, css_token **token); static css_error IdentOrFunction(css_lexer *lexer, css_token **token); static css_error Match(css_lexer *lexer, css_token **token); static css_error NumberOrPercentageOrDimension(css_lexer *lexer, css_token **token); static css_error S(css_lexer *lexer, css_token **token); static css_error Start(css_lexer *lexer, css_token **token); static css_error String(css_lexer *lexer, css_token **token); static css_error URIOrUnicodeRangeOrIdentOrFunction( css_lexer *lexer, css_token **token); static css_error URI(css_lexer *lexer, css_token **token); static css_error UnicodeRange(css_lexer *lexer, css_token **token); static css_error consumeDigits(css_lexer *lexer); static css_error consumeEscape(css_lexer *lexer, bool nl); static css_error consumeNMChars(css_lexer *lexer); static css_error consumeString(css_lexer *lexer); static css_error consumeStringChars(css_lexer *lexer); static css_error consumeUnicode(css_lexer *lexer, uint32_t ucs); static css_error consumeURLChars(css_lexer *lexer); static css_error consumeWChars(css_lexer *lexer); static inline bool startNMChar(uint8_t c); static inline bool startNMStart(uint8_t c); static inline bool startStringChar(uint8_t c); static inline bool startURLChar(uint8_t c); static inline bool isSpace(uint8_t c); /** * Create a lexer instance * * \param input The inputstream to read from * \param alloc Memory (de)allocation function * \param pw Pointer to client-specific private data * \param lexer Pointer to location to receive lexer instance * \return CSS_OK on success, * CSS_BADPARM on bad parameters, * CSS_NOMEM on memory exhaustion */ css_error css_lexer_create(parserutils_inputstream *input, css_allocator_fn alloc, void *pw, css_lexer **lexer) { css_lexer *lex; if (input == NULL || alloc == NULL || lexer == NULL) return CSS_BADPARM; lex = alloc(NULL, sizeof(css_lexer), pw); if (lex == NULL) return CSS_NOMEM; lex->input = input; lex->bytesReadForToken = 0; lex->token.type = CSS_TOKEN_EOF; lex->token.data.data = NULL; lex->token.data.len = 0; lex->escapeSeen = false; lex->unescapedTokenData = NULL; lex->state = sSTART; lex->substate = 0; lex->emit_comments = false; lex->currentCol = 1; lex->currentLine = 1; lex->alloc = alloc; lex->pw = pw; *lexer = lex; return CSS_OK; } /** * Destroy a lexer instance * * \param lexer The instance to destroy * \return CSS_OK on success, appropriate error otherwise */ css_error css_lexer_destroy(css_lexer *lexer) { if (lexer == NULL) return CSS_BADPARM; if (lexer->unescapedTokenData != NULL) parserutils_buffer_destroy(lexer->unescapedTokenData); lexer->alloc(lexer, 0, lexer->pw); return CSS_OK; } /** * Configure a lexer instance * * \param lexer The lexer to configure * \param type The option type to modify * \param params Option-specific parameters * \return CSS_OK on success, appropriate error otherwise */ css_error css_lexer_setopt(css_lexer *lexer, css_lexer_opttype type, css_lexer_optparams *params) { if (lexer == NULL || params == NULL) return CSS_BADPARM; switch (type) { case CSS_LEXER_EMIT_COMMENTS: lexer->emit_comments = params->emit_comments; break; default: return CSS_BADPARM; } return CSS_OK; } /** * Retrieve a token from a lexer * * \param lexer The lexer instance to read from * \param token Pointer to location to receive pointer to token * \return CSS_OK on success, appropriate error otherwise * * The returned token object is owned by the lexer. However, the client is * permitted to modify the data members of the token. The token must not be * freed by the client (it may not have been allocated in the first place), * nor may any of the pointers contained within it. The client may, if they * wish, overwrite any data member of the returned token object -- the lexer * does not depend on these remaining constant. This allows the client code * to efficiently implement a push-back buffer with interned string data. */ css_error css_lexer_get_token(css_lexer *lexer, css_token **token) { css_error error; if (lexer == NULL || token == NULL) return CSS_BADPARM; switch (lexer->state) { case sSTART: start: return Start(lexer, token); case sATKEYWORD: return AtKeyword(lexer, token); case sSTRING: return String(lexer, token); case sHASH: return Hash(lexer, token); case sNUMBER: return NumberOrPercentageOrDimension(lexer, token); case sCDO: return CDO(lexer, token); case sCDC: return CDCOrIdentOrFunctionOrNPD(lexer, token); case sS: return S(lexer, token); case sCOMMENT: error = Comment(lexer, token); if (!lexer->emit_comments && error == CSS_OK && (*token)->type == CSS_TOKEN_COMMENT) goto start; return error; case sMATCH: return Match(lexer, token); case sURI: return URI(lexer, token); case sIDENT: return IdentOrFunction(lexer, token); case sESCAPEDIDENT: return EscapedIdentOrFunction(lexer, token); case sURL: return URI(lexer, token); case sUCR: return UnicodeRange(lexer, token); } /* Should never be reached */ assert(0); return CSS_OK; } /****************************************************************************** * Utility routines * ******************************************************************************/ /** * Append some data to the current token * * \param lexer The lexer instance * \param data Pointer to data to append * \param len Length, in bytes, of data * \return CSS_OK on success, appropriate error otherwise * * This should not be called directly without good reason. Use the APPEND() * macro instead. */ css_error appendToTokenData(css_lexer *lexer, const uint8_t *data, size_t len) { css_token *token = &lexer->token; if (lexer->escapeSeen) { css_error error = css_error_from_parserutils_error( parserutils_buffer_append( lexer->unescapedTokenData, data, len)); if (error != CSS_OK) return error; } token->data.len += len; return CSS_OK; } /** * Prepare a token for consumption and emit it to the client * * \param lexer The lexer instance * \param type The type of token to emit * \param token Pointer to location to receive pointer to token * \return CSS_OK on success, appropriate error otherwise */ css_error emitToken(css_lexer *lexer, css_token_type type, css_token **token) { css_token *t = &lexer->token; t->type = type; /* Calculate token data start pointer. We have to do this here as * the inputstream's buffer may have moved under us. */ if (lexer->escapeSeen) { t->data.data = lexer->unescapedTokenData->data; } else { size_t clen; const uint8_t *data; parserutils_error error; error = parserutils_inputstream_peek(lexer->input, 0, &data, &clen); assert(type == CSS_TOKEN_EOF || error == PARSERUTILS_OK); t->data.data = (type == CSS_TOKEN_EOF) ? NULL : (uint8_t *) data; } switch (type) { case CSS_TOKEN_ATKEYWORD: /* Strip the '@' from the front */ t->data.data += 1; t->data.len -= 1; break; case CSS_TOKEN_STRING: /* Strip the leading quote */ t->data.data += 1; t->data.len -= 1; /* Strip the trailing quote, iff it exists (may have hit EOF) */ if (t->data.len > 0 && (t->data.data[t->data.len - 1] == '"' || t->data.data[t->data.len - 1] == '\'')) { t->data.len -= 1; } break; case CSS_TOKEN_INVALID_STRING: /* Strip the leading quote */ t->data.data += 1; t->data.len -= 1; break; case CSS_TOKEN_HASH: /* Strip the '#' from the front */ t->data.data += 1; t->data.len -= 1; break; case CSS_TOKEN_PERCENTAGE: /* Strip the '%' from the end */ t->data.len -= 1; break; case CSS_TOKEN_DIMENSION: break; case CSS_TOKEN_URI: /* Strip the "url(" from the start */ t->data.data += SLEN("url("); t->data.len -= SLEN("url("); /* Strip any leading whitespace */ while (isSpace(t->data.data[0])) { t->data.data++; t->data.len--; } /* Strip any leading quote */ if (t->data.data[0] == '"' || t->data.data[0] == '\'') { t->data.data += 1; t->data.len -= 1; } /* Strip the trailing ')' */ t->data.len -= 1; /* Strip any trailing whitespace */ while (t->data.len > 0 && isSpace(t->data.data[t->data.len - 1])) { t->data.len--; } /* Strip any trailing quote */ if (t->data.len > 0 && (t->data.data[t->data.len - 1] == '"' || t->data.data[t->data.len - 1] == '\'')) { t->data.len -= 1; } break; case CSS_TOKEN_UNICODE_RANGE: /* Remove "U+" from the start */ t->data.data += SLEN("U+"); t->data.len -= SLEN("U+"); break; case CSS_TOKEN_COMMENT: /* Strip the leading '/' and '*' */ t->data.data += SLEN("/*"); t->data.len -= SLEN("/*"); /* Strip the trailing '*' and '/' */ t->data.len -= SLEN("*/"); break; case CSS_TOKEN_FUNCTION: /* Strip the trailing '(' */ t->data.len -= 1; break; default: break; } *token = t; /* Reset the lexer's state */ lexer->state = sSTART; lexer->substate = 0; return CSS_OK; } /****************************************************************************** * State machine components * ******************************************************************************/ css_error AtKeyword(css_lexer *lexer, css_token **token) { const uint8_t *cptr; uint8_t c; size_t clen; css_error error; parserutils_error perror; enum { Initial = 0, Escape = 1, NMChar = 2 }; /* ATKEYWORD = '@' ident * * The '@' has been consumed. */ switch (lexer->substate) { case Initial: perror = parserutils_inputstream_peek(lexer->input, lexer->bytesReadForToken, &cptr, &clen); if (perror != PARSERUTILS_OK && perror != PARSERUTILS_EOF) return css_error_from_parserutils_error(perror); if (perror == PARSERUTILS_EOF) return emitToken(lexer, CSS_TOKEN_CHAR, token); c = *cptr; if (!startNMStart(c)) return emitToken(lexer, CSS_TOKEN_CHAR, token); if (c != '\\') { APPEND(lexer, cptr, clen); } else { lexer->bytesReadForToken += clen; goto escape; } /* Fall through */ case NMChar: nmchar: lexer->substate = NMChar; error = consumeNMChars(lexer); if (error != CSS_OK) return error; break; case Escape: escape: lexer->substate = Escape; error = consumeEscape(lexer, false); if (error != CSS_OK) { if (error == CSS_EOF || error == CSS_INVALID) { /* Rewind the '\\' */ lexer->bytesReadForToken -= 1; return emitToken(lexer, CSS_TOKEN_CHAR, token); } return error; } goto nmchar; } return emitToken(lexer, CSS_TOKEN_ATKEYWORD, token); } css_error CDCOrIdentOrFunctionOrNPD(css_lexer *lexer, css_token **token) { css_token *t = &lexer->token; const uint8_t *cptr; uint8_t c; size_t clen; css_error error; parserutils_error perror; enum { Initial = 0, Escape = 1, Gt = 2 }; /* CDC = "-->" * IDENT = [-]? nmstart nmchar* * FUNCTION = [-]? nmstart nmchar* '(' * NUMBER = num = [-+]? ([0-9]+ | [0-9]* '.' [0-9]+) * PERCENTAGE = num '%' * DIMENSION = num ident * * The first dash has been consumed. Thus, we must consume the next * character in the stream. If it's a dash, then we're dealing with * CDC. If it's a digit or dot, then we're dealing with NPD. * Otherwise, we're dealing with IDENT/FUNCTION. */ switch (lexer->substate) { case Initial: perror = parserutils_inputstream_peek(lexer->input, lexer->bytesReadForToken, &cptr, &clen); if (perror != PARSERUTILS_OK && perror != PARSERUTILS_EOF) return css_error_from_parserutils_error(perror); if (perror == PARSERUTILS_EOF) { /* We can only match char with what we've read so far */ return emitToken(lexer, CSS_TOKEN_CHAR, token); } c = *cptr; if (isDigit(c) || c == '.') { /* NPD */ APPEND(lexer, cptr, clen); lexer->state = sNUMBER; lexer->substate = 0; /* Abuse "first" to store first non-sign character */ lexer->context.first = c; return NumberOrPercentageOrDimension(lexer, token); } if (c != '-' && !startNMStart(c)) { /* Can only be CHAR */ return emitToken(lexer, CSS_TOKEN_CHAR, token); } if (c != '\\') { APPEND(lexer, cptr, clen); } if (c != '-') { if (c == '\\') { lexer->bytesReadForToken += clen; goto escape; } lexer->state = sIDENT; lexer->substate = 0; return IdentOrFunction(lexer, token); } /* Fall through */ case Gt: lexer->substate = Gt; /* Ok, so we're dealing with CDC. Expect a '>' */ perror = parserutils_inputstream_peek(lexer->input, lexer->bytesReadForToken, &cptr, &clen); if (perror != PARSERUTILS_OK && perror != PARSERUTILS_EOF) return css_error_from_parserutils_error(perror); if (perror == PARSERUTILS_EOF) { /* CHAR is the only match here */ /* Remove the '-' we read above */ lexer->bytesReadForToken -= 1; t->data.len -= 1; return emitToken(lexer, CSS_TOKEN_CHAR, token); } c = *cptr; if (c == '>') { APPEND(lexer, cptr, clen); t->type = CSS_TOKEN_CDC; } else { /* Remove the '-' we read above */ lexer->bytesReadForToken -= 1; t->data.len -= 1; t->type = CSS_TOKEN_CHAR; } break; case Escape: escape: lexer->substate = Escape; error = consumeEscape(lexer, false); if (error != CSS_OK) { if (error == CSS_EOF || error == CSS_INVALID) { /* Rewind the '\\' */ lexer->bytesReadForToken -= 1; return emitToken(lexer, CSS_TOKEN_CHAR, token); } return error; } lexer->state = sIDENT; lexer->substate = 0; return IdentOrFunction(lexer, token); } return emitToken(lexer, t->type, token); } css_error CDO(css_lexer *lexer, css_token **token) { css_token *t = &lexer->token; const uint8_t *cptr; uint8_t c; size_t clen; parserutils_error perror; enum { Initial = 0, Dash1 = 1, Dash2 = 2 }; /* CDO = "