From 72c39e3522c5781d1e7dc8abad77d96141c5d49b Mon Sep 17 00:00:00 2001 From: John Mark Bell Date: Thu, 1 May 2008 16:36:27 +0000 Subject: Import beginnings of a CSS parsing library. Currently comprises a lexer. svn path=/trunk/libcss/; revision=4112 --- src/lex/Makefile | 49 ++ src/lex/lex.c | 2116 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ src/lex/lex.h | 67 ++ 3 files changed, 2232 insertions(+) create mode 100644 src/lex/Makefile create mode 100644 src/lex/lex.c create mode 100644 src/lex/lex.h (limited to 'src/lex') diff --git a/src/lex/Makefile b/src/lex/Makefile new file mode 100644 index 0000000..73f8ecf --- /dev/null +++ b/src/lex/Makefile @@ -0,0 +1,49 @@ +# Child makefile fragment +# +# Toolchain is provided by top-level makefile +# +# Variables provided by top-level makefile +# +# COMPONENT The name of the component +# EXPORT The location of the export directory +# TOP The location of the source tree root +# RELEASEDIR The place to put release objects +# DEBUGDIR The place to put debug objects +# +# do_include Canned command sequence to include a child makefile +# +# Variables provided by parent makefile: +# +# DIR The name of the directory we're in, relative to $(TOP) +# +# Variables we can manipulate: +# +# ITEMS_CLEAN The list of items to remove for "make clean" +# ITEMS_DISTCLEAN The list of items to remove for "make distclean" +# TARGET_TESTS The list of target names to run for "make test" +# +# SOURCES The list of sources to build for $(COMPONENT) +# +# Plus anything from the toolchain + +# Push parent directory onto the directory stack +sp := $(sp).x +dirstack_$(sp) := $(d) +d := $(DIR) + +# Manipulate include paths +CFLAGS := $(CFLAGS) -I$(d) + +# Sources +SRCS_$(d) := lex.c + +# Append to sources for component +SOURCES += $(addprefix $(d), $(SRCS_$(d))) + +# Now include any children we may have +MAKE_INCLUDES := $(wildcard $(d)*/Makefile) +$(eval $(foreach INC, $(MAKE_INCLUDES), $(call do_include,$(INC)))) + +# Finally, pop off the directory stack +d := $(dirstack_$(sp)) +sp := $(basename $(sp)) diff --git a/src/lex/lex.c b/src/lex/lex.c new file mode 100644 index 0000000..4df6cea --- /dev/null +++ b/src/lex/lex.c @@ -0,0 +1,2116 @@ +/* + * This file is part of LibCSS. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2008 John-Mark Bell + */ + +/** \file CSS lexer + * + * See docs/Tokens for the production rules used by this lexer. + * + * See docs/Lexer for the inferred first characters for each token. + * + * See also CSS3 Syntax module and CSS2.1 $4.1.1 + errata + * + * The lexer assumes that all invalid Unicode codepoints have been converted + * to U+FFFD by the input stream. + * + * The lexer comprises a state machine, the top-level of which is derived from + * the First sets in docs/Lexer. Each top-level state may contain a number of + * sub states. These enable restarting of the parser. + */ + +#include +#include +#include +#include + +#include +#include +#include + +#include + +#include "lex/lex.h" +#include "utils/parserutilserror.h" + +/** \todo Optimisation -- we're currently revisiting a bunch of input + * characters (Currently, we're calling parserutils_inputstream_peek about + * 1.5x the number of characters in the input stream). Ideally, + * we'll visit each character in the input exactly once. In reality, + * the upper bound is twice, due to the need, in some cases, to read + * one character beyond the end of a token's input to detect the end + * of the token. Resumability adds a little overhead here, unless + * we're somewhat more clever when it comes to having support for + * restarting mid-escape sequence. Currently, we rewind back to the + * start of the sequence and process the whole thing again. + */ + +enum { + sSTART = 0, + sATKEYWORD = 1, + sSTRING = 2, + sHASH = 3, + sNUMBER = 4, + sCDO = 5, + sCDC = 6, + sS = 7, + sCOMMENT = 8, + sMATCH = 9, + sURI = 10, + sIDENT = 11, + sESCAPEDIDENT = 12, + sURL = 13, + sUCR = 14 +}; + +/** + * CSS lexer object + */ +struct css_lexer +{ + parserutils_inputstream *input; /**< Inputstream containing CSS */ + + size_t bytesReadForToken; /**< Total bytes read from the + * inputstream for the current token */ + + css_token token; /**< The current token */ + + bool escapeSeen; /**< Whether an escape sequence has + * been seen while processing the input + * for the current token */ + parserutils_buffer *unescapedTokenData; /**< Buffer containing + * unescaped token data + * (used iff escapeSeen == true) + */ + + uint32_t state : 4, /**< Current state */ + substate : 4; /**< Current substate */ + + struct { + uint8_t first; /**< First character read for token */ + size_t origBytes; /**< Storage of current number of + * bytes read, for rewinding */ + bool lastWasStar; /**< Whether the previous character + * was an asterisk */ + bool lastWasCR; /**< Whether the previous character + * was CR */ + size_t bytesForURL; /**< Input bytes read for "url(", for + * rewinding */ + size_t dataLenForURL; /**< Output length for "url(", for + * rewinding */ + int hexCount; /**< Counter for reading hex digits */ + } context; /**< Context for the current state */ + + bool emit_comments; /**< Whether to emit comment tokens */ + + uint32_t currentCol; /**< Current column in source */ + uint32_t currentLine; /**< Current line in source */ + + css_alloc alloc; /**< Memory (de)allocation function */ + void *pw; /**< Pointer to client-specific data */ +}; + +#define APPEND(lexer, data, len) \ +do { \ + css_error error; \ + error = appendToTokenData((lexer), \ + (const uint8_t*) (data), (len)); \ + if (error != CSS_OK) \ + return error; \ + (lexer)->bytesReadForToken += (len); \ + (lexer)->currentCol += (len); \ +} while(0) \ + +static inline css_error appendToTokenData(css_lexer *lexer, + const uint8_t *data, size_t len); +static inline css_error emitToken(css_lexer *lexer, css_token_type type, + const css_token **token); + +static inline css_error AtKeyword(css_lexer *lexer, const css_token **token); +static inline css_error CDCOrIdentOrFunction(css_lexer *lexer, + const css_token **token); +static inline css_error CDO(css_lexer *lexer, const css_token **token); +static inline css_error Comment(css_lexer *lexer, const css_token **token); +static inline css_error EscapedIdentOrFunction(css_lexer *lexer, + const css_token **token); +static inline css_error Hash(css_lexer *lexer, const css_token **token); +static inline css_error IdentOrFunction(css_lexer *lexer, + const css_token **token); +static inline css_error Match(css_lexer *lexer, const css_token **token); +static inline css_error NumberOrPercentageOrDimension(css_lexer *lexer, + const css_token **token); +static inline css_error S(css_lexer *lexer, const css_token **token); +static inline css_error Start(css_lexer *lexer, const css_token **token); +static inline css_error String(css_lexer *lexer, const css_token **token); +static inline css_error URIOrUnicodeRangeOrIdentOrFunction( + css_lexer *lexer, const css_token **token); +static inline css_error URI(css_lexer *lexer, const css_token **token); +static inline css_error UnicodeRange(css_lexer *lexer, const css_token **token); + +static inline css_error consumeDigits(css_lexer *lexer); +static inline css_error consumeEscape(css_lexer *lexer, bool nl); +static inline css_error consumeNMChars(css_lexer *lexer); +static inline css_error consumeString(css_lexer *lexer); +static inline css_error consumeStringChars(css_lexer *lexer); +static inline css_error consumeUnicode(css_lexer *lexer, uint32_t ucs); +static inline css_error consumeURLChars(css_lexer *lexer); +static inline css_error consumeWChars(css_lexer *lexer); + +static inline uint32_t charToHex(uint8_t c); +static inline bool startNMChar(uint8_t c); +static inline bool startNMStart(uint8_t c); +static inline bool startStringChar(uint8_t c); +static inline bool startURLChar(uint8_t c); +static inline bool isDigit(uint8_t c); +static inline bool isHex(uint8_t c); +static inline bool isSpace(uint8_t c); + +/** + * Create a lexer instance + * + * \param input The inputstream to read from + * \param alloc Memory (de)allocation function + * \param pw Pointer to client-specific private data + * \return Pointer to instance, or NULL on memory exhaustion + */ +css_lexer *css_lexer_create(parserutils_inputstream *input, + css_alloc alloc, void *pw) +{ + css_lexer *lex; + + if (input == NULL || alloc == NULL) + return NULL; + + lex = alloc(NULL, sizeof(css_lexer), pw); + if (lex == NULL) + return NULL; + + lex->input = input; + lex->bytesReadForToken = 0; + lex->token.type = CSS_TOKEN_EOF; + lex->token.data.ptr = NULL; + lex->token.data.len = 0; + lex->escapeSeen = false; + lex->unescapedTokenData = NULL; + lex->state = sSTART; + lex->substate = 0; + lex->emit_comments = false; + lex->currentCol = 1; + lex->currentLine = 1; + lex->alloc = alloc; + lex->pw = pw; + + return lex; +} + +/** + * Destroy a lexer instance + * + * \param lexer The instance to destroy + */ +void css_lexer_destroy(css_lexer *lexer) +{ + if (lexer == NULL) + return; + + if (lexer->unescapedTokenData != NULL) + parserutils_buffer_destroy(lexer->unescapedTokenData); + + lexer->alloc(lexer, 0, lexer->pw); +} + +/** + * Configure a lexer instance + * + * \param lexer The lexer to configure + * \param type The option type to modify + * \param params Option-specific parameters + * \return CSS_OK on success, appropriate error otherwise + */ +css_error css_lexer_setopt(css_lexer *lexer, css_lexer_opttype type, + css_lexer_optparams *params) +{ + if (lexer == NULL || params == NULL) + return CSS_BADPARM; + + switch (type) { + case CSS_LEXER_EMIT_COMMENTS: + lexer->emit_comments = params->emit_comments; + break; + default: + return CSS_BADPARM; + } + + return CSS_OK; +} + +/** + * Retrieve a token from a lexer + * + * \param lexer The lexer instance to read from + * \param token Pointer to location to receive pointer to token + * \return CSS_OK on success, appropriate error otherwise + */ +css_error css_lexer_get_token(css_lexer *lexer, const css_token **token) +{ + css_error error; + + if (lexer == NULL || token == NULL) + return CSS_BADPARM; + + switch (lexer->state) + { + case sSTART: + start: + return Start(lexer, token); + case sATKEYWORD: + return AtKeyword(lexer, token); + case sSTRING: + return String(lexer, token); + case sHASH: + return Hash(lexer, token); + case sNUMBER: + return NumberOrPercentageOrDimension(lexer, token); + case sCDO: + return CDO(lexer, token); + case sCDC: + return CDCOrIdentOrFunction(lexer, token); + case sS: + return S(lexer, token); + case sCOMMENT: + error = Comment(lexer, token); + if (!lexer->emit_comments && error == CSS_OK) + goto start; + return error; + case sMATCH: + return Match(lexer, token); + case sURI: + return URI(lexer, token); + case sIDENT: + return IdentOrFunction(lexer, token); + case sESCAPEDIDENT: + return EscapedIdentOrFunction(lexer, token); + case sURL: + return URI(lexer, token); + case sUCR: + return UnicodeRange(lexer, token); + } + + /* Should never be reached */ + assert(0); + + return CSS_OK; +} + +/****************************************************************************** + * Utility routines * + ******************************************************************************/ + +/** + * Append some data to the current token + * + * \param lexer The lexer instance + * \param data Pointer to data to append + * \param len Length, in bytes, of data + * \return CSS_OK on success, appropriate error otherwise + * + * This should not be called directly without good reason. Use the APPEND() + * macro instead. + */ +css_error appendToTokenData(css_lexer *lexer, const uint8_t *data, size_t len) +{ + css_token *token = &lexer->token; + + if (lexer->escapeSeen) { + css_error error = css_error_from_parserutils_error( + parserutils_buffer_append( + lexer->unescapedTokenData, data, len)); + if (error != CSS_OK) + return error; + } + + token->data.len += len; + + return CSS_OK; +} + +/** + * Prepare a token for consumption and emit it to the client + * + * \param lexer The lexer instance + * \param type The type of token to emit + * \param token Pointer to location to receive pointer to token + * \return CSS_OK on success, appropriate error otherwise + */ +css_error emitToken(css_lexer *lexer, css_token_type type, + const css_token **token) +{ + css_token *t = &lexer->token; + + t->type = type; + + /* Calculate token data start pointer. We have to do this here as + * the inputstream's buffer may have moved under us. */ + if (lexer->escapeSeen) { + t->data.ptr = lexer->unescapedTokenData->data; + } else { + size_t clen; + uintptr_t ptr = parserutils_inputstream_peek( + lexer->input, 0, &clen); + + assert(type == CSS_TOKEN_EOF || + (ptr != PARSERUTILS_INPUTSTREAM_EOF && + ptr != PARSERUTILS_INPUTSTREAM_OOD)); + + t->data.ptr = (type == CSS_TOKEN_EOF) ? NULL : (uint8_t *) ptr; + } + + switch (type) { + case CSS_TOKEN_ATKEYWORD: + /* Strip the '@' from the front */ + t->data.ptr += 1; + t->data.len -= 1; + break; + case CSS_TOKEN_STRING: + /* Strip the leading quote */ + t->data.ptr += 1; + t->data.len -= 1; + + /* Strip the trailing quote */ + t->data.len -= 1; + break; + case CSS_TOKEN_HASH: + /* Strip the '#' from the front */ + t->data.ptr += 1; + t->data.len -= 1; + break; + case CSS_TOKEN_PERCENTAGE: + /* Strip the '%' from the end */ + t->data.len -= 1; + break; + case CSS_TOKEN_DIMENSION: + /** \todo Do we want to separate the value from the units? */ + break; + case CSS_TOKEN_URI: + /* Strip the "url(" from the start */ + t->data.ptr += sizeof("url(") - 1; + t->data.len -= sizeof("url(") - 1; + + /* Strip any leading whitespace */ + while (isSpace(t->data.ptr[0])) { + t->data.ptr++; + t->data.len--; + } + + /* Strip any leading quote */ + if (t->data.ptr[0] == '"' || t->data.ptr[0] == '\'') { + t->data.ptr += 1; + t->data.len -= 1; + } + + /* Strip the trailing ')' */ + t->data.len -= 1; + + /* Strip any trailing whitespace */ + while (isSpace(t->data.ptr[t->data.len - 1])) { + t->data.len--; + } + + /* Strip any trailing quote */ + if (t->data.ptr[t->data.len - 1] == '"' || + t->data.ptr[t->data.len - 1] == '\'') { + t->data.len -= 1; + } + break; + case CSS_TOKEN_UNICODE_RANGE: + /* Remove "U+" from the start */ + t->data.ptr += sizeof("U+") - 1; + t->data.len -= sizeof("U+") - 1; + break; + case CSS_TOKEN_COMMENT: + /* Strip the leading '/' and '*' */ + t->data.ptr += sizeof("/*") - 1; + t->data.len -= sizeof("/*") - 1; + + /* Strip the trailing '*' and '/' */ + t->data.len -= sizeof("*/") - 1; + break; + case CSS_TOKEN_FUNCTION: + /* Strip the trailing '(' */ + t->data.len -= 1; + break; + default: + break; + } + + *token = t; + + /* Reset the lexer's state */ + lexer->state = sSTART; + lexer->substate = 0; + + return CSS_OK; +} + +/****************************************************************************** + * State machine components * + ******************************************************************************/ + +css_error AtKeyword(css_lexer *lexer, const css_token **token) +{ + uintptr_t cptr; + uint8_t c; + size_t clen; + css_error error; + enum { Initial = 0, Escape = 1, NMChar = 2 }; + + /* ATKEYWORD = '@' ident + * + * The '@' has been consumed. + */ + + switch (lexer->substate) { + case Initial: + cptr = parserutils_inputstream_peek(lexer->input, + lexer->bytesReadForToken, &clen); + if (cptr == PARSERUTILS_INPUTSTREAM_OOD) + return CSS_NEEDDATA; + + if (cptr == PARSERUTILS_INPUTSTREAM_EOF) + return emitToken(lexer, CSS_TOKEN_CHAR, token); + + c = *((uint8_t *) cptr); + + if (!startNMStart(c)) + return emitToken(lexer, CSS_TOKEN_CHAR, token); + + if (c != '\\') { + APPEND(lexer, cptr, clen); + } else { + lexer->bytesReadForToken += clen; + goto escape; + } + + /* Fall through */ + case NMChar: + nmchar: + lexer->substate = NMChar; + error = consumeNMChars(lexer); + if (error != CSS_OK) + return error; + break; + + case Escape: + escape: + lexer->substate = Escape; + error = consumeEscape(lexer, false); + if (error != CSS_OK) { + if (error == CSS_EOF || error == CSS_INVALID) { + /* Rewind the '\\' */ + lexer->bytesReadForToken -= 1; + + return emitToken(lexer, CSS_TOKEN_CHAR, token); + } + + return error; + } + + goto nmchar; + } + + return emitToken(lexer, CSS_TOKEN_ATKEYWORD, token); +} + +css_error CDCOrIdentOrFunction(css_lexer *lexer, const css_token **token) +{ + css_token *t = &lexer->token; + uintptr_t cptr; + uint8_t c; + size_t clen; + css_error error; + enum { Initial = 0, Escape = 1, Gt = 2 }; + + /* CDC = "-->" + * IDENT = [-]? nmstart nmchar* + * FUNCTION = [-]? nmstart nmchar* '(' + * + * The first dash has been consumed. Thus, we must consume the next + * character in the stream. If it's a dash, then we're dealing with + * CDC. Otherwise, we're dealing with IDENT/FUNCTION. + */ + + switch (lexer->substate) { + case Initial: + cptr = parserutils_inputstream_peek(lexer->input, + lexer->bytesReadForToken, &clen); + if (cptr == PARSERUTILS_INPUTSTREAM_OOD) + return CSS_NEEDDATA; + + if (cptr == PARSERUTILS_INPUTSTREAM_EOF) { + /* We can only match char with what we've read so far */ + return emitToken(lexer, CSS_TOKEN_CHAR, token); + } + + c = *((uint8_t *) cptr); + + if (c != '-' && !startNMStart(c)) { + /* Can only be CHAR */ + return emitToken(lexer, CSS_TOKEN_CHAR, token); + } + + + if (c != '\\') { + APPEND(lexer, cptr, clen); + } + + if (c != '-') { + if (c == '\\') { + lexer->bytesReadForToken += clen; + goto escape; + } + + lexer->state = sIDENT; + lexer->substate = 0; + return IdentOrFunction(lexer, token); + } + + /* Fall through */ + case Gt: + lexer->substate = Gt; + + /* Ok, so we're dealing with CDC. Expect a '>' */ + cptr = parserutils_inputstream_peek(lexer->input, + lexer->bytesReadForToken, &clen); + if (cptr == PARSERUTILS_INPUTSTREAM_OOD) + return CSS_NEEDDATA; + + if (cptr == PARSERUTILS_INPUTSTREAM_EOF) { + /* CHAR is the only match here */ + /* Remove the '-' we read above */ + lexer->bytesReadForToken -= 1; + t->data.len -= 1; + return emitToken(lexer, CSS_TOKEN_CHAR, token); + } + + c = *((uint8_t *) cptr); + + if (c == '>') { + APPEND(lexer, cptr, clen); + + t->type = CSS_TOKEN_CDC; + } else { + /* Remove the '-' we read above */ + lexer->bytesReadForToken -= 1; + t->data.len -= 1; + t->type = CSS_TOKEN_CHAR; + } + break; + + case Escape: + escape: + lexer->substate = Escape; + error = consumeEscape(lexer, false); + if (error != CSS_OK) { + if (error == CSS_EOF || error == CSS_INVALID) { + /* Rewind the '\\' */ + lexer->bytesReadForToken -= 1; + + return emitToken(lexer, CSS_TOKEN_CHAR, token); + } + + return error; + } + + lexer->state = sIDENT; + lexer->substate = 0; + return IdentOrFunction(lexer, token); + } + + return emitToken(lexer, t->type, token); +} + +css_error CDO(css_lexer *lexer, const css_token **token) +{ + css_token *t = &lexer->token; + uintptr_t cptr; + uint8_t c; + size_t clen; + enum { Initial = 0, Dash1 = 1, Dash2 = 2 }; + + /* CDO = "