summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorJohn Mark Bell <jmb@netsurf-browser.org>2008-05-01 16:36:27 +0000
committerJohn Mark Bell <jmb@netsurf-browser.org>2008-05-01 16:36:27 +0000
commit72c39e3522c5781d1e7dc8abad77d96141c5d49b (patch)
treee16497caaa0bf20771ef34787de02fc95e5993bf /src
downloadlibcss-72c39e3522c5781d1e7dc8abad77d96141c5d49b.tar.gz
libcss-72c39e3522c5781d1e7dc8abad77d96141c5d49b.tar.bz2
Import beginnings of a CSS parsing library.
Currently comprises a lexer. svn path=/trunk/libcss/; revision=4112
Diffstat (limited to 'src')
-rw-r--r--src/Makefile49
-rw-r--r--src/charset/Makefile46
-rw-r--r--src/charset/detect.c161
-rw-r--r--src/charset/detect.h24
-rw-r--r--src/lex/Makefile49
-rw-r--r--src/lex/lex.c2116
-rw-r--r--src/lex/lex.h67
-rw-r--r--src/libcss.c50
-rw-r--r--src/utils/Makefile46
-rw-r--r--src/utils/errors.c80
-rw-r--r--src/utils/parserutilserror.h29
-rw-r--r--src/utils/utils.h28
12 files changed, 2745 insertions, 0 deletions
diff --git a/src/Makefile b/src/Makefile
new file mode 100644
index 0000000..f56a87f
--- /dev/null
+++ b/src/Makefile
@@ -0,0 +1,49 @@
+# Child makefile fragment
+#
+# Toolchain is provided by top-level makefile
+#
+# Variables provided by top-level makefile
+#
+# COMPONENT The name of the component
+# EXPORT The location of the export directory
+# TOP The location of the source tree root
+# RELEASEDIR The place to put release objects
+# DEBUGDIR The place to put debug objects
+#
+# do_include Canned command sequence to include a child makefile
+#
+# Variables provided by parent makefile:
+#
+# DIR The name of the directory we're in, relative to $(TOP)
+#
+# Variables we can manipulate:
+#
+# ITEMS_CLEAN The list of items to remove for "make clean"
+# ITEMS_DISTCLEAN The list of items to remove for "make distclean"
+# TARGET_TESTS The list of target names to run for "make test"
+#
+# SOURCES The list of sources to build for $(COMPONENT)
+#
+# Plus anything from the toolchain
+
+# Push parent directory onto the directory stack
+sp := $(sp).x
+dirstack_$(sp) := $(d)
+d := $(DIR)
+
+# Manipulate include paths
+CFLAGS := $(CFLAGS) -I$(d)
+
+# Sources
+SRCS_$(d) := libcss.c
+
+# Append to sources for component
+SOURCES += $(addprefix $(d), $(SRCS_$(d)))
+
+# Now include any children we may have
+MAKE_INCLUDES := $(wildcard $(d)*/Makefile)
+$(eval $(foreach INC, $(MAKE_INCLUDES), $(call do_include,$(INC))))
+
+# Finally, pop off the directory stack
+d := $(dirstack_$(sp))
+sp := $(basename $(sp))
diff --git a/src/charset/Makefile b/src/charset/Makefile
new file mode 100644
index 0000000..dda58c1
--- /dev/null
+++ b/src/charset/Makefile
@@ -0,0 +1,46 @@
+# Child makefile fragment
+#
+# Toolchain is provided by top-level makefile
+#
+# Variables provided by top-level makefile
+#
+# COMPONENT The name of the component
+# EXPORT The location of the export directory
+# TOP The location of the source tree root
+# RELEASEDIR The place to put release objects
+# DEBUGDIR The place to put debug objects
+#
+# do_include Canned command sequence to include a child makefile
+#
+# Variables provided by parent makefile:
+#
+# DIR The name of the directory we're in, relative to $(TOP)
+#
+# Variables we can manipulate:
+#
+# ITEMS_CLEAN The list of items to remove for "make clean"
+# ITEMS_DISTCLEAN The list of items to remove for "make distclean"
+# TARGET_TESTS The list of target names to run for "make test"
+#
+# SOURCES The list of sources to build for $(COMPONENT)
+#
+# Plus anything from the toolchain
+
+# Push parent directory onto the directory stack
+sp := $(sp).x
+dirstack_$(sp) := $(d)
+d := $(DIR)
+
+# Sources
+SRCS_$(d) := detect.c
+
+# Append to sources for component
+SOURCES += $(addprefix $(d), $(SRCS_$(d)))
+
+# Now include any children we may have
+MAKE_INCLUDES := $(wildcard $(d)*/Makefile)
+$(eval $(foreach INC, $(MAKE_INCLUDES), $(call do_include,$(INC))))
+
+# Finally, pop off the directory stack
+d := $(dirstack_$(sp))
+sp := $(basename $(sp))
diff --git a/src/charset/detect.c b/src/charset/detect.c
new file mode 100644
index 0000000..11ee699
--- /dev/null
+++ b/src/charset/detect.c
@@ -0,0 +1,161 @@
+/*
+ * This file is part of LibCSS.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2008 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#include <stdbool.h>
+#include <string.h>
+
+#include <parserutils/charset/mibenum.h>
+
+#include "charset/detect.h"
+#include "utils/utils.h"
+
+static parserutils_error css_charset_read_bom_or_charset(const uint8_t *data,
+ size_t len, uint16_t *mibenum);
+
+/**
+ * Extract a charset from a chunk of data
+ *
+ * \param data Pointer to buffer containing data
+ * \param len Buffer length
+ * \param mibenum Pointer to location containing current MIB enum
+ * \param source Pointer to location containing current charset source
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ *
+ * ::mibenum and ::source will be updated on exit
+ *
+ * CSS 2.1 $4.4
+ */
+parserutils_error css_charset_extract(const uint8_t *data, size_t len,
+ uint16_t *mibenum, uint32_t *source)
+{
+ css_error error;
+ uint16_t charset = 0;
+
+ if (data == NULL || mibenum == NULL || source == NULL)
+ return PARSERUTILS_BADPARM;
+
+ /* If the charset was dictated by the client, we've nothing to detect */
+ if (*source == CSS_CHARSET_DICTATED)
+ return PARSERUTILS_OK;
+
+ /* We need at least 4 bytes of data */
+ if (len < 4)
+ goto default_encoding;
+
+ /* Look for a BOM and/or @charset */
+ error = css_charset_read_bom_or_charset(data, len, &charset);
+ if (error != PARSERUTILS_OK)
+ return error;
+
+ if (charset != 0) {
+ *mibenum = charset;
+ *source = CSS_CHARSET_DOCUMENT;
+
+ return PARSERUTILS_OK;
+ }
+
+ /* If we've already got a charset from the linking mechanism or
+ * referring document, then we've nothing further to do */
+ if (*source != CSS_CHARSET_DEFAULT)
+ return PARSERUTILS_OK;
+
+ /* We've not yet found a charset, so use the default fallback */
+default_encoding:
+
+ charset = parserutils_charset_mibenum_from_name("UTF-8", SLEN("UTF-8"));
+
+ *mibenum = charset;
+ *source = CSS_CHARSET_DEFAULT;
+
+ return PARSERUTILS_OK;
+}
+
+
+/**
+ * Inspect the beginning of a buffer of data for the presence of a
+ * UTF Byte Order Mark and/or an @charset rule
+ *
+ * \param data Pointer to buffer containing data
+ * \param len Buffer length
+ * \param mibenum Pointer to location to receive MIB enum
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error css_charset_read_bom_or_charset(const uint8_t *data,
+ size_t len, uint16_t *mibenum)
+{
+ uint16_t charset = 0;
+
+ if (data == NULL)
+ return PARSERUTILS_BADPARM;
+
+ /* We require at least 4 bytes of data */
+ if (len < 4)
+ return PARSERUTILS_NEEDDATA;
+
+
+ /* Look for BOM */
+ if (data[0] == 0x00 && data[1] == 0x00 &&
+ data[2] == 0xFE && data[3] == 0xFF) {
+ charset = parserutils_charset_mibenum_from_name("UTF-32BE",
+ SLEN("UTF-32BE"));
+ } else if (data[0] == 0xFF && data[1] == 0xFE &&
+ data[2] == 0x00 && data[3] == 0x00) {
+ charset = parserutils_charset_mibenum_from_name("UTF-32LE",
+ SLEN("UTF-32LE"));
+ } else if (data[0] == 0xFE && data[1] == 0xFF) {
+ charset = parserutils_charset_mibenum_from_name("UTF-16BE",
+ SLEN("UTF-16BE"));
+ } else if (data[0] == 0xFF && data[1] == 0xFE) {
+ charset = parserutils_charset_mibenum_from_name("UTF-16LE",
+ SLEN("UTF-16LE"));
+ } else if (data[0] == 0xEF && data[1] == 0xBB && data[2] == 0xBF) {
+ charset = parserutils_charset_mibenum_from_name("UTF-8",
+ SLEN("UTF-8"));
+ }
+
+ /* BOM beats @charset.
+ * UAs differ here, but none appear to match the spec.
+ * The spec indicates that any @charset present in conjunction with a
+ * BOM, should match the BOM. In reality, it appears UAs just take the
+ * BOM as gospel and ignore any @charset rule. The w3c CSS validator
+ * appears to do the same (at the least, it doesn't complain about a
+ * mismatch).
+ */
+ if (charset != 0) {
+ *mibenum = charset;
+ return PARSERUTILS_OK;
+ }
+
+ /** \todo UTF-32 and UTF-16 @charset support */
+
+ /* Look for @charset, assuming ASCII-compatible source data */
+ if (len > 10 && strncmp((const char *) data, "@charset \"",
+ SLEN("@charset \"")) == 0) {
+ const uint8_t *end;
+
+ /* Look for "; at end of charset declaration */
+ for (end = data + 10; end < data + len; end++) {
+ if (*end == '"' && end < data + len - 1 &&
+ *(end + 1) == ';')
+ break;
+ }
+
+ if (end == data + len) {
+ /* Ran out of input */
+ return PARSERUTILS_NEEDDATA;
+ }
+
+ /* Convert to MIB enum */
+ charset = parserutils_charset_mibenum_from_name(
+ (char *) data + 10, end - data - 10);
+ }
+
+ *mibenum = charset;
+
+ return PARSERUTILS_OK;
+}
+
diff --git a/src/charset/detect.h b/src/charset/detect.h
new file mode 100644
index 0000000..d907921
--- /dev/null
+++ b/src/charset/detect.h
@@ -0,0 +1,24 @@
+/*
+ * This file is part of LibCSS.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#ifndef css_charset_detect_h_
+#define css_charset_detect_h_
+
+#include <inttypes.h>
+
+#include <libcss/errors.h>
+#include <libcss/functypes.h>
+#include <libcss/types.h>
+
+#include <parserutils/errors.h>
+
+/* Extract a charset from a chunk of data */
+parserutils_error css_charset_extract(const uint8_t *data, size_t len,
+ uint16_t *mibenum, uint32_t *source);
+
+#endif
+
diff --git a/src/lex/Makefile b/src/lex/Makefile
new file mode 100644
index 0000000..73f8ecf
--- /dev/null
+++ b/src/lex/Makefile
@@ -0,0 +1,49 @@
+# Child makefile fragment
+#
+# Toolchain is provided by top-level makefile
+#
+# Variables provided by top-level makefile
+#
+# COMPONENT The name of the component
+# EXPORT The location of the export directory
+# TOP The location of the source tree root
+# RELEASEDIR The place to put release objects
+# DEBUGDIR The place to put debug objects
+#
+# do_include Canned command sequence to include a child makefile
+#
+# Variables provided by parent makefile:
+#
+# DIR The name of the directory we're in, relative to $(TOP)
+#
+# Variables we can manipulate:
+#
+# ITEMS_CLEAN The list of items to remove for "make clean"
+# ITEMS_DISTCLEAN The list of items to remove for "make distclean"
+# TARGET_TESTS The list of target names to run for "make test"
+#
+# SOURCES The list of sources to build for $(COMPONENT)
+#
+# Plus anything from the toolchain
+
+# Push parent directory onto the directory stack
+sp := $(sp).x
+dirstack_$(sp) := $(d)
+d := $(DIR)
+
+# Manipulate include paths
+CFLAGS := $(CFLAGS) -I$(d)
+
+# Sources
+SRCS_$(d) := lex.c
+
+# Append to sources for component
+SOURCES += $(addprefix $(d), $(SRCS_$(d)))
+
+# Now include any children we may have
+MAKE_INCLUDES := $(wildcard $(d)*/Makefile)
+$(eval $(foreach INC, $(MAKE_INCLUDES), $(call do_include,$(INC))))
+
+# Finally, pop off the directory stack
+d := $(dirstack_$(sp))
+sp := $(basename $(sp))
diff --git a/src/lex/lex.c b/src/lex/lex.c
new file mode 100644
index 0000000..4df6cea
--- /dev/null
+++ b/src/lex/lex.c
@@ -0,0 +1,2116 @@
+/*
+ * This file is part of LibCSS.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2008 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+/** \file CSS lexer
+ *
+ * See docs/Tokens for the production rules used by this lexer.
+ *
+ * See docs/Lexer for the inferred first characters for each token.
+ *
+ * See also CSS3 Syntax module and CSS2.1 $4.1.1 + errata
+ *
+ * The lexer assumes that all invalid Unicode codepoints have been converted
+ * to U+FFFD by the input stream.
+ *
+ * The lexer comprises a state machine, the top-level of which is derived from
+ * the First sets in docs/Lexer. Each top-level state may contain a number of
+ * sub states. These enable restarting of the parser.
+ */
+
+#include <assert.h>
+#include <stdarg.h>
+#include <stdbool.h>
+#include <stdint.h>
+
+#include <parserutils/charset/utf8.h>
+#include <parserutils/input/inputstream.h>
+#include <parserutils/utils/buffer.h>
+
+#include <libcss/errors.h>
+
+#include "lex/lex.h"
+#include "utils/parserutilserror.h"
+
+/** \todo Optimisation -- we're currently revisiting a bunch of input
+ * characters (Currently, we're calling parserutils_inputstream_peek about
+ * 1.5x the number of characters in the input stream). Ideally,
+ * we'll visit each character in the input exactly once. In reality,
+ * the upper bound is twice, due to the need, in some cases, to read
+ * one character beyond the end of a token's input to detect the end
+ * of the token. Resumability adds a little overhead here, unless
+ * we're somewhat more clever when it comes to having support for
+ * restarting mid-escape sequence. Currently, we rewind back to the
+ * start of the sequence and process the whole thing again.
+ */
+
+enum {
+ sSTART = 0,
+ sATKEYWORD = 1,
+ sSTRING = 2,
+ sHASH = 3,
+ sNUMBER = 4,
+ sCDO = 5,
+ sCDC = 6,
+ sS = 7,
+ sCOMMENT = 8,
+ sMATCH = 9,
+ sURI = 10,
+ sIDENT = 11,
+ sESCAPEDIDENT = 12,
+ sURL = 13,
+ sUCR = 14
+};
+
+/**
+ * CSS lexer object
+ */
+struct css_lexer
+{
+ parserutils_inputstream *input; /**< Inputstream containing CSS */
+
+ size_t bytesReadForToken; /**< Total bytes read from the
+ * inputstream for the current token */
+
+ css_token token; /**< The current token */
+
+ bool escapeSeen; /**< Whether an escape sequence has
+ * been seen while processing the input
+ * for the current token */
+ parserutils_buffer *unescapedTokenData; /**< Buffer containing
+ * unescaped token data
+ * (used iff escapeSeen == true)
+ */
+
+ uint32_t state : 4, /**< Current state */
+ substate : 4; /**< Current substate */
+
+ struct {
+ uint8_t first; /**< First character read for token */
+ size_t origBytes; /**< Storage of current number of
+ * bytes read, for rewinding */
+ bool lastWasStar; /**< Whether the previous character
+ * was an asterisk */
+ bool lastWasCR; /**< Whether the previous character
+ * was CR */
+ size_t bytesForURL; /**< Input bytes read for "url(", for
+ * rewinding */
+ size_t dataLenForURL; /**< Output length for "url(", for
+ * rewinding */
+ int hexCount; /**< Counter for reading hex digits */
+ } context; /**< Context for the current state */
+
+ bool emit_comments; /**< Whether to emit comment tokens */
+
+ uint32_t currentCol; /**< Current column in source */
+ uint32_t currentLine; /**< Current line in source */
+
+ css_alloc alloc; /**< Memory (de)allocation function */
+ void *pw; /**< Pointer to client-specific data */
+};
+
+#define APPEND(lexer, data, len) \
+do { \
+ css_error error; \
+ error = appendToTokenData((lexer), \
+ (const uint8_t*) (data), (len)); \
+ if (error != CSS_OK) \
+ return error; \
+ (lexer)->bytesReadForToken += (len); \
+ (lexer)->currentCol += (len); \
+} while(0) \
+
+static inline css_error appendToTokenData(css_lexer *lexer,
+ const uint8_t *data, size_t len);
+static inline css_error emitToken(css_lexer *lexer, css_token_type type,
+ const css_token **token);
+
+static inline css_error AtKeyword(css_lexer *lexer, const css_token **token);
+static inline css_error CDCOrIdentOrFunction(css_lexer *lexer,
+ const css_token **token);
+static inline css_error CDO(css_lexer *lexer, const css_token **token);
+static inline css_error Comment(css_lexer *lexer, const css_token **token);
+static inline css_error EscapedIdentOrFunction(css_lexer *lexer,
+ const css_token **token);
+static inline css_error Hash(css_lexer *lexer, const css_token **token);
+static inline css_error IdentOrFunction(css_lexer *lexer,
+ const css_token **token);
+static inline css_error Match(css_lexer *lexer, const css_token **token);
+static inline css_error NumberOrPercentageOrDimension(css_lexer *lexer,
+ const css_token **token);
+static inline css_error S(css_lexer *lexer, const css_token **token);
+static inline css_error Start(css_lexer *lexer, const css_token **token);
+static inline css_error String(css_lexer *lexer, const css_token **token);
+static inline css_error URIOrUnicodeRangeOrIdentOrFunction(
+ css_lexer *lexer, const css_token **token);
+static inline css_error URI(css_lexer *lexer, const css_token **token);
+static inline css_error UnicodeRange(css_lexer *lexer, const css_token **token);
+
+static inline css_error consumeDigits(css_lexer *lexer);
+static inline css_error consumeEscape(css_lexer *lexer, bool nl);
+static inline css_error consumeNMChars(css_lexer *lexer);
+static inline css_error consumeString(css_lexer *lexer);
+static inline css_error consumeStringChars(css_lexer *lexer);
+static inline css_error consumeUnicode(css_lexer *lexer, uint32_t ucs);
+static inline css_error consumeURLChars(css_lexer *lexer);
+static inline css_error consumeWChars(css_lexer *lexer);
+
+static inline uint32_t charToHex(uint8_t c);
+static inline bool startNMChar(uint8_t c);
+static inline bool startNMStart(uint8_t c);
+static inline bool startStringChar(uint8_t c);
+static inline bool startURLChar(uint8_t c);
+static inline bool isDigit(uint8_t c);
+static inline bool isHex(uint8_t c);
+static inline bool isSpace(uint8_t c);
+
+/**
+ * Create a lexer instance
+ *
+ * \param input The inputstream to read from
+ * \param alloc Memory (de)allocation function
+ * \param pw Pointer to client-specific private data
+ * \return Pointer to instance, or NULL on memory exhaustion
+ */
+css_lexer *css_lexer_create(parserutils_inputstream *input,
+ css_alloc alloc, void *pw)
+{
+ css_lexer *lex;
+
+ if (input == NULL || alloc == NULL)
+ return NULL;
+
+ lex = alloc(NULL, sizeof(css_lexer), pw);
+ if (lex == NULL)
+ return NULL;
+
+ lex->input = input;
+ lex->bytesReadForToken = 0;
+ lex->token.type = CSS_TOKEN_EOF;
+ lex->token.data.ptr = NULL;
+ lex->token.data.len = 0;
+ lex->escapeSeen = false;
+ lex->unescapedTokenData = NULL;
+ lex->state = sSTART;
+ lex->substate = 0;
+ lex->emit_comments = false;
+ lex->currentCol = 1;
+ lex->currentLine = 1;
+ lex->alloc = alloc;
+ lex->pw = pw;
+
+ return lex;
+}
+
+/**
+ * Destroy a lexer instance
+ *
+ * \param lexer The instance to destroy
+ */
+void css_lexer_destroy(css_lexer *lexer)
+{
+ if (lexer == NULL)
+ return;
+
+ if (lexer->unescapedTokenData != NULL)
+ parserutils_buffer_destroy(lexer->unescapedTokenData);
+
+ lexer->alloc(lexer, 0, lexer->pw);
+}
+
+/**
+ * Configure a lexer instance
+ *
+ * \param lexer The lexer to configure
+ * \param type The option type to modify
+ * \param params Option-specific parameters
+ * \return CSS_OK on success, appropriate error otherwise
+ */
+css_error css_lexer_setopt(css_lexer *lexer, css_lexer_opttype type,
+ css_lexer_optparams *params)
+{
+ if (lexer == NULL || params == NULL)
+ return CSS_BADPARM;
+
+ switch (type) {
+ case CSS_LEXER_EMIT_COMMENTS:
+ lexer->emit_comments = params->emit_comments;
+ break;
+ default:
+ return CSS_BADPARM;
+ }
+
+ return CSS_OK;
+}
+
+/**
+ * Retrieve a token from a lexer
+ *
+ * \param lexer The lexer instance to read from
+ * \param token Pointer to location to receive pointer to token
+ * \return CSS_OK on success, appropriate error otherwise
+ */
+css_error css_lexer_get_token(css_lexer *lexer, const css_token **token)
+{
+ css_error error;
+
+ if (lexer == NULL || token == NULL)
+ return CSS_BADPARM;
+
+ switch (lexer->state)
+ {
+ case sSTART:
+ start:
+ return Start(lexer, token);
+ case sATKEYWORD:
+ return AtKeyword(lexer, token);
+ case sSTRING:
+ return String(lexer, token);
+ case sHASH:
+ return Hash(lexer, token);
+ case sNUMBER:
+ return NumberOrPercentageOrDimension(lexer, token);
+ case sCDO:
+ return CDO(lexer, token);
+ case sCDC:
+ return CDCOrIdentOrFunction(lexer, token);
+ case sS:
+ return S(lexer, token);
+ case sCOMMENT:
+ error = Comment(lexer, token);
+ if (!lexer->emit_comments && error == CSS_OK)
+ goto start;
+ return error;
+ case sMATCH:
+ return Match(lexer, token);
+ case sURI:
+ return URI(lexer, token);
+ case sIDENT:
+ return IdentOrFunction(lexer, token);
+ case sESCAPEDIDENT:
+ return EscapedIdentOrFunction(lexer, token);
+ case sURL:
+ return URI(lexer, token);
+ case sUCR:
+ return UnicodeRange(lexer, token);
+ }
+
+ /* Should never be reached */
+ assert(0);
+
+ return CSS_OK;
+}
+
+/******************************************************************************
+ * Utility routines *
+ ******************************************************************************/
+
+/**
+ * Append some data to the current token
+ *
+ * \param lexer The lexer instance
+ * \param data Pointer to data to append
+ * \param len Length, in bytes, of data
+ * \return CSS_OK on success, appropriate error otherwise
+ *
+ * This should not be called directly without good reason. Use the APPEND()
+ * macro instead.
+ */
+css_error appendToTokenData(css_lexer *lexer, const uint8_t *data, size_t len)
+{
+ css_token *token = &lexer->token;
+
+ if (lexer->escapeSeen) {
+ css_error error = css_error_from_parserutils_error(
+ parserutils_buffer_append(
+ lexer->unescapedTokenData, data, len));
+ if (error != CSS_OK)
+ return error;
+ }
+
+ token->data.len += len;
+
+ return CSS_OK;
+}
+
+/**
+ * Prepare a token for consumption and emit it to the client
+ *
+ * \param lexer The lexer instance
+ * \param type The type of token to emit
+ * \param token Pointer to location to receive pointer to token
+ * \return CSS_OK on success, appropriate error otherwise
+ */
+css_error emitToken(css_lexer *lexer, css_token_type type,
+ const css_token **token)
+{
+ css_token *t = &lexer->token;
+
+ t->type = type;
+
+ /* Calculate token data start pointer. We have to do this here as
+ * the inputstream's buffer may have moved under us. */
+ if (lexer->escapeSeen) {
+ t->data.ptr = lexer->unescapedTokenData->data;
+ } else {
+ size_t clen;
+ uintptr_t ptr = parserutils_inputstream_peek(
+ lexer->input, 0, &clen);
+
+ assert(type == CSS_TOKEN_EOF ||
+ (ptr != PARSERUTILS_INPUTSTREAM_EOF &&
+ ptr != PARSERUTILS_INPUTSTREAM_OOD));
+
+ t->data.ptr = (type == CSS_TOKEN_EOF) ? NULL : (uint8_t *) ptr;
+ }
+
+ switch (type) {
+ case CSS_TOKEN_ATKEYWORD:
+ /* Strip the '@' from the front */
+ t->data.ptr += 1;
+ t->data.len -= 1;
+ break;
+ case CSS_TOKEN_STRING:
+ /* Strip the leading quote */
+ t->data.ptr += 1;
+ t->data.len -= 1;
+
+ /* Strip the trailing quote */
+ t->data.len -= 1;
+ break;
+ case CSS_TOKEN_HASH:
+ /* Strip the '#' from the front */
+ t->data.ptr += 1;
+ t->data.len -= 1;
+ break;
+ case CSS_TOKEN_PERCENTAGE:
+ /* Strip the '%' from the end */
+ t->data.len -= 1;
+ break;
+ case CSS_TOKEN_DIMENSION:
+ /** \todo Do we want to separate the value from the units? */
+ break;
+ case CSS_TOKEN_URI:
+ /* Strip the "url(" from the start */
+ t->data.ptr += sizeof("url(") - 1;
+ t->data.len -= sizeof("url(") - 1;
+
+ /* Strip any leading whitespace */
+ while (isSpace(t->data.ptr[0])) {
+ t->data.ptr++;
+ t->data.len--;
+ }
+
+ /* Strip any leading quote */
+ if (t->data.ptr[0] == '"' || t->data.ptr[0] == '\'') {
+ t->data.ptr += 1;
+ t->data.len -= 1;
+ }
+
+ /* Strip the trailing ')' */
+ t->data.len -= 1;
+
+ /* Strip any trailing whitespace */
+ while (isSpace(t->data.ptr[t->data.len - 1])) {
+ t->data.len--;
+ }
+
+ /* Strip any trailing quote */
+ if (t->data.ptr[t->data.len - 1] == '"' ||
+ t->data.ptr[t->data.len - 1] == '\'') {
+ t->data.len -= 1;
+ }
+ break;
+ case CSS_TOKEN_UNICODE_RANGE:
+ /* Remove "U+" from the start */
+ t->data.ptr += sizeof("U+") - 1;
+ t->data.len -= sizeof("U+") - 1;
+ break;
+ case CSS_TOKEN_COMMENT:
+ /* Strip the leading '/' and '*' */
+ t->data.ptr += sizeof("/*") - 1;
+ t->data.len -= sizeof("/*") - 1;
+
+ /* Strip the trailing '*' and '/' */
+ t->data.len -= sizeof("*/") - 1;
+ break;
+ case CSS_TOKEN_FUNCTION:
+ /* Strip the trailing '(' */
+ t->data.len -= 1;
+ break;
+ default:
+ break;
+ }
+
+ *token = t;
+
+ /* Reset the lexer's state */
+ lexer->state = sSTART;
+ lexer->substate = 0;
+
+ return CSS_OK;
+}
+
+/******************************************************************************
+ * State machine components *
+ ******************************************************************************/
+
+css_error AtKeyword(css_lexer *lexer, const css_token **token)
+{
+ uintptr_t cptr;
+ uint8_t c;
+ size_t clen;
+ css_error error;
+ enum { Initial = 0, Escape = 1, NMChar = 2 };
+
+ /* ATKEYWORD = '@' ident
+ *
+ * The '@' has been consumed.
+ */
+
+ switch (lexer->substate) {
+ case Initial:
+ cptr = parserutils_inputstream_peek(lexer->input,
+ lexer->bytesReadForToken, &clen);
+ if (cptr == PARSERUTILS_INPUTSTREAM_OOD)
+ return CSS_NEEDDATA;
+
+ if (cptr == PARSERUTILS_INPUTSTREAM_EOF)
+ return emitToken(lexer, CSS_TOKEN_CHAR, token);
+
+ c = *((uint8_t *) cptr);
+
+ if (!startNMStart(c))
+ return emitToken(lexer, CSS_TOKEN_CHAR, token);
+
+ if (c != '\\') {
+ APPEND(lexer, cptr, clen);
+ } else {
+ lexer->bytesReadForToken += clen;
+ goto escape;
+ }
+
+ /* Fall through */
+ case NMChar:
+ nmchar:
+ lexer->substate = NMChar;
+ error = consumeNMChars(lexer);
+ if (error != CSS_OK)
+ return error;
+ break;
+
+ case Escape:
+ escape:
+ lexer->substate = Escape;
+ error = consumeEscape(lexer, false);
+ if (error != CSS_OK) {
+ if (error == CSS_EOF || error == CSS_INVALID) {
+ /* Rewind the '\\' */
+ lexer->bytesReadForToken -= 1;
+
+ return emitToken(lexer, CSS_TOKEN_CHAR, token);
+ }
+
+ return error;
+ }
+
+ goto nmchar;
+ }
+
+ return emitToken(lexer, CSS_TOKEN_ATKEYWORD, token);
+}
+
+css_error CDCOrIdentOrFunction(css_lexer *lexer, const css_token **token)
+{
+ css_token *t = &lexer->token;
+ uintptr_t cptr;
+ uint8_t c;
+ size_t clen;
+ css_error error;
+ enum { Initial = 0, Escape = 1, Gt = 2 };
+
+ /* CDC = "-->"
+ * IDENT = [-]? nmstart nmchar*
+ * FUNCTION = [-]? nmstart nmchar* '('
+ *
+ * The first dash has been consumed. Thus, we must consume the next
+ * character in the stream. If it's a dash, then we're dealing with
+ * CDC. Otherwise, we're dealing with IDENT/FUNCTION.
+ */
+
+ switch (lexer->substate) {
+ case Initial:
+ cptr = parserutils_inputstream_peek(lexer->input,
+ lexer->bytesReadForToken, &clen);
+ if (cptr == PARSERUTILS_INPUTSTREAM_OOD)
+ return CSS_NEEDDATA;
+
+ if (cptr == PARSERUTILS_INPUTSTREAM_EOF) {
+ /* We can only match char with what we've read so far */
+ return emitToken(lexer, CSS_TOKEN_CHAR, token);
+ }
+
+ c = *((uint8_t *) cptr);
+
+ if (c != '-' && !startNMStart(c)) {
+ /* Can only be CHAR */
+ return emitToken(lexer, CSS_TOKEN_CHAR, token);
+ }
+
+
+ if (c != '\\') {
+ APPEND(lexer, cptr, clen);
+ }
+
+ if (c != '-') {
+ if (c == '\\') {
+ lexer->bytesReadForToken += clen;
+ goto escape;
+ }
+
+ lexer->state = sIDENT;
+ lexer->substate = 0;
+ return IdentOrFunction(lexer, token);
+ }
+
+ /* Fall through */
+ case Gt:
+ lexer->substate = Gt;
+
+ /* Ok, so we're dealing with CDC. Expect a '>' */
+ cptr = parserutils_inputstream_peek(lexer->input,
+ lexer->bytesReadForToken, &clen);
+ if (cptr == PARSERUTILS_INPUTSTREAM_OOD)
+ return CSS_NEEDDATA;
+
+ if (cptr == PARSERUTILS_INPUTSTREAM_EOF) {
+ /* CHAR is the only match here */
+ /* Remove the '-' we read above */
+ lexer->bytesReadForToken -= 1;
+ t->data.len -= 1;
+ return emitToken(lexer, CSS_TOKEN_CHAR, token);
+ }
+
+ c = *((uint8_t *) cptr);
+
+ if (c == '>') {
+ APPEND(lexer, cptr, clen);
+
+ t->type = CSS_TOKEN_CDC;
+ } else {
+ /* Remove the '-' we read above */
+ lexer->bytesReadForToken -= 1;
+ t->data.len -= 1;
+ t->type = CSS_TOKEN_CHAR;
+ }
+ break;
+
+ case Escape:
+ escape:
+ lexer->substate = Escape;
+ error = consumeEscape(lexer, false);
+ if (error != CSS_OK) {
+ if (error == CSS_EOF || error == CSS_INVALID) {
+ /* Rewind the '\\' */
+ lexer->bytesReadForToken -= 1;
+
+ return emitToken(lexer, CSS_TOKEN_CHAR, token);
+ }
+
+ return error;
+ }
+
+ lexer->state = sIDENT;
+ lexer->substate = 0;
+ return IdentOrFunction(lexer, token);
+ }
+
+ return emitToken(lexer, t->type, token);
+}
+
+css_error CDO(css_lexer *lexer, const css_token **token)
+{
+ css_token *t = &lexer->token;
+ uintptr_t cptr;
+ uint8_t c;
+ size_t clen;
+ enum { Initial = 0, Dash1 = 1, Dash2 = 2 };
+
+ /* CDO = "<!--"
+ *
+ * The '<' has been consumed
+ */
+
+ switch (lexer->substate) {
+ case Initial:
+ /* Expect '!' */
+ cptr = parserutils_inputstream_peek(lexer->input,
+ lexer->bytesReadForToken, &clen);
+ if (cptr == PARSERUTILS_INPUTSTREAM_OOD)
+ return CSS_NEEDDATA;
+
+ if (cptr == PARSERUTILS_INPUTSTREAM_EOF) {
+ /* CHAR is the only match here */
+ return emitToken(lexer, CSS_TOKEN_CHAR, token);
+ }
+
+ c = *((uint8_t *) cptr);
+
+ if (c == '!') {
+ APPEND(lexer, cptr, clen);
+ } else {
+ return emitToken(lexer, CSS_TOKEN_CHAR, token);
+ }
+
+ /* Fall Through */
+ case Dash1:
+ lexer->substate = Dash1;
+
+ /* Expect '-' */
+ cptr = parserutils_inputstream_peek(lexer->input,
+ lexer->bytesReadForToken, &clen);
+ if (cptr == PARSERUTILS_INPUTSTREAM_OOD)
+ return CSS_NEEDDATA;
+
+ if (cptr == PARSERUTILS_INPUTSTREAM_EOF) {
+ /* CHAR is the only match here */
+ /* Remove the '!' we read above */
+ lexer->bytesReadForToken -= 1;
+ t->data.len -= 1;
+ return emitToken(lexer, CSS_TOKEN_CHAR, token);
+ }
+
+ c = *((uint8_t *) cptr);
+
+ if (c == '-') {
+ APPEND(lexer, cptr, clen);
+ } else {
+ /* Remove the '!' we read above */
+ lexer->bytesReadForToken -= 1;
+ t->data.len -= 1;
+ return emitToken(lexer, CSS_TOKEN_CHAR, token);
+ }
+
+ /* Fall through */
+ case Dash2:
+ lexer->substate = Dash2;
+
+ /* Expect '-' */
+ cptr = parserutils_inputstream_peek(lexer->input,
+ lexer->bytesReadForToken, &clen);
+ if (cptr == PARSERUTILS_INPUTSTREAM_OOD)
+ return CSS_NEEDDATA;
+
+ if (cptr == PARSERUTILS_INPUTSTREAM_EOF) {
+ /* CHAR is the only match here */
+ /* Remove the '-' and the '!' we read above */
+ lexer->bytesReadForToken -= 2;
+ t->data.len -= 2;
+ return emitToken(lexer, CSS_TOKEN_CHAR, token);
+ }
+
+ c = *((uint8_t *) cptr);
+
+ if (c == '-') {
+ APPEND(lexer, cptr, clen);
+ } else {
+ /* Remove the '-' and the '!' we read above */
+ lexer->bytesReadForToken -= 2;
+ t->data.len -= 2;
+ return emitToken(lexer, CSS_TOKEN_CHAR, token);
+ }
+ }
+
+ return emitToken(lexer, CSS_TOKEN_CDO, token);
+}
+
+css_error Comment(css_lexer *lexer, const css_token **token)
+{
+ uintptr_t cptr;
+ uint8_t c;
+ size_t clen;
+ enum { Initial = 0, InComment = 1 };
+
+ /* COMMENT = '/' '*' [^*]* '*'+ ([^/] [^*]* '*'+)* '/'
+ *
+ * The '/' has been consumed.
+ */
+ switch (lexer->substate) {
+ case Initial:
+ cptr = parserutils_inputstream_peek(lexer->input,
+ lexer->bytesReadForToken, &clen);
+ if (cptr == PARSERUTILS_INPUTSTREAM_OOD)
+ return CSS_NEEDDATA;
+
+ if (cptr == PARSERUTILS_INPUTSTREAM_EOF)
+ return emitToken(lexer, CSS_TOKEN_CHAR, token);
+
+ c = *((uint8_t *) cptr);
+
+ if (c != '*')
+ return emitToken(lexer, CSS_TOKEN_CHAR, token);
+
+ APPEND(lexer, cptr, clen);
+
+ /* Fall through */
+ case InComment:
+ lexer->substate = InComment;
+
+ while (1) {
+ cptr = parserutils_inputstream_peek(lexer->input,
+ lexer->bytesReadForToken, &clen);
+ if (cptr == PARSERUTILS_INPUTSTREAM_OOD)
+ return CSS_NEEDDATA;
+
+ if (cptr == PARSERUTILS_INPUTSTREAM_EOF) {
+ /* As per unterminated strings,
+ * we ignore unterminated comments. */
+ return emitToken(lexer, CSS_TOKEN_EOF, token);
+ }
+
+ c = *((uint8_t *) cptr);
+
+ APPEND(lexer, cptr, clen);
+
+ if (lexer->context.lastWasStar && c == '/')
+ break;
+
+ lexer->context.lastWasStar = (c == '*');
+
+ if (c == '\n' || c == '\f') {
+ lexer->currentCol = 1;
+ lexer->currentLine++;
+ }
+
+ if (lexer->context.lastWasCR && c != '\n') {
+ lexer->currentCol = 1;
+ lexer->currentLine++;
+ }
+ lexer->context.lastWasCR = (c == '\r');
+ }
+ }
+
+ return emitToken(lexer, CSS_TOKEN_COMMENT, token);
+}
+
+css_error EscapedIdentOrFunction(css_lexer *lexer, const css_token **token)
+{
+ css_error error;
+
+ /* IDENT = ident = [-]? nmstart nmchar*
+ * FUNCTION = ident '(' = [-]? nmstart nmchar* '('
+ *
+ * In this case, nmstart is an escape sequence and no '-' is present.
+ *
+ * The '\\' has been consumed.
+ */
+
+ error = consumeEscape(lexer, false);
+ if (error != CSS_OK) {
+ if (error == CSS_EOF || error == CSS_INVALID) {
+ /* The '\\' is a token of its own */
+ return emitToken(lexer, CSS_TOKEN_CHAR, token);
+ }
+
+ return error;
+ }
+
+ lexer->state = sIDENT;
+ lexer->substate = 0;
+ return IdentOrFunction(lexer, token);
+}
+
+css_error Hash(css_lexer *lexer, const css_token **token)
+{
+ css_error error;
+
+ /* HASH = '#' name = '#' nmchar+
+ *
+ * The '#' has been consumed.
+ */
+
+ error = consumeNMChars(lexer);
+ if (error != CSS_OK)
+ return error;
+
+ /* Require at least one NMChar otherwise, we're just a raw '#' */
+ if (lexer->bytesReadForToken - lexer->context.origBytes > 0)
+ return emitToken(lexer, CSS_TOKEN_HASH, token);
+
+ return emitToken(lexer, CSS_TOKEN_CHAR, token);
+}
+
+css_error IdentOrFunction(css_lexer *lexer, const css_token **token)
+{
+ css_token *t = &lexer->token;
+ uintptr_t cptr;
+ uint8_t c;
+ size_t clen;
+ css_error error;
+ enum { Initial = 0, Bracket = 1 };
+
+ /* IDENT = ident = [-]? nmstart nmchar*
+ * FUNCTION = ident '(' = [-]? nmstart nmchar* '('
+ *
+ * The optional dash and nmstart are already consumed
+ */
+
+ switch (lexer->substate) {
+ case Initial:
+ /* Consume all subsequent nmchars (if any exist) */
+ error = consumeNMChars(lexer);
+ if (error != CSS_OK)
+ return error;
+
+ /* Fall through */
+ case Bracket:
+ lexer->substate = Bracket;
+
+ cptr = parserutils_inputstream_peek(lexer->input,
+ lexer->bytesReadForToken, &clen);
+ if (cptr == PARSERUTILS_INPUTSTREAM_OOD)
+ return CSS_NEEDDATA;
+
+ if (cptr == PARSERUTILS_INPUTSTREAM_EOF) {
+ /* IDENT, rather than CHAR */
+ return emitToken(lexer, CSS_TOKEN_IDENT, token);
+ }
+
+ c = *((uint8_t *) cptr);
+
+ if (c == '(') {
+ APPEND(lexer, cptr, clen);
+
+ t->type = CSS_TOKEN_FUNCTION;
+ } else {
+ t->type = CSS_TOKEN_IDENT;
+ }
+ }
+
+ return emitToken(lexer, t->type, token);
+}
+
+css_error Match(css_lexer *lexer, const css_token **token)
+{
+ uintptr_t cptr;
+ uint8_t c;
+ size_t clen;
+ css_token_type type = CSS_TOKEN_EOF; /* GCC's braindead */
+
+ /* INCLUDES = "~="
+ * DASHMATCH = "|="
+ * PREFIXMATCH = "^="
+ * SUFFIXMATCH = "$="
+ * SUBSTRINGMATCH = "*="
+ *
+ * The first character has been consumed.
+ */
+
+ cptr = parserutils_inputstream_peek(lexer->input,
+ lexer->bytesReadForToken, &clen);
+ if (cptr == PARSERUTILS_INPUTSTREAM_OOD)
+ return CSS_NEEDDATA;
+
+ if (cptr == PARSERUTILS_INPUTSTREAM_EOF)
+ return emitToken(lexer, CSS_TOKEN_CHAR, token);
+
+ c = *((uint8_t *) cptr);
+
+ if (c != '=')
+ return emitToken(lexer, CSS_TOKEN_CHAR, token);
+
+ APPEND(lexer, cptr, clen);
+
+ switch (lexer->context.first) {
+ case '~':
+ type = CSS_TOKEN_INCLUDES;
+ break;
+ case '|':
+ type = CSS_TOKEN_DASHMATCH;
+ break;
+ case '^':
+ type = CSS_TOKEN_PREFIXMATCH;
+ break;
+ case '$':
+ type = CSS_TOKEN_SUFFIXMATCH;
+ break;
+ case '*':
+ type = CSS_TOKEN_SUBSTRINGMATCH;
+ break;
+ default:
+ assert(0);
+ }
+
+ return emitToken(lexer, type, token);
+}
+
+css_error NumberOrPercentageOrDimension(css_lexer *lexer,
+ const css_token **token)
+{
+ css_token *t = &lexer->token;
+ uintptr_t cptr;
+ uint8_t c;
+ size_t clen;
+ css_error error;
+ enum { Initial = 0, Dot = 1, MoreDigits = 2,
+ Suffix = 3, NMChars = 4, Escape = 5 };
+
+ /* NUMBER = num = [0-9]+ | [0-9]* '.' [0-9]+
+ * PERCENTAGE = num '%'
+ * DIMENSION = num ident
+ *
+ * The first digit, or '.' has been consumed.
+ */
+
+ switch (lexer->substate) {
+ case Initial:
+ error = consumeDigits(lexer);
+ if (error != CSS_OK)
+ return error;
+
+ /* Fall through */
+ case Dot:
+ lexer->substate = Dot;
+
+ cptr = parserutils_inputstream_peek(lexer->input,
+ lexer->bytesReadForToken, &clen);
+ if (cptr == PARSERUTILS_INPUTSTREAM_OOD)
+ return CSS_NEEDDATA;
+
+ if (cptr == PARSERUTILS_INPUTSTREAM_EOF) {
+ if (t->data.len == 1 && lexer->context.first == '.')
+ return emitToken(lexer, CSS_TOKEN_CHAR, token);
+ else
+ return emitToken(lexer, CSS_TOKEN_NUMBER,
+ token);
+ }
+
+ c = *((uint8_t *) cptr);
+
+ /* Bail if we've not got a '.' or we've seen one already */
+ if (c != '.' || lexer->context.first == '.')
+ goto suffix;
+
+ /* Save the token length up to the end of the digits */
+ lexer->context.origBytes = lexer->bytesReadForToken;
+
+ /* Append the '.' to the token */
+ APPEND(lexer, cptr, clen);
+
+ /* Fall through */
+ case MoreDigits:
+ lexer->substate = MoreDigits;
+
+ error = consumeDigits(lexer);
+ if (error != CSS_OK)
+ return error;
+
+ if (lexer->bytesReadForToken - lexer->context.origBytes == 1) {
+ /* No digits after dot => dot isn't part of number */
+ lexer->bytesReadForToken -= 1;
+ t->data.len -= 1;
+ }
+
+ /* Fall through */
+ case Suffix:
+ suffix:
+ lexer->substate = Suffix;
+
+ cptr = parserutils_inputstream_peek(lexer->input,
+ lexer->bytesReadForToken, &clen);
+ if (cptr == PARSERUTILS_INPUTSTREAM_OOD)
+ return CSS_NEEDDATA;
+
+ if (cptr == PARSERUTILS_INPUTSTREAM_EOF) {
+ if (t->data.len == 1 && lexer->context.first == '.')
+ return emitToken(lexer, CSS_TOKEN_CHAR, token);
+ else
+ return emitToken(lexer, CSS_TOKEN_NUMBER,
+ token);
+ }
+
+ c = *((uint8_t *) cptr);
+
+ /* A solitary '.' is a CHAR, not numeric */
+ if (t->data.len == 1 && lexer->context.first == '.')
+ return emitToken(lexer, CSS_TOKEN_CHAR, token);
+
+ if (c == '%') {
+ APPEND(lexer, cptr, clen);
+
+ return emitToken(lexer, CSS_TOKEN_PERCENTAGE, token);
+ } else if (!startNMStart(c)) {
+ return emitToken(lexer, CSS_TOKEN_NUMBER, token);
+ }
+
+ if (c != '\\') {
+ APPEND(lexer, cptr, clen);
+ } else {
+ lexer->bytesReadForToken += clen;
+ goto escape;
+ }
+
+ /* Fall through */
+ case NMChars:
+ nmchars:
+ lexer->substate = NMChars;
+
+ error = consumeNMChars(lexer);
+ if (error != CSS_OK)
+ return error;
+
+ break;
+ case Escape:
+ escape:
+ lexer->substate = Escape;
+
+ error = consumeEscape(lexer, false);
+ if (error != CSS_OK) {
+ if (error == CSS_EOF || error == CSS_INVALID) {
+ /* Rewind the '\\' */
+ lexer->bytesReadForToken -= 1;
+
+ /* This can only be a number */
+ return emitToken(lexer,
+ CSS_TOKEN_NUMBER, token);
+ }
+
+ return error;
+ }
+
+ goto nmchars;
+ }
+
+ return emitToken(lexer, CSS_TOKEN_DIMENSION, token);
+}
+
+css_error S(css_lexer *lexer, const css_token **token)
+{
+ css_error error;
+
+ /* S = wc*
+ *
+ * The first whitespace character has been consumed.
+ */
+
+ error = consumeWChars(lexer);
+ if (error != CSS_OK)
+ return error;
+
+ return emitToken(lexer, CSS_TOKEN_S, token);
+}
+
+css_error Start(css_lexer *lexer, const css_token **token)
+{
+ css_token *t = &lexer->token;
+ uintptr_t cptr;
+ uint8_t c;
+ size_t clen;
+ css_error error;
+
+start:
+
+ /* Advance past the input read for the previous token */
+ if (lexer->bytesReadForToken > 0) {
+ parserutils_inputstream_advance(
+ lexer->input, lexer->bytesReadForToken);
+ lexer->bytesReadForToken = 0;
+ }
+
+ /* Reset in preparation for the next token */
+ t->type = CSS_TOKEN_EOF;
+ t->data.ptr = NULL;
+ t->data.len = 0;
+ t->col = lexer->currentCol;
+ t->line = lexer->currentLine;
+ lexer->escapeSeen = false;
+ if (lexer->unescapedTokenData != NULL)
+ lexer->unescapedTokenData->length = 0;
+
+ cptr = parserutils_inputstream_peek(lexer->input, 0, &clen);
+ if (cptr == PARSERUTILS_INPUTSTREAM_OOD)
+ return CSS_NEEDDATA;
+
+ if (cptr == PARSERUTILS_INPUTSTREAM_EOF)
+ return emitToken(lexer, CSS_TOKEN_EOF, token);
+
+ APPEND(lexer, cptr, clen);
+
+ c = *((uint8_t *) cptr);
+
+ if (clen > 1 || c >= 0x80) {
+ lexer->state = sIDENT;
+ lexer->substate = 0;
+ return IdentOrFunction(lexer, token);
+ }
+
+ switch (c) {
+ case '@':
+ lexer->state = sATKEYWORD;
+ lexer->substate = 0;
+ return AtKeyword(lexer, token);
+ case '"': case '\'':
+ lexer->state = sSTRING;
+ lexer->substate = 0;
+ lexer->context.first = c;
+ return String(lexer, token);
+ case '#':
+ lexer->state = sHASH;
+ lexer->substate = 0;
+ lexer->context.origBytes = lexer->bytesReadForToken;
+ return Hash(lexer, token);
+ case '0': case '1': case '2': case '3': case '4':
+ case '5': case '6': case '7': case '8': case '9':
+ case '.':
+ lexer->state = sNUMBER;
+ lexer->substate = 0;
+ lexer->context.first = c;
+ return NumberOrPercentageOrDimension(lexer, token);
+ case '<':
+ lexer->state = sCDO;
+ lexer->substate = 0;
+ return CDO(lexer, token);
+ case '-':
+ lexer->state = sCDC;
+ lexer->substate = 0;
+ return CDCOrIdentOrFunction(lexer, token);
+ case ' ': case '\t': case '\r': case '\n': case '\f':
+ lexer->state = sS;
+ lexer->substate = 0;
+ if (c == '\n' || c == '\f') {
+ lexer->currentCol = 1;
+ lexer->currentLine++;
+ }
+ lexer->context.lastWasCR = (c == '\r');
+ return S(lexer, token);
+ case '/':
+ lexer->state = sCOMMENT;
+ lexer->substate = 0;
+ lexer->context.lastWasStar = false;
+ lexer->context.lastWasCR = false;
+ error = Comment(lexer, token);
+ if (!lexer->emit_comments && error == CSS_OK)
+ goto start;
+ return error;
+ case '~': case '|': case '^': case '$': case '*':
+ lexer->state = sMATCH;
+ lexer->substate = 0;
+ lexer->context.first = c;
+ return Match(lexer, token);
+ case 'u': case 'U':
+ lexer->state = sURI;
+ lexer->substate = 0;
+ return URIOrUnicodeRangeOrIdentOrFunction(lexer, token);
+ case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
+ case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
+ case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
+ case 's': case 't': /* 'u'*/ case 'v': case 'w': case 'x':
+ case 'y': case 'z':
+ case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
+ case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
+ case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
+ case 'S': case 'T': /* 'U'*/ case 'V': case 'W': case 'X':
+ case 'Y': case 'Z':
+ case '_':
+ lexer->state = sIDENT;
+ lexer->substate = 0;
+ return IdentOrFunction(lexer, token);
+ case '\\':
+ lexer->state = sESCAPEDIDENT;
+ lexer->substate = 0;
+ return EscapedIdentOrFunction(lexer, token);
+ default:
+ return emitToken(lexer, CSS_TOKEN_CHAR, token);
+ }
+}
+
+css_error String(css_lexer *lexer, const css_token **token)
+{
+ css_error error;
+
+ /* STRING = string
+ *
+ * The open quote has been consumed.
+ */
+
+ error = consumeString(lexer);
+ if (error != CSS_OK && error != CSS_EOF)
+ return error;
+
+ return emitToken(lexer,
+ error == CSS_EOF ? CSS_TOKEN_EOF : CSS_TOKEN_STRING,
+ token);
+}
+
+css_error URIOrUnicodeRangeOrIdentOrFunction(css_lexer *lexer,
+ const css_token **token)
+{
+ uintptr_t cptr;
+ uint8_t c;
+ size_t clen;
+
+ /* URI = "url(" w (string | urlchar*) w ')'
+ * UNICODE-RANGE = [Uu] '+' [0-9a-fA-F?]{1,6}(-[0-9a-fA-F]{1,6})?
+ * IDENT = ident = [-]? nmstart nmchar*
+ * FUNCTION = ident '(' = [-]? nmstart nmchar* '('
+ *
+ * The 'u' (or 'U') has been consumed.
+ */
+
+ cptr = parserutils_inputstream_peek(lexer->input,
+ lexer->bytesReadForToken, &clen);
+ if (cptr == PARSERUTILS_INPUTSTREAM_OOD)
+ return CSS_NEEDDATA;
+
+ if (cptr == PARSERUTILS_INPUTSTREAM_EOF) {
+ /* IDENT, rather than CHAR */
+ return emitToken(lexer, CSS_TOKEN_IDENT, token);
+ }
+
+ c = *((uint8_t *) cptr);
+
+ if (c == 'r' || c == 'R') {
+ APPEND(lexer, cptr, clen);
+
+ lexer->state = sURL;
+ lexer->substate = 0;
+ return URI(lexer, token);
+ } else if (c == '+') {
+ APPEND(lexer, cptr, clen);
+
+ lexer->state = sUCR;
+ lexer->substate = 0;
+ lexer->context.hexCount = 0;
+ return UnicodeRange(lexer, token);
+ }
+
+ /* Can only be IDENT or FUNCTION. Reprocess current character */
+ lexer->state = sIDENT;
+ lexer->substate = 0;
+ return IdentOrFunction(lexer, token);
+}
+
+css_error URI(css_lexer *lexer, const css_token **token)
+{
+ uintptr_t cptr;
+ uint8_t c;
+ size_t clen;
+ css_error error;
+ enum { Initial = 0, LParen = 1, W1 = 2, Quote = 3,
+ URL = 4, W2 = 5, RParen = 6, String = 7 };
+
+ /* URI = "url(" w (string | urlchar*) w ')'
+ *
+ * 'u' and 'r' have been consumed.
+ */
+
+ switch (lexer->substate) {
+ case Initial:
+ cptr = parserutils_inputstream_peek(lexer->input,
+ lexer->bytesReadForToken, &clen);
+ if (cptr == PARSERUTILS_INPUTSTREAM_OOD)
+ return CSS_NEEDDATA;
+
+ if (cptr == PARSERUTILS_INPUTSTREAM_EOF) {
+ /* IDENT */
+ return emitToken(lexer, CSS_TOKEN_IDENT, token);
+ }
+
+ c = *((uint8_t *) cptr);
+
+ if (c == 'l' || c == 'L') {
+ APPEND(lexer, cptr, clen);
+ } else {
+ lexer->state = sIDENT;
+ lexer->substate = 0;
+ return IdentOrFunction(lexer, token);
+ }
+
+ /* Fall through */
+ case LParen:
+ lexer->substate = LParen;
+
+ cptr = parserutils_inputstream_peek(lexer->input,
+ lexer->bytesReadForToken, &clen);
+ if (cptr == PARSERUTILS_INPUTSTREAM_OOD)
+ return CSS_NEEDDATA;
+
+ if (cptr == PARSERUTILS_INPUTSTREAM_EOF)
+ return emitToken(lexer, CSS_TOKEN_IDENT, token);
+
+ c = *((uint8_t *) cptr);
+
+ if (c == '(') {
+ APPEND(lexer, cptr, clen);
+ } else {
+ lexer->state = sIDENT;
+ lexer->substate = 0;
+ return IdentOrFunction(lexer, token);
+ }
+
+ /* Save the number of input bytes read for "url(" */
+ lexer->context.bytesForURL = lexer->bytesReadForToken;
+ /* And the length of the token data at the same point */
+ lexer->context.dataLenForURL = lexer->token.data.len;
+
+ lexer->context.lastWasCR = false;
+
+ /* Fall through */
+ case W1:
+ lexer->substate = W1;
+
+ error = consumeWChars(lexer);
+ if (error != CSS_OK)
+ return error;
+
+ /* Fall through */
+ case Quote:
+ lexer->substate = Quote;
+
+ cptr = parserutils_inputstream_peek(lexer->input,
+ lexer->bytesReadForToken, &clen);
+ if (cptr == PARSERUTILS_INPUTSTREAM_OOD)
+ return CSS_NEEDDATA;
+
+ if (cptr == PARSERUTILS_INPUTSTREAM_EOF) {
+ /* Rewind to "url(" */
+ lexer->bytesReadForToken = lexer->context.bytesForURL;
+ lexer->token.data.len = lexer->context.dataLenForURL;
+ return emitToken(lexer, CSS_TOKEN_FUNCTION, token);
+ }
+
+ c = *((uint8_t *) cptr);
+
+ if (c == '"' || c == '\'') {
+ APPEND(lexer, cptr, clen);
+
+ lexer->context.first = c;
+
+ goto string;
+ }
+
+ /* Potential minor optimisation: If string is more common,
+ * then fall through to that state and branch for the URL
+ * state. Need to investigate a reasonably large corpus of
+ * real-world data to determine if this is worthwhile. */
+
+ /* Fall through */
+ case URL:
+ lexer->substate = URL;
+
+ error = consumeURLChars(lexer);
+ if (error != CSS_OK)
+ return error;
+
+ lexer->context.lastWasCR = false;
+
+ /* Fall through */
+ case W2:
+ w2:
+ lexer->substate = W2;
+
+ error = consumeWChars(lexer);
+ if (error != CSS_OK)
+ return error;
+
+ /* Fall through */
+ case RParen:
+ lexer->substate = RParen;
+
+ cptr = parserutils_inputstream_peek(lexer->input,
+ lexer->bytesReadForToken, &clen);
+ if (cptr == PARSERUTILS_INPUTSTREAM_OOD)
+ return CSS_NEEDDATA;
+
+ if (cptr == PARSERUTILS_INPUTSTREAM_EOF) {
+ /* Rewind to "url(" */
+ lexer->bytesReadForToken = lexer->context.bytesForURL;
+ lexer->token.data.len = lexer->context.dataLenForURL;
+ return emitToken(lexer, CSS_TOKEN_FUNCTION, token);
+ }
+
+ c = *((uint8_t *) cptr);
+
+ if (c != ')') {
+ /* Rewind to "url(" */
+ lexer->bytesReadForToken = lexer->context.bytesForURL;
+ lexer->token.data.len = lexer->context.dataLenForURL;
+ return emitToken(lexer, CSS_TOKEN_FUNCTION, token);
+ }
+
+ APPEND(lexer, cptr, clen);
+ break;
+ case String:
+ string:
+ lexer->substate = String;
+
+ error = consumeString(lexer);
+ if (error != CSS_OK && error != CSS_EOF)
+ return error;
+
+ /* EOF gets handled in RParen */
+
+ lexer->context.lastWasCR = false;
+
+ goto w2;
+ }
+
+ return emitToken(lexer, CSS_TOKEN_URI, token);
+}
+
+css_error UnicodeRange(css_lexer *lexer, const css_token **token)
+{
+ css_token *t = &lexer->token;
+ uintptr_t cptr = PARSERUTILS_INPUTSTREAM_OOD; /* GCC: shush */
+ uint8_t c = 0; /* GCC: shush */
+ size_t clen;
+ enum { Initial = 0, MoreDigits = 1 };
+
+ /* UNICODE-RANGE = [Uu] '+' [0-9a-fA-F?]{1,6}(-[0-9a-fA-F]{1,6})?
+ *
+ * "U+" has been consumed.
+ */
+
+ switch (lexer->substate) {
+ case Initial:
+ /* Attempt to consume 6 hex digits (or question marks) */
+ for (; lexer->context.hexCount < 6; lexer->context.hexCount++) {
+ cptr = parserutils_inputstream_peek(lexer->input,
+ lexer->bytesReadForToken, &clen);
+ if (cptr == PARSERUTILS_INPUTSTREAM_OOD)
+ return CSS_NEEDDATA;
+
+ if (cptr == PARSERUTILS_INPUTSTREAM_EOF) {
+ if (lexer->context.hexCount == 0) {
+ /* Remove '+' */
+ lexer->bytesReadForToken -= 1;
+ t->data.len -= 1;
+ /* u == IDENT */
+ return emitToken(lexer,
+ CSS_TOKEN_IDENT, token);
+ } else {
+ return emitToken(lexer,
+ CSS_TOKEN_UNICODE_RANGE, token);
+ }
+ }
+
+ c = *((uint8_t *) cptr);
+
+ if (isHex(c) || c == '?') {
+ APPEND(lexer, cptr, clen);
+ } else {
+ break;
+ }
+ }
+
+ if (lexer->context.hexCount == 0) {
+ /* We didn't consume any valid Unicode Range digits */
+ /* Remove the '+' */
+ lexer->bytesReadForToken -= 1;
+ t->data.len -= 1;
+ /* 'u' == IDENT */
+ return emitToken(lexer, CSS_TOKEN_IDENT, token);
+ }
+
+ if (lexer->context.hexCount == 6) {
+ /* Consumed 6 valid characters. Look for '-' */
+ cptr = parserutils_inputstream_peek(lexer->input,
+ lexer->bytesReadForToken, &clen);
+ if (cptr == PARSERUTILS_INPUTSTREAM_OOD)
+ return CSS_NEEDDATA;
+
+ if (cptr == PARSERUTILS_INPUTSTREAM_EOF)
+ return emitToken(lexer,
+ CSS_TOKEN_UNICODE_RANGE, token);
+
+ c = *((uint8_t *) cptr);
+ }
+
+ /* If we've got a '-', then we may have a
+ * second range component */
+ if (c != '-') {
+ /* Reached the end of the range */
+ return emitToken(lexer, CSS_TOKEN_UNICODE_RANGE, token);
+ }
+
+ APPEND(lexer, cptr, clen);
+
+ /* Reset count for next set of digits */
+ lexer->context.hexCount = 0;
+
+ /* Fall through */
+ case MoreDigits:
+ lexer->substate = MoreDigits;
+
+ /* Consume up to 6 hex digits */
+ for (; lexer->context.hexCount < 6; lexer->context.hexCount++) {
+ cptr = parserutils_inputstream_peek(lexer->input,
+ lexer->bytesReadForToken, &clen);
+ if (cptr == PARSERUTILS_INPUTSTREAM_OOD)
+ return CSS_NEEDDATA;
+
+ if (cptr == PARSERUTILS_INPUTSTREAM_EOF) {
+ if (lexer->context.hexCount == 0) {
+ /* Remove '-' */
+ lexer->bytesReadForToken -= 1;
+ t->data.len -= 1;
+ }
+
+ return emitToken(lexer,
+ CSS_TOKEN_UNICODE_RANGE, token);
+ }
+
+ c = *((uint8_t *) cptr);
+
+ if (isHex(c)) {
+ APPEND(lexer, cptr, clen);
+ } else {
+ break;
+ }
+ }
+
+ if (lexer->context.hexCount == 0) {
+ /* No hex digits consumed. Remove '-' */
+ lexer->bytesReadForToken -= 1;
+ t->data.len -= 1;
+ }
+ }
+
+ return emitToken(lexer, CSS_TOKEN_UNICODE_RANGE, token);
+}
+
+/******************************************************************************
+ * Input consumers *
+ ******************************************************************************/
+
+css_error consumeDigits(css_lexer *lexer)
+{
+ uintptr_t cptr;
+ uint8_t c;
+ size_t clen;
+
+ /* digit = [0-9] */
+
+ /* Consume all digits */
+ do {
+ cptr = parserutils_inputstream_peek(lexer->input,
+ lexer->bytesReadForToken, &clen);
+ if (cptr == PARSERUTILS_INPUTSTREAM_OOD)
+ return CSS_NEEDDATA;
+
+ if (cptr == PARSERUTILS_INPUTSTREAM_EOF)
+ return CSS_OK;
+
+ c = *((uint8_t *) cptr);
+
+ if (isDigit(c)) {
+ APPEND(lexer, cptr, clen);
+ }
+ } while (isDigit(c));
+
+ return CSS_OK;
+}
+
+css_error consumeEscape(css_lexer *lexer, bool nl)
+{
+ uintptr_t cptr, sptr;
+ uint8_t c;
+ size_t clen, slen;
+ css_error error;
+
+ /* escape = unicode | '\' [^\n\r\f0-9a-fA-F]
+ *
+ * The '\' has been consumed.
+ */
+
+ cptr = parserutils_inputstream_peek(lexer->input,
+ lexer->bytesReadForToken, &clen);
+ if (cptr == PARSERUTILS_INPUTSTREAM_OOD)
+ return CSS_NEEDDATA;
+
+ if (cptr == PARSERUTILS_INPUTSTREAM_EOF)
+ return CSS_EOF;
+
+ c = *((uint8_t *) cptr);
+
+ if (!nl && (c == '\n' || c == '\r' || c == '\f')) {
+ /* These are not permitted */
+ return CSS_INVALID;
+ }
+
+ /* Create unescaped buffer, if it doesn't already exist */
+ if (lexer->unescapedTokenData == NULL) {
+ lexer->unescapedTokenData =
+ parserutils_buffer_create(lexer->alloc, lexer->pw);
+ if (lexer->unescapedTokenData == NULL)
+ return CSS_NOMEM;
+ }
+
+ /* If this is the first escaped character we've seen for this token,
+ * we must copy the characters we've read to the unescaped buffer */
+ if (!lexer->escapeSeen) {
+ if (lexer->bytesReadForToken > 1) {
+ sptr = parserutils_inputstream_peek(
+ lexer->input, 0, &slen);
+
+ assert(sptr != PARSERUTILS_INPUTSTREAM_EOF &&
+ sptr != PARSERUTILS_INPUTSTREAM_OOD);
+
+ /* -1 to skip '\\' */
+ error = css_error_from_parserutils_error(
+ parserutils_buffer_append(
+ lexer->unescapedTokenData,
+ (const uint8_t *) sptr,
+ lexer->bytesReadForToken - 1));
+ if (error != CSS_OK)
+ return error;
+ }
+
+ lexer->token.data.len = lexer->bytesReadForToken - 1;
+ lexer->escapeSeen = true;
+ }
+
+ if (isHex(c)) {
+ lexer->bytesReadForToken += clen;
+
+ error = consumeUnicode(lexer, charToHex(c));
+ if (error != CSS_OK) {
+ /* Rewind for next time */
+ lexer->bytesReadForToken -= clen;
+ }
+
+ return error;
+ }
+
+ /* If we're handling escaped newlines, convert CR(LF)? to LF */
+ if (nl && c == '\r') {
+ cptr = parserutils_inputstream_peek(lexer->input,
+ lexer->bytesReadForToken + clen, &clen);
+ if (cptr == PARSERUTILS_INPUTSTREAM_OOD)
+ return CSS_NEEDDATA;
+
+ if (cptr == PARSERUTILS_INPUTSTREAM_EOF) {
+ c = '\n';
+ APPEND(lexer, &c, 1);
+
+ lexer->currentCol = 1;
+ lexer->currentLine++;
+
+ return CSS_OK;
+ }
+
+ c = *((uint8_t *) cptr);
+
+ if (c == '\n') {
+ APPEND(lexer, &c, 1);
+ /* And skip the '\r' in the input */
+ lexer->bytesReadForToken += clen;
+
+ lexer->currentCol = 1;
+ lexer->currentLine++;
+
+ return CSS_OK;
+ }
+ } else if (nl && (c == '\n' || c == '\f')) {
+ /* APPEND will increment this appropriately */
+ lexer->currentCol = 0;
+ lexer->currentLine++;
+ } else if (c != '\n' && c != '\r' && c != '\f') {
+ lexer->currentCol++;
+ }
+
+ /* Append the unescaped character */
+ APPEND(lexer, cptr, clen);
+
+ return CSS_OK;
+}
+
+css_error consumeNMChars(css_lexer *lexer)
+{
+ uintptr_t cptr;
+ uint8_t c;
+ size_t clen;
+ css_error error;
+
+ /* nmchar = [a-zA-Z] | '-' | '_' | nonascii | escape */
+
+ do {
+ cptr = parserutils_inputstream_peek(lexer->input,
+ lexer->bytesReadForToken, &clen);
+ if (cptr == PARSERUTILS_INPUTSTREAM_OOD)
+ return CSS_NEEDDATA;
+
+ if (cptr == PARSERUTILS_INPUTSTREAM_EOF)
+ return CSS_OK;
+
+ c = *((uint8_t *) cptr);
+
+ if (startNMChar(c) && c != '\\') {
+ APPEND(lexer, cptr, clen);
+ }
+
+ if (c == '\\') {
+ lexer->bytesReadForToken += clen;
+
+ error = consumeEscape(lexer, false);
+ if (error != CSS_OK) {
+ /* Rewind '\\', so we do the
+ * right thing next time */
+ lexer->bytesReadForToken -= clen;
+
+ /* Convert either EOF or INVALID into OK.
+ * This will cause the caller to believe that
+ * all NMChars in the sequence have been
+ * processed (and thus proceed to the next
+ * state). Eventually, the '\\' will be output
+ * as a CHAR. */
+ if (error == CSS_EOF || error == CSS_INVALID)
+ return CSS_OK;
+
+ return error;
+ }
+ }
+ } while (startNMChar(c));
+
+ return CSS_OK;
+}
+
+css_error consumeString(css_lexer *lexer)
+{
+ uintptr_t cptr;
+ uint8_t c;
+ size_t clen;
+ uint8_t quote = lexer->context.first;
+ uint8_t permittedquote = (quote == '"') ? '\'' : '"';
+ css_error error;
+
+ /* string = '"' (stringchar | "'")* '"' | "'" (stringchar | '"')* "'"
+ *
+ * The open quote has been consumed.
+ */
+
+ do {
+ cptr = parserutils_inputstream_peek(lexer->input,
+ lexer->bytesReadForToken, &clen);
+ if (cptr == PARSERUTILS_INPUTSTREAM_OOD)
+ return CSS_NEEDDATA;
+
+ if (cptr == PARSERUTILS_INPUTSTREAM_EOF)
+ return CSS_EOF;
+
+ c = *((uint8_t *) cptr);
+
+ if (c == permittedquote) {
+ APPEND(lexer, cptr, clen);
+ } else if (startStringChar(c)) {
+ error = consumeStringChars(lexer);
+ if (error != CSS_OK)
+ return error;
+ } else if (c != quote) {
+ /* Invalid character in string -- skip */
+ lexer->bytesReadForToken += clen;
+ }
+ } while(c != quote);
+
+ /* Append closing quote to token data */
+ APPEND(lexer, cptr, clen);
+
+ return CSS_OK;
+}
+
+css_error consumeStringChars(css_lexer *lexer)
+{
+ uintptr_t cptr;
+ uint8_t c;
+ size_t clen;
+ css_error error;
+
+ /* stringchar = urlchar | ' ' | ')' | '\' nl */
+
+ do {
+ cptr = parserutils_inputstream_peek(lexer->input,
+ lexer->bytesReadForToken, &clen);
+ if (cptr == PARSERUTILS_INPUTSTREAM_OOD)
+ return CSS_NEEDDATA;
+
+ if (cptr == PARSERUTILS_INPUTSTREAM_EOF)
+ return CSS_OK;
+
+ c = *((uint8_t *) cptr);
+
+ if (startStringChar(c) && c != '\\') {
+ APPEND(lexer, cptr, clen);
+ }
+
+ if (c == '\\') {
+ lexer->bytesReadForToken += clen;
+
+ error = consumeEscape(lexer, true);
+ if (error != CSS_OK) {
+ /* Rewind '\\', so we do the
+ * right thing next time. */
+ lexer->bytesReadForToken -= clen;
+
+ /* Convert EOF to OK. This causes the caller
+ * to believe that all StringChars have been
+ * processed. Eventually, the '\\' will be
+ * output as a CHAR. */
+ if (error == CSS_EOF)
+ return CSS_OK;
+
+ return error;
+ }
+ }
+ } while (startStringChar(c));
+
+ return CSS_OK;
+
+}
+
+css_error consumeUnicode(css_lexer *lexer, uint32_t ucs)
+{
+ uintptr_t cptr = PARSERUTILS_INPUTSTREAM_OOD; /* GCC: shush */
+ uint8_t c = 0;
+ size_t clen;
+ uint8_t utf8[6];
+ uint8_t *utf8ptr = utf8;
+ size_t utf8len = sizeof(utf8);
+ size_t bytesReadInit = lexer->bytesReadForToken;
+ int count;
+ parserutils_error error;
+
+ /* unicode = '\' [0-9a-fA-F]{1,6} wc?
+ *
+ * The '\' and the first digit have been consumed.
+ */
+
+ /* Attempt to consume a further five hex digits */
+ for (count = 0; count < 5; count++) {
+ cptr = parserutils_inputstream_peek(lexer->input,
+ lexer->bytesReadForToken, &clen);
+ if (cptr == PARSERUTILS_INPUTSTREAM_OOD) {
+ /* Rewind what we've read */
+ lexer->bytesReadForToken = bytesReadInit;
+ return CSS_NEEDDATA;
+ }
+
+ if (cptr == PARSERUTILS_INPUTSTREAM_EOF)
+ break;
+
+ c = *((uint8_t *) cptr);
+
+ if (isHex(c)) {
+ lexer->bytesReadForToken += clen;
+
+ ucs = (ucs << 4) | charToHex(c);
+ } else {
+ break;
+ }
+ }
+
+ /* Convert our UCS4 character to UTF-8 */
+ error = parserutils_charset_utf8_from_ucs4(ucs, &utf8ptr, &utf8len);
+ assert(error == PARSERUTILS_OK);
+
+ /* Append it to the token data (unescaped buffer already set up) */
+ /* We can't use the APPEND() macro here as we want to rewind correctly
+ * on error. Additionally, lexer->bytesReadForToken has already been
+ * advanced */
+ error = appendToTokenData(lexer, (const uint8_t *) utf8,
+ sizeof(utf8) - utf8len);
+ if (error != CSS_OK) {
+ /* Rewind what we've read */
+ lexer->bytesReadForToken = bytesReadInit;
+ return error;
+ }
+
+ /* Finally, attempt to skip a whitespace character */
+ if (cptr == PARSERUTILS_INPUTSTREAM_EOF)
+ return CSS_OK;
+
+ if (isSpace(c)) {
+ lexer->bytesReadForToken += clen;
+ }
+
+ /* +2 for '\' and first digit */
+ lexer->currentCol += lexer->bytesReadForToken - bytesReadInit + 2;
+
+ return CSS_OK;
+}
+
+css_error consumeURLChars(css_lexer *lexer)
+{
+ uintptr_t cptr;
+ uint8_t c;
+ size_t clen;
+ css_error error;
+
+ /* urlchar = [\t!#-&(*-~] | nonascii | escape */
+
+ do {
+ cptr = parserutils_inputstream_peek(lexer->input,
+ lexer->bytesReadForToken, &clen);
+ if (cptr == PARSERUTILS_INPUTSTREAM_OOD)
+ return CSS_NEEDDATA;
+
+ if (cptr == PARSERUTILS_INPUTSTREAM_EOF)
+ return CSS_OK;
+
+ c = *((uint8_t *) cptr);
+
+ if (startURLChar(c) && c != '\\') {
+ APPEND(lexer, cptr, clen);
+ }
+
+ if (c == '\\') {
+ lexer->bytesReadForToken += clen;
+
+ error = consumeEscape(lexer, false);
+ if (error != CSS_OK) {
+ /* Rewind '\\', so we do the
+ * right thing next time */
+ lexer->bytesReadForToken -= clen;
+
+ /* Convert either EOF or INVALID into OK.
+ * This will cause the caller to believe that
+ * all URLChars in the sequence have been
+ * processed (and thus proceed to the next
+ * state). Eventually, the '\\' will be output
+ * as a CHAR. */
+ if (error == CSS_EOF || error == CSS_INVALID)
+ return CSS_OK;
+
+ return error;
+ }
+ }
+ } while (startURLChar(c));
+
+ return CSS_OK;
+}
+
+css_error consumeWChars(css_lexer *lexer)
+{
+ uintptr_t cptr;
+ uint8_t c;
+ size_t clen;
+
+ do {
+ cptr = parserutils_inputstream_peek(lexer->input,
+ lexer->bytesReadForToken, &clen);
+ if (cptr == PARSERUTILS_INPUTSTREAM_OOD)
+ return CSS_NEEDDATA;
+
+ if (cptr == PARSERUTILS_INPUTSTREAM_EOF)
+ return CSS_OK;
+
+ c = *((uint8_t *) cptr);
+
+ if (isSpace(c)) {
+ APPEND(lexer, cptr, clen);
+ }
+
+ if (c == '\n' || c == '\f') {
+ lexer->currentCol = 1;
+ lexer->currentLine++;
+ }
+
+ if (lexer->context.lastWasCR && c != '\n') {
+ lexer->currentCol = 1;
+ lexer->currentLine++;
+ }
+ lexer->context.lastWasCR = (c == '\r');
+ } while (isSpace(c));
+
+ if (lexer->context.lastWasCR) {
+ lexer->currentCol = 1;
+ lexer->currentLine++;
+ }
+
+ return CSS_OK;
+}
+
+/******************************************************************************
+ * More utility routines *
+ ******************************************************************************/
+
+uint32_t charToHex(uint8_t c)
+{
+ switch (c) {
+ case 'a': case 'A':
+ return 0xa;
+ case 'b': case 'B':
+ return 0xb;
+ case 'c': case 'C':
+ return 0xc;
+ case 'd': case 'D':
+ return 0xd;
+ case 'e': case 'E':
+ return 0xe;
+ case 'f': case 'F':
+ return 0xf;
+ case '0':
+ return 0x0;
+ case '1':
+ return 0x1;
+ case '2':
+ return 0x2;
+ case '3':
+ return 0x3;
+ case '4':
+ return 0x4;
+ case '5':
+ return 0x5;
+ case '6':
+ return 0x6;
+ case '7':
+ return 0x7;
+ case '8':
+ return 0x8;
+ case '9':
+ return 0x9;
+ }
+
+ return 0;
+}
+
+bool startNMChar(uint8_t c)
+{
+ return c == '_' || ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') ||
+ ('0' <= c && c <= '9') || c == '-' || c >= 0x80 || c == '\\';
+}
+
+bool startNMStart(uint8_t c)
+{
+ return c == '_' || ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') ||
+ c >= 0x80 || c == '\\';
+}
+
+bool startStringChar(uint8_t c)
+{
+ return startURLChar(c) || c == ' ' || c == ')';
+}
+
+bool startURLChar(uint8_t c)
+{
+ return c == '\t' || c == '!' || ('#' <= c && c <= '&') || c == '(' ||
+ ('*' <= c && c <= '~') || c >= 0x80 || c == '\\';
+}
+
+bool isDigit(uint8_t c)
+{
+ return '0' <= c && c <= '9';
+}
+
+bool isHex(uint8_t c)
+{
+ return isDigit(c) || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F');
+}
+
+bool isSpace(uint8_t c)
+{
+ return c == ' ' || c == '\r' || c == '\n' || c == '\f' || c == '\t';
+}
+
diff --git a/src/lex/lex.h b/src/lex/lex.h
new file mode 100644
index 0000000..150823e
--- /dev/null
+++ b/src/lex/lex.h
@@ -0,0 +1,67 @@
+/*
+ * This file is part of LibCSS.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2008 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#ifndef css_lex_lex_h_
+#define css_lex_lex_h_
+
+#include <libcss/functypes.h>
+#include <libcss/types.h>
+
+#include <parserutils/input/inputstream.h>
+
+typedef struct css_lexer css_lexer;
+
+/**
+ * Lexer option types
+ */
+typedef enum css_lexer_opttype {
+ CSS_LEXER_EMIT_COMMENTS,
+} css_lexer_opttype;
+
+/**
+ * Lexer option parameters
+ */
+typedef union css_lexer_optparams {
+ bool emit_comments;
+} css_lexer_optparams;
+
+/**
+ * Token type
+ */
+typedef enum css_token_type{
+ CSS_TOKEN_IDENT, CSS_TOKEN_ATKEYWORD, CSS_TOKEN_STRING,
+ CSS_TOKEN_HASH, CSS_TOKEN_NUMBER, CSS_TOKEN_PERCENTAGE,
+ CSS_TOKEN_DIMENSION, CSS_TOKEN_URI, CSS_TOKEN_UNICODE_RANGE,
+ CSS_TOKEN_CDO, CSS_TOKEN_CDC, CSS_TOKEN_S, CSS_TOKEN_COMMENT,
+ CSS_TOKEN_FUNCTION, CSS_TOKEN_INCLUDES, CSS_TOKEN_DASHMATCH,
+ CSS_TOKEN_PREFIXMATCH, CSS_TOKEN_SUFFIXMATCH, CSS_TOKEN_SUBSTRINGMATCH,
+ CSS_TOKEN_CHAR, CSS_TOKEN_EOF
+} css_token_type;
+
+/**
+ * Token object
+ */
+typedef struct css_token {
+ css_token_type type;
+
+ css_string data;
+
+ uint32_t col;
+ uint32_t line;
+} css_token;
+
+css_lexer *css_lexer_create(parserutils_inputstream *input,
+ css_alloc alloc, void *pw);
+void css_lexer_destroy(css_lexer *lexer);
+
+css_error css_lexer_setopt(css_lexer *lexer, css_lexer_opttype type,
+ css_lexer_optparams *params);
+
+css_error css_lexer_get_token(css_lexer *lexer, const css_token **token);
+
+#endif
+
diff --git a/src/libcss.c b/src/libcss.c
new file mode 100644
index 0000000..b5b99c9
--- /dev/null
+++ b/src/libcss.c
@@ -0,0 +1,50 @@
+/*
+ * This file is part of LibCSS.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#include <parserutils/parserutils.h>
+
+#include <libcss/libcss.h>
+
+#include "utils/parserutilserror.h"
+
+/**
+ * Initialise the CSS library for use.
+ *
+ * This _must_ be called before using any LibCSS functions
+ *
+ * \param aliases_file Pointer to name of file containing encoding alias data
+ * \param alloc Pointer to (de)allocation function
+ * \param pw Pointer to client-specific private data (may be NULL)
+ * \return CSS_OK on success, applicable error otherwise.
+ */
+css_error css_initialise(const char *aliases_file,
+ css_alloc alloc, void *pw)
+{
+ if (aliases_file == NULL || alloc == NULL)
+ return CSS_BADPARM;
+
+ return css_error_from_parserutils_error(
+ parserutils_initialise(aliases_file, alloc, pw));
+}
+
+/**
+ * Clean up after LibCSS
+ *
+ * \param alloc Pointer to (de)allocation function
+ * \param pw Pointer to client-specific private data (may be NULL)
+ * \return CSS_OK on success, applicable error otherwise.
+ */
+css_error css_finalise(css_alloc alloc, void *pw)
+{
+ if (alloc == NULL)
+ return CSS_BADPARM;
+
+ return css_error_from_parserutils_error(
+ parserutils_finalise(alloc, pw));
+}
+
+
diff --git a/src/utils/Makefile b/src/utils/Makefile
new file mode 100644
index 0000000..912590c
--- /dev/null
+++ b/src/utils/Makefile
@@ -0,0 +1,46 @@
+# Child makefile fragment
+#
+# Toolchain is provided by top-level makefile
+#
+# Variables provided by top-level makefile
+#
+# COMPONENT The name of the component
+# EXPORT The location of the export directory
+# TOP The location of the source tree root
+# RELEASEDIR The place to put release objects
+# DEBUGDIR The place to put debug objects
+#
+# do_include Canned command sequence to include a child makefile
+#
+# Variables provided by parent makefile:
+#
+# DIR The name of the directory we're in, relative to $(TOP)
+#
+# Variables we can manipulate:
+#
+# ITEMS_CLEAN The list of items to remove for "make clean"
+# ITEMS_DISTCLEAN The list of items to remove for "make distclean"
+# TARGET_TESTS The list of target names to run for "make test"
+#
+# SOURCES The list of sources to build for $(COMPONENT)
+#
+# Plus anything from the toolchain
+
+# Push parent directory onto the directory stack
+sp := $(sp).x
+dirstack_$(sp) := $(d)
+d := $(DIR)
+
+# Sources
+SRCS_$(d) := errors.c
+
+# Append to sources for component
+SOURCES += $(addprefix $(d), $(SRCS_$(d)))
+
+# Now include any children we may have
+MAKE_INCLUDES := $(wildcard $(d)*/Makefile)
+$(eval $(foreach INC, $(MAKE_INCLUDES), $(call do_include,$(INC))))
+
+# Finally, pop off the directory stack
+d := $(dirstack_$(sp))
+sp := $(basename $(sp))
diff --git a/src/utils/errors.c b/src/utils/errors.c
new file mode 100644
index 0000000..ac7dd42
--- /dev/null
+++ b/src/utils/errors.c
@@ -0,0 +1,80 @@
+/*
+ * This file is part of LibCSS.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#include <string.h>
+
+#include <libcss/errors.h>
+
+/**
+ * Convert a LibCSS error code to a string
+ *
+ * \param error The error code to convert
+ * \return Pointer to string representation of error, or NULL if unknown.
+ */
+const char *css_error_to_string(css_error error)
+{
+ const char *result = NULL;
+
+ switch (error) {
+ case CSS_OK:
+ result = "No error";
+ break;
+ case CSS_NOMEM:
+ result = "Insufficient memory";
+ break;
+ case CSS_BADPARM:
+ result = "Bad parameter";
+ break;
+ case CSS_INVALID:
+ result = "Invalid input";
+ break;
+ case CSS_FILENOTFOUND:
+ result = "File not found";
+ break;
+ case CSS_NEEDDATA:
+ result = "Insufficient data";
+ break;
+ case CSS_BADCHARSET:
+ result = "BOM and @charset mismatch";
+ break;
+ case CSS_EOF:
+ result = "EOF encountered";
+ break;
+ }
+
+ return result;
+}
+
+/**
+ * Convert a string representation of an error name to a LibCSS error code
+ *
+ * \param str String containing error name
+ * \param len Length of string (bytes)
+ * \return LibCSS error code, or CSS_OK if unknown
+ */
+css_error css_error_from_string(const char *str, size_t len)
+{
+ if (strncmp(str, "CSS_OK", len) == 0) {
+ return CSS_OK;
+ } else if (strncmp(str, "CSS_NOMEM", len) == 0) {
+ return CSS_NOMEM;
+ } else if (strncmp(str, "CSS_BADPARM", len) == 0) {
+ return CSS_BADPARM;
+ } else if (strncmp(str, "CSS_INVALID", len) == 0) {
+ return CSS_INVALID;
+ } else if (strncmp(str, "CSS_FILENOTFOUND", len) == 0) {
+ return CSS_FILENOTFOUND;
+ } else if (strncmp(str, "CSS_NEEDDATA", len) == 0) {
+ return CSS_NEEDDATA;
+ } else if (strncmp(str, "CSS_BADCHARSET", len) == 0) {
+ return CSS_BADCHARSET;
+ } else if (strncmp(str, "CSS_EOF", len) == 0) {
+ return CSS_EOF;
+ }
+
+ return CSS_OK;
+}
diff --git a/src/utils/parserutilserror.h b/src/utils/parserutilserror.h
new file mode 100644
index 0000000..d2cffb6
--- /dev/null
+++ b/src/utils/parserutilserror.h
@@ -0,0 +1,29 @@
+/*
+ * This file is part of LibCSS.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#ifndef css_utils_parserutilserror_h_
+#define css_utils_parserutilserror_h_
+
+#include <parserutils/errors.h>
+
+#include <libcss/errors.h>
+
+/**
+ * Convert a ParserUtils error into a LibCSS error
+ *
+ * \param error The ParserUtils error to convert
+ * \return The corresponding LibCSS error
+ */
+static inline css_error css_error_from_parserutils_error(
+ parserutils_error error)
+{
+ /* Currently, there's a 1:1 mapping, so we've nothing to do */
+ return (css_error) error;
+}
+
+#endif
+
diff --git a/src/utils/utils.h b/src/utils/utils.h
new file mode 100644
index 0000000..ac19e59
--- /dev/null
+++ b/src/utils/utils.h
@@ -0,0 +1,28 @@
+/*
+ * This file is part of LibCSS.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#ifndef css_utils_h_
+#define css_utils_h_
+
+#ifndef max
+#define max(a,b) ((a)>(b)?(a):(b))
+#endif
+
+#ifndef min
+#define min(a,b) ((a)<(b)?(a):(b))
+#endif
+
+#ifndef SLEN
+/* Calculate length of a string constant */
+#define SLEN(s) (sizeof((s)) - 1) /* -1 for '\0' */
+#endif
+
+#ifndef UNUSED
+#define UNUSED(x) ((x)=(x))
+#endif
+
+#endif