From 7b30a5520cfb56e651f0eb4da85a3e07747da7dc Mon Sep 17 00:00:00 2001 From: John Mark Bell Date: Sat, 23 Jun 2007 22:40:25 +0000 Subject: Import hubbub -- an HTML parsing library. Plenty of work still to do (like tree generation ;) svn path=/trunk/hubbub/; revision=3359 --- src/tokeniser/Makefile | 53 ++ src/tokeniser/entities.c | 363 +++++++ src/tokeniser/entities.h | 25 + src/tokeniser/tokeniser.c | 2282 +++++++++++++++++++++++++++++++++++++++++++++ src/tokeniser/tokeniser.h | 71 ++ 5 files changed, 2794 insertions(+) create mode 100644 src/tokeniser/Makefile create mode 100644 src/tokeniser/entities.c create mode 100644 src/tokeniser/entities.h create mode 100644 src/tokeniser/tokeniser.c create mode 100644 src/tokeniser/tokeniser.h (limited to 'src/tokeniser') diff --git a/src/tokeniser/Makefile b/src/tokeniser/Makefile new file mode 100644 index 0000000..539625f --- /dev/null +++ b/src/tokeniser/Makefile @@ -0,0 +1,53 @@ +# Makefile for libhubbub +# +# Toolchain is exported by top-level makefile +# +# Top-level makefile also exports the following variables: +# +# COMPONENT Name of component +# EXPORT Absolute path of export directory +# TOP Absolute path of source tree root +# +# The top-level makefile requires the following targets to exist: +# +# clean Clean source tree +# debug Create a debug binary +# distclean Fully clean source tree, back to pristine condition +# export Export distributable components to ${EXPORT} +# release Create a release binary +# setup Perform any setup required prior to compilation +# test Execute any test cases + +# Manipulate include paths +CFLAGS += -I$(CURDIR) + +# Objects +OBJS = entities tokeniser + +.PHONY: clean debug distclean export release setup test + +# Targets +release: $(addprefix ../Release/, $(addsuffix .o, $(OBJS))) + +debug: $(addprefix ../Debug/, $(addsuffix .o, $(OBJS))) + +clean: + -@${RM} ${RMFLAGS} $(addprefix ../Release/, $(addsuffix .o, ${OBJS})) + -@${RM} ${RMFLAGS} $(addprefix ../Debug/, $(addsuffix .o, ${OBJS})) + +distclean: + +setup: + +export: + +test: + +# Pattern rules +../Release/%.o: %.c + @${ECHO} ${ECHOFLAGS} "==> $<" + @${CC} -c ${CFLAGS} -DNDEBUG -o $@ $< + +../Debug/%.o: %.c + @${ECHO} ${ECHOFLAGS} "==> $<" + @${CC} -c -g ${CFLAGS} -o $@ $< diff --git a/src/tokeniser/entities.c b/src/tokeniser/entities.c new file mode 100644 index 0000000..8a9acf5 --- /dev/null +++ b/src/tokeniser/entities.c @@ -0,0 +1,363 @@ +/* + * This file is part of Hubbub. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell + */ + +#include "utils/dict.h" +#include "utils/utils.h" +#include "tokeniser/entities.h" + +typedef struct hubbub_entity hubbub_entity; + +static const struct hubbub_entity { + const char *name; + uint32_t ucs4; +} entities[] = { + { "AElig", 0x00C6 }, + { "Aacute", 0x00C1 }, + { "Acirc", 0x00C2 }, + { "Agrave", 0x00C0 }, + { "Alpha", 0x0391 }, + { "Aring", 0x00C5 }, + { "Atilde", 0x00C3 }, + { "Auml", 0x00C4 }, + { "Beta", 0x0392 }, + { "Ccedil", 0x00C7 }, + { "Chi", 0x03A7 }, + { "Dagger", 0x2021 }, + { "Delta", 0x0394 }, + { "ETH", 0x00D0 }, + { "Eacute", 0x00C9 }, + { "Ecirc", 0x00CA }, + { "Egrave", 0x00C8 }, + { "Epsilon", 0x0395 }, + { "Eta", 0x0397 }, + { "Euml", 0x00CB }, + { "Gamma", 0x0393 }, + { "Iacute", 0x00CD }, + { "Icirc", 0x00CE }, + { "Igrave", 0x00CC }, + { "Iota", 0x0399 }, + { "Iuml", 0x00CF }, + { "Kappa", 0x039A }, + { "Lambda", 0x039B }, + { "Mu", 0x039C }, + { "Ntilde", 0x00D1 }, + { "Nu", 0x039D }, + { "OElig", 0x0152 }, + { "Oacute", 0x00D3 }, + { "Ocirc", 0x00D4 }, + { "Ograve", 0x00D2 }, + { "Omega", 0x03A9 }, + { "Omicron", 0x039F }, + { "Oslash", 0x00D8 }, + { "Otilde", 0x00D5 }, + { "Ouml", 0x00D6 }, + { "Phi", 0x03A6 }, + { "Pi", 0x03A0 }, + { "Prime", 0x2033 }, + { "Psi", 0x03A8 }, + { "Rho", 0x03A1 }, + { "Scaron", 0x0160 }, + { "Sigma", 0x03A3 }, + { "THORN", 0x00DE }, + { "Tau", 0x03A4 }, + { "Theta", 0x0398 }, + { "Uacute", 0x00DA }, + { "Ucirc", 0x00DB }, + { "Ugrave", 0x00D9 }, + { "Upsilon", 0x03A5 }, + { "Uuml", 0x00DC }, + { "Xi", 0x039E }, + { "Yacute", 0x00DD }, + { "Yuml", 0x0178 }, + { "Zeta", 0x0396 }, + { "aacute", 0x00E1 }, + { "acirc", 0x00E2 }, + { "acute", 0x00B4 }, + { "aelig", 0x00E6 }, + { "agrave", 0x00E0 }, + { "alefsym", 0x2135 }, + { "alpha", 0x03B1 }, + { "amp", 0x0026 }, + { "AMP", 0x0026 }, + { "and", 0x2227 }, + { "ang", 0x2220 }, + { "apos", 0x0027 }, + { "aring", 0x00E5 }, + { "asymp", 0x2248 }, + { "atilde", 0x00E3 }, + { "auml", 0x00E4 }, + { "bdquo", 0x201E }, + { "beta", 0x03B2 }, + { "brvbar", 0x00A6 }, + { "bull", 0x2022 }, + { "cap", 0x2229 }, + { "ccedil", 0x00E7 }, + { "cedil", 0x00B8 }, + { "cent", 0x00A2 }, + { "chi", 0x03C7 }, + { "circ", 0x02C6 }, + { "clubs", 0x2663 }, + { "cong", 0x2245 }, + { "copy", 0x00A9 }, + { "COPY", 0x00A9 }, + { "crarr", 0x21B5 }, + { "cup", 0x222A }, + { "curren", 0x00A4 }, + { "dArr", 0x21D3 }, + { "dagger", 0x2020 }, + { "darr", 0x2193 }, + { "deg", 0x00B0 }, + { "delta", 0x03B4 }, + { "diams", 0x2666 }, + { "divide", 0x00F7 }, + { "eacute", 0x00E9 }, + { "ecirc", 0x00EA }, + { "egrave", 0x00E8 }, + { "empty", 0x2205 }, + { "emsp", 0x2003 }, + { "ensp", 0x2002 }, + { "epsilon", 0x03B5 }, + { "equiv", 0x2261 }, + { "eta", 0x03B7 }, + { "eth", 0x00F0 }, + { "euml", 0x00EB }, + { "euro", 0x20AC }, + { "exist", 0x2203 }, + { "fnof", 0x0192 }, + { "forall", 0x2200 }, + { "frac12", 0x00BD }, + { "frac14", 0x00BC }, + { "frac34", 0x00BE }, + { "frasl", 0x2044 }, + { "gamma", 0x03B3 }, + { "ge", 0x2265 }, + { "gt", 0x003E }, + { "GT", 0x003E }, + { "hArr", 0x21D4 }, + { "harr", 0x2194 }, + { "hearts", 0x2665 }, + { "hellip", 0x2026 }, + { "iacute", 0x00ED }, + { "icirc", 0x00EE }, + { "iexcl", 0x00A1 }, + { "igrave", 0x00EC }, + { "image", 0x2111 }, + { "infin", 0x221E }, + { "int", 0x222B }, + { "iota", 0x03B9 }, + { "iquest", 0x00BF }, + { "isin", 0x2208 }, + { "iuml", 0x00EF }, + { "kappa", 0x03BA }, + { "lArr", 0x21D0 }, + { "lambda", 0x03BB }, + { "lang", 0x2329 }, + { "laquo", 0x00AB }, + { "larr", 0x2190 }, + { "lceil", 0x2308 }, + { "ldquo", 0x201C }, + { "le", 0x2264 }, + { "lfloor", 0x230A }, + { "lowast", 0x2217 }, + { "loz", 0x25CA }, + { "lrm", 0x200E }, + { "lsaquo", 0x2039 }, + { "lsquo", 0x2018 }, + { "lt", 0x003C }, + { "LT", 0x003C }, + { "macr", 0x00AF }, + { "mdash", 0x2014 }, + { "micro", 0x00B5 }, + { "middot", 0x00B7 }, + { "minus", 0x2212 }, + { "mu", 0x03BC }, + { "nabla", 0x2207 }, + { "nbsp", 0x00A0 }, + { "ndash", 0x2013 }, + { "ne", 0x2260 }, + { "ni", 0x220B }, + { "not", 0x00AC }, + { "notin", 0x2209 }, + { "nsub", 0x2284 }, + { "ntilde", 0x00F1 }, + { "nu", 0x03BD }, + { "oacute", 0x00F3 }, + { "ocirc", 0x00F4 }, + { "oelig", 0x0153 }, + { "ograve", 0x00F2 }, + { "oline", 0x203E }, + { "omega", 0x03C9 }, + { "omicron", 0x03BF }, + { "oplus", 0x2295 }, + { "or", 0x2228 }, + { "ordf", 0x00AA }, + { "ordm", 0x00BA }, + { "oslash", 0x00F8 }, + { "otilde", 0x00F5 }, + { "otimes", 0x2297 }, + { "ouml", 0x00F6 }, + { "para", 0x00B6 }, + { "part", 0x2202 }, + { "permil", 0x2030 }, + { "perp", 0x22A5 }, + { "phi", 0x03C6 }, + { "pi", 0x03C0 }, + { "piv", 0x03D6 }, + { "plusmn", 0x00B1 }, + { "pound", 0x00A3 }, + { "prime", 0x2032 }, + { "prod", 0x220F }, + { "prop", 0x221D }, + { "psi", 0x03C8 }, + { "quot", 0x0022 }, + { "QUOT", 0x0022 }, + { "rArr", 0x21D2 }, + { "radic", 0x221A }, + { "rang", 0x232A }, + { "raquo", 0x00BB }, + { "rarr", 0x2192 }, + { "rceil", 0x2309 }, + { "rdquo", 0x201D }, + { "real", 0x211C }, + { "reg", 0x00AE }, + { "REG", 0x00AE }, + { "rfloor", 0x230B }, + { "rho", 0x03C1 }, + { "rlm", 0x200F }, + { "rsaquo", 0x203A }, + { "rsquo", 0x2019 }, + { "sbquo", 0x201A }, + { "scaron", 0x0161 }, + { "sdot", 0x22C5 }, + { "sect", 0x00A7 }, + { "shy", 0x00AD }, + { "sigma", 0x03C3 }, + { "sigmaf", 0x03C2 }, + { "sim", 0x223C }, + { "spades", 0x2660 }, + { "sub", 0x2282 }, + { "sube", 0x2286 }, + { "sum", 0x2211 }, + { "sup", 0x2283 }, + { "sup1", 0x00B9 }, + { "sup2", 0x00B2 }, + { "sup3", 0x00B3 }, + { "supe", 0x2287 }, + { "szlig", 0x00DF }, + { "tau", 0x03C4 }, + { "there4", 0x2234 }, + { "theta", 0x03B8 }, + { "thetasym", 0x03D1 }, + { "thinsp", 0x2009 }, + { "thorn", 0x00FE }, + { "tilde", 0x02DC }, + { "times", 0x00D7 }, + { "trade", 0x2122 }, + { "uArr", 0x21D1 }, + { "uacute", 0x00FA }, + { "uarr", 0x2191 }, + { "ucirc", 0x00FB }, + { "ugrave", 0x00F9 }, + { "uml", 0x00A8 }, + { "upsih", 0x03D2 }, + { "upsilon", 0x03C5 }, + { "uuml", 0x00FC }, + { "weierp", 0x2118 }, + { "xi", 0x03BE }, + { "yacute", 0x00FD }, + { "yen", 0x00A5 }, + { "yuml", 0x00FF }, + { "zeta", 0x03B6 }, + { "zwj", 0x200D }, + { "zwnj", 0x200C }, +}; + +static hubbub_dict *dict; + +/** + * Create the entities dictionary + * + * \param alloc Memory (de)allocation function + * \param pw Pointer to client-specific private data (may be NULL) + * \return HUBBUB_OK on success, appropriate error otherwise + */ +hubbub_error hubbub_entities_create(hubbub_alloc alloc, void *pw) +{ + hubbub_error error; + size_t i; + + if (alloc == NULL) + return HUBBUB_BADPARM; + + dict = hubbub_dict_create(alloc, pw); + if (dict == NULL) + return HUBBUB_NOMEM; + + for (i = 0; i < sizeof(entities) / sizeof(entities[0]); i++) { + error = hubbub_dict_insert(dict, entities[i].name, + &entities[i]); + if (error != HUBBUB_OK) { + hubbub_dict_destroy(dict); + return error; + } + } + + return HUBBUB_OK; +} + +/** + * Destroy the entities dictionary + * + * \param alloc Memory (de)allocation function + * \param pw Pointer to client-specific private data (may be NULL) + */ +void hubbub_entities_destroy(hubbub_alloc alloc, void *pw) +{ + UNUSED(alloc); + UNUSED(pw); + + hubbub_dict_destroy(dict); +} + +/** + * Step-wise search for an entity in the dictionary + * + * \param c Character to look for + * \param result Pointer to location for result + * \param context Pointer to location for search context + * \return HUBBUB_OK if key found, + * HUBBUB_NEEDDATA if more steps are required + * HUBBUB_INVALID if nothing matches + * + * The value pointed to by ::context should be NULL for the first call. + * Thereafter, pass in the same value as returned by the previous call. + * The context is opaque to the caller and should not be inspected. + * + * The location pointed to by ::result will be set to U+FFFD unless a match + * is found. + */ +hubbub_error hubbub_entities_search_step(uint8_t c, uint32_t *result, + void **context) +{ + const hubbub_entity *e; + hubbub_error error; + + if (result == NULL || context == NULL) + return HUBBUB_BADPARM; + + error = hubbub_dict_search_step(dict, c, + (const void **) (const void *) &e, + context); + if (error != HUBBUB_OK) { + *result = 0xFFFD; + return error; + } + + *result = e->ucs4; + + return HUBBUB_OK; +} diff --git a/src/tokeniser/entities.h b/src/tokeniser/entities.h new file mode 100644 index 0000000..efd1987 --- /dev/null +++ b/src/tokeniser/entities.h @@ -0,0 +1,25 @@ +/* + * This file is part of Hubbub. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell + */ + +#ifndef hubbub_tokeniser_entities_h_ +#define hubbub_tokeniser_entities_h_ + +#include + +#include +#include + +/* Create the entities dictionary */ +hubbub_error hubbub_entities_create(hubbub_alloc alloc, void *pw); +/* Destroy the entities dictionary */ +void hubbub_entities_destroy(hubbub_alloc alloc, void *pw); + +/* Step-wise search for an entity in the dictionary */ +hubbub_error hubbub_entities_search_step(uint8_t c, uint32_t *result, + void **context); + +#endif diff --git a/src/tokeniser/tokeniser.c b/src/tokeniser/tokeniser.c new file mode 100644 index 0000000..f8b6bb3 --- /dev/null +++ b/src/tokeniser/tokeniser.c @@ -0,0 +1,2282 @@ +/* + * This file is part of Hubbub. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell + */ + +#include +#include + +#include "utils/utils.h" + +#include "tokeniser/entities.h" +#include "tokeniser/tokeniser.h" + +/** + * Table of mappings between Windows-1252 codepoints 128-159 and UCS4 + */ +static const uint32_t cp1252Table[32] = { + 0x20AC, 0xFFFD, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, + 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0xFFFD, 0x017D, 0xFFFD, + 0xFFFD, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, + 0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0xFFFD, 0x017E, 0x0178 +}; + +/** + * Tokeniser states + */ +typedef enum hubbub_tokeniser_state { + HUBBUB_TOKENISER_STATE_DATA, + HUBBUB_TOKENISER_STATE_ENTITY_DATA, + HUBBUB_TOKENISER_STATE_TAG_OPEN, + HUBBUB_TOKENISER_STATE_CLOSE_TAG_OPEN, + HUBBUB_TOKENISER_STATE_CLOSE_TAG_MATCH, + HUBBUB_TOKENISER_STATE_TAG_NAME, + HUBBUB_TOKENISER_STATE_BEFORE_ATTRIBUTE_NAME, + HUBBUB_TOKENISER_STATE_ATTRIBUTE_NAME, + HUBBUB_TOKENISER_STATE_AFTER_ATTRIBUTE_NAME, + HUBBUB_TOKENISER_STATE_BEFORE_ATTRIBUTE_VALUE, + HUBBUB_TOKENISER_STATE_ATTRIBUTE_VALUE_DQ, + HUBBUB_TOKENISER_STATE_ATTRIBUTE_VALUE_SQ, + HUBBUB_TOKENISER_STATE_ATTRIBUTE_VALUE_UQ, + HUBBUB_TOKENISER_STATE_ENTITY_IN_ATTRIBUTE_VALUE, + HUBBUB_TOKENISER_STATE_BOGUS_COMMENT, + HUBBUB_TOKENISER_STATE_MARKUP_DECLARATION_OPEN, + HUBBUB_TOKENISER_STATE_COMMENT_START, + HUBBUB_TOKENISER_STATE_COMMENT, + HUBBUB_TOKENISER_STATE_COMMENT_DASH, + HUBBUB_TOKENISER_STATE_COMMENT_END, + HUBBUB_TOKENISER_STATE_MATCH_DOCTYPE, + HUBBUB_TOKENISER_STATE_DOCTYPE, + HUBBUB_TOKENISER_STATE_BEFORE_DOCTYPE_NAME, + HUBBUB_TOKENISER_STATE_DOCTYPE_NAME, + HUBBUB_TOKENISER_STATE_AFTER_DOCTYPE_NAME, + HUBBUB_TOKENISER_STATE_BOGUS_DOCTYPE, + HUBBUB_TOKENISER_STATE_NUMBERED_ENTITY, + HUBBUB_TOKENISER_STATE_NAMED_ENTITY +} hubbub_tokeniser_state; + +/** + * Context for tokeniser + */ +typedef struct hubbub_tokeniser_context { + hubbub_token_type current_tag_type; /**< Type of current_tag */ + hubbub_tag current_tag; /**< Current tag */ + + hubbub_string current_comment; /**< Current comment */ + + hubbub_doctype current_doctype; /**< Current doctype */ + + hubbub_string current_chars; /**< Pending characters */ + + hubbub_tokeniser_state prev_state; /**< Previous state */ + + struct { + hubbub_string tag; /**< Pending close tag */ + } close_tag_match; + + struct { + uint32_t count; /**< Index into "DOCTYPE" */ + } match_doctype; + + struct { + hubbub_string str; /**< Pending string */ + uint8_t base; /**< Base for numeric + * entities */ + uint32_t codepoint; /**< UCS4 codepoint */ + bool had_data; /**< Whether we read + * anything after &#(x)? */ + hubbub_tokeniser_state return_state; /**< State we were + * called from */ + bool complete; /**< Flag that entity + * matching completed */ + bool done_setup; /**< Flag that match setup + * has completed */ + void *context; /**< Context for named + * entity search */ + size_t prev_len; /**< Previous byte length + * of str */ + } match_entity; + + struct { + uint32_t line; /**< Current line of input */ + uint32_t col; /**< Current character in + * line */ + } position; +} hubbub_tokeniser_context; + +/** + * Tokeniser data structure + */ +struct hubbub_tokeniser { + hubbub_tokeniser_state state; /**< Current tokeniser state */ + hubbub_content_model content_model; /**< Current content + * model flag */ + + hubbub_inputstream *input; /**< Input stream */ + + const uint8_t *input_buffer; /**< Start of input stream's buffer */ + size_t input_buffer_len; /**< Length of input buffer */ + + hubbub_tokeniser_context context; /**< Tokeniser context */ + + hubbub_token_handler token_handler; + void *token_pw; + + hubbub_buffer_handler buffer_handler; + void *buffer_pw; + + hubbub_error_handler error_handler; + void *error_pw; + + hubbub_alloc alloc; /**< Memory (de)allocation function */ + void *alloc_pw; /**< Client private data */ +}; + +static bool hubbub_tokeniser_handle_data(hubbub_tokeniser *tokeniser); +static bool hubbub_tokeniser_handle_entity_data(hubbub_tokeniser *tokeniser); +static bool hubbub_tokeniser_handle_tag_open(hubbub_tokeniser *tokeniser); +static bool hubbub_tokeniser_handle_close_tag_open( + hubbub_tokeniser *tokeniser); +static bool hubbub_tokeniser_handle_close_tag_match( + hubbub_tokeniser *tokeniser); +static bool hubbub_tokeniser_handle_tag_name(hubbub_tokeniser *tokeniser); +static bool hubbub_tokeniser_handle_before_attribute_name( + hubbub_tokeniser *tokeniser); +static bool hubbub_tokeniser_handle_attribute_name( + hubbub_tokeniser *tokeniser); +static bool hubbub_tokeniser_handle_after_attribute_name( + hubbub_tokeniser *tokeniser); +static bool hubbub_tokeniser_handle_before_attribute_value( + hubbub_tokeniser *tokeniser); +static bool hubbub_tokeniser_handle_attribute_value_dq( + hubbub_tokeniser *tokeniser); +static bool hubbub_tokeniser_handle_attribute_value_sq( + hubbub_tokeniser *tokeniser); +static bool hubbub_tokeniser_handle_attribute_value_uq( + hubbub_tokeniser *tokeniser); +static bool hubbub_tokeniser_handle_entity_in_attribute_value( + hubbub_tokeniser *tokeniser); +static bool hubbub_tokeniser_handle_bogus_comment( + hubbub_tokeniser *tokeniser); +static bool hubbub_tokeniser_handle_markup_declaration_open( + hubbub_tokeniser *tokeniser); +static bool hubbub_tokeniser_handle_comment_start( + hubbub_tokeniser *tokeniser); +static bool hubbub_tokeniser_handle_comment(hubbub_tokeniser *tokeniser); +static bool hubbub_tokeniser_handle_comment_dash( + hubbub_tokeniser *tokeniser); +static bool hubbub_tokeniser_handle_comment_end(hubbub_tokeniser *tokeniser); +static bool hubbub_tokeniser_handle_match_doctype( + hubbub_tokeniser *tokeniser); +static bool hubbub_tokeniser_handle_doctype(hubbub_tokeniser *tokeniser); +static bool hubbub_tokeniser_handle_before_doctype_name( + hubbub_tokeniser *tokeniser); +static bool hubbub_tokeniser_handle_doctype_name( + hubbub_tokeniser *tokeniser); +static bool hubbub_tokeniser_handle_after_doctype_name( + hubbub_tokeniser *tokeniser); +static bool hubbub_tokeniser_handle_bogus_doctype( + hubbub_tokeniser *tokeniser); +static bool hubbub_tokeniser_consume_entity(hubbub_tokeniser *tokeniser); +static bool hubbub_tokeniser_handle_numbered_entity( + hubbub_tokeniser *tokeniser); +static bool hubbub_tokeniser_handle_named_entity( + hubbub_tokeniser *tokeniser); +static void hubbub_tokeniser_buffer_moved_handler(const uint8_t *buffer, + size_t len, void *pw); +static void hubbub_tokeniser_emit_token(hubbub_tokeniser *tokeniser, + hubbub_token *token); + +/** + * Create a hubbub tokeniser + * + * \param input Input stream instance + * \param alloc Memory (de)allocation function + * \param pw Pointer to client-specific private data (may be NULL) + * \return Pointer to tokeniser instance, or NULL on failure + */ +hubbub_tokeniser *hubbub_tokeniser_create(hubbub_inputstream *input, + hubbub_alloc alloc, void *pw) +{ + hubbub_tokeniser *tok; + + if (input == NULL || alloc == NULL) + return NULL; + + tok = alloc(NULL, sizeof(hubbub_tokeniser), pw); + if (tok == NULL) + return NULL; + + tok->state = HUBBUB_TOKENISER_STATE_DATA; + tok->content_model = HUBBUB_CONTENT_MODEL_PCDATA; + + tok->input = input; + tok->input_buffer = NULL; + tok->input_buffer_len = 0; + + tok->token_handler = NULL; + tok->token_pw = NULL; + + tok->buffer_handler = NULL; + tok->buffer_pw = NULL; + + tok->error_handler = NULL; + tok->error_pw = NULL; + + tok->alloc = alloc; + tok->alloc_pw = pw; + + if (hubbub_inputstream_register_movehandler(input, + hubbub_tokeniser_buffer_moved_handler, tok) != + HUBBUB_OK) { + alloc(tok, 0, pw); + return NULL; + } + + memset(&tok->context, 0, sizeof(hubbub_tokeniser_context)); + + return tok; +} + +/** + * Destroy a hubbub tokeniser + * + * \param tokeniser The tokeniser instance to destroy + */ +void hubbub_tokeniser_destroy(hubbub_tokeniser *tokeniser) +{ + if (tokeniser == NULL) + return; + + hubbub_inputstream_deregister_movehandler(tokeniser->input, + hubbub_tokeniser_buffer_moved_handler, tokeniser); + + if (tokeniser->context.current_tag.attributes != NULL) { + tokeniser->alloc(tokeniser->context.current_tag.attributes, + 0, tokeniser->alloc_pw); + } + + tokeniser->alloc(tokeniser, 0, tokeniser->alloc_pw); +} + +/** + * Configure a hubbub tokeniser + * + * \param tokeniser The tokeniser instance to configure + * \param type The option type to set + * \param params Option-specific parameters + * \return HUBBUB_OK on success, appropriate error otherwise + */ +hubbub_error hubbub_tokeniser_setopt(hubbub_tokeniser *tokeniser, + hubbub_tokeniser_opttype type, + hubbub_tokeniser_optparams *params) +{ + if (tokeniser == NULL || params == NULL) + return HUBBUB_BADPARM; + + switch (type) { + case HUBBUB_TOKENISER_TOKEN_HANDLER: + tokeniser->token_handler = params->token_handler.handler; + tokeniser->token_pw = params->token_handler.pw; + break; + case HUBBUB_TOKENISER_BUFFER_HANDLER: + tokeniser->buffer_handler = params->buffer_handler.handler; + tokeniser->buffer_pw = params->buffer_handler.pw; + tokeniser->buffer_handler(tokeniser->input_buffer, + tokeniser->input_buffer_len, + tokeniser->buffer_pw); + break; + case HUBBUB_TOKENISER_ERROR_HANDLER: + tokeniser->error_handler = params->error_handler.handler; + tokeniser->error_pw = params->error_handler.pw; + break; + case HUBBUB_TOKENISER_CONTENT_MODEL: + tokeniser->content_model = params->content_model.model; + break; + } + + return HUBBUB_OK; +} + +/** + * Process remaining data in the input stream + * + * \param tokeniser The tokeniser instance to invoke + * \return HUBBUB_OK on success, appropriate error otherwise + */ +hubbub_error hubbub_tokeniser_run(hubbub_tokeniser *tokeniser) +{ + bool cont = true; + + if (tokeniser == NULL) + return HUBBUB_BADPARM; + + while (cont) { + switch (tokeniser->state) { + case HUBBUB_TOKENISER_STATE_DATA: + cont = hubbub_tokeniser_handle_data(tokeniser); + break; + case HUBBUB_TOKENISER_STATE_ENTITY_DATA: + cont = hubbub_tokeniser_handle_entity_data( + tokeniser); + break; + case HUBBUB_TOKENISER_STATE_TAG_OPEN: + cont = hubbub_tokeniser_handle_tag_open(tokeniser); + break; + case HUBBUB_TOKENISER_STATE_CLOSE_TAG_OPEN: + cont = hubbub_tokeniser_handle_close_tag_open( + tokeniser); + break; + case HUBBUB_TOKENISER_STATE_CLOSE_TAG_MATCH: + cont = hubbub_tokeniser_handle_close_tag_match( + tokeniser); + break; + case HUBBUB_TOKENISER_STATE_TAG_NAME: + cont = hubbub_tokeniser_handle_tag_name(tokeniser); + break; + case HUBBUB_TOKENISER_STATE_BEFORE_ATTRIBUTE_NAME: + cont = hubbub_tokeniser_handle_before_attribute_name( + tokeniser); + break; + case HUBBUB_TOKENISER_STATE_ATTRIBUTE_NAME: + cont = hubbub_tokeniser_handle_attribute_name( + tokeniser); + break; + case HUBBUB_TOKENISER_STATE_AFTER_ATTRIBUTE_NAME: + cont = hubbub_tokeniser_handle_after_attribute_name( + tokeniser); + break; + case HUBBUB_TOKENISER_STATE_BEFORE_ATTRIBUTE_VALUE: + cont = hubbub_tokeniser_handle_before_attribute_value( + tokeniser); + break; + case HUBBUB_TOKENISER_STATE_ATTRIBUTE_VALUE_DQ: + cont = hubbub_tokeniser_handle_attribute_value_dq( + tokeniser); + break; + case HUBBUB_TOKENISER_STATE_ATTRIBUTE_VALUE_SQ: + cont = hubbub_tokeniser_handle_attribute_value_sq( + tokeniser); + break; + case HUBBUB_TOKENISER_STATE_ATTRIBUTE_VALUE_UQ: + cont = hubbub_tokeniser_handle_attribute_value_uq( + tokeniser); + break; + case HUBBUB_TOKENISER_STATE_ENTITY_IN_ATTRIBUTE_VALUE: + cont = hubbub_tokeniser_handle_entity_in_attribute_value( + tokeniser); + break; + case HUBBUB_TOKENISER_STATE_BOGUS_COMMENT: + cont = hubbub_tokeniser_handle_bogus_comment( + tokeniser); + break; + case HUBBUB_TOKENISER_STATE_MARKUP_DECLARATION_OPEN: + cont = hubbub_tokeniser_handle_markup_declaration_open( + tokeniser); + break; + case HUBBUB_TOKENISER_STATE_COMMENT_START: + cont = hubbub_tokeniser_handle_comment_start( + tokeniser); + break; + case HUBBUB_TOKENISER_STATE_COMMENT: + cont = hubbub_tokeniser_handle_comment(tokeniser); + break; + case HUBBUB_TOKENISER_STATE_COMMENT_DASH: + cont = hubbub_tokeniser_handle_comment_dash( + tokeniser); + break; + case HUBBUB_TOKENISER_STATE_COMMENT_END: + cont = hubbub_tokeniser_handle_comment_end( + tokeniser); + break; + case HUBBUB_TOKENISER_STATE_MATCH_DOCTYPE: + cont = hubbub_tokeniser_handle_match_doctype( + tokeniser); + break; + case HUBBUB_TOKENISER_STATE_DOCTYPE: + cont = hubbub_tokeniser_handle_doctype(tokeniser); + break; + case HUBBUB_TOKENISER_STATE_BEFORE_DOCTYPE_NAME: + cont = hubbub_tokeniser_handle_before_doctype_name( + tokeniser); + break; + case HUBBUB_TOKENISER_STATE_DOCTYPE_NAME: + cont = hubbub_tokeniser_handle_doctype_name( + tokeniser); + break; + case HUBBUB_TOKENISER_STATE_AFTER_DOCTYPE_NAME: + cont = hubbub_tokeniser_handle_after_doctype_name( + tokeniser); + break; + case HUBBUB_TOKENISER_STATE_BOGUS_DOCTYPE: + cont = hubbub_tokeniser_handle_bogus_doctype( + tokeniser); + break; + case HUBBUB_TOKENISER_STATE_NUMBERED_ENTITY: + cont = hubbub_tokeniser_handle_numbered_entity( + tokeniser); + break; + case HUBBUB_TOKENISER_STATE_NAMED_ENTITY: + cont = hubbub_tokeniser_handle_named_entity( + tokeniser); + break; + } + } + + return HUBBUB_OK; +} + +bool hubbub_tokeniser_handle_data(hubbub_tokeniser *tokeniser) +{ + hubbub_token token; + uint32_t c; + + /* Clear current characters */ + tokeniser->context.current_chars.data_off = 0; + tokeniser->context.current_chars.len = 0; + + while ((c = hubbub_inputstream_peek(tokeniser->input)) != + HUBBUB_INPUTSTREAM_EOF && + c != HUBBUB_INPUTSTREAM_OOD) { + if (c == '&' && (tokeniser->content_model == + HUBBUB_CONTENT_MODEL_PCDATA || + tokeniser->content_model == + HUBBUB_CONTENT_MODEL_RCDATA)) { + tokeniser->state = + HUBBUB_TOKENISER_STATE_ENTITY_DATA; + /* Don't eat the '&'; it'll be handled by + * entity consumption */ + break; + } else if (c == '<' && tokeniser->content_model != + HUBBUB_CONTENT_MODEL_PLAINTEXT) { + if (tokeniser->context.current_chars.len > 0) { + /* Emit any pending characters */ + token.type = HUBBUB_TOKEN_CHARACTER; + token.data.character = + tokeniser->context.current_chars; + + hubbub_tokeniser_emit_token(tokeniser, + &token); + } + + /* Buffer '<' */ + tokeniser->context.current_chars.data_off = + hubbub_inputstream_cur_pos(tokeniser->input, + &tokeniser->context.current_chars.len); + + tokeniser->state = HUBBUB_TOKENISER_STATE_TAG_OPEN; + hubbub_inputstream_advance(tokeniser->input); + break; + } else { + uint32_t pos; + size_t len; + + /* Accumulate characters into buffer */ + pos = hubbub_inputstream_cur_pos(tokeniser->input, + &len); + + if (tokeniser->context.current_chars.len == 0) { + tokeniser->context.current_chars.data_off = + pos; + } + tokeniser->context.current_chars.len++; + + hubbub_inputstream_advance(tokeniser->input); + } + } + + if (tokeniser->state != HUBBUB_TOKENISER_STATE_TAG_OPEN && + tokeniser->context.current_chars.len > 0) { + /* Emit any pending characters */ + token.type = HUBBUB_TOKEN_CHARACTER; + token.data.character = tokeniser->context.current_chars; + + hubbub_tokeniser_emit_token(tokeniser, &token); + + tokeniser->context.current_chars.data_off = 0; + tokeniser->context.current_chars.len = 0; + } + + if (c == HUBBUB_INPUTSTREAM_EOF) { + token.type = HUBBUB_TOKEN_EOF; + + hubbub_tokeniser_emit_token(tokeniser, &token); + } + + return (c != HUBBUB_INPUTSTREAM_EOF && c != HUBBUB_INPUTSTREAM_OOD); +} + +bool hubbub_tokeniser_handle_entity_data(hubbub_tokeniser *tokeniser) +{ + if (tokeniser->context.match_entity.complete == false) { + return hubbub_tokeniser_consume_entity(tokeniser); + } else { + hubbub_token token; + uint32_t c = hubbub_inputstream_peek(tokeniser->input); + + if (c == HUBBUB_INPUTSTREAM_OOD || + c == HUBBUB_INPUTSTREAM_EOF) { + /* Should never happen */ + abort(); + } + + /* Emit character */ + token.type = HUBBUB_TOKEN_CHARACTER; + token.data.character.data_off = + hubbub_inputstream_cur_pos(tokeniser->input, + &token.data.character.len); + + hubbub_tokeniser_emit_token(tokeniser, &token); + + /* Reset for next time */ + tokeniser->context.match_entity.complete = false; + + tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + + hubbub_inputstream_advance(tokeniser->input); + } + + return true; +} + +bool hubbub_tokeniser_handle_tag_open(hubbub_tokeniser *tokeniser) +{ + uint32_t c = hubbub_inputstream_peek(tokeniser->input); + hubbub_tag *ctag = &tokeniser->context.current_tag; + uint32_t pos; + size_t len; + + if (c == HUBBUB_INPUTSTREAM_OOD) + return false; + + if (tokeniser->content_model == HUBBUB_CONTENT_MODEL_RCDATA || + tokeniser->content_model == + HUBBUB_CONTENT_MODEL_CDATA) { + if (c == '/') { + pos = hubbub_inputstream_cur_pos(tokeniser->input, + &len); + tokeniser->context.current_chars.len += len; + + tokeniser->state = + HUBBUB_TOKENISER_STATE_CLOSE_TAG_OPEN; + + hubbub_inputstream_advance(tokeniser->input); + } else { + hubbub_token token; + + /* Emit '<' */ + token.type = HUBBUB_TOKEN_CHARACTER; + token.data.character = + tokeniser->context.current_chars; + + hubbub_tokeniser_emit_token(tokeniser, &token); + + tokeniser->state = + HUBBUB_TOKENISER_STATE_DATA; + } + } else if (tokeniser->content_model == HUBBUB_CONTENT_MODEL_PCDATA) { + if (c == '!') { + pos = hubbub_inputstream_cur_pos(tokeniser->input, + &len); + + tokeniser->context.current_chars.len += len; + + tokeniser->state = + HUBBUB_TOKENISER_STATE_MARKUP_DECLARATION_OPEN; + hubbub_inputstream_advance(tokeniser->input); + } else if (c == '/') { + pos = hubbub_inputstream_cur_pos(tokeniser->input, + &len); + + tokeniser->context.current_chars.len += len; + + tokeniser->state = + HUBBUB_TOKENISER_STATE_CLOSE_TAG_OPEN; + hubbub_inputstream_advance(tokeniser->input); + } else if ('A' <= c && c <= 'Z') { + hubbub_inputstream_lowercase(tokeniser->input); + + tokeniser->context.current_tag_type = + HUBBUB_TOKEN_START_TAG; + + ctag->name.data_off = + hubbub_inputstream_cur_pos(tokeniser->input, + &ctag->name.len); + ctag->n_attributes = 0; + + tokeniser->state = + HUBBUB_TOKENISER_STATE_TAG_NAME; + hubbub_inputstream_advance(tokeniser->input); + } else if ('a' <= c && c <= 'z') { + tokeniser->context.current_tag_type = + HUBBUB_TOKEN_START_TAG; + + ctag->name.data_off = + hubbub_inputstream_cur_pos(tokeniser->input, + &ctag->name.len); + ctag->n_attributes = 0; + + tokeniser->state = + HUBBUB_TOKENISER_STATE_TAG_NAME; + hubbub_inputstream_advance(tokeniser->input); + } else if (c == '>') { + hubbub_token token; + + pos = hubbub_inputstream_cur_pos(tokeniser->input, + &len); + tokeniser->context.current_chars.len += len; + + /* Emit "<>" */ + token.type = HUBBUB_TOKEN_CHARACTER; + token.data.character = + tokeniser->context.current_chars; + + hubbub_tokeniser_emit_token(tokeniser, &token); + + tokeniser->state = + HUBBUB_TOKENISER_STATE_DATA; + + hubbub_inputstream_advance(tokeniser->input); + } else if (c == '?') { + pos = hubbub_inputstream_cur_pos(tokeniser->input, + &len); + tokeniser->context.current_chars.len += len; + + tokeniser->context.current_comment.data_off = pos; + tokeniser->context.current_comment.len = len; + tokeniser->state = + HUBBUB_TOKENISER_STATE_BOGUS_COMMENT; + hubbub_inputstream_advance(tokeniser->input); + } else { + hubbub_token token; + + /* Emit '<' */ + token.type = HUBBUB_TOKEN_CHARACTER; + token.data.character = + tokeniser->context.current_chars; + + hubbub_tokeniser_emit_token(tokeniser, &token); + + tokeniser->state = + HUBBUB_TOKENISER_STATE_DATA; + } + } + + return true; +} + +bool hubbub_tokeniser_handle_close_tag_open(hubbub_tokeniser *tokeniser) +{ + if (tokeniser->content_model == HUBBUB_CONTENT_MODEL_RCDATA || + tokeniser->content_model == + HUBBUB_CONTENT_MODEL_CDATA) { + tokeniser->context.close_tag_match.tag.len = 0; + tokeniser->state = HUBBUB_TOKENISER_STATE_CLOSE_TAG_MATCH; + } else if (tokeniser->content_model == HUBBUB_CONTENT_MODEL_PCDATA) { + hubbub_tag *ctag = &tokeniser->context.current_tag; + uint32_t c = hubbub_inputstream_peek(tokeniser->input); + uint32_t pos; + size_t len; + + if ('A' <= c && c <= 'Z') { + hubbub_inputstream_lowercase(tokeniser->input); + + pos = hubbub_inputstream_cur_pos(tokeniser->input, + &len); + + tokeniser->context.current_tag_type = + HUBBUB_TOKEN_END_TAG; + ctag->name.data_off = pos; + ctag->name.len = len; + ctag->n_attributes = 0; + + tokeniser->state = HUBBUB_TOKENISER_STATE_TAG_NAME; + hubbub_inputstream_advance(tokeniser->input); + } else if ('a' <= c && c <= 'z') { + pos = hubbub_inputstream_cur_pos(tokeniser->input, + &len); + + tokeniser->context.current_tag_type = + HUBBUB_TOKEN_END_TAG; + ctag->name.data_off = pos; + ctag->name.len = len; + ctag->n_attributes = 0; + + tokeniser->state = HUBBUB_TOKENISER_STATE_TAG_NAME; + hubbub_inputstream_advance(tokeniser->input); + } else if (c == '>') { + tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + hubbub_inputstream_advance(tokeniser->input); + } else if (c == HUBBUB_INPUTSTREAM_EOF) { + hubbub_token token; + + /* Emit "context.current_chars; + + hubbub_tokeniser_emit_token(tokeniser, &token); + + tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + } else if (c != HUBBUB_INPUTSTREAM_OOD) { + pos = hubbub_inputstream_cur_pos(tokeniser->input, + &len); + + tokeniser->context.current_comment.data_off = pos; + tokeniser->context.current_comment.len = len; + + tokeniser->state = + HUBBUB_TOKENISER_STATE_BOGUS_COMMENT; + hubbub_inputstream_advance(tokeniser->input); + } else { + /* Out of data */ + return false; + } + } + + return true; +} + +bool hubbub_tokeniser_handle_close_tag_match(hubbub_tokeniser *tokeniser) +{ + hubbub_tokeniser_context *ctx = &tokeniser->context; + hubbub_tag *ctag = &tokeniser->context.current_tag; + uint32_t c = 0; + + while (ctx->close_tag_match.tag.len < ctag->name.len && + (c = hubbub_inputstream_peek(tokeniser->input)) != + HUBBUB_INPUTSTREAM_EOF && + c != HUBBUB_INPUTSTREAM_OOD) { + /* Match last open tag */ + uint32_t off; + size_t len; + + off = hubbub_inputstream_cur_pos(tokeniser->input, &len); + + if (ctx->close_tag_match.tag.len == 0) { + ctx->close_tag_match.tag.data_off = off; + ctx->close_tag_match.tag.len = len; + } else { + ctx->close_tag_match.tag.len += len; + } + + hubbub_inputstream_advance(tokeniser->input); + + if (ctx->close_tag_match.tag.len > ctag->name.len || + (ctx->close_tag_match.tag.len == ctag->name.len && + hubbub_inputstream_compare_range_ci( + tokeniser->input, + ctag->name.data_off, + ctx->close_tag_match.tag.data_off, + ctag->name.len) != 0)) { + hubbub_token token; + + /* Rewind input stream to start of tag name */ + if (hubbub_inputstream_rewind(tokeniser->input, + ctx->close_tag_match.tag.len) != + HUBBUB_OK) + abort(); + + /* Emit "context.current_chars; + + hubbub_tokeniser_emit_token(tokeniser, &token); + + tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + + return true; + } else if (ctx->close_tag_match.tag.len == ctag->name.len && + hubbub_inputstream_compare_range_ci( + tokeniser->input, + ctag->name.data_off, + ctx->close_tag_match.tag.data_off, + ctag->name.len) == 0) { + /* Matched => stop searching */ + break; + } + } + + if (c == HUBBUB_INPUTSTREAM_OOD) { + /* Need more data */ + return false; + } + + if (c == HUBBUB_INPUTSTREAM_EOF) { + /* Ran out of data - parse error */ + hubbub_token token; + + /* Rewind input stream to start of tag name */ + if (hubbub_inputstream_rewind(tokeniser->input, + ctx->close_tag_match.tag.len) != HUBBUB_OK) + abort(); + + /* Emit "context.current_chars; + + hubbub_tokeniser_emit_token(tokeniser, &token); + + tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + + return true; + } + + /* Match following char */ + c = hubbub_inputstream_peek(tokeniser->input); + + if (c == HUBBUB_INPUTSTREAM_OOD) { + /* Need more data */ + return false; + } + + /* Rewind input stream to start of tag name */ + if (hubbub_inputstream_rewind(tokeniser->input, + ctx->close_tag_match.tag.len) != HUBBUB_OK) + abort(); + + /* Check that following char was valid */ + if (c != '\t' && c != '\n' && c != '\v' && c != '\f' && + c != ' ' && c != '>' && c != '/' && c != '<' && + c != HUBBUB_INPUTSTREAM_EOF) { + hubbub_token token; + + /* Emit "context.current_chars; + + hubbub_tokeniser_emit_token(tokeniser, &token); + + tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + + return true; + } + + /* Switch the content model back to PCDATA */ + tokeniser->content_model = HUBBUB_CONTENT_MODEL_PCDATA; + + /* Finally, transition back to close tag open state */ + tokeniser->state = HUBBUB_TOKENISER_STATE_CLOSE_TAG_OPEN; + + return true; +} + +bool hubbub_tokeniser_handle_tag_name(hubbub_tokeniser *tokeniser) +{ + hubbub_tag *ctag = &tokeniser->context.current_tag; + uint32_t c = hubbub_inputstream_peek(tokeniser->input); + + if (c == HUBBUB_INPUTSTREAM_OOD) + return false; + + if (c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == ' ') { + tokeniser->state = + HUBBUB_TOKENISER_STATE_BEFORE_ATTRIBUTE_NAME; + hubbub_inputstream_advance(tokeniser->input); + } else if (c == '>') { + hubbub_token token; + + /* Emit current tag */ + token.type = tokeniser->context.current_tag_type; + token.data.tag = tokeniser->context.current_tag; + + hubbub_tokeniser_emit_token(tokeniser, &token); + + tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + hubbub_inputstream_advance(tokeniser->input); + } else if ('A' <= c && c <= 'Z') { + uint32_t pos; + size_t len; + + hubbub_inputstream_lowercase(tokeniser->input); + + pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); + + ctag->name.len += len; + + hubbub_inputstream_advance(tokeniser->input); + } else if (c == '<' || c == HUBBUB_INPUTSTREAM_EOF) { + hubbub_token token; + + /* Emit current tag */ + token.type = tokeniser->context.current_tag_type; + token.data.tag = tokeniser->context.current_tag; + + hubbub_tokeniser_emit_token(tokeniser, &token); + + tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + } else if (c == '/') { + /** \todo permitted slash */ + tokeniser->state = + HUBBUB_TOKENISER_STATE_BEFORE_ATTRIBUTE_NAME; + hubbub_inputstream_advance(tokeniser->input); + } else { + uint32_t pos; + size_t len; + + pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); + + ctag->name.len += len; + + hubbub_inputstream_advance(tokeniser->input); + } + + return true; +} + +bool hubbub_tokeniser_handle_before_attribute_name( + hubbub_tokeniser *tokeniser) +{ + hubbub_tag *ctag = &tokeniser->context.current_tag; + uint32_t c = hubbub_inputstream_peek(tokeniser->input); + + if (c == HUBBUB_INPUTSTREAM_OOD) + return false; + + if (c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == ' ') { + hubbub_inputstream_advance(tokeniser->input); + } else if (c == '>') { + hubbub_token token; + + /* Emit current tag */ + token.type = tokeniser->context.current_tag_type; + token.data.tag = tokeniser->context.current_tag; + + hubbub_tokeniser_emit_token(tokeniser, &token); + + tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + hubbub_inputstream_advance(tokeniser->input); + } else if ('A' <= c && c <= 'Z') { + uint32_t pos; + size_t len; + hubbub_attribute *attr; + + hubbub_inputstream_lowercase(tokeniser->input); + + pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); + + attr = tokeniser->alloc(ctag->attributes, + (ctag->n_attributes + 1) * + sizeof(hubbub_attribute), + tokeniser->alloc_pw); + if (attr == NULL) { + /** \todo handle memory exhaustion */ + } + + ctag->attributes = attr; + + attr[ctag->n_attributes].name.data_off = pos; + attr[ctag->n_attributes].name.len = len; + attr[ctag->n_attributes].value.data_off = 0; + attr[ctag->n_attributes].value.len = 0; + + ctag->n_attributes++; + + tokeniser->state = HUBBUB_TOKENISER_STATE_ATTRIBUTE_NAME; + + hubbub_inputstream_advance(tokeniser->input); + } else if (c == '/') { + /** \todo permitted slash */ + hubbub_inputstream_advance(tokeniser->input); + } else if (c == '<' || c == HUBBUB_INPUTSTREAM_EOF) { + hubbub_token token; + + /* Emit current tag */ + token.type = tokeniser->context.current_tag_type; + token.data.tag = tokeniser->context.current_tag; + + hubbub_tokeniser_emit_token(tokeniser, &token); + + tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + } else { + uint32_t pos; + size_t len; + hubbub_attribute *attr; + + pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); + + attr = tokeniser->alloc(ctag->attributes, + (ctag->n_attributes + 1) * + sizeof(hubbub_attribute), + tokeniser->alloc_pw); + if (attr == NULL) { + /** \todo handle memory exhaustion */ + } + + ctag->attributes = attr; + + attr[ctag->n_attributes].name.data_off = pos; + attr[ctag->n_attributes].name.len = len; + attr[ctag->n_attributes].value.data_off = 0; + attr[ctag->n_attributes].value.len = 0; + + ctag->n_attributes++; + + tokeniser->state = HUBBUB_TOKENISER_STATE_ATTRIBUTE_NAME; + + hubbub_inputstream_advance(tokeniser->input); + } + + return true; +} + +bool hubbub_tokeniser_handle_attribute_name(hubbub_tokeniser *tokeniser) +{ + hubbub_tag *ctag = &tokeniser->context.current_tag; + uint32_t c = hubbub_inputstream_peek(tokeniser->input); + + if (c == HUBBUB_INPUTSTREAM_OOD) + return false; + + if (c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == ' ') { + tokeniser->state = + HUBBUB_TOKENISER_STATE_AFTER_ATTRIBUTE_NAME; + hubbub_inputstream_advance(tokeniser->input); + } else if (c == '=') { + tokeniser->state = + HUBBUB_TOKENISER_STATE_BEFORE_ATTRIBUTE_VALUE; + hubbub_inputstream_advance(tokeniser->input); + } else if (c == '>') { + hubbub_token token; + + /* Emit current tag */ + token.type = tokeniser->context.current_tag_type; + token.data.tag = tokeniser->context.current_tag; + + hubbub_tokeniser_emit_token(tokeniser, &token); + + tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + hubbub_inputstream_advance(tokeniser->input); + } else if ('A' <= c && c <= 'Z') { + uint32_t pos; + size_t len; + + hubbub_inputstream_lowercase(tokeniser->input); + + pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); + + ctag->attributes[ctag->n_attributes - 1].name.len += len; + + hubbub_inputstream_advance(tokeniser->input); + } else if (c == '/') { + /** \todo permitted slash */ + tokeniser->state = + HUBBUB_TOKENISER_STATE_BEFORE_ATTRIBUTE_NAME; + hubbub_inputstream_advance(tokeniser->input); + } else if (c == '<' || c == HUBBUB_INPUTSTREAM_EOF) { + hubbub_token token; + + /* Emit current tag */ + token.type = tokeniser->context.current_tag_type; + token.data.tag = tokeniser->context.current_tag; + + hubbub_tokeniser_emit_token(tokeniser, &token); + + tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + } else { + uint32_t pos; + size_t len; + + pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); + + ctag->attributes[ctag->n_attributes - 1].name.len += len; + + hubbub_inputstream_advance(tokeniser->input); + } + + return true; +} + +bool hubbub_tokeniser_handle_after_attribute_name( + hubbub_tokeniser *tokeniser) +{ + hubbub_tag *ctag = &tokeniser->context.current_tag; + uint32_t c = hubbub_inputstream_peek(tokeniser->input); + + if (c == HUBBUB_INPUTSTREAM_OOD) + return false; + + if (c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == ' ') { + hubbub_inputstream_advance(tokeniser->input); + } else if (c == '=') { + tokeniser->state = + HUBBUB_TOKENISER_STATE_BEFORE_ATTRIBUTE_VALUE; + hubbub_inputstream_advance(tokeniser->input); + } else if (c == '>') { + hubbub_token token; + + /* Emit current tag */ + token.type = tokeniser->context.current_tag_type; + token.data.tag = tokeniser->context.current_tag; + + hubbub_tokeniser_emit_token(tokeniser, &token); + + tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + hubbub_inputstream_advance(tokeniser->input); + } else if ('A' <= c && c <= 'Z') { + uint32_t pos; + size_t len; + hubbub_attribute *attr; + + hubbub_inputstream_lowercase(tokeniser->input); + + pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); + + attr = tokeniser->alloc(ctag->attributes, + (ctag->n_attributes + 1) * + sizeof(hubbub_attribute), + tokeniser->alloc_pw); + if (attr == NULL) { + /** \todo handle memory exhaustion */ + } + + ctag->attributes = attr; + + attr[ctag->n_attributes].name.data_off = pos; + attr[ctag->n_attributes].name.len = len; + attr[ctag->n_attributes].value.data_off = 0; + attr[ctag->n_attributes].value.len = 0; + + ctag->n_attributes++; + + tokeniser->state = HUBBUB_TOKENISER_STATE_ATTRIBUTE_NAME; + + hubbub_inputstream_advance(tokeniser->input); + } else if (c == '/') { + /** \todo permitted slash */ + tokeniser->state = + HUBBUB_TOKENISER_STATE_BEFORE_ATTRIBUTE_NAME; + hubbub_inputstream_advance(tokeniser->input); + } else if (c == '<' || c == HUBBUB_INPUTSTREAM_EOF) { + hubbub_token token; + + /* Emit current tag */ + token.type = tokeniser->context.current_tag_type; + token.data.tag = tokeniser->context.current_tag; + + hubbub_tokeniser_emit_token(tokeniser, &token); + + tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + } else { + uint32_t pos; + size_t len; + hubbub_attribute *attr; + + hubbub_inputstream_lowercase(tokeniser->input); + + pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); + + attr = tokeniser->alloc(ctag->attributes, + (ctag->n_attributes + 1) * + sizeof(hubbub_attribute), + tokeniser->alloc_pw); + if (attr == NULL) { + /** \todo handle memory exhaustion */ + } + + ctag->attributes = attr; + + attr[ctag->n_attributes].name.data_off = pos; + attr[ctag->n_attributes].name.len = len; + attr[ctag->n_attributes].value.data_off = 0; + attr[ctag->n_attributes].value.len = 0; + + ctag->n_attributes++; + + tokeniser->state = HUBBUB_TOKENISER_STATE_ATTRIBUTE_NAME; + + hubbub_inputstream_advance(tokeniser->input); + } + + return true; +} + +bool hubbub_tokeniser_handle_before_attribute_value( + hubbub_tokeniser *tokeniser) +{ + hubbub_tag *ctag = &tokeniser->context.current_tag; + uint32_t c = hubbub_inputstream_peek(tokeniser->input); + + if (c == HUBBUB_INPUTSTREAM_OOD) + return false; + + if (c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == ' ') { + hubbub_inputstream_advance(tokeniser->input); + } else if (c == '"') { + tokeniser->state = HUBBUB_TOKENISER_STATE_ATTRIBUTE_VALUE_DQ; + hubbub_inputstream_advance(tokeniser->input); + } else if (c == '&') { + tokeniser->state = HUBBUB_TOKENISER_STATE_ATTRIBUTE_VALUE_UQ; + } else if (c == '\'') { + tokeniser->state = HUBBUB_TOKENISER_STATE_ATTRIBUTE_VALUE_SQ; + hubbub_inputstream_advance(tokeniser->input); + } else if (c == '>') { + hubbub_token token; + + /* Emit current tag */ + token.type = tokeniser->context.current_tag_type; + token.data.tag = tokeniser->context.current_tag; + + hubbub_tokeniser_emit_token(tokeniser, &token); + + tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + hubbub_inputstream_advance(tokeniser->input); + } else if (c == '<' || c == HUBBUB_INPUTSTREAM_EOF) { + hubbub_token token; + + /* Emit current tag */ + token.type = tokeniser->context.current_tag_type; + token.data.tag = tokeniser->context.current_tag; + + hubbub_tokeniser_emit_token(tokeniser, &token); + + tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + } else { + uint32_t pos; + size_t len; + + pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); + + ctag->attributes[ctag->n_attributes - 1].value.data_off = pos; + ctag->attributes[ctag->n_attributes - 1].value.len = len; + + tokeniser->state = HUBBUB_TOKENISER_STATE_ATTRIBUTE_VALUE_UQ; + + hubbub_inputstream_advance(tokeniser->input); + } + + return true; +} + +bool hubbub_tokeniser_handle_attribute_value_dq(hubbub_tokeniser *tokeniser) +{ + hubbub_tag *ctag = &tokeniser->context.current_tag; + uint32_t c = hubbub_inputstream_peek(tokeniser->input); + + if (c == HUBBUB_INPUTSTREAM_OOD) + return false; + + if (c == '"') { + tokeniser->state = + HUBBUB_TOKENISER_STATE_BEFORE_ATTRIBUTE_NAME; + hubbub_inputstream_advance(tokeniser->input); + } else if (c == '&') { + tokeniser->context.prev_state = tokeniser->state; + tokeniser->state = + HUBBUB_TOKENISER_STATE_ENTITY_IN_ATTRIBUTE_VALUE; + /* Don't eat the '&'; entity consumption handles this */ + } else if (c == HUBBUB_INPUTSTREAM_EOF) { + hubbub_token token; + + /* Emit current tag */ + token.type = tokeniser->context.current_tag_type; + token.data.tag = tokeniser->context.current_tag; + + hubbub_tokeniser_emit_token(tokeniser, &token); + + tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + } else { + uint32_t pos; + size_t len; + + pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); + + if (ctag->attributes[ctag->n_attributes - 1].value.len == 0) { + ctag->attributes[ctag->n_attributes - 1].value.data_off = + pos; + } + + ctag->attributes[ctag->n_attributes - 1].value.len += len; + + hubbub_inputstream_advance(tokeniser->input); + } + + return true; +} + +bool hubbub_tokeniser_handle_attribute_value_sq(hubbub_tokeniser *tokeniser) +{ + hubbub_tag *ctag = &tokeniser->context.current_tag; + uint32_t c = hubbub_inputstream_peek(tokeniser->input); + + if (c == HUBBUB_INPUTSTREAM_OOD) + return false; + + if (c == '\'') { + tokeniser->state = + HUBBUB_TOKENISER_STATE_BEFORE_ATTRIBUTE_NAME; + hubbub_inputstream_advance(tokeniser->input); + } else if (c == '&') { + tokeniser->context.prev_state = tokeniser->state; + tokeniser->state = + HUBBUB_TOKENISER_STATE_ENTITY_IN_ATTRIBUTE_VALUE; + /* Don't eat the '&'; entity consumption handles this */ + } else if (c == HUBBUB_INPUTSTREAM_EOF) { + hubbub_token token; + + /* Emit current tag */ + token.type = tokeniser->context.current_tag_type; + token.data.tag = tokeniser->context.current_tag; + + hubbub_tokeniser_emit_token(tokeniser, &token); + + tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + } else { + uint32_t pos; + size_t len; + + pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); + + if (ctag->attributes[ctag->n_attributes - 1].value.len == 0) { + ctag->attributes[ctag->n_attributes - 1].value.data_off = + pos; + } + + ctag->attributes[ctag->n_attributes - 1].value.len += len; + + hubbub_inputstream_advance(tokeniser->input); + } + + return true; +} + +bool hubbub_tokeniser_handle_attribute_value_uq(hubbub_tokeniser *tokeniser) +{ + hubbub_tag *ctag = &tokeniser->context.current_tag; + uint32_t c = hubbub_inputstream_peek(tokeniser->input); + + if (c == HUBBUB_INPUTSTREAM_OOD) + return false; + + if (c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == ' ') { + tokeniser->state = + HUBBUB_TOKENISER_STATE_BEFORE_ATTRIBUTE_NAME; + hubbub_inputstream_advance(tokeniser->input); + } else if (c == '&') { + tokeniser->context.prev_state = tokeniser->state; + tokeniser->state = + HUBBUB_TOKENISER_STATE_ENTITY_IN_ATTRIBUTE_VALUE; + /* Don't eat the '&'; entity consumption handles this */ + } else if (c == '>') { + hubbub_token token; + + /* Emit current tag */ + token.type = tokeniser->context.current_tag_type; + token.data.tag = tokeniser->context.current_tag; + + hubbub_tokeniser_emit_token(tokeniser, &token); + + tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + hubbub_inputstream_advance(tokeniser->input); + } else if (c == '<' || c == HUBBUB_INPUTSTREAM_EOF) { + hubbub_token token; + + /* Emit current tag */ + token.type = tokeniser->context.current_tag_type; + token.data.tag = tokeniser->context.current_tag; + + hubbub_tokeniser_emit_token(tokeniser, &token); + + tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + } else { + uint32_t pos; + size_t len; + + pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); + + if (ctag->attributes[ctag->n_attributes - 1].value.len == 0) { + ctag->attributes[ctag->n_attributes - 1].value.data_off = + pos; + } + + ctag->attributes[ctag->n_attributes - 1].value.len += len; + + hubbub_inputstream_advance(tokeniser->input); + } + + return true; +} + +bool hubbub_tokeniser_handle_entity_in_attribute_value( + hubbub_tokeniser *tokeniser) +{ + hubbub_tag *ctag = &tokeniser->context.current_tag; + uint32_t pos; + size_t len; + + if (tokeniser->context.match_entity.complete == false) { + return hubbub_tokeniser_consume_entity(tokeniser); + } else { + uint32_t c = hubbub_inputstream_peek(tokeniser->input); + + if (c == HUBBUB_INPUTSTREAM_OOD || + c == HUBBUB_INPUTSTREAM_EOF) { + /* Should never happen */ + abort(); + } + + pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); + + if (ctag->attributes[ctag->n_attributes - 1].value.len == 0) { + ctag->attributes[ctag->n_attributes - 1].value.data_off = + pos; + } + + ctag->attributes[ctag->n_attributes - 1].value.len += len; + + /* Reset for next time */ + tokeniser->context.match_entity.complete = false; + + /* And back to the previous state */ + tokeniser->state = tokeniser->context.prev_state; + + hubbub_inputstream_advance(tokeniser->input); + } + + return true; +} + +bool hubbub_tokeniser_handle_bogus_comment(hubbub_tokeniser *tokeniser) +{ + hubbub_token token; + uint32_t c; + + while ((c = hubbub_inputstream_peek(tokeniser->input)) != + HUBBUB_INPUTSTREAM_EOF && + c != HUBBUB_INPUTSTREAM_OOD) { + uint32_t pos; + size_t len; + + if (c == '>') { + hubbub_inputstream_advance(tokeniser->input); + break; + } + + pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); + + if (tokeniser->context.current_comment.len == 0) + tokeniser->context.current_comment.data_off = pos; + tokeniser->context.current_comment.len += len; + + hubbub_inputstream_advance(tokeniser->input); + } + + if (c == HUBBUB_INPUTSTREAM_OOD) + return false; + + /* Emit comment */ + token.type = HUBBUB_TOKEN_COMMENT; + token.data.comment = tokeniser->context.current_comment; + + hubbub_tokeniser_emit_token(tokeniser, &token); + + tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + + return true; +} + +bool hubbub_tokeniser_handle_markup_declaration_open( + hubbub_tokeniser *tokeniser) +{ + uint32_t c = hubbub_inputstream_peek(tokeniser->input); + + if (c == HUBBUB_INPUTSTREAM_OOD) + return false; + + if (c == '-') { + tokeniser->state = HUBBUB_TOKENISER_STATE_COMMENT_START; + hubbub_inputstream_advance(tokeniser->input); + } else if ((c & ~0x20) == 'D') { + hubbub_inputstream_uppercase(tokeniser->input); + tokeniser->context.match_doctype.count = 1; + tokeniser->state = HUBBUB_TOKENISER_STATE_MATCH_DOCTYPE; + hubbub_inputstream_advance(tokeniser->input); + } else { + tokeniser->context.current_comment.data_off = 0; + tokeniser->context.current_comment.len = 0; + + tokeniser->state = HUBBUB_TOKENISER_STATE_BOGUS_COMMENT; + } + + return true; +} + +bool hubbub_tokeniser_handle_comment_start(hubbub_tokeniser *tokeniser) +{ + uint32_t c = hubbub_inputstream_peek(tokeniser->input); + + if (c == HUBBUB_INPUTSTREAM_OOD) + return false; + + tokeniser->context.current_comment.data_off = 0; + tokeniser->context.current_comment.len = 0; + + + if (c == '-') { + tokeniser->state = HUBBUB_TOKENISER_STATE_COMMENT; + hubbub_inputstream_advance(tokeniser->input); + } else { + hubbub_inputstream_push_back(tokeniser->input, '-'); + tokeniser->state = HUBBUB_TOKENISER_STATE_BOGUS_COMMENT; + } + + return true; +} + +bool hubbub_tokeniser_handle_comment(hubbub_tokeniser *tokeniser) +{ + uint32_t c = hubbub_inputstream_peek(tokeniser->input); + + if (c == HUBBUB_INPUTSTREAM_OOD) + return false; + + if (c == '-') { + tokeniser->state = HUBBUB_TOKENISER_STATE_COMMENT_DASH; + hubbub_inputstream_advance(tokeniser->input); + } else if (c == HUBBUB_INPUTSTREAM_EOF) { + hubbub_token token; + + /* Emit comment */ + token.type = HUBBUB_TOKEN_COMMENT; + token.data.comment = tokeniser->context.current_comment; + + hubbub_tokeniser_emit_token(tokeniser, &token); + + tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + } else { + uint32_t pos; + size_t len; + + pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); + + if (tokeniser->context.current_comment.len == 0) + tokeniser->context.current_comment.data_off = pos; + tokeniser->context.current_comment.len += len; + + hubbub_inputstream_advance(tokeniser->input); + } + + return true; +} + +bool hubbub_tokeniser_handle_comment_dash(hubbub_tokeniser *tokeniser) +{ + uint32_t c = hubbub_inputstream_peek(tokeniser->input); + + if (c == HUBBUB_INPUTSTREAM_OOD) + return false; + + if (c == '-') { + tokeniser->state = HUBBUB_TOKENISER_STATE_COMMENT_END; + hubbub_inputstream_advance(tokeniser->input); + } else if (c == HUBBUB_INPUTSTREAM_EOF) { + hubbub_token token; + + /* Emit comment */ + token.type = HUBBUB_TOKEN_COMMENT; + token.data.comment = tokeniser->context.current_comment; + + hubbub_tokeniser_emit_token(tokeniser, &token); + + tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + } else { + uint32_t pos; + size_t len; + + pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); + + if (tokeniser->context.current_comment.len == 0) { + tokeniser->context.current_comment.data_off = pos; + } else { + /* Need to do this to get length of '-' */ + len += pos - + tokeniser->context.current_comment.data_off; + } + + tokeniser->context.current_comment.len = len; + + tokeniser->state = HUBBUB_TOKENISER_STATE_COMMENT; + + hubbub_inputstream_advance(tokeniser->input); + } + + return true; +} + +bool hubbub_tokeniser_handle_comment_end(hubbub_tokeniser *tokeniser) +{ + uint32_t c = hubbub_inputstream_peek(tokeniser->input); + + if (c == HUBBUB_INPUTSTREAM_OOD) + return false; + + if (c == '>') { + hubbub_token token; + + /* Emit comment */ + token.type = HUBBUB_TOKEN_COMMENT; + token.data.comment = tokeniser->context.current_comment; + + hubbub_tokeniser_emit_token(tokeniser, &token); + + tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + hubbub_inputstream_advance(tokeniser->input); + } else if (c == '-') { + uint32_t pos; + size_t len; + + pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); + + if (tokeniser->context.current_comment.len == 0) { + tokeniser->context.current_comment.data_off = pos; + tokeniser->context.current_comment.len = len; + } else { + /* Need to do this to get length of '-' */ + len = pos - + tokeniser->context.current_comment.data_off; + } + + tokeniser->context.current_comment.len = len; + + tokeniser->state = HUBBUB_TOKENISER_STATE_COMMENT_END; + hubbub_inputstream_advance(tokeniser->input); + } else if (c == HUBBUB_INPUTSTREAM_EOF) { + hubbub_token token; + + /* Emit comment */ + token.type = HUBBUB_TOKEN_COMMENT; + token.data.comment = tokeniser->context.current_comment; + + hubbub_tokeniser_emit_token(tokeniser, &token); + + tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + } else { + uint32_t pos; + size_t len; + + pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); + + if (tokeniser->context.current_comment.len == 0) { + tokeniser->context.current_comment.data_off = pos; + } else { + /* Need to do this to get length of '--' */ + len += pos - + tokeniser->context.current_comment.data_off; + } + + tokeniser->context.current_comment.len = len; + + tokeniser->state = HUBBUB_TOKENISER_STATE_COMMENT; + + hubbub_inputstream_advance(tokeniser->input); + } + + return true; +} + +bool hubbub_tokeniser_handle_match_doctype(hubbub_tokeniser *tokeniser) +{ + uint32_t c = hubbub_inputstream_peek(tokeniser->input); + + if (c == HUBBUB_INPUTSTREAM_OOD) + return false; + + if (tokeniser->context.match_doctype.count == 1 && + (c & ~0x20) == 'O') { + hubbub_inputstream_uppercase(tokeniser->input); + tokeniser->context.match_doctype.count++; + hubbub_inputstream_advance(tokeniser->input); + } else if (tokeniser->context.match_doctype.count == 2 && + (c & ~0x20) == 'C') { + hubbub_inputstream_uppercase(tokeniser->input); + tokeniser->context.match_doctype.count++; + hubbub_inputstream_advance(tokeniser->input); + } else if (tokeniser->context.match_doctype.count == 3 && + (c & ~0x20) == 'T') { + hubbub_inputstream_uppercase(tokeniser->input); + tokeniser->context.match_doctype.count++; + hubbub_inputstream_advance(tokeniser->input); + } else if (tokeniser->context.match_doctype.count == 4 && + (c & ~0x20) == 'Y') { + hubbub_inputstream_uppercase(tokeniser->input); + tokeniser->context.match_doctype.count++; + hubbub_inputstream_advance(tokeniser->input); + } else if (tokeniser->context.match_doctype.count == 5 && + (c & ~0x20) == 'P') { + hubbub_inputstream_uppercase(tokeniser->input); + tokeniser->context.match_doctype.count++; + hubbub_inputstream_advance(tokeniser->input); + } else if (tokeniser->context.match_doctype.count == 6 && + (c & ~0x20) == 'E') { + hubbub_inputstream_uppercase(tokeniser->input); + tokeniser->state = HUBBUB_TOKENISER_STATE_DOCTYPE; + hubbub_inputstream_advance(tokeniser->input); + } else { + switch (tokeniser->context.match_doctype.count) { + case 6: hubbub_inputstream_push_back(tokeniser->input, 'P'); + case 5: hubbub_inputstream_push_back(tokeniser->input, 'Y'); + case 4: hubbub_inputstream_push_back(tokeniser->input, 'T'); + case 3: hubbub_inputstream_push_back(tokeniser->input, 'C'); + case 2: hubbub_inputstream_push_back(tokeniser->input, 'O'); + case 1: hubbub_inputstream_push_back(tokeniser->input, 'D'); + } + + tokeniser->context.current_comment.data_off = 0; + tokeniser->context.current_comment.len = 0; + + tokeniser->state = HUBBUB_TOKENISER_STATE_BOGUS_COMMENT; + } + + return true; +} + +bool hubbub_tokeniser_handle_doctype(hubbub_tokeniser *tokeniser) +{ + uint32_t c = hubbub_inputstream_peek(tokeniser->input); + + if (c == HUBBUB_INPUTSTREAM_OOD) + return false; + + if (c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == ' ') { + hubbub_inputstream_advance(tokeniser->input); + } + + tokeniser->state = HUBBUB_TOKENISER_STATE_BEFORE_DOCTYPE_NAME; + + return true; +} + +bool hubbub_tokeniser_handle_before_doctype_name( + hubbub_tokeniser *tokeniser) +{ + hubbub_doctype *cdoc = &tokeniser->context.current_doctype; + uint32_t c = hubbub_inputstream_peek(tokeniser->input); + + if (c == HUBBUB_INPUTSTREAM_OOD) + return false; + + if (c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == ' ') { + hubbub_inputstream_advance(tokeniser->input); + } else if ('a' <= c && c <= 'z') { + uint32_t pos; + size_t len; + + hubbub_inputstream_uppercase(tokeniser->input); + + pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); + + cdoc->name.data_off = pos; + cdoc->name.len = len; + cdoc->correct = false; + + tokeniser->state = HUBBUB_TOKENISER_STATE_DOCTYPE_NAME; + + hubbub_inputstream_advance(tokeniser->input); + } else if (c == '>') { + hubbub_token token; + + /* Emit doctype */ + token.type = HUBBUB_TOKEN_DOCTYPE; + token.data.doctype = tokeniser->context.current_doctype; + + hubbub_tokeniser_emit_token(tokeniser, &token); + + tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + hubbub_inputstream_advance(tokeniser->input); + } else if (c == HUBBUB_INPUTSTREAM_EOF) { + hubbub_token token; + + /* Emit doctype */ + token.type = HUBBUB_TOKEN_DOCTYPE; + token.data.doctype = tokeniser->context.current_doctype; + + hubbub_tokeniser_emit_token(tokeniser, &token); + + tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + } else { + uint32_t pos; + size_t len; + + pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); + + cdoc->name.data_off = pos; + cdoc->name.len = len; + cdoc->correct = false; + + tokeniser->state = HUBBUB_TOKENISER_STATE_DOCTYPE_NAME; + + hubbub_inputstream_advance(tokeniser->input); + } + + return true; +} + +bool hubbub_tokeniser_handle_doctype_name(hubbub_tokeniser *tokeniser) +{ + hubbub_doctype *cdoc = &tokeniser->context.current_doctype; + uint32_t c = hubbub_inputstream_peek(tokeniser->input); + + if (c == HUBBUB_INPUTSTREAM_OOD) + return false; + + if (c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == ' ') { + tokeniser->state = HUBBUB_TOKENISER_STATE_AFTER_DOCTYPE_NAME; + hubbub_inputstream_advance(tokeniser->input); + } else if (c == '>') { + hubbub_token token; + + /* Emit doctype */ + token.type = HUBBUB_TOKEN_DOCTYPE; + token.data.doctype = tokeniser->context.current_doctype; + token.data.doctype.correct = + (hubbub_inputstream_compare_range_ascii( + tokeniser->input, + token.data.doctype.name.data_off, + token.data.doctype.name.len, + "HTML", SLEN("HTML")) == 0); + + hubbub_tokeniser_emit_token(tokeniser, &token); + + tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + hubbub_inputstream_advance(tokeniser->input); + } else if ('a' <= c && c <= 'z') { + uint32_t pos; + size_t len; + + hubbub_inputstream_uppercase(tokeniser->input); + + pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); + + cdoc->name.len += len; + + hubbub_inputstream_advance(tokeniser->input); + } else if (c == HUBBUB_INPUTSTREAM_EOF) { + hubbub_token token; + + /* Emit doctype */ + token.type = HUBBUB_TOKEN_DOCTYPE; + token.data.doctype = tokeniser->context.current_doctype; + + hubbub_tokeniser_emit_token(tokeniser, &token); + + tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + } else { + uint32_t pos; + size_t len; + + pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); + + cdoc->name.len += len; + + hubbub_inputstream_advance(tokeniser->input); + } + + return true; +} + +bool hubbub_tokeniser_handle_after_doctype_name(hubbub_tokeniser *tokeniser) +{ + hubbub_doctype *cdoc = &tokeniser->context.current_doctype; + uint32_t c = hubbub_inputstream_peek(tokeniser->input); + + if (c == HUBBUB_INPUTSTREAM_OOD) + return false; + + if (c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == ' ') { + hubbub_inputstream_advance(tokeniser->input); + } else if (c == '>') { + hubbub_token token; + + /* Emit doctype */ + token.type = HUBBUB_TOKEN_DOCTYPE; + token.data.doctype = tokeniser->context.current_doctype; + token.data.doctype.correct = + (hubbub_inputstream_compare_range_ascii( + tokeniser->input, + token.data.doctype.name.data_off, + token.data.doctype.name.len, + "HTML", SLEN("HTML")) == 0); + + hubbub_tokeniser_emit_token(tokeniser, &token); + + tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + hubbub_inputstream_advance(tokeniser->input); + } else if (c == HUBBUB_INPUTSTREAM_EOF) { + hubbub_token token; + + /* Emit doctype */ + token.type = HUBBUB_TOKEN_DOCTYPE; + token.data.doctype = tokeniser->context.current_doctype; + + hubbub_tokeniser_emit_token(tokeniser, &token); + + tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + } else { + cdoc->correct = false; + + tokeniser->state = HUBBUB_TOKENISER_STATE_BOGUS_DOCTYPE; + + hubbub_inputstream_advance(tokeniser->input); + } + + return true; +} + +bool hubbub_tokeniser_handle_bogus_doctype(hubbub_tokeniser *tokeniser) +{ + uint32_t c = hubbub_inputstream_peek(tokeniser->input); + + if (c == HUBBUB_INPUTSTREAM_OOD) + return false; + + if (c == '>') { + hubbub_token token; + + /* Emit doctype */ + token.type = HUBBUB_TOKEN_DOCTYPE; + token.data.doctype = tokeniser->context.current_doctype; + + hubbub_tokeniser_emit_token(tokeniser, &token); + + tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + hubbub_inputstream_advance(tokeniser->input); + } else if (c == HUBBUB_INPUTSTREAM_EOF) { + hubbub_token token; + + /* Emit doctype */ + token.type = HUBBUB_TOKEN_DOCTYPE; + token.data.doctype = tokeniser->context.current_doctype; + + hubbub_tokeniser_emit_token(tokeniser, &token); + + tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + } else { + hubbub_inputstream_advance(tokeniser->input); + } + + return true; +} + +bool hubbub_tokeniser_consume_entity(hubbub_tokeniser *tokeniser) +{ + uint32_t c; + uint32_t pos; + size_t len; + + if (tokeniser->context.match_entity.done_setup == false) { + pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); + + tokeniser->context.match_entity.str.data_off = pos; + tokeniser->context.match_entity.str.len = len; + tokeniser->context.match_entity.base = 0; + tokeniser->context.match_entity.codepoint = 0; + tokeniser->context.match_entity.had_data = false; + tokeniser->context.match_entity.return_state = + tokeniser->state; + tokeniser->context.match_entity.complete = false; + tokeniser->context.match_entity.done_setup = true; + tokeniser->context.match_entity.context = NULL; + tokeniser->context.match_entity.prev_len = len; + + hubbub_inputstream_advance(tokeniser->input); + } + + c = hubbub_inputstream_peek(tokeniser->input); + + if (c == HUBBUB_INPUTSTREAM_OOD) + return false; + + if (c == '#') { + pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); + + tokeniser->context.match_entity.str.len += len; + + tokeniser->state = HUBBUB_TOKENISER_STATE_NUMBERED_ENTITY; + hubbub_inputstream_advance(tokeniser->input); + } else { + tokeniser->state = HUBBUB_TOKENISER_STATE_NAMED_ENTITY; + } + + return true; +} + +bool hubbub_tokeniser_handle_numbered_entity(hubbub_tokeniser *tokeniser) +{ + hubbub_tokeniser_context *ctx = &tokeniser->context; + uint32_t c = hubbub_inputstream_peek(tokeniser->input); + uint32_t pos; + size_t len; + hubbub_error error; + + if (c == HUBBUB_INPUTSTREAM_OOD) + return false; + + if (ctx->match_entity.base == 0) { + if ((c & ~0x20) == 'X') { + ctx->match_entity.base = 16; + + pos = hubbub_inputstream_cur_pos(tokeniser->input, + &len); + ctx->match_entity.str.len += len; + + hubbub_inputstream_advance(tokeniser->input); + } else { + ctx->match_entity.base = 10; + } + } + + while ((c = hubbub_inputstream_peek(tokeniser->input)) != + HUBBUB_INPUTSTREAM_EOF && + c != HUBBUB_INPUTSTREAM_OOD) { + if (ctx->match_entity.base == 10 && + ('0' <= c && c <= '9')) { + ctx->match_entity.had_data = true; + + ctx->match_entity.codepoint = + ctx->match_entity.codepoint * 10 + (c - '0'); + + pos = hubbub_inputstream_cur_pos(tokeniser->input, + &len); + ctx->match_entity.str.len += len; + } else if (ctx->match_entity.base == 16 && + (('0' <= c && c <= '9') || + ('A' <= (c & ~0x20) && + (c & ~0x20) <= 'F'))) { + ctx->match_entity.had_data = true; + + ctx->match_entity.codepoint *= 16; + + if ('0' <= c && c <= '9') { + ctx->match_entity.codepoint += (c - '0'); + } else { + ctx->match_entity.codepoint += + ((c & ~0x20) - 'A' + 10); + } + + pos = hubbub_inputstream_cur_pos(tokeniser->input, + &len); + ctx->match_entity.str.len += len; + } else { + break; + } + + hubbub_inputstream_advance(tokeniser->input); + } + + if (c == HUBBUB_INPUTSTREAM_OOD) + return false; + + /* Eat trailing semicolon, if any */ + if (c == ';') { + pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); + ctx->match_entity.str.len += len; + + hubbub_inputstream_advance(tokeniser->input); + } + + /* Rewind the inputstream to start of matched sequence */ + hubbub_inputstream_rewind(tokeniser->input, + ctx->match_entity.str.len); + + if (ctx->match_entity.had_data) { + /* Had data, so calculate final codepoint */ + if (0x80 <= ctx->match_entity.codepoint && + ctx->match_entity.codepoint <= 0x9F) { + ctx->match_entity.codepoint = + cp1252Table[ctx->match_entity.codepoint - + 0x80]; + } else if (ctx->match_entity.codepoint == 0 || + ctx->match_entity.codepoint > 0x10FFFF) { + ctx->match_entity.codepoint = 0xFFFD; + } + + /* And replace the matched range with it */ + error = hubbub_inputstream_replace_range(tokeniser->input, + ctx->match_entity.str.data_off, + ctx->match_entity.str.len, + ctx->match_entity.codepoint); + if (error != HUBBUB_OK) { + /** \todo handle memory exhaustion */ + } + } + + /* Reset for next time */ + ctx->match_entity.done_setup = false; + + /* Flag completion */ + ctx->match_entity.complete = true; + + /* And back to the state we were entered in */ + tokeniser->state = ctx->match_entity.return_state; + + return true; +} + +bool hubbub_tokeniser_handle_named_entity(hubbub_tokeniser *tokeniser) +{ + hubbub_tokeniser_context *ctx = &tokeniser->context; + uint32_t c; + uint32_t pos; + size_t len; + hubbub_error error; + + while ((c = hubbub_inputstream_peek(tokeniser->input)) != + HUBBUB_INPUTSTREAM_EOF && + c != HUBBUB_INPUTSTREAM_OOD) { + uint32_t cp; + + if (c > 0x7F) { + /* Entity names are ASCII only */ + break; + } + + error = hubbub_entities_search_step((uint8_t) c, + &cp, + &ctx->match_entity.context); + if (error == HUBBUB_OK) { + /* Had a match - store it for later */ + ctx->match_entity.codepoint = cp; + + pos = hubbub_inputstream_cur_pos(tokeniser->input, + &len); + ctx->match_entity.str.len += len; + + /* And cache length, for replacement */ + ctx->match_entity.prev_len = + ctx->match_entity.str.len; + } else if (error == HUBBUB_INVALID) { + /* No further matches - use last found */ + break; + } else { + pos = hubbub_inputstream_cur_pos(tokeniser->input, + &len); + ctx->match_entity.str.len += len; + } + + hubbub_inputstream_advance(tokeniser->input); + } + + if (c == HUBBUB_INPUTSTREAM_OOD) + return false; + + /* Eat trailing semicolon, if any */ + if (ctx->match_entity.codepoint != 0 && c == ';' && + ctx->match_entity.prev_len == + ctx->match_entity.str.len) { + pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); + ctx->match_entity.prev_len += len; + } + + /* Rewind the inputstream to start of processed sequence */ + hubbub_inputstream_rewind(tokeniser->input, + ctx->match_entity.str.len); + + /* Now, replace range, if we found a named entity */ + if (ctx->match_entity.codepoint != 0) { + error = hubbub_inputstream_replace_range(tokeniser->input, + ctx->match_entity.str.data_off, + ctx->match_entity.prev_len, + ctx->match_entity.codepoint); + if (error != HUBBUB_OK) { + /** \todo handle memory exhaustion */ + } + } + + /* Reset for next time */ + ctx->match_entity.done_setup = false; + + /* Flag completion */ + ctx->match_entity.complete = true; + + /* And back to the state from whence we came */ + tokeniser->state = ctx->match_entity.return_state; + + return true; +} + +/** + * Handle input stream buffer moving + * + * \param buffer Pointer to buffer + * \param len Length of data in buffer (bytes) + * \param pw Pointer to our context + */ +void hubbub_tokeniser_buffer_moved_handler(const uint8_t *buffer, + size_t len, void *pw) +{ + hubbub_tokeniser *tok = (hubbub_tokeniser *) pw; + + tok->input_buffer = buffer; + tok->input_buffer_len = len; + + if (tok->buffer_handler != NULL) + tok->buffer_handler(buffer, len, tok->buffer_pw); +} + +/** + * Emit a token, performing sanity checks if necessary + * + * \param tokeniser Tokeniser instance + * \param token Token to emit + */ +void hubbub_tokeniser_emit_token(hubbub_tokeniser *tokeniser, + hubbub_token *token) +{ + if (tokeniser == NULL || token == NULL) + return; + + /* Nothing to do if there's no registered handler */ + if (tokeniser->token_handler == NULL) + return; + + if (token->type == HUBBUB_TOKEN_START_TAG || + token->type == HUBBUB_TOKEN_END_TAG) { + uint32_t i, j; + uint32_t n_attributes = token->data.tag.n_attributes; + hubbub_attribute *attrs = + token->data.tag.attributes; + + /* Discard duplicate attributes */ + for (i = 0; i < n_attributes; i++) { + for (j = 0; j < n_attributes; j++) { + uint32_t move; + + if (j == i || + attrs[i].name.len != + attrs[j].name.len || + hubbub_inputstream_compare_range_cs( + tokeniser->input, + attrs[i].name.data_off, + attrs[j].name.data_off, + attrs[i].name.len) != 0) { + /* Attributes don't match */ + continue; + } + + /* Calculate amount to move */ + move = (n_attributes - 1 - + ((i < j) ? j : i)) * + sizeof(hubbub_attribute); + + if (move > 0) { + memmove((i < j) ? &attrs[j] + : &attrs[i], + (i < j) ? &attrs[j+1] + : &attrs[i+1], + move); + } + + /* And reduce the number of attributes */ + n_attributes--; + } + } + + token->data.tag.n_attributes = n_attributes; + } + + /* Finally, emit token */ + tokeniser->token_handler(token, tokeniser->token_pw); +} diff --git a/src/tokeniser/tokeniser.h b/src/tokeniser/tokeniser.h new file mode 100644 index 0000000..20bbe20 --- /dev/null +++ b/src/tokeniser/tokeniser.h @@ -0,0 +1,71 @@ +/* + * This file is part of Hubbub. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell + */ + +#ifndef hubbub_tokeniser_tokeniser_h_ +#define hubbub_tokeniser_tokeniser_h_ + +#include +#include + +#include +#include +#include + +#include "input/inputstream.h" + +typedef struct hubbub_tokeniser hubbub_tokeniser; + +/** + * Hubbub tokeniser option types + */ +typedef enum hubbub_tokeniser_opttype { + HUBBUB_TOKENISER_TOKEN_HANDLER, + HUBBUB_TOKENISER_BUFFER_HANDLER, + HUBBUB_TOKENISER_ERROR_HANDLER, + HUBBUB_TOKENISER_CONTENT_MODEL, +} hubbub_tokeniser_opttype; + +/** + * Hubbub tokeniser option parameters + */ +typedef union hubbub_tokeniser_optparams { + struct { + hubbub_token_handler handler; + void *pw; + } token_handler; + + struct { + hubbub_buffer_handler handler; + void *pw; + } buffer_handler; + + struct { + hubbub_error_handler handler; + void *pw; + } error_handler; + + struct { + hubbub_content_model model; + } content_model; +} hubbub_tokeniser_optparams; + +/* Create a hubbub tokeniser */ +hubbub_tokeniser *hubbub_tokeniser_create(hubbub_inputstream *input, + hubbub_alloc alloc, void *pw); +/* Destroy a hubbub tokeniser */ +void hubbub_tokeniser_destroy(hubbub_tokeniser *tokeniser); + +/* Configure a hubbub tokeniser */ +hubbub_error hubbub_tokeniser_setopt(hubbub_tokeniser *tokeniser, + hubbub_tokeniser_opttype type, + hubbub_tokeniser_optparams *params); + +/* Process remaining data in the input stream */ +hubbub_error hubbub_tokeniser_run(hubbub_tokeniser *tokeniser); + +#endif + -- cgit v1.2.3