From 7b30a5520cfb56e651f0eb4da85a3e07747da7dc Mon Sep 17 00:00:00 2001
From: John Mark Bell <jmb@netsurf-browser.org>
Date: Sat, 23 Jun 2007 22:40:25 +0000
Subject: Import hubbub -- an HTML parsing library. Plenty of work still to do
 (like tree generation ;)

svn path=/trunk/hubbub/; revision=3359
---
 src/tokeniser/Makefile    |   53 ++
 src/tokeniser/entities.c  |  363 +++++++
 src/tokeniser/entities.h  |   25 +
 src/tokeniser/tokeniser.c | 2282 +++++++++++++++++++++++++++++++++++++++++++++
 src/tokeniser/tokeniser.h |   71 ++
 5 files changed, 2794 insertions(+)
 create mode 100644 src/tokeniser/Makefile
 create mode 100644 src/tokeniser/entities.c
 create mode 100644 src/tokeniser/entities.h
 create mode 100644 src/tokeniser/tokeniser.c
 create mode 100644 src/tokeniser/tokeniser.h

(limited to 'src/tokeniser')

diff --git a/src/tokeniser/Makefile b/src/tokeniser/Makefile
new file mode 100644
index 0000000..539625f
--- /dev/null
+++ b/src/tokeniser/Makefile
@@ -0,0 +1,53 @@
+# Makefile for libhubbub
+#
+# Toolchain is exported by top-level makefile
+#
+# Top-level makefile also exports the following variables:
+#
+# COMPONENT  Name of component
+# EXPORT     Absolute path of export directory
+# TOP        Absolute path of source tree root
+#
+# The top-level makefile requires the following targets to exist:
+#
+# clean      Clean source tree
+# debug      Create a debug binary
+# distclean  Fully clean source tree, back to pristine condition
+# export     Export distributable components to ${EXPORT}
+# release    Create a release binary
+# setup      Perform any setup required prior to compilation
+# test       Execute any test cases
+
+# Manipulate include paths
+CFLAGS += -I$(CURDIR)
+
+# Objects
+OBJS = entities tokeniser
+
+.PHONY: clean debug distclean export release setup test
+
+# Targets
+release: $(addprefix ../Release/, $(addsuffix .o, $(OBJS)))
+
+debug: $(addprefix ../Debug/, $(addsuffix .o, $(OBJS)))
+
+clean:
+	-@${RM} ${RMFLAGS} $(addprefix ../Release/, $(addsuffix .o, ${OBJS}))
+	-@${RM} ${RMFLAGS} $(addprefix ../Debug/, $(addsuffix .o, ${OBJS}))  
+
+distclean:
+
+setup:
+
+export:
+
+test:
+
+# Pattern rules
+../Release/%.o: %.c
+	@${ECHO} ${ECHOFLAGS} "==> $<"
+	@${CC} -c ${CFLAGS} -DNDEBUG -o $@ $<
+
+../Debug/%.o: %.c
+	@${ECHO} ${ECHOFLAGS} "==> $<"
+	@${CC} -c -g ${CFLAGS} -o $@ $<
diff --git a/src/tokeniser/entities.c b/src/tokeniser/entities.c
new file mode 100644
index 0000000..8a9acf5
--- /dev/null
+++ b/src/tokeniser/entities.c
@@ -0,0 +1,363 @@
+/*
+ * This file is part of Hubbub.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#include "utils/dict.h"
+#include "utils/utils.h"
+#include "tokeniser/entities.h"
+
+typedef struct hubbub_entity hubbub_entity;
+
+static const struct hubbub_entity {
+	const char *name;
+	uint32_t ucs4;
+} entities[] = {
+	{ "AElig", 0x00C6 },
+	{ "Aacute", 0x00C1 },
+	{ "Acirc", 0x00C2 },
+	{ "Agrave", 0x00C0 },
+	{ "Alpha", 0x0391 },
+	{ "Aring", 0x00C5 },
+	{ "Atilde", 0x00C3 },
+	{ "Auml", 0x00C4 },
+	{ "Beta", 0x0392 },
+	{ "Ccedil", 0x00C7 },
+	{ "Chi", 0x03A7 },
+	{ "Dagger", 0x2021 },
+	{ "Delta", 0x0394 },
+	{ "ETH", 0x00D0 },
+	{ "Eacute", 0x00C9 },
+	{ "Ecirc", 0x00CA },
+	{ "Egrave", 0x00C8 },
+	{ "Epsilon", 0x0395 },
+	{ "Eta", 0x0397 },
+	{ "Euml", 0x00CB },
+	{ "Gamma", 0x0393 },
+	{ "Iacute", 0x00CD },
+	{ "Icirc", 0x00CE },
+	{ "Igrave", 0x00CC },
+	{ "Iota", 0x0399 },
+	{ "Iuml", 0x00CF },
+	{ "Kappa", 0x039A },
+	{ "Lambda", 0x039B },
+	{ "Mu", 0x039C },
+	{ "Ntilde", 0x00D1 },
+	{ "Nu", 0x039D },
+	{ "OElig", 0x0152 },
+	{ "Oacute", 0x00D3 },
+	{ "Ocirc", 0x00D4 },
+	{ "Ograve", 0x00D2 },
+	{ "Omega", 0x03A9 },
+	{ "Omicron", 0x039F },
+	{ "Oslash", 0x00D8 },
+	{ "Otilde", 0x00D5 },
+	{ "Ouml", 0x00D6 },
+	{ "Phi", 0x03A6 },
+	{ "Pi", 0x03A0 },
+	{ "Prime", 0x2033 },
+	{ "Psi", 0x03A8 },
+	{ "Rho", 0x03A1 },
+	{ "Scaron", 0x0160 },
+	{ "Sigma", 0x03A3 },
+	{ "THORN", 0x00DE },
+	{ "Tau", 0x03A4 },
+	{ "Theta", 0x0398 },
+	{ "Uacute", 0x00DA },
+	{ "Ucirc", 0x00DB },
+	{ "Ugrave", 0x00D9 },
+	{ "Upsilon", 0x03A5 },
+	{ "Uuml", 0x00DC },
+	{ "Xi", 0x039E },
+	{ "Yacute", 0x00DD },
+	{ "Yuml", 0x0178 },
+	{ "Zeta", 0x0396 },
+	{ "aacute", 0x00E1 },
+	{ "acirc", 0x00E2 },
+	{ "acute", 0x00B4 },
+	{ "aelig", 0x00E6 },
+	{ "agrave", 0x00E0 },
+	{ "alefsym", 0x2135 },
+	{ "alpha", 0x03B1 },
+	{ "amp", 0x0026 },
+	{ "AMP", 0x0026 },
+	{ "and", 0x2227 },
+	{ "ang", 0x2220 },
+	{ "apos", 0x0027 },
+	{ "aring", 0x00E5 },
+	{ "asymp", 0x2248 },
+	{ "atilde", 0x00E3 },
+	{ "auml", 0x00E4 },
+	{ "bdquo", 0x201E },
+	{ "beta", 0x03B2 },
+	{ "brvbar", 0x00A6 },
+	{ "bull", 0x2022 },
+	{ "cap", 0x2229 },
+	{ "ccedil", 0x00E7 },
+	{ "cedil", 0x00B8 },
+	{ "cent", 0x00A2 },
+	{ "chi", 0x03C7 },
+	{ "circ", 0x02C6 },
+	{ "clubs", 0x2663 },
+	{ "cong", 0x2245 },
+	{ "copy", 0x00A9 },
+	{ "COPY", 0x00A9 },
+	{ "crarr", 0x21B5 },
+	{ "cup", 0x222A },
+	{ "curren", 0x00A4 },
+	{ "dArr", 0x21D3 },
+	{ "dagger", 0x2020 },
+	{ "darr", 0x2193 },
+	{ "deg", 0x00B0 },
+	{ "delta", 0x03B4 },
+	{ "diams", 0x2666 },
+	{ "divide", 0x00F7 },
+	{ "eacute", 0x00E9 },
+	{ "ecirc", 0x00EA },
+	{ "egrave", 0x00E8 },
+	{ "empty", 0x2205 },
+	{ "emsp", 0x2003 },
+	{ "ensp", 0x2002 },
+	{ "epsilon", 0x03B5 },
+	{ "equiv", 0x2261 },
+	{ "eta", 0x03B7 },
+	{ "eth", 0x00F0 },
+	{ "euml", 0x00EB },
+	{ "euro", 0x20AC },
+	{ "exist", 0x2203 },
+	{ "fnof", 0x0192 },
+	{ "forall", 0x2200 },
+	{ "frac12", 0x00BD },
+	{ "frac14", 0x00BC },
+	{ "frac34", 0x00BE },
+	{ "frasl", 0x2044 },
+	{ "gamma", 0x03B3 },
+	{ "ge", 0x2265 },
+	{ "gt", 0x003E },
+	{ "GT", 0x003E },
+	{ "hArr", 0x21D4 },
+	{ "harr", 0x2194 },
+	{ "hearts", 0x2665 },
+	{ "hellip", 0x2026 },
+	{ "iacute", 0x00ED },
+	{ "icirc", 0x00EE },
+	{ "iexcl", 0x00A1 },
+	{ "igrave", 0x00EC },
+	{ "image", 0x2111 },
+	{ "infin", 0x221E },
+	{ "int", 0x222B },
+	{ "iota", 0x03B9 },
+	{ "iquest", 0x00BF },
+	{ "isin", 0x2208 },
+	{ "iuml", 0x00EF },
+	{ "kappa", 0x03BA },
+	{ "lArr", 0x21D0 },
+	{ "lambda", 0x03BB },
+	{ "lang", 0x2329 },
+	{ "laquo", 0x00AB },
+	{ "larr", 0x2190 },
+	{ "lceil", 0x2308 },
+	{ "ldquo", 0x201C },
+	{ "le", 0x2264 },
+	{ "lfloor", 0x230A },
+	{ "lowast", 0x2217 },
+	{ "loz", 0x25CA },
+	{ "lrm", 0x200E },
+	{ "lsaquo", 0x2039 },
+	{ "lsquo", 0x2018 },
+	{ "lt", 0x003C },
+	{ "LT", 0x003C },
+	{ "macr", 0x00AF },
+	{ "mdash", 0x2014 },
+	{ "micro", 0x00B5 },
+	{ "middot", 0x00B7 },
+	{ "minus", 0x2212 },
+	{ "mu", 0x03BC },
+	{ "nabla", 0x2207 },
+	{ "nbsp", 0x00A0 },
+	{ "ndash", 0x2013 },
+	{ "ne", 0x2260 },
+	{ "ni", 0x220B },
+	{ "not", 0x00AC },
+	{ "notin", 0x2209 },
+	{ "nsub", 0x2284 },
+	{ "ntilde", 0x00F1 },
+	{ "nu", 0x03BD },
+	{ "oacute", 0x00F3 },
+	{ "ocirc", 0x00F4 },
+	{ "oelig", 0x0153 },
+	{ "ograve", 0x00F2 },
+	{ "oline", 0x203E },
+	{ "omega", 0x03C9 },
+	{ "omicron", 0x03BF },
+	{ "oplus", 0x2295 },
+	{ "or", 0x2228 },
+	{ "ordf", 0x00AA },
+	{ "ordm", 0x00BA },
+	{ "oslash", 0x00F8 },
+	{ "otilde", 0x00F5 },
+	{ "otimes", 0x2297 },
+	{ "ouml", 0x00F6 },
+	{ "para", 0x00B6 },
+	{ "part", 0x2202 },
+	{ "permil", 0x2030 },
+	{ "perp", 0x22A5 },
+	{ "phi", 0x03C6 },
+	{ "pi", 0x03C0 },
+	{ "piv", 0x03D6 },
+	{ "plusmn", 0x00B1 },
+	{ "pound", 0x00A3 },
+	{ "prime", 0x2032 },
+	{ "prod", 0x220F },
+	{ "prop", 0x221D },
+	{ "psi", 0x03C8 },
+	{ "quot", 0x0022 },
+	{ "QUOT", 0x0022 },
+	{ "rArr", 0x21D2 },
+	{ "radic", 0x221A },
+	{ "rang", 0x232A },
+	{ "raquo", 0x00BB },
+	{ "rarr", 0x2192 },
+	{ "rceil", 0x2309 },
+	{ "rdquo", 0x201D },
+	{ "real", 0x211C },
+	{ "reg", 0x00AE },
+	{ "REG", 0x00AE },
+	{ "rfloor", 0x230B },
+	{ "rho", 0x03C1 },
+	{ "rlm", 0x200F },
+	{ "rsaquo", 0x203A },
+	{ "rsquo", 0x2019 },
+	{ "sbquo", 0x201A },
+	{ "scaron", 0x0161 },
+	{ "sdot", 0x22C5 },
+	{ "sect", 0x00A7 },
+	{ "shy", 0x00AD },
+	{ "sigma", 0x03C3 },
+	{ "sigmaf", 0x03C2 },
+	{ "sim", 0x223C },
+	{ "spades", 0x2660 },
+	{ "sub", 0x2282 },
+	{ "sube", 0x2286 },
+	{ "sum", 0x2211 },
+	{ "sup", 0x2283 },
+	{ "sup1", 0x00B9 },
+	{ "sup2", 0x00B2 },
+	{ "sup3", 0x00B3 },
+	{ "supe", 0x2287 },
+	{ "szlig", 0x00DF },
+	{ "tau", 0x03C4 },
+	{ "there4", 0x2234 },
+	{ "theta", 0x03B8 },
+	{ "thetasym", 0x03D1 },
+	{ "thinsp", 0x2009 },
+	{ "thorn", 0x00FE },
+	{ "tilde", 0x02DC },
+	{ "times", 0x00D7 },
+	{ "trade", 0x2122 },
+	{ "uArr", 0x21D1 },
+	{ "uacute", 0x00FA },
+	{ "uarr", 0x2191 },
+	{ "ucirc", 0x00FB },
+	{ "ugrave", 0x00F9 },
+	{ "uml", 0x00A8 },
+	{ "upsih", 0x03D2 },
+	{ "upsilon", 0x03C5 },
+	{ "uuml", 0x00FC },
+	{ "weierp", 0x2118 },
+	{ "xi", 0x03BE },
+	{ "yacute", 0x00FD },
+	{ "yen", 0x00A5 },
+	{ "yuml", 0x00FF },
+	{ "zeta", 0x03B6 },
+	{ "zwj", 0x200D },
+	{ "zwnj", 0x200C },
+};
+
+static hubbub_dict *dict;
+
+/**
+ * Create the entities dictionary
+ *
+ * \param alloc  Memory (de)allocation function
+ * \param pw     Pointer to client-specific private data (may be NULL)
+ * \return HUBBUB_OK on success, appropriate error otherwise
+ */
+hubbub_error hubbub_entities_create(hubbub_alloc alloc, void *pw)
+{
+	hubbub_error error;
+	size_t i;
+
+	if (alloc == NULL)
+		return HUBBUB_BADPARM;
+
+	dict = hubbub_dict_create(alloc, pw);
+	if (dict == NULL)
+		return HUBBUB_NOMEM;
+
+	for (i = 0; i < sizeof(entities) / sizeof(entities[0]); i++) {
+		error = hubbub_dict_insert(dict, entities[i].name,
+				&entities[i]);
+		if (error != HUBBUB_OK) {
+			hubbub_dict_destroy(dict);
+			return error;
+		}
+	}
+
+	return HUBBUB_OK;
+}
+
+/**
+ * Destroy the entities dictionary
+ *
+ * \param alloc  Memory (de)allocation function
+ * \param pw     Pointer to client-specific private data (may be NULL)
+ */
+void hubbub_entities_destroy(hubbub_alloc alloc, void *pw)
+{
+	UNUSED(alloc);
+	UNUSED(pw);
+
+	hubbub_dict_destroy(dict);
+}
+
+/**
+ * Step-wise search for an entity in the dictionary
+ *
+ * \param c        Character to look for
+ * \param result   Pointer to location for result
+ * \param context  Pointer to location for search context
+ * \return HUBBUB_OK if key found,
+ *         HUBBUB_NEEDDATA if more steps are required
+ *         HUBBUB_INVALID if nothing matches
+ *
+ * The value pointed to by ::context should be NULL for the first call.
+ * Thereafter, pass in the same value as returned by the previous call.
+ * The context is opaque to the caller and should not be inspected.
+ *
+ * The location pointed to by ::result will be set to U+FFFD unless a match
+ * is found.
+ */
+hubbub_error hubbub_entities_search_step(uint8_t c, uint32_t *result,
+		void **context)
+{
+	const hubbub_entity *e;
+	hubbub_error error;
+
+	if (result == NULL || context == NULL)
+		return HUBBUB_BADPARM;
+
+	error = hubbub_dict_search_step(dict, c,
+			(const void **) (const void *) &e,
+			context);
+	if (error != HUBBUB_OK) {
+		*result = 0xFFFD;
+		return error;
+	}
+
+	*result = e->ucs4;
+
+	return HUBBUB_OK;
+}
diff --git a/src/tokeniser/entities.h b/src/tokeniser/entities.h
new file mode 100644
index 0000000..efd1987
--- /dev/null
+++ b/src/tokeniser/entities.h
@@ -0,0 +1,25 @@
+/*
+ * This file is part of Hubbub.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#ifndef hubbub_tokeniser_entities_h_
+#define hubbub_tokeniser_entities_h_
+
+#include <inttypes.h>
+
+#include <hubbub/errors.h>
+#include <hubbub/functypes.h>
+
+/* Create the entities dictionary */
+hubbub_error hubbub_entities_create(hubbub_alloc alloc, void *pw);
+/* Destroy the entities dictionary */
+void hubbub_entities_destroy(hubbub_alloc alloc, void *pw);
+
+/* Step-wise search for an entity in the dictionary */
+hubbub_error hubbub_entities_search_step(uint8_t c, uint32_t *result,
+		void **context);
+
+#endif
diff --git a/src/tokeniser/tokeniser.c b/src/tokeniser/tokeniser.c
new file mode 100644
index 0000000..f8b6bb3
--- /dev/null
+++ b/src/tokeniser/tokeniser.c
@@ -0,0 +1,2282 @@
+/*
+ * This file is part of Hubbub.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#include <stdbool.h>
+#include <string.h>
+
+#include "utils/utils.h"
+
+#include "tokeniser/entities.h"
+#include "tokeniser/tokeniser.h"
+
+/**
+ * Table of mappings between Windows-1252 codepoints 128-159 and UCS4
+ */
+static const uint32_t cp1252Table[32] = {
+	0x20AC, 0xFFFD, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021,
+	0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0xFFFD, 0x017D, 0xFFFD,
+	0xFFFD, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
+	0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0xFFFD, 0x017E, 0x0178
+};
+
+/**
+ * Tokeniser states
+ */
+typedef enum hubbub_tokeniser_state {
+	HUBBUB_TOKENISER_STATE_DATA,
+	HUBBUB_TOKENISER_STATE_ENTITY_DATA,
+	HUBBUB_TOKENISER_STATE_TAG_OPEN,
+	HUBBUB_TOKENISER_STATE_CLOSE_TAG_OPEN,
+	HUBBUB_TOKENISER_STATE_CLOSE_TAG_MATCH,
+	HUBBUB_TOKENISER_STATE_TAG_NAME,
+	HUBBUB_TOKENISER_STATE_BEFORE_ATTRIBUTE_NAME,
+	HUBBUB_TOKENISER_STATE_ATTRIBUTE_NAME,
+	HUBBUB_TOKENISER_STATE_AFTER_ATTRIBUTE_NAME,
+	HUBBUB_TOKENISER_STATE_BEFORE_ATTRIBUTE_VALUE,
+	HUBBUB_TOKENISER_STATE_ATTRIBUTE_VALUE_DQ,
+	HUBBUB_TOKENISER_STATE_ATTRIBUTE_VALUE_SQ,
+	HUBBUB_TOKENISER_STATE_ATTRIBUTE_VALUE_UQ,
+	HUBBUB_TOKENISER_STATE_ENTITY_IN_ATTRIBUTE_VALUE,
+	HUBBUB_TOKENISER_STATE_BOGUS_COMMENT,
+	HUBBUB_TOKENISER_STATE_MARKUP_DECLARATION_OPEN,
+	HUBBUB_TOKENISER_STATE_COMMENT_START,
+	HUBBUB_TOKENISER_STATE_COMMENT,
+	HUBBUB_TOKENISER_STATE_COMMENT_DASH,
+	HUBBUB_TOKENISER_STATE_COMMENT_END,
+	HUBBUB_TOKENISER_STATE_MATCH_DOCTYPE,
+	HUBBUB_TOKENISER_STATE_DOCTYPE,
+	HUBBUB_TOKENISER_STATE_BEFORE_DOCTYPE_NAME,
+	HUBBUB_TOKENISER_STATE_DOCTYPE_NAME,
+	HUBBUB_TOKENISER_STATE_AFTER_DOCTYPE_NAME,
+	HUBBUB_TOKENISER_STATE_BOGUS_DOCTYPE,
+	HUBBUB_TOKENISER_STATE_NUMBERED_ENTITY,
+	HUBBUB_TOKENISER_STATE_NAMED_ENTITY
+} hubbub_tokeniser_state;
+
+/**
+ * Context for tokeniser
+ */
+typedef struct hubbub_tokeniser_context {
+	hubbub_token_type current_tag_type;	/**< Type of current_tag */
+	hubbub_tag current_tag;			/**< Current tag */
+
+	hubbub_string current_comment;		/**< Current comment */
+
+	hubbub_doctype current_doctype;		/**< Current doctype */
+
+	hubbub_string current_chars;		/**< Pending characters */
+
+	hubbub_tokeniser_state prev_state;	/**< Previous state */
+
+	struct {
+		hubbub_string tag;		/**< Pending close tag */
+	} close_tag_match;
+
+	struct {
+		uint32_t count;			/**< Index into "DOCTYPE" */
+	} match_doctype;
+
+	struct {
+		hubbub_string str;		/**< Pending string */
+		uint8_t base;			/**< Base for numeric
+						 * entities */
+		uint32_t codepoint;		/**< UCS4 codepoint */
+		bool had_data;			/**< Whether we read
+						 * anything after &#(x)? */
+		hubbub_tokeniser_state return_state;	/**< State we were
+							 * called from */
+		bool complete;			/**< Flag that entity
+						 * matching completed */
+		bool done_setup;		/**< Flag that match setup
+						 * has completed */
+		void *context;			/**< Context for named
+						 * entity search */
+		size_t prev_len;		/**< Previous byte length
+						 * of str */
+	} match_entity;
+
+	struct {
+		uint32_t line;			/**< Current line of input */
+		uint32_t col;			/**< Current character in
+						 * line */
+	} position;
+} hubbub_tokeniser_context;
+
+/**
+ * Tokeniser data structure
+ */
+struct hubbub_tokeniser {
+	hubbub_tokeniser_state state;	/**< Current tokeniser state */
+	hubbub_content_model content_model;	/**< Current content
+						 * model flag */
+
+	hubbub_inputstream *input;	/**< Input stream */
+
+	const uint8_t *input_buffer;	/**< Start of input stream's buffer */
+	size_t input_buffer_len;	/**< Length of input buffer */
+
+	hubbub_tokeniser_context context;	/**< Tokeniser context */
+
+	hubbub_token_handler token_handler;
+	void *token_pw;
+
+	hubbub_buffer_handler buffer_handler;
+	void *buffer_pw;
+
+	hubbub_error_handler error_handler;
+	void *error_pw;
+
+	hubbub_alloc alloc;		/**< Memory (de)allocation function */
+	void *alloc_pw;			/**< Client private data */
+};
+
+static bool hubbub_tokeniser_handle_data(hubbub_tokeniser *tokeniser);
+static bool hubbub_tokeniser_handle_entity_data(hubbub_tokeniser *tokeniser);
+static bool hubbub_tokeniser_handle_tag_open(hubbub_tokeniser *tokeniser);
+static bool hubbub_tokeniser_handle_close_tag_open(
+		hubbub_tokeniser *tokeniser);
+static bool hubbub_tokeniser_handle_close_tag_match(
+		hubbub_tokeniser *tokeniser);
+static bool hubbub_tokeniser_handle_tag_name(hubbub_tokeniser *tokeniser);
+static bool hubbub_tokeniser_handle_before_attribute_name(
+		hubbub_tokeniser *tokeniser);
+static bool hubbub_tokeniser_handle_attribute_name(
+		hubbub_tokeniser *tokeniser);
+static bool hubbub_tokeniser_handle_after_attribute_name(
+		hubbub_tokeniser *tokeniser);
+static bool hubbub_tokeniser_handle_before_attribute_value(
+		hubbub_tokeniser *tokeniser);
+static bool hubbub_tokeniser_handle_attribute_value_dq(
+		hubbub_tokeniser *tokeniser);
+static bool hubbub_tokeniser_handle_attribute_value_sq(
+		hubbub_tokeniser *tokeniser);
+static bool hubbub_tokeniser_handle_attribute_value_uq(
+		hubbub_tokeniser *tokeniser);
+static bool hubbub_tokeniser_handle_entity_in_attribute_value(
+		hubbub_tokeniser *tokeniser);
+static bool hubbub_tokeniser_handle_bogus_comment(
+		hubbub_tokeniser *tokeniser);
+static bool hubbub_tokeniser_handle_markup_declaration_open(
+		hubbub_tokeniser *tokeniser);
+static bool hubbub_tokeniser_handle_comment_start(
+		hubbub_tokeniser *tokeniser);
+static bool hubbub_tokeniser_handle_comment(hubbub_tokeniser *tokeniser);
+static bool hubbub_tokeniser_handle_comment_dash(
+		hubbub_tokeniser *tokeniser);
+static bool hubbub_tokeniser_handle_comment_end(hubbub_tokeniser *tokeniser);
+static bool hubbub_tokeniser_handle_match_doctype(
+		hubbub_tokeniser *tokeniser);
+static bool hubbub_tokeniser_handle_doctype(hubbub_tokeniser *tokeniser);
+static bool hubbub_tokeniser_handle_before_doctype_name(
+		hubbub_tokeniser *tokeniser);
+static bool hubbub_tokeniser_handle_doctype_name(
+		hubbub_tokeniser *tokeniser);
+static bool hubbub_tokeniser_handle_after_doctype_name(
+		hubbub_tokeniser *tokeniser);
+static bool hubbub_tokeniser_handle_bogus_doctype(
+		hubbub_tokeniser *tokeniser);
+static bool hubbub_tokeniser_consume_entity(hubbub_tokeniser *tokeniser);
+static bool hubbub_tokeniser_handle_numbered_entity(
+		hubbub_tokeniser *tokeniser);
+static bool hubbub_tokeniser_handle_named_entity(
+		hubbub_tokeniser *tokeniser);
+static void hubbub_tokeniser_buffer_moved_handler(const uint8_t *buffer,
+		size_t len, void *pw);
+static void hubbub_tokeniser_emit_token(hubbub_tokeniser *tokeniser,
+		hubbub_token *token);
+
+/**
+ * Create a hubbub tokeniser
+ *
+ * \param input  Input stream instance
+ * \param alloc  Memory (de)allocation function
+ * \param pw     Pointer to client-specific private data (may be NULL)
+ * \return Pointer to tokeniser instance, or NULL on failure
+ */
+hubbub_tokeniser *hubbub_tokeniser_create(hubbub_inputstream *input,
+		hubbub_alloc alloc, void *pw)
+{
+	hubbub_tokeniser *tok;
+
+	if (input == NULL || alloc == NULL)
+		return NULL;
+
+	tok = alloc(NULL, sizeof(hubbub_tokeniser), pw);
+	if (tok == NULL)
+		return NULL;
+
+	tok->state = HUBBUB_TOKENISER_STATE_DATA;
+	tok->content_model = HUBBUB_CONTENT_MODEL_PCDATA;
+
+	tok->input = input;
+	tok->input_buffer = NULL;
+	tok->input_buffer_len = 0;
+
+	tok->token_handler = NULL;
+	tok->token_pw = NULL;
+
+	tok->buffer_handler = NULL;
+	tok->buffer_pw = NULL;
+
+	tok->error_handler = NULL;
+	tok->error_pw = NULL;
+
+	tok->alloc = alloc;
+	tok->alloc_pw = pw;
+
+	if (hubbub_inputstream_register_movehandler(input,
+			hubbub_tokeniser_buffer_moved_handler, tok) !=
+			HUBBUB_OK) {
+		alloc(tok, 0, pw);
+		return NULL;
+	}
+
+	memset(&tok->context, 0, sizeof(hubbub_tokeniser_context));
+
+	return tok;
+}
+
+/**
+ * Destroy a hubbub tokeniser
+ *
+ * \param tokeniser  The tokeniser instance to destroy
+ */
+void hubbub_tokeniser_destroy(hubbub_tokeniser *tokeniser)
+{
+	if (tokeniser == NULL)
+		return;
+
+	hubbub_inputstream_deregister_movehandler(tokeniser->input,
+			hubbub_tokeniser_buffer_moved_handler, tokeniser);
+
+	if (tokeniser->context.current_tag.attributes != NULL) {
+		tokeniser->alloc(tokeniser->context.current_tag.attributes,
+				0, tokeniser->alloc_pw);
+	}
+
+	tokeniser->alloc(tokeniser, 0, tokeniser->alloc_pw);
+}
+
+/**
+ * Configure a hubbub tokeniser
+ *
+ * \param tokeniser  The tokeniser instance to configure
+ * \param type       The option type to set
+ * \param params     Option-specific parameters
+ * \return HUBBUB_OK on success, appropriate error otherwise
+ */
+hubbub_error hubbub_tokeniser_setopt(hubbub_tokeniser *tokeniser,
+		hubbub_tokeniser_opttype type,
+		hubbub_tokeniser_optparams *params)
+{
+	if (tokeniser == NULL || params == NULL)
+		return HUBBUB_BADPARM;
+
+	switch (type) {
+	case HUBBUB_TOKENISER_TOKEN_HANDLER:
+		tokeniser->token_handler = params->token_handler.handler;
+		tokeniser->token_pw = params->token_handler.pw;
+		break;
+	case HUBBUB_TOKENISER_BUFFER_HANDLER:
+		tokeniser->buffer_handler = params->buffer_handler.handler;
+		tokeniser->buffer_pw = params->buffer_handler.pw;
+		tokeniser->buffer_handler(tokeniser->input_buffer,
+				tokeniser->input_buffer_len,
+				tokeniser->buffer_pw);
+		break;
+	case HUBBUB_TOKENISER_ERROR_HANDLER:
+		tokeniser->error_handler = params->error_handler.handler;
+		tokeniser->error_pw = params->error_handler.pw;
+		break;
+	case HUBBUB_TOKENISER_CONTENT_MODEL:
+		tokeniser->content_model = params->content_model.model;
+		break;
+	}
+
+	return HUBBUB_OK;
+}
+
+/**
+ * Process remaining data in the input stream
+ *
+ * \param tokeniser  The tokeniser instance to invoke
+ * \return HUBBUB_OK on success, appropriate error otherwise
+ */
+hubbub_error hubbub_tokeniser_run(hubbub_tokeniser *tokeniser)
+{
+	bool cont = true;
+
+	if (tokeniser == NULL)
+		return HUBBUB_BADPARM;
+
+	while (cont) {
+		switch (tokeniser->state) {
+		case HUBBUB_TOKENISER_STATE_DATA:
+			cont = hubbub_tokeniser_handle_data(tokeniser);
+			break;
+		case HUBBUB_TOKENISER_STATE_ENTITY_DATA:
+			cont = hubbub_tokeniser_handle_entity_data(
+					tokeniser);
+			break;
+		case HUBBUB_TOKENISER_STATE_TAG_OPEN:
+			cont = hubbub_tokeniser_handle_tag_open(tokeniser);
+			break;
+		case HUBBUB_TOKENISER_STATE_CLOSE_TAG_OPEN:
+			cont = hubbub_tokeniser_handle_close_tag_open(
+					tokeniser);
+			break;
+		case HUBBUB_TOKENISER_STATE_CLOSE_TAG_MATCH:
+			cont = hubbub_tokeniser_handle_close_tag_match(
+					tokeniser);
+			break;
+		case HUBBUB_TOKENISER_STATE_TAG_NAME:
+			cont = hubbub_tokeniser_handle_tag_name(tokeniser);
+			break;
+		case HUBBUB_TOKENISER_STATE_BEFORE_ATTRIBUTE_NAME:
+			cont = hubbub_tokeniser_handle_before_attribute_name(
+					tokeniser);
+			break;
+		case HUBBUB_TOKENISER_STATE_ATTRIBUTE_NAME:
+			cont = hubbub_tokeniser_handle_attribute_name(
+					tokeniser);
+			break;
+		case HUBBUB_TOKENISER_STATE_AFTER_ATTRIBUTE_NAME:
+			cont = hubbub_tokeniser_handle_after_attribute_name(
+					tokeniser);
+			break;
+		case HUBBUB_TOKENISER_STATE_BEFORE_ATTRIBUTE_VALUE:
+			cont = hubbub_tokeniser_handle_before_attribute_value(
+					tokeniser);
+			break;
+		case HUBBUB_TOKENISER_STATE_ATTRIBUTE_VALUE_DQ:
+			cont = hubbub_tokeniser_handle_attribute_value_dq(
+					tokeniser);
+			break;
+		case HUBBUB_TOKENISER_STATE_ATTRIBUTE_VALUE_SQ:
+			cont = hubbub_tokeniser_handle_attribute_value_sq(
+					tokeniser);
+			break;
+		case HUBBUB_TOKENISER_STATE_ATTRIBUTE_VALUE_UQ:
+			cont = hubbub_tokeniser_handle_attribute_value_uq(
+					tokeniser);
+			break;
+		case HUBBUB_TOKENISER_STATE_ENTITY_IN_ATTRIBUTE_VALUE:
+			cont = hubbub_tokeniser_handle_entity_in_attribute_value(
+					tokeniser);
+			break;
+		case HUBBUB_TOKENISER_STATE_BOGUS_COMMENT:
+			cont = hubbub_tokeniser_handle_bogus_comment(
+					tokeniser);
+			break;
+		case HUBBUB_TOKENISER_STATE_MARKUP_DECLARATION_OPEN:
+			cont = hubbub_tokeniser_handle_markup_declaration_open(
+					tokeniser);
+			break;
+		case HUBBUB_TOKENISER_STATE_COMMENT_START:
+			cont = hubbub_tokeniser_handle_comment_start(
+					tokeniser);
+			break;
+		case HUBBUB_TOKENISER_STATE_COMMENT:
+			cont = hubbub_tokeniser_handle_comment(tokeniser);
+			break;
+		case HUBBUB_TOKENISER_STATE_COMMENT_DASH:
+			cont = hubbub_tokeniser_handle_comment_dash(
+					tokeniser);
+			break;
+		case HUBBUB_TOKENISER_STATE_COMMENT_END:
+			cont = hubbub_tokeniser_handle_comment_end(
+					tokeniser);
+			break;
+		case HUBBUB_TOKENISER_STATE_MATCH_DOCTYPE:
+			cont = hubbub_tokeniser_handle_match_doctype(
+					tokeniser);
+			break;
+		case HUBBUB_TOKENISER_STATE_DOCTYPE:
+			cont = hubbub_tokeniser_handle_doctype(tokeniser);
+			break;
+		case HUBBUB_TOKENISER_STATE_BEFORE_DOCTYPE_NAME:
+			cont = hubbub_tokeniser_handle_before_doctype_name(
+					tokeniser);
+			break;
+		case HUBBUB_TOKENISER_STATE_DOCTYPE_NAME:
+			cont = hubbub_tokeniser_handle_doctype_name(
+					tokeniser);
+			break;
+		case HUBBUB_TOKENISER_STATE_AFTER_DOCTYPE_NAME:
+			cont = hubbub_tokeniser_handle_after_doctype_name(
+					tokeniser);
+			break;
+		case HUBBUB_TOKENISER_STATE_BOGUS_DOCTYPE:
+			cont = hubbub_tokeniser_handle_bogus_doctype(
+					tokeniser);
+			break;
+		case HUBBUB_TOKENISER_STATE_NUMBERED_ENTITY:
+			cont = hubbub_tokeniser_handle_numbered_entity(
+					tokeniser);
+			break;
+		case HUBBUB_TOKENISER_STATE_NAMED_ENTITY:
+			cont = hubbub_tokeniser_handle_named_entity(
+					tokeniser);
+			break;
+		}
+	}
+
+	return HUBBUB_OK;
+}
+
+bool hubbub_tokeniser_handle_data(hubbub_tokeniser *tokeniser)
+{
+	hubbub_token token;
+	uint32_t c;
+
+	/* Clear current characters */
+	tokeniser->context.current_chars.data_off = 0;
+	tokeniser->context.current_chars.len = 0;
+
+	while ((c = hubbub_inputstream_peek(tokeniser->input)) !=
+			HUBBUB_INPUTSTREAM_EOF &&
+			c != HUBBUB_INPUTSTREAM_OOD) {
+		if (c == '&' && (tokeniser->content_model ==
+					HUBBUB_CONTENT_MODEL_PCDATA ||
+				tokeniser->content_model ==
+					HUBBUB_CONTENT_MODEL_RCDATA)) {
+			tokeniser->state =
+					HUBBUB_TOKENISER_STATE_ENTITY_DATA;
+			/* Don't eat the '&'; it'll be handled by
+			 * entity consumption */
+			break;
+		} else if (c == '<' && tokeniser->content_model !=
+				HUBBUB_CONTENT_MODEL_PLAINTEXT) {
+			if (tokeniser->context.current_chars.len > 0) {
+				/* Emit any pending characters */
+				token.type = HUBBUB_TOKEN_CHARACTER;
+				token.data.character =
+					tokeniser->context.current_chars;
+
+				hubbub_tokeniser_emit_token(tokeniser,
+						&token);
+			}
+
+			/* Buffer '<' */
+			tokeniser->context.current_chars.data_off =
+				hubbub_inputstream_cur_pos(tokeniser->input,
+					&tokeniser->context.current_chars.len);
+
+			tokeniser->state = HUBBUB_TOKENISER_STATE_TAG_OPEN;
+			hubbub_inputstream_advance(tokeniser->input);
+			break;
+		} else {
+			uint32_t pos;
+			size_t len;
+
+			/* Accumulate characters into buffer */
+			pos = hubbub_inputstream_cur_pos(tokeniser->input,
+					&len);
+
+			if (tokeniser->context.current_chars.len == 0) {
+				tokeniser->context.current_chars.data_off =
+						pos;
+			}
+			tokeniser->context.current_chars.len++;
+
+			hubbub_inputstream_advance(tokeniser->input);
+		}
+	}
+
+	if (tokeniser->state != HUBBUB_TOKENISER_STATE_TAG_OPEN &&
+			tokeniser->context.current_chars.len > 0) {
+		/* Emit any pending characters */
+		token.type = HUBBUB_TOKEN_CHARACTER;
+		token.data.character = tokeniser->context.current_chars;
+
+		hubbub_tokeniser_emit_token(tokeniser, &token);
+
+		tokeniser->context.current_chars.data_off = 0;
+		tokeniser->context.current_chars.len = 0;
+	}
+
+	if (c == HUBBUB_INPUTSTREAM_EOF) {
+		token.type = HUBBUB_TOKEN_EOF;
+
+		hubbub_tokeniser_emit_token(tokeniser, &token);
+	}
+
+	return (c != HUBBUB_INPUTSTREAM_EOF && c != HUBBUB_INPUTSTREAM_OOD);
+}
+
+bool hubbub_tokeniser_handle_entity_data(hubbub_tokeniser *tokeniser)
+{
+	if (tokeniser->context.match_entity.complete == false) {
+		return hubbub_tokeniser_consume_entity(tokeniser);
+	} else {
+		hubbub_token token;
+		uint32_t c = hubbub_inputstream_peek(tokeniser->input);
+
+		if (c == HUBBUB_INPUTSTREAM_OOD ||
+				c == HUBBUB_INPUTSTREAM_EOF) {
+			/* Should never happen */
+			abort();
+		}
+
+		/* Emit character */
+		token.type = HUBBUB_TOKEN_CHARACTER;
+		token.data.character.data_off =
+				hubbub_inputstream_cur_pos(tokeniser->input,
+						&token.data.character.len);
+
+		hubbub_tokeniser_emit_token(tokeniser, &token);
+
+		/* Reset for next time */
+		tokeniser->context.match_entity.complete = false;
+
+		tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+
+		hubbub_inputstream_advance(tokeniser->input);
+	}
+
+	return true;
+}
+
+bool hubbub_tokeniser_handle_tag_open(hubbub_tokeniser *tokeniser)
+{
+	uint32_t c = hubbub_inputstream_peek(tokeniser->input);
+	hubbub_tag *ctag = &tokeniser->context.current_tag;
+	uint32_t pos;
+	size_t len;
+
+	if (c == HUBBUB_INPUTSTREAM_OOD)
+		return false;
+
+	if (tokeniser->content_model == HUBBUB_CONTENT_MODEL_RCDATA ||
+			tokeniser->content_model ==
+					HUBBUB_CONTENT_MODEL_CDATA) {
+		if (c == '/') {
+			pos = hubbub_inputstream_cur_pos(tokeniser->input,
+					&len);
+			tokeniser->context.current_chars.len += len;
+
+			tokeniser->state =
+				HUBBUB_TOKENISER_STATE_CLOSE_TAG_OPEN;
+
+			hubbub_inputstream_advance(tokeniser->input);
+		} else {
+			hubbub_token token;
+
+			/* Emit '<' */
+			token.type = HUBBUB_TOKEN_CHARACTER;
+			token.data.character =
+					tokeniser->context.current_chars;
+
+			hubbub_tokeniser_emit_token(tokeniser, &token);
+
+			tokeniser->state =
+				HUBBUB_TOKENISER_STATE_DATA;
+		}
+	} else if (tokeniser->content_model == HUBBUB_CONTENT_MODEL_PCDATA) {
+		if (c == '!') {
+			pos = hubbub_inputstream_cur_pos(tokeniser->input,
+					&len);
+
+			tokeniser->context.current_chars.len += len;
+
+			tokeniser->state =
+				HUBBUB_TOKENISER_STATE_MARKUP_DECLARATION_OPEN;
+			hubbub_inputstream_advance(tokeniser->input);
+		} else if (c == '/') {
+			pos = hubbub_inputstream_cur_pos(tokeniser->input,
+					&len);
+
+			tokeniser->context.current_chars.len += len;
+
+			tokeniser->state =
+				HUBBUB_TOKENISER_STATE_CLOSE_TAG_OPEN;
+			hubbub_inputstream_advance(tokeniser->input);
+		} else if ('A' <= c && c <= 'Z') {
+			hubbub_inputstream_lowercase(tokeniser->input);
+
+			tokeniser->context.current_tag_type =
+					HUBBUB_TOKEN_START_TAG;
+
+			ctag->name.data_off =
+				hubbub_inputstream_cur_pos(tokeniser->input,
+				&ctag->name.len);
+			ctag->n_attributes = 0;
+
+			tokeniser->state =
+				HUBBUB_TOKENISER_STATE_TAG_NAME;
+			hubbub_inputstream_advance(tokeniser->input);
+		} else if ('a' <= c && c <= 'z') {
+			tokeniser->context.current_tag_type =
+					HUBBUB_TOKEN_START_TAG;
+
+			ctag->name.data_off =
+				hubbub_inputstream_cur_pos(tokeniser->input,
+				&ctag->name.len);
+			ctag->n_attributes = 0;
+
+			tokeniser->state =
+				HUBBUB_TOKENISER_STATE_TAG_NAME;
+			hubbub_inputstream_advance(tokeniser->input);
+		} else if (c == '>') {
+			hubbub_token token;
+
+			pos = hubbub_inputstream_cur_pos(tokeniser->input,
+					&len);
+			tokeniser->context.current_chars.len += len;
+
+			/* Emit "<>" */
+			token.type = HUBBUB_TOKEN_CHARACTER;
+			token.data.character =
+					tokeniser->context.current_chars;
+
+			hubbub_tokeniser_emit_token(tokeniser, &token);
+
+			tokeniser->state =
+				HUBBUB_TOKENISER_STATE_DATA;
+
+			hubbub_inputstream_advance(tokeniser->input);
+		} else if (c == '?') {
+			pos = hubbub_inputstream_cur_pos(tokeniser->input,
+					&len);
+			tokeniser->context.current_chars.len += len;
+
+			tokeniser->context.current_comment.data_off = pos;
+			tokeniser->context.current_comment.len = len;
+			tokeniser->state =
+				HUBBUB_TOKENISER_STATE_BOGUS_COMMENT;
+			hubbub_inputstream_advance(tokeniser->input);
+		} else {
+			hubbub_token token;
+
+			/* Emit '<' */
+			token.type = HUBBUB_TOKEN_CHARACTER;
+			token.data.character =
+					tokeniser->context.current_chars;
+
+			hubbub_tokeniser_emit_token(tokeniser, &token);
+
+			tokeniser->state =
+				HUBBUB_TOKENISER_STATE_DATA;
+		}
+	}
+
+	return true;
+}
+
+bool hubbub_tokeniser_handle_close_tag_open(hubbub_tokeniser *tokeniser)
+{
+	if (tokeniser->content_model == HUBBUB_CONTENT_MODEL_RCDATA ||
+			tokeniser->content_model ==
+					HUBBUB_CONTENT_MODEL_CDATA) {
+		tokeniser->context.close_tag_match.tag.len = 0;
+		tokeniser->state = HUBBUB_TOKENISER_STATE_CLOSE_TAG_MATCH;
+	} else if (tokeniser->content_model == HUBBUB_CONTENT_MODEL_PCDATA) {
+		hubbub_tag *ctag = &tokeniser->context.current_tag;
+		uint32_t c = hubbub_inputstream_peek(tokeniser->input);
+		uint32_t pos;
+		size_t len;
+
+		if ('A' <= c && c <= 'Z') {
+			hubbub_inputstream_lowercase(tokeniser->input);
+
+			pos = hubbub_inputstream_cur_pos(tokeniser->input,
+					&len);
+
+			tokeniser->context.current_tag_type =
+					HUBBUB_TOKEN_END_TAG;
+			ctag->name.data_off = pos;
+			ctag->name.len = len;
+			ctag->n_attributes = 0;
+
+			tokeniser->state = HUBBUB_TOKENISER_STATE_TAG_NAME;
+			hubbub_inputstream_advance(tokeniser->input);
+		} else if ('a' <= c && c <= 'z') {
+			pos = hubbub_inputstream_cur_pos(tokeniser->input,
+					&len);
+
+			tokeniser->context.current_tag_type =
+					HUBBUB_TOKEN_END_TAG;
+			ctag->name.data_off = pos;
+			ctag->name.len = len;
+			ctag->n_attributes = 0;
+
+			tokeniser->state = HUBBUB_TOKENISER_STATE_TAG_NAME;
+			hubbub_inputstream_advance(tokeniser->input);
+		} else if (c == '>') {
+			tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+			hubbub_inputstream_advance(tokeniser->input);
+		} else if (c == HUBBUB_INPUTSTREAM_EOF) {
+			hubbub_token token;
+
+			/* Emit "</" */
+			token.type = HUBBUB_TOKEN_CHARACTER;
+			token.data.character =
+					tokeniser->context.current_chars;
+
+			hubbub_tokeniser_emit_token(tokeniser, &token);
+
+			tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+		} else if (c != HUBBUB_INPUTSTREAM_OOD) {
+			pos = hubbub_inputstream_cur_pos(tokeniser->input,
+					&len);
+
+			tokeniser->context.current_comment.data_off = pos;
+			tokeniser->context.current_comment.len = len;
+
+			tokeniser->state =
+				HUBBUB_TOKENISER_STATE_BOGUS_COMMENT;
+			hubbub_inputstream_advance(tokeniser->input);
+		} else {
+			/* Out of data */
+			return false;
+		}
+	}
+
+	return true;
+}
+
+bool hubbub_tokeniser_handle_close_tag_match(hubbub_tokeniser *tokeniser)
+{
+	hubbub_tokeniser_context *ctx = &tokeniser->context;
+	hubbub_tag *ctag = &tokeniser->context.current_tag;
+	uint32_t c = 0;
+
+	while (ctx->close_tag_match.tag.len < ctag->name.len &&
+			(c = hubbub_inputstream_peek(tokeniser->input)) !=
+			HUBBUB_INPUTSTREAM_EOF &&
+			c != HUBBUB_INPUTSTREAM_OOD) {
+		/* Match last open tag */
+		uint32_t off;
+		size_t len;
+
+		off = hubbub_inputstream_cur_pos(tokeniser->input, &len);
+
+		if (ctx->close_tag_match.tag.len == 0) {
+			ctx->close_tag_match.tag.data_off = off;
+			ctx->close_tag_match.tag.len = len;
+		} else {
+			ctx->close_tag_match.tag.len += len;
+		}
+
+		hubbub_inputstream_advance(tokeniser->input);
+
+		if (ctx->close_tag_match.tag.len > ctag->name.len ||
+			(ctx->close_tag_match.tag.len == ctag->name.len &&
+				hubbub_inputstream_compare_range_ci(
+					tokeniser->input,
+					ctag->name.data_off,
+					ctx->close_tag_match.tag.data_off,
+					ctag->name.len) != 0)) {
+			hubbub_token token;
+
+			/* Rewind input stream to start of tag name */
+			if (hubbub_inputstream_rewind(tokeniser->input,
+					ctx->close_tag_match.tag.len) !=
+					HUBBUB_OK)
+				abort();
+
+			/* Emit "</" */
+			token.type = HUBBUB_TOKEN_CHARACTER;
+			token.data.character =
+					tokeniser->context.current_chars;
+
+			hubbub_tokeniser_emit_token(tokeniser, &token);
+
+			tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+
+			return true;
+		} else if (ctx->close_tag_match.tag.len == ctag->name.len &&
+				hubbub_inputstream_compare_range_ci(
+					tokeniser->input,
+					ctag->name.data_off,
+					ctx->close_tag_match.tag.data_off,
+					ctag->name.len) == 0) {
+			/* Matched => stop searching */
+			break;
+		}
+	}
+
+	if (c == HUBBUB_INPUTSTREAM_OOD) {
+		/* Need more data */
+		return false;
+	}
+
+	if (c == HUBBUB_INPUTSTREAM_EOF) {
+		/* Ran out of data - parse error */
+		hubbub_token token;
+
+		/* Rewind input stream to start of tag name */
+		if (hubbub_inputstream_rewind(tokeniser->input,
+				ctx->close_tag_match.tag.len) != HUBBUB_OK)
+			abort();
+
+		/* Emit "</" */
+		token.type = HUBBUB_TOKEN_CHARACTER;
+		token.data.character = tokeniser->context.current_chars;
+
+		hubbub_tokeniser_emit_token(tokeniser, &token);
+
+		tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+
+		return true;
+	}
+
+	/* Match following char */
+	c = hubbub_inputstream_peek(tokeniser->input);
+
+	if (c == HUBBUB_INPUTSTREAM_OOD) {
+		/* Need more data */
+		return false;
+	}
+
+	/* Rewind input stream to start of tag name */
+	if (hubbub_inputstream_rewind(tokeniser->input,
+			ctx->close_tag_match.tag.len) != HUBBUB_OK)
+		abort();
+
+	/* Check that following char was valid */
+	if (c != '\t' && c != '\n' && c != '\v' && c != '\f' &&
+			c != ' ' && c != '>' && c != '/' && c != '<' &&
+			c != HUBBUB_INPUTSTREAM_EOF) {
+		hubbub_token token;
+
+		/* Emit "</" */
+		token.type = HUBBUB_TOKEN_CHARACTER;
+		token.data.character = tokeniser->context.current_chars;
+
+		hubbub_tokeniser_emit_token(tokeniser, &token);
+
+		tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+
+		return true;
+	}
+
+	/* Switch the content model back to PCDATA */
+	tokeniser->content_model = HUBBUB_CONTENT_MODEL_PCDATA;
+
+	/* Finally, transition back to close tag open state */
+	tokeniser->state = HUBBUB_TOKENISER_STATE_CLOSE_TAG_OPEN;
+
+	return true;
+}
+
+bool hubbub_tokeniser_handle_tag_name(hubbub_tokeniser *tokeniser)
+{
+	hubbub_tag *ctag = &tokeniser->context.current_tag;
+	uint32_t c = hubbub_inputstream_peek(tokeniser->input);
+
+	if (c == HUBBUB_INPUTSTREAM_OOD)
+		return false;
+
+	if (c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == ' ') {
+		tokeniser->state =
+			HUBBUB_TOKENISER_STATE_BEFORE_ATTRIBUTE_NAME;
+		hubbub_inputstream_advance(tokeniser->input);
+	} else if (c == '>') {
+		hubbub_token token;
+
+		/* Emit current tag */
+		token.type = tokeniser->context.current_tag_type;
+		token.data.tag = tokeniser->context.current_tag;
+
+		hubbub_tokeniser_emit_token(tokeniser, &token);
+
+		tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+		hubbub_inputstream_advance(tokeniser->input);
+	} else if ('A' <= c && c <= 'Z') {
+		uint32_t pos;
+		size_t len;
+
+		hubbub_inputstream_lowercase(tokeniser->input);
+
+		pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
+
+		ctag->name.len += len;
+
+		hubbub_inputstream_advance(tokeniser->input);
+	} else if (c == '<' || c == HUBBUB_INPUTSTREAM_EOF) {
+		hubbub_token token;
+
+		/* Emit current tag */
+		token.type = tokeniser->context.current_tag_type;
+		token.data.tag = tokeniser->context.current_tag;
+
+		hubbub_tokeniser_emit_token(tokeniser, &token);
+
+		tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+	} else if (c == '/') {
+		/** \todo permitted slash */
+		tokeniser->state =
+			HUBBUB_TOKENISER_STATE_BEFORE_ATTRIBUTE_NAME;
+		hubbub_inputstream_advance(tokeniser->input);
+	} else {
+		uint32_t pos;
+		size_t len;
+
+		pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
+
+		ctag->name.len += len;
+
+		hubbub_inputstream_advance(tokeniser->input);
+	}
+
+	return true;
+}
+
+bool hubbub_tokeniser_handle_before_attribute_name(
+		hubbub_tokeniser *tokeniser)
+{
+	hubbub_tag *ctag = &tokeniser->context.current_tag;
+	uint32_t c = hubbub_inputstream_peek(tokeniser->input);
+
+	if (c == HUBBUB_INPUTSTREAM_OOD)
+		return false;
+
+	if (c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == ' ') {
+		hubbub_inputstream_advance(tokeniser->input);
+	} else if (c == '>') {
+		hubbub_token token;
+
+		/* Emit current tag */
+		token.type = tokeniser->context.current_tag_type;
+		token.data.tag = tokeniser->context.current_tag;
+
+		hubbub_tokeniser_emit_token(tokeniser, &token);
+
+		tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+		hubbub_inputstream_advance(tokeniser->input);
+	} else if ('A' <= c && c <= 'Z') {
+		uint32_t pos;
+		size_t len;
+		hubbub_attribute *attr;
+
+		hubbub_inputstream_lowercase(tokeniser->input);
+
+		pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
+
+		attr = tokeniser->alloc(ctag->attributes,
+				(ctag->n_attributes + 1) *
+					sizeof(hubbub_attribute),
+				tokeniser->alloc_pw);
+		if (attr == NULL) {
+			/** \todo handle memory exhaustion */
+		}
+
+		ctag->attributes = attr;
+
+		attr[ctag->n_attributes].name.data_off = pos;
+		attr[ctag->n_attributes].name.len = len;
+		attr[ctag->n_attributes].value.data_off = 0;
+		attr[ctag->n_attributes].value.len = 0;
+
+		ctag->n_attributes++;
+
+		tokeniser->state = HUBBUB_TOKENISER_STATE_ATTRIBUTE_NAME;
+
+		hubbub_inputstream_advance(tokeniser->input);
+	} else if (c == '/') {
+		/** \todo permitted slash */
+		hubbub_inputstream_advance(tokeniser->input);
+	} else if (c == '<' || c == HUBBUB_INPUTSTREAM_EOF) {
+		hubbub_token token;
+
+		/* Emit current tag */
+		token.type = tokeniser->context.current_tag_type;
+		token.data.tag = tokeniser->context.current_tag;
+
+		hubbub_tokeniser_emit_token(tokeniser, &token);
+
+		tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+	} else {
+		uint32_t pos;
+		size_t len;
+		hubbub_attribute *attr;
+
+		pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
+
+		attr = tokeniser->alloc(ctag->attributes,
+				(ctag->n_attributes + 1) *
+					sizeof(hubbub_attribute),
+				tokeniser->alloc_pw);
+		if (attr == NULL) {
+			/** \todo handle memory exhaustion */
+		}
+
+		ctag->attributes = attr;
+
+		attr[ctag->n_attributes].name.data_off = pos;
+		attr[ctag->n_attributes].name.len = len;
+		attr[ctag->n_attributes].value.data_off = 0;
+		attr[ctag->n_attributes].value.len = 0;
+
+		ctag->n_attributes++;
+
+		tokeniser->state = HUBBUB_TOKENISER_STATE_ATTRIBUTE_NAME;
+
+		hubbub_inputstream_advance(tokeniser->input);
+	}
+
+	return true;
+}
+
+bool hubbub_tokeniser_handle_attribute_name(hubbub_tokeniser *tokeniser)
+{
+	hubbub_tag *ctag = &tokeniser->context.current_tag;
+	uint32_t c = hubbub_inputstream_peek(tokeniser->input);
+
+	if (c == HUBBUB_INPUTSTREAM_OOD)
+		return false;
+
+	if (c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == ' ') {
+		tokeniser->state =
+				HUBBUB_TOKENISER_STATE_AFTER_ATTRIBUTE_NAME;
+		hubbub_inputstream_advance(tokeniser->input);
+	} else if (c == '=') {
+		tokeniser->state =
+			HUBBUB_TOKENISER_STATE_BEFORE_ATTRIBUTE_VALUE;
+		hubbub_inputstream_advance(tokeniser->input);
+	} else if (c == '>') {
+		hubbub_token token;
+
+		/* Emit current tag */
+		token.type = tokeniser->context.current_tag_type;
+		token.data.tag = tokeniser->context.current_tag;
+
+		hubbub_tokeniser_emit_token(tokeniser, &token);
+
+		tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+		hubbub_inputstream_advance(tokeniser->input);
+	} else if ('A' <= c && c <= 'Z') {
+		uint32_t pos;
+		size_t len;
+
+		hubbub_inputstream_lowercase(tokeniser->input);
+
+		pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
+
+		ctag->attributes[ctag->n_attributes - 1].name.len += len;
+
+		hubbub_inputstream_advance(tokeniser->input);
+	} else if (c == '/') {
+		/** \todo permitted slash */
+		tokeniser->state =
+				HUBBUB_TOKENISER_STATE_BEFORE_ATTRIBUTE_NAME;
+		hubbub_inputstream_advance(tokeniser->input);
+	} else if (c == '<' || c == HUBBUB_INPUTSTREAM_EOF) {
+		hubbub_token token;
+
+		/* Emit current tag */
+		token.type = tokeniser->context.current_tag_type;
+		token.data.tag = tokeniser->context.current_tag;
+
+		hubbub_tokeniser_emit_token(tokeniser, &token);
+
+		tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+	} else {
+		uint32_t pos;
+		size_t len;
+
+		pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
+
+		ctag->attributes[ctag->n_attributes - 1].name.len += len;
+
+		hubbub_inputstream_advance(tokeniser->input);
+	}
+
+	return true;
+}
+
+bool hubbub_tokeniser_handle_after_attribute_name(
+		hubbub_tokeniser *tokeniser)
+{
+	hubbub_tag *ctag = &tokeniser->context.current_tag;
+	uint32_t c = hubbub_inputstream_peek(tokeniser->input);
+
+	if (c == HUBBUB_INPUTSTREAM_OOD)
+		return false;
+
+	if (c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == ' ') {
+		hubbub_inputstream_advance(tokeniser->input);
+	} else if (c == '=') {
+		tokeniser->state =
+			HUBBUB_TOKENISER_STATE_BEFORE_ATTRIBUTE_VALUE;
+		hubbub_inputstream_advance(tokeniser->input);
+	} else if (c == '>') {
+		hubbub_token token;
+
+		/* Emit current tag */
+		token.type = tokeniser->context.current_tag_type;
+		token.data.tag = tokeniser->context.current_tag;
+
+		hubbub_tokeniser_emit_token(tokeniser, &token);
+
+		tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+		hubbub_inputstream_advance(tokeniser->input);
+	} else if ('A' <= c && c <= 'Z') {
+		uint32_t pos;
+		size_t len;
+		hubbub_attribute *attr;
+
+		hubbub_inputstream_lowercase(tokeniser->input);
+
+		pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
+
+		attr = tokeniser->alloc(ctag->attributes,
+				(ctag->n_attributes + 1) *
+					sizeof(hubbub_attribute),
+				tokeniser->alloc_pw);
+		if (attr == NULL) {
+			/** \todo handle memory exhaustion */
+		}
+
+		ctag->attributes = attr;
+
+		attr[ctag->n_attributes].name.data_off = pos;
+		attr[ctag->n_attributes].name.len = len;
+		attr[ctag->n_attributes].value.data_off = 0;
+		attr[ctag->n_attributes].value.len = 0;
+
+		ctag->n_attributes++;
+
+		tokeniser->state = HUBBUB_TOKENISER_STATE_ATTRIBUTE_NAME;
+
+		hubbub_inputstream_advance(tokeniser->input);
+	} else if (c == '/') {
+		/** \todo permitted slash */
+		tokeniser->state =
+				HUBBUB_TOKENISER_STATE_BEFORE_ATTRIBUTE_NAME;
+		hubbub_inputstream_advance(tokeniser->input);
+	} else if (c == '<' || c == HUBBUB_INPUTSTREAM_EOF) {
+		hubbub_token token;
+
+		/* Emit current tag */
+		token.type = tokeniser->context.current_tag_type;
+		token.data.tag = tokeniser->context.current_tag;
+
+		hubbub_tokeniser_emit_token(tokeniser, &token);
+
+		tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+	} else {
+		uint32_t pos;
+		size_t len;
+		hubbub_attribute *attr;
+
+		hubbub_inputstream_lowercase(tokeniser->input);
+
+		pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
+
+		attr = tokeniser->alloc(ctag->attributes,
+				(ctag->n_attributes + 1) *
+					sizeof(hubbub_attribute),
+				tokeniser->alloc_pw);
+		if (attr == NULL) {
+			/** \todo handle memory exhaustion */
+		}
+
+		ctag->attributes = attr;
+
+		attr[ctag->n_attributes].name.data_off = pos;
+		attr[ctag->n_attributes].name.len = len;
+		attr[ctag->n_attributes].value.data_off = 0;
+		attr[ctag->n_attributes].value.len = 0;
+
+		ctag->n_attributes++;
+
+		tokeniser->state = HUBBUB_TOKENISER_STATE_ATTRIBUTE_NAME;
+
+		hubbub_inputstream_advance(tokeniser->input);
+	}
+
+	return true;
+}
+
+bool hubbub_tokeniser_handle_before_attribute_value(
+		hubbub_tokeniser *tokeniser)
+{
+	hubbub_tag *ctag = &tokeniser->context.current_tag;
+	uint32_t c = hubbub_inputstream_peek(tokeniser->input);
+
+	if (c == HUBBUB_INPUTSTREAM_OOD)
+		return false;
+
+	if (c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == ' ') {
+		hubbub_inputstream_advance(tokeniser->input);
+	} else if (c == '"') {
+		tokeniser->state = HUBBUB_TOKENISER_STATE_ATTRIBUTE_VALUE_DQ;
+		hubbub_inputstream_advance(tokeniser->input);
+	} else if (c == '&') {
+		tokeniser->state = HUBBUB_TOKENISER_STATE_ATTRIBUTE_VALUE_UQ;
+	} else if (c == '\'') {
+		tokeniser->state = HUBBUB_TOKENISER_STATE_ATTRIBUTE_VALUE_SQ;
+		hubbub_inputstream_advance(tokeniser->input);
+	} else if (c == '>') {
+		hubbub_token token;
+
+		/* Emit current tag */
+		token.type = tokeniser->context.current_tag_type;
+		token.data.tag = tokeniser->context.current_tag;
+
+		hubbub_tokeniser_emit_token(tokeniser, &token);
+
+		tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+		hubbub_inputstream_advance(tokeniser->input);
+	} else if (c == '<' || c == HUBBUB_INPUTSTREAM_EOF) {
+		hubbub_token token;
+
+		/* Emit current tag */
+		token.type = tokeniser->context.current_tag_type;
+		token.data.tag = tokeniser->context.current_tag;
+
+		hubbub_tokeniser_emit_token(tokeniser, &token);
+
+		tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+	} else {
+		uint32_t pos;
+		size_t len;
+
+		pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
+
+		ctag->attributes[ctag->n_attributes - 1].value.data_off = pos;
+		ctag->attributes[ctag->n_attributes - 1].value.len = len;
+
+		tokeniser->state = HUBBUB_TOKENISER_STATE_ATTRIBUTE_VALUE_UQ;
+
+		hubbub_inputstream_advance(tokeniser->input);
+	}
+
+	return true;
+}
+
+bool hubbub_tokeniser_handle_attribute_value_dq(hubbub_tokeniser *tokeniser)
+{
+	hubbub_tag *ctag = &tokeniser->context.current_tag;
+	uint32_t c = hubbub_inputstream_peek(tokeniser->input);
+
+	if (c == HUBBUB_INPUTSTREAM_OOD)
+		return false;
+
+	if (c == '"') {
+		tokeniser->state =
+				HUBBUB_TOKENISER_STATE_BEFORE_ATTRIBUTE_NAME;
+		hubbub_inputstream_advance(tokeniser->input);
+	} else if (c == '&') {
+		tokeniser->context.prev_state = tokeniser->state;
+		tokeniser->state =
+			HUBBUB_TOKENISER_STATE_ENTITY_IN_ATTRIBUTE_VALUE;
+		/* Don't eat the '&'; entity consumption handles this */
+	} else if (c == HUBBUB_INPUTSTREAM_EOF) {
+		hubbub_token token;
+
+		/* Emit current tag */
+		token.type = tokeniser->context.current_tag_type;
+		token.data.tag = tokeniser->context.current_tag;
+
+		hubbub_tokeniser_emit_token(tokeniser, &token);
+
+		tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+	} else {
+		uint32_t pos;
+		size_t len;
+
+		pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
+
+		if (ctag->attributes[ctag->n_attributes - 1].value.len == 0) {
+			ctag->attributes[ctag->n_attributes - 1].value.data_off =
+					pos;
+		}
+
+		ctag->attributes[ctag->n_attributes - 1].value.len += len;
+
+		hubbub_inputstream_advance(tokeniser->input);
+	}
+
+	return true;
+}
+
+bool hubbub_tokeniser_handle_attribute_value_sq(hubbub_tokeniser *tokeniser)
+{
+	hubbub_tag *ctag = &tokeniser->context.current_tag;
+	uint32_t c = hubbub_inputstream_peek(tokeniser->input);
+
+	if (c == HUBBUB_INPUTSTREAM_OOD)
+		return false;
+
+	if (c == '\'') {
+		tokeniser->state =
+				HUBBUB_TOKENISER_STATE_BEFORE_ATTRIBUTE_NAME;
+		hubbub_inputstream_advance(tokeniser->input);
+	} else if (c == '&') {
+		tokeniser->context.prev_state = tokeniser->state;
+		tokeniser->state =
+			HUBBUB_TOKENISER_STATE_ENTITY_IN_ATTRIBUTE_VALUE;
+		/* Don't eat the '&'; entity consumption handles this */
+	} else if (c == HUBBUB_INPUTSTREAM_EOF) {
+		hubbub_token token;
+
+		/* Emit current tag */
+		token.type = tokeniser->context.current_tag_type;
+		token.data.tag = tokeniser->context.current_tag;
+
+		hubbub_tokeniser_emit_token(tokeniser, &token);
+
+		tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+	} else {
+		uint32_t pos;
+		size_t len;
+
+		pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
+
+		if (ctag->attributes[ctag->n_attributes - 1].value.len == 0) {
+			ctag->attributes[ctag->n_attributes - 1].value.data_off =
+					pos;
+		}
+
+		ctag->attributes[ctag->n_attributes - 1].value.len += len;
+
+		hubbub_inputstream_advance(tokeniser->input);
+	}
+
+	return true;
+}
+
+bool hubbub_tokeniser_handle_attribute_value_uq(hubbub_tokeniser *tokeniser)
+{
+	hubbub_tag *ctag = &tokeniser->context.current_tag;
+	uint32_t c = hubbub_inputstream_peek(tokeniser->input);
+
+	if (c == HUBBUB_INPUTSTREAM_OOD)
+		return false;
+
+	if (c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == ' ') {
+		tokeniser->state =
+				HUBBUB_TOKENISER_STATE_BEFORE_ATTRIBUTE_NAME;
+		hubbub_inputstream_advance(tokeniser->input);
+	} else if (c == '&') {
+		tokeniser->context.prev_state = tokeniser->state;
+		tokeniser->state =
+			HUBBUB_TOKENISER_STATE_ENTITY_IN_ATTRIBUTE_VALUE;
+		/* Don't eat the '&'; entity consumption handles this */
+	} else if (c == '>') {
+		hubbub_token token;
+
+		/* Emit current tag */
+		token.type = tokeniser->context.current_tag_type;
+		token.data.tag = tokeniser->context.current_tag;
+
+		hubbub_tokeniser_emit_token(tokeniser, &token);
+
+		tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+		hubbub_inputstream_advance(tokeniser->input);
+	} else if (c == '<' || c == HUBBUB_INPUTSTREAM_EOF) {
+		hubbub_token token;
+
+		/* Emit current tag */
+		token.type = tokeniser->context.current_tag_type;
+		token.data.tag = tokeniser->context.current_tag;
+
+		hubbub_tokeniser_emit_token(tokeniser, &token);
+
+		tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+	} else {
+		uint32_t pos;
+		size_t len;
+
+		pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
+
+		if (ctag->attributes[ctag->n_attributes - 1].value.len == 0) {
+			ctag->attributes[ctag->n_attributes - 1].value.data_off =
+					pos;
+		}
+
+		ctag->attributes[ctag->n_attributes - 1].value.len += len;
+
+		hubbub_inputstream_advance(tokeniser->input);
+	}
+
+	return true;
+}
+
+bool hubbub_tokeniser_handle_entity_in_attribute_value(
+		hubbub_tokeniser *tokeniser)
+{
+	hubbub_tag *ctag = &tokeniser->context.current_tag;
+	uint32_t pos;
+	size_t len;
+
+	if (tokeniser->context.match_entity.complete == false) {
+		return hubbub_tokeniser_consume_entity(tokeniser);
+	} else {
+		uint32_t c = hubbub_inputstream_peek(tokeniser->input);
+
+		if (c == HUBBUB_INPUTSTREAM_OOD ||
+				c == HUBBUB_INPUTSTREAM_EOF) {
+			/* Should never happen */
+			abort();
+		}
+
+		pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
+
+		if (ctag->attributes[ctag->n_attributes - 1].value.len == 0) {
+			ctag->attributes[ctag->n_attributes - 1].value.data_off =
+					pos;
+		}
+
+		ctag->attributes[ctag->n_attributes - 1].value.len += len;
+
+		/* Reset for next time */
+		tokeniser->context.match_entity.complete = false;
+
+		/* And back to the previous state */
+		tokeniser->state = tokeniser->context.prev_state;
+
+		hubbub_inputstream_advance(tokeniser->input);
+	}
+
+	return true;
+}
+
+bool hubbub_tokeniser_handle_bogus_comment(hubbub_tokeniser *tokeniser)
+{
+	hubbub_token token;
+	uint32_t c;
+
+	while ((c = hubbub_inputstream_peek(tokeniser->input)) !=
+			HUBBUB_INPUTSTREAM_EOF &&
+			c != HUBBUB_INPUTSTREAM_OOD) {
+		uint32_t pos;
+		size_t len;
+
+		if (c == '>') {
+			hubbub_inputstream_advance(tokeniser->input);
+			break;
+		}
+
+		pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
+
+		if (tokeniser->context.current_comment.len == 0)
+			tokeniser->context.current_comment.data_off = pos;
+		tokeniser->context.current_comment.len += len;
+
+		hubbub_inputstream_advance(tokeniser->input);
+	}
+
+	if (c == HUBBUB_INPUTSTREAM_OOD)
+		return false;
+
+	/* Emit comment */
+	token.type = HUBBUB_TOKEN_COMMENT;
+	token.data.comment = tokeniser->context.current_comment;
+
+	hubbub_tokeniser_emit_token(tokeniser, &token);
+
+	tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+
+	return true;
+}
+
+bool hubbub_tokeniser_handle_markup_declaration_open(
+		hubbub_tokeniser *tokeniser)
+{
+	uint32_t c = hubbub_inputstream_peek(tokeniser->input);
+
+	if (c == HUBBUB_INPUTSTREAM_OOD)
+		return false;
+
+	if (c == '-') {
+		tokeniser->state = HUBBUB_TOKENISER_STATE_COMMENT_START;
+		hubbub_inputstream_advance(tokeniser->input);
+	} else if ((c & ~0x20) == 'D') {
+		hubbub_inputstream_uppercase(tokeniser->input);
+		tokeniser->context.match_doctype.count = 1;
+		tokeniser->state = HUBBUB_TOKENISER_STATE_MATCH_DOCTYPE;
+		hubbub_inputstream_advance(tokeniser->input);
+	} else {
+		tokeniser->context.current_comment.data_off = 0;
+		tokeniser->context.current_comment.len = 0;
+
+		tokeniser->state = HUBBUB_TOKENISER_STATE_BOGUS_COMMENT;
+	}
+
+	return true;
+}
+
+bool hubbub_tokeniser_handle_comment_start(hubbub_tokeniser *tokeniser)
+{
+	uint32_t c = hubbub_inputstream_peek(tokeniser->input);
+
+	if (c == HUBBUB_INPUTSTREAM_OOD)
+		return false;
+
+	tokeniser->context.current_comment.data_off = 0;
+	tokeniser->context.current_comment.len = 0;
+
+
+	if (c == '-') {
+		tokeniser->state = HUBBUB_TOKENISER_STATE_COMMENT;
+		hubbub_inputstream_advance(tokeniser->input);
+	} else {
+		hubbub_inputstream_push_back(tokeniser->input, '-');
+		tokeniser->state = HUBBUB_TOKENISER_STATE_BOGUS_COMMENT;
+	}
+
+	return true;
+}
+
+bool hubbub_tokeniser_handle_comment(hubbub_tokeniser *tokeniser)
+{
+	uint32_t c = hubbub_inputstream_peek(tokeniser->input);
+
+	if (c == HUBBUB_INPUTSTREAM_OOD)
+		return false;
+
+	if (c == '-') {
+		tokeniser->state = HUBBUB_TOKENISER_STATE_COMMENT_DASH;
+		hubbub_inputstream_advance(tokeniser->input);
+	} else if (c == HUBBUB_INPUTSTREAM_EOF) {
+		hubbub_token token;
+
+		/* Emit comment */
+		token.type = HUBBUB_TOKEN_COMMENT;
+		token.data.comment = tokeniser->context.current_comment;
+
+		hubbub_tokeniser_emit_token(tokeniser, &token);
+
+		tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+	} else {
+		uint32_t pos;
+		size_t len;
+
+		pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
+
+		if (tokeniser->context.current_comment.len == 0)
+			tokeniser->context.current_comment.data_off = pos;
+		tokeniser->context.current_comment.len += len;
+
+		hubbub_inputstream_advance(tokeniser->input);
+	}
+
+	return true;
+}
+
+bool hubbub_tokeniser_handle_comment_dash(hubbub_tokeniser *tokeniser)
+{
+	uint32_t c = hubbub_inputstream_peek(tokeniser->input);
+
+	if (c == HUBBUB_INPUTSTREAM_OOD)
+		return false;
+
+	if (c == '-') {
+		tokeniser->state = HUBBUB_TOKENISER_STATE_COMMENT_END;
+		hubbub_inputstream_advance(tokeniser->input);
+	} else if (c == HUBBUB_INPUTSTREAM_EOF) {
+		hubbub_token token;
+
+		/* Emit comment */
+		token.type = HUBBUB_TOKEN_COMMENT;
+		token.data.comment = tokeniser->context.current_comment;
+
+		hubbub_tokeniser_emit_token(tokeniser, &token);
+
+		tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+	} else {
+		uint32_t pos;
+		size_t len;
+
+		pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
+
+		if (tokeniser->context.current_comment.len == 0) {
+			tokeniser->context.current_comment.data_off = pos;
+		} else {
+			/* Need to do this to get length of '-' */
+			len += pos -
+				tokeniser->context.current_comment.data_off;
+		}
+
+		tokeniser->context.current_comment.len = len;
+
+		tokeniser->state = HUBBUB_TOKENISER_STATE_COMMENT;
+
+		hubbub_inputstream_advance(tokeniser->input);
+	}
+
+	return true;
+}
+
+bool hubbub_tokeniser_handle_comment_end(hubbub_tokeniser *tokeniser)
+{
+	uint32_t c = hubbub_inputstream_peek(tokeniser->input);
+
+	if (c == HUBBUB_INPUTSTREAM_OOD)
+		return false;
+
+	if (c == '>') {
+		hubbub_token token;
+
+		/* Emit comment */
+		token.type = HUBBUB_TOKEN_COMMENT;
+		token.data.comment = tokeniser->context.current_comment;
+
+		hubbub_tokeniser_emit_token(tokeniser, &token);
+
+		tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+		hubbub_inputstream_advance(tokeniser->input);
+	} else if (c == '-') {
+		uint32_t pos;
+		size_t len;
+
+		pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
+
+		if (tokeniser->context.current_comment.len == 0) {
+			tokeniser->context.current_comment.data_off = pos;
+			tokeniser->context.current_comment.len = len;
+		} else {
+			/* Need to do this to get length of '-' */
+			len = pos -
+				tokeniser->context.current_comment.data_off;
+		}
+
+		tokeniser->context.current_comment.len = len;
+
+		tokeniser->state = HUBBUB_TOKENISER_STATE_COMMENT_END;
+		hubbub_inputstream_advance(tokeniser->input);
+	} else if (c == HUBBUB_INPUTSTREAM_EOF) {
+		hubbub_token token;
+
+		/* Emit comment */
+		token.type = HUBBUB_TOKEN_COMMENT;
+		token.data.comment = tokeniser->context.current_comment;
+
+		hubbub_tokeniser_emit_token(tokeniser, &token);
+
+		tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+	} else {
+		uint32_t pos;
+		size_t len;
+
+		pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
+
+		if (tokeniser->context.current_comment.len == 0) {
+			tokeniser->context.current_comment.data_off = pos;
+		} else {
+			/* Need to do this to get length of '--' */
+			len += pos -
+				tokeniser->context.current_comment.data_off;
+		}
+
+		tokeniser->context.current_comment.len = len;
+
+		tokeniser->state = HUBBUB_TOKENISER_STATE_COMMENT;
+
+		hubbub_inputstream_advance(tokeniser->input);
+	}
+
+	return true;
+}
+
+bool hubbub_tokeniser_handle_match_doctype(hubbub_tokeniser *tokeniser)
+{
+	uint32_t c = hubbub_inputstream_peek(tokeniser->input);
+
+	if (c == HUBBUB_INPUTSTREAM_OOD)
+		return false;
+
+	if (tokeniser->context.match_doctype.count == 1 &&
+			(c & ~0x20) == 'O') {
+		hubbub_inputstream_uppercase(tokeniser->input);
+		tokeniser->context.match_doctype.count++;
+		hubbub_inputstream_advance(tokeniser->input);
+	} else if (tokeniser->context.match_doctype.count == 2 &&
+			(c & ~0x20) == 'C') {
+		hubbub_inputstream_uppercase(tokeniser->input);
+		tokeniser->context.match_doctype.count++;
+		hubbub_inputstream_advance(tokeniser->input);
+	} else if (tokeniser->context.match_doctype.count == 3 &&
+			(c & ~0x20) == 'T') {
+		hubbub_inputstream_uppercase(tokeniser->input);
+		tokeniser->context.match_doctype.count++;
+		hubbub_inputstream_advance(tokeniser->input);
+	} else if (tokeniser->context.match_doctype.count == 4 &&
+			(c & ~0x20) == 'Y') {
+		hubbub_inputstream_uppercase(tokeniser->input);
+		tokeniser->context.match_doctype.count++;
+		hubbub_inputstream_advance(tokeniser->input);
+	} else if (tokeniser->context.match_doctype.count == 5 &&
+			(c & ~0x20) == 'P') {
+		hubbub_inputstream_uppercase(tokeniser->input);
+		tokeniser->context.match_doctype.count++;
+		hubbub_inputstream_advance(tokeniser->input);
+	} else if (tokeniser->context.match_doctype.count == 6 &&
+			(c & ~0x20) == 'E') {
+		hubbub_inputstream_uppercase(tokeniser->input);
+		tokeniser->state = HUBBUB_TOKENISER_STATE_DOCTYPE;
+		hubbub_inputstream_advance(tokeniser->input);
+	} else {
+		switch (tokeniser->context.match_doctype.count) {
+		case 6: hubbub_inputstream_push_back(tokeniser->input, 'P');
+		case 5: hubbub_inputstream_push_back(tokeniser->input, 'Y');
+		case 4: hubbub_inputstream_push_back(tokeniser->input, 'T');
+		case 3: hubbub_inputstream_push_back(tokeniser->input, 'C');
+		case 2: hubbub_inputstream_push_back(tokeniser->input, 'O');
+		case 1: hubbub_inputstream_push_back(tokeniser->input, 'D');
+		}
+
+		tokeniser->context.current_comment.data_off = 0;
+		tokeniser->context.current_comment.len = 0;
+
+		tokeniser->state = HUBBUB_TOKENISER_STATE_BOGUS_COMMENT;
+	}
+
+	return true;
+}
+
+bool hubbub_tokeniser_handle_doctype(hubbub_tokeniser *tokeniser)
+{
+	uint32_t c = hubbub_inputstream_peek(tokeniser->input);
+
+	if (c == HUBBUB_INPUTSTREAM_OOD)
+		return false;
+
+	if (c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == ' ') {
+		hubbub_inputstream_advance(tokeniser->input);
+	}
+
+	tokeniser->state = HUBBUB_TOKENISER_STATE_BEFORE_DOCTYPE_NAME;
+
+	return true;
+}
+
+bool hubbub_tokeniser_handle_before_doctype_name(
+		hubbub_tokeniser *tokeniser)
+{
+	hubbub_doctype *cdoc = &tokeniser->context.current_doctype;
+	uint32_t c = hubbub_inputstream_peek(tokeniser->input);
+
+	if (c == HUBBUB_INPUTSTREAM_OOD)
+		return false;
+
+	if (c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == ' ') {
+		hubbub_inputstream_advance(tokeniser->input);
+	} else if ('a' <= c && c <= 'z') {
+		uint32_t pos;
+		size_t len;
+
+		hubbub_inputstream_uppercase(tokeniser->input);
+
+		pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
+
+		cdoc->name.data_off = pos;
+		cdoc->name.len = len;
+		cdoc->correct = false;
+
+		tokeniser->state = HUBBUB_TOKENISER_STATE_DOCTYPE_NAME;
+
+		hubbub_inputstream_advance(tokeniser->input);
+	} else if (c == '>') {
+		hubbub_token token;
+
+		/* Emit doctype */
+		token.type = HUBBUB_TOKEN_DOCTYPE;
+		token.data.doctype = tokeniser->context.current_doctype;
+
+		hubbub_tokeniser_emit_token(tokeniser, &token);
+
+		tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+		hubbub_inputstream_advance(tokeniser->input);
+	} else if (c == HUBBUB_INPUTSTREAM_EOF) {
+		hubbub_token token;
+
+		/* Emit doctype */
+		token.type = HUBBUB_TOKEN_DOCTYPE;
+		token.data.doctype = tokeniser->context.current_doctype;
+
+		hubbub_tokeniser_emit_token(tokeniser, &token);
+
+		tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+	} else {
+		uint32_t pos;
+		size_t len;
+
+		pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
+
+		cdoc->name.data_off = pos;
+		cdoc->name.len = len;
+		cdoc->correct = false;
+
+		tokeniser->state = HUBBUB_TOKENISER_STATE_DOCTYPE_NAME;
+
+		hubbub_inputstream_advance(tokeniser->input);
+	}
+
+	return true;
+}
+
+bool hubbub_tokeniser_handle_doctype_name(hubbub_tokeniser *tokeniser)
+{
+	hubbub_doctype *cdoc = &tokeniser->context.current_doctype;
+	uint32_t c = hubbub_inputstream_peek(tokeniser->input);
+
+	if (c == HUBBUB_INPUTSTREAM_OOD)
+		return false;
+
+	if (c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == ' ') {
+		tokeniser->state = HUBBUB_TOKENISER_STATE_AFTER_DOCTYPE_NAME;
+		hubbub_inputstream_advance(tokeniser->input);
+	} else if (c == '>') {
+		hubbub_token token;
+
+		/* Emit doctype */
+		token.type = HUBBUB_TOKEN_DOCTYPE;
+		token.data.doctype = tokeniser->context.current_doctype;
+		token.data.doctype.correct =
+			(hubbub_inputstream_compare_range_ascii(
+				tokeniser->input,
+				token.data.doctype.name.data_off,
+				token.data.doctype.name.len,
+				"HTML", SLEN("HTML")) == 0);
+
+		hubbub_tokeniser_emit_token(tokeniser, &token);
+
+		tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+		hubbub_inputstream_advance(tokeniser->input);
+	} else if ('a' <= c && c <= 'z') {
+		uint32_t pos;
+		size_t len;
+
+		hubbub_inputstream_uppercase(tokeniser->input);
+
+		pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
+
+		cdoc->name.len += len;
+
+		hubbub_inputstream_advance(tokeniser->input);
+	} else if (c == HUBBUB_INPUTSTREAM_EOF) {
+		hubbub_token token;
+
+		/* Emit doctype */
+		token.type = HUBBUB_TOKEN_DOCTYPE;
+		token.data.doctype = tokeniser->context.current_doctype;
+
+		hubbub_tokeniser_emit_token(tokeniser, &token);
+
+		tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+	} else {
+		uint32_t pos;
+		size_t len;
+
+		pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
+
+		cdoc->name.len += len;
+
+		hubbub_inputstream_advance(tokeniser->input);
+	}
+
+	return true;
+}
+
+bool hubbub_tokeniser_handle_after_doctype_name(hubbub_tokeniser *tokeniser)
+{
+	hubbub_doctype *cdoc = &tokeniser->context.current_doctype;
+	uint32_t c = hubbub_inputstream_peek(tokeniser->input);
+
+	if (c == HUBBUB_INPUTSTREAM_OOD)
+		return false;
+
+	if (c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == ' ') {
+		hubbub_inputstream_advance(tokeniser->input);
+	} else if (c == '>') {
+		hubbub_token token;
+
+		/* Emit doctype */
+		token.type = HUBBUB_TOKEN_DOCTYPE;
+		token.data.doctype = tokeniser->context.current_doctype;
+		token.data.doctype.correct =
+			(hubbub_inputstream_compare_range_ascii(
+				tokeniser->input,
+				token.data.doctype.name.data_off,
+				token.data.doctype.name.len,
+				"HTML", SLEN("HTML")) == 0);
+
+		hubbub_tokeniser_emit_token(tokeniser, &token);
+
+		tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+		hubbub_inputstream_advance(tokeniser->input);
+	} else if (c == HUBBUB_INPUTSTREAM_EOF) {
+		hubbub_token token;
+
+		/* Emit doctype */
+		token.type = HUBBUB_TOKEN_DOCTYPE;
+		token.data.doctype = tokeniser->context.current_doctype;
+
+		hubbub_tokeniser_emit_token(tokeniser, &token);
+
+		tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+	} else {
+		cdoc->correct = false;
+
+		tokeniser->state = HUBBUB_TOKENISER_STATE_BOGUS_DOCTYPE;
+
+		hubbub_inputstream_advance(tokeniser->input);
+	}
+
+	return true;
+}
+
+bool hubbub_tokeniser_handle_bogus_doctype(hubbub_tokeniser *tokeniser)
+{
+	uint32_t c = hubbub_inputstream_peek(tokeniser->input);
+
+	if (c == HUBBUB_INPUTSTREAM_OOD)
+		return false;
+
+	if (c == '>') {
+		hubbub_token token;
+
+		/* Emit doctype */
+		token.type = HUBBUB_TOKEN_DOCTYPE;
+		token.data.doctype = tokeniser->context.current_doctype;
+
+		hubbub_tokeniser_emit_token(tokeniser, &token);
+
+		tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+		hubbub_inputstream_advance(tokeniser->input);
+	} else if (c == HUBBUB_INPUTSTREAM_EOF) {
+		hubbub_token token;
+
+		/* Emit doctype */
+		token.type = HUBBUB_TOKEN_DOCTYPE;
+		token.data.doctype = tokeniser->context.current_doctype;
+
+		hubbub_tokeniser_emit_token(tokeniser, &token);
+
+		tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+	} else {
+		hubbub_inputstream_advance(tokeniser->input);
+	}
+
+	return true;
+}
+
+bool hubbub_tokeniser_consume_entity(hubbub_tokeniser *tokeniser)
+{
+	uint32_t c;
+	uint32_t pos;
+	size_t len;
+
+	if (tokeniser->context.match_entity.done_setup == false) {
+		pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
+
+		tokeniser->context.match_entity.str.data_off = pos;
+		tokeniser->context.match_entity.str.len = len;
+		tokeniser->context.match_entity.base = 0;
+		tokeniser->context.match_entity.codepoint = 0;
+		tokeniser->context.match_entity.had_data = false;
+		tokeniser->context.match_entity.return_state =
+				tokeniser->state;
+		tokeniser->context.match_entity.complete = false;
+		tokeniser->context.match_entity.done_setup = true;
+		tokeniser->context.match_entity.context = NULL;
+		tokeniser->context.match_entity.prev_len = len;
+
+		hubbub_inputstream_advance(tokeniser->input);
+	}
+
+	c = hubbub_inputstream_peek(tokeniser->input);
+
+	if (c == HUBBUB_INPUTSTREAM_OOD)
+		return false;
+
+	if (c == '#') {
+		pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
+
+		tokeniser->context.match_entity.str.len += len;
+
+		tokeniser->state = HUBBUB_TOKENISER_STATE_NUMBERED_ENTITY;
+		hubbub_inputstream_advance(tokeniser->input);
+	} else {
+		tokeniser->state = HUBBUB_TOKENISER_STATE_NAMED_ENTITY;
+	}
+
+	return true;
+}
+
+bool hubbub_tokeniser_handle_numbered_entity(hubbub_tokeniser *tokeniser)
+{
+	hubbub_tokeniser_context *ctx = &tokeniser->context;
+	uint32_t c = hubbub_inputstream_peek(tokeniser->input);
+	uint32_t pos;
+	size_t len;
+	hubbub_error error;
+
+	if (c == HUBBUB_INPUTSTREAM_OOD)
+		return false;
+
+	if (ctx->match_entity.base == 0) {
+		if ((c & ~0x20) == 'X') {
+			ctx->match_entity.base = 16;
+
+			pos = hubbub_inputstream_cur_pos(tokeniser->input,
+					&len);
+			ctx->match_entity.str.len += len;
+
+			hubbub_inputstream_advance(tokeniser->input);
+		} else {
+			ctx->match_entity.base = 10;
+		}
+	}
+
+	while ((c = hubbub_inputstream_peek(tokeniser->input)) !=
+			HUBBUB_INPUTSTREAM_EOF &&
+			c != HUBBUB_INPUTSTREAM_OOD) {
+		if (ctx->match_entity.base == 10 &&
+				('0' <= c && c <= '9')) {
+			ctx->match_entity.had_data = true;
+
+			ctx->match_entity.codepoint =
+				ctx->match_entity.codepoint * 10 + (c - '0');
+
+			pos = hubbub_inputstream_cur_pos(tokeniser->input,
+					&len);
+			ctx->match_entity.str.len += len;
+		} else if (ctx->match_entity.base == 16 &&
+				(('0' <= c && c <= '9') ||
+				('A' <= (c & ~0x20) &&
+						(c & ~0x20) <= 'F'))) {
+			ctx->match_entity.had_data = true;
+
+			ctx->match_entity.codepoint *= 16;
+
+			if ('0' <= c && c <= '9') {
+				ctx->match_entity.codepoint += (c - '0');
+			} else {
+				ctx->match_entity.codepoint +=
+						((c & ~0x20) - 'A' + 10);
+			}
+
+			pos = hubbub_inputstream_cur_pos(tokeniser->input,
+					&len);
+			ctx->match_entity.str.len += len;
+		} else {
+			break;
+		}
+
+		hubbub_inputstream_advance(tokeniser->input);
+	}
+
+	if (c == HUBBUB_INPUTSTREAM_OOD)
+		return false;
+
+	/* Eat trailing semicolon, if any */
+	if (c == ';') {
+		pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
+		ctx->match_entity.str.len += len;
+
+		hubbub_inputstream_advance(tokeniser->input);
+	}
+
+	/* Rewind the inputstream to start of matched sequence */
+	hubbub_inputstream_rewind(tokeniser->input,
+			ctx->match_entity.str.len);
+
+	if (ctx->match_entity.had_data) {
+		/* Had data, so calculate final codepoint */
+		if (0x80 <= ctx->match_entity.codepoint &&
+				ctx->match_entity.codepoint <= 0x9F) {
+			ctx->match_entity.codepoint =
+				cp1252Table[ctx->match_entity.codepoint -
+						0x80];
+		} else if (ctx->match_entity.codepoint == 0 ||
+				ctx->match_entity.codepoint > 0x10FFFF) {
+			ctx->match_entity.codepoint = 0xFFFD;
+		}
+
+		/* And replace the matched range with it */
+		error = hubbub_inputstream_replace_range(tokeniser->input,
+				ctx->match_entity.str.data_off,
+				ctx->match_entity.str.len,
+				ctx->match_entity.codepoint);
+		if (error != HUBBUB_OK) {
+			/** \todo handle memory exhaustion */
+		}
+	}
+
+	/* Reset for next time */
+	ctx->match_entity.done_setup = false;
+
+	/* Flag completion */
+	ctx->match_entity.complete = true;
+
+	/* And back to the state we were entered in */
+	tokeniser->state = ctx->match_entity.return_state;
+
+	return true;
+}
+
+bool hubbub_tokeniser_handle_named_entity(hubbub_tokeniser *tokeniser)
+{
+	hubbub_tokeniser_context *ctx = &tokeniser->context;
+	uint32_t c;
+	uint32_t pos;
+	size_t len;
+	hubbub_error error;
+
+	while ((c = hubbub_inputstream_peek(tokeniser->input)) !=
+			HUBBUB_INPUTSTREAM_EOF &&
+			c != HUBBUB_INPUTSTREAM_OOD) {
+		uint32_t cp;
+
+		if (c > 0x7F) {
+			/* Entity names are ASCII only */
+			break;
+		}
+
+		error = hubbub_entities_search_step((uint8_t) c,
+				&cp,
+				&ctx->match_entity.context);
+		if (error == HUBBUB_OK) {
+			/* Had a match - store it for later */
+			ctx->match_entity.codepoint = cp;
+
+			pos = hubbub_inputstream_cur_pos(tokeniser->input,
+					&len);
+			ctx->match_entity.str.len += len;
+
+			/* And cache length, for replacement */
+			ctx->match_entity.prev_len =
+					ctx->match_entity.str.len;
+		} else if (error == HUBBUB_INVALID) {
+			/* No further matches - use last found */
+			break;
+		} else {
+			pos = hubbub_inputstream_cur_pos(tokeniser->input,
+					&len);
+			ctx->match_entity.str.len += len;
+		}
+
+		hubbub_inputstream_advance(tokeniser->input);
+	}
+
+	if (c == HUBBUB_INPUTSTREAM_OOD)
+		return false;
+
+	/* Eat trailing semicolon, if any */
+	if (ctx->match_entity.codepoint != 0 && c == ';' &&
+			ctx->match_entity.prev_len ==
+				ctx->match_entity.str.len) {
+		pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
+		ctx->match_entity.prev_len += len;
+	}
+
+	/* Rewind the inputstream to start of processed sequence */
+	hubbub_inputstream_rewind(tokeniser->input,
+			ctx->match_entity.str.len);
+
+	/* Now, replace range, if we found a named entity */
+	if (ctx->match_entity.codepoint != 0) {
+		error = hubbub_inputstream_replace_range(tokeniser->input,
+				ctx->match_entity.str.data_off,
+				ctx->match_entity.prev_len,
+				ctx->match_entity.codepoint);
+		if (error != HUBBUB_OK) {
+			/** \todo handle memory exhaustion */
+		}
+	}
+
+	/* Reset for next time */
+	ctx->match_entity.done_setup = false;
+
+	/* Flag completion */
+	ctx->match_entity.complete = true;
+
+	/* And back to the state from whence we came */
+	tokeniser->state = ctx->match_entity.return_state;
+
+	return true;
+}
+
+/**
+ * Handle input stream buffer moving
+ *
+ * \param buffer  Pointer to buffer
+ * \param len     Length of data in buffer (bytes)
+ * \param pw      Pointer to our context
+ */
+void hubbub_tokeniser_buffer_moved_handler(const uint8_t *buffer,
+		size_t len, void *pw)
+{
+	hubbub_tokeniser *tok = (hubbub_tokeniser *) pw;
+
+	tok->input_buffer = buffer;
+	tok->input_buffer_len = len;
+
+	if (tok->buffer_handler != NULL)
+		tok->buffer_handler(buffer, len, tok->buffer_pw);
+}
+
+/**
+ * Emit a token, performing sanity checks if necessary
+ *
+ * \param tokeniser  Tokeniser instance
+ * \param token      Token to emit
+ */
+void hubbub_tokeniser_emit_token(hubbub_tokeniser *tokeniser,
+		hubbub_token *token)
+{
+	if (tokeniser == NULL || token == NULL)
+		return;
+
+	/* Nothing to do if there's no registered handler */
+	if (tokeniser->token_handler == NULL)
+		return;
+
+	if (token->type == HUBBUB_TOKEN_START_TAG ||
+			token->type == HUBBUB_TOKEN_END_TAG) {
+		uint32_t i, j;
+		uint32_t n_attributes = token->data.tag.n_attributes;
+		hubbub_attribute *attrs =
+				token->data.tag.attributes;
+
+		/* Discard duplicate attributes */
+		for (i = 0; i < n_attributes; i++) {
+			for (j = 0; j < n_attributes; j++) {
+				uint32_t move;
+
+				if (j == i ||
+					attrs[i].name.len !=
+							attrs[j].name.len ||
+					hubbub_inputstream_compare_range_cs(
+						tokeniser->input,
+						attrs[i].name.data_off,
+						attrs[j].name.data_off,
+						attrs[i].name.len) != 0) {
+					/* Attributes don't match */
+					continue;
+				}
+
+				/* Calculate amount to move */
+				move = (n_attributes - 1 -
+					((i < j) ? j : i)) *
+					sizeof(hubbub_attribute);
+
+				if (move > 0) {
+					memmove((i < j) ? &attrs[j]
+							: &attrs[i],
+						(i < j) ? &attrs[j+1]
+							: &attrs[i+1],
+						move);
+				}
+
+				/* And reduce the number of attributes */
+				n_attributes--;
+			}
+		}
+
+		token->data.tag.n_attributes = n_attributes;
+	}
+
+	/* Finally, emit token */
+	tokeniser->token_handler(token, tokeniser->token_pw);
+}
diff --git a/src/tokeniser/tokeniser.h b/src/tokeniser/tokeniser.h
new file mode 100644
index 0000000..20bbe20
--- /dev/null
+++ b/src/tokeniser/tokeniser.h
@@ -0,0 +1,71 @@
+/*
+ * This file is part of Hubbub.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#ifndef hubbub_tokeniser_tokeniser_h_
+#define hubbub_tokeniser_tokeniser_h_
+
+#include <stdbool.h>
+#include <inttypes.h>
+
+#include <hubbub/errors.h>
+#include <hubbub/functypes.h>
+#include <hubbub/types.h>
+
+#include "input/inputstream.h"
+
+typedef struct hubbub_tokeniser hubbub_tokeniser;
+
+/**
+ * Hubbub tokeniser option types
+ */
+typedef enum hubbub_tokeniser_opttype {
+	HUBBUB_TOKENISER_TOKEN_HANDLER,
+	HUBBUB_TOKENISER_BUFFER_HANDLER,
+	HUBBUB_TOKENISER_ERROR_HANDLER,
+	HUBBUB_TOKENISER_CONTENT_MODEL,
+} hubbub_tokeniser_opttype;
+
+/**
+ * Hubbub tokeniser option parameters
+ */
+typedef union hubbub_tokeniser_optparams {
+	struct {
+		hubbub_token_handler handler;
+		void *pw;
+	} token_handler;
+
+	struct {
+		hubbub_buffer_handler handler;
+		void *pw;
+	} buffer_handler;
+
+	struct {
+		hubbub_error_handler handler;
+		void *pw;
+	} error_handler;
+
+	struct {
+		hubbub_content_model model;
+	} content_model;
+} hubbub_tokeniser_optparams;
+
+/* Create a hubbub tokeniser */
+hubbub_tokeniser *hubbub_tokeniser_create(hubbub_inputstream *input,
+		hubbub_alloc alloc, void *pw);
+/* Destroy a hubbub tokeniser */
+void hubbub_tokeniser_destroy(hubbub_tokeniser *tokeniser);
+
+/* Configure a hubbub tokeniser */
+hubbub_error hubbub_tokeniser_setopt(hubbub_tokeniser *tokeniser,
+		hubbub_tokeniser_opttype type,
+		hubbub_tokeniser_optparams *params);
+
+/* Process remaining data in the input stream */
+hubbub_error hubbub_tokeniser_run(hubbub_tokeniser *tokeniser);
+
+#endif
+
-- 
cgit v1.2.3