From 7b30a5520cfb56e651f0eb4da85a3e07747da7dc Mon Sep 17 00:00:00 2001
From: John Mark Bell <jmb@netsurf-browser.org>
Date: Sat, 23 Jun 2007 22:40:25 +0000
Subject: Import hubbub -- an HTML parsing library. Plenty of work still to do
 (like tree generation ;)

svn path=/trunk/hubbub/; revision=3359
---
 src/Makefile              |   79 ++
 src/charset/Makefile      |   53 ++
 src/charset/aliases.c     |  361 +++++++
 src/charset/aliases.h     |   42 +
 src/charset/codec.c       |  186 ++++
 src/charset/codec.h       |  153 +++
 src/charset/codec_iconv.c |  837 +++++++++++++++++
 src/charset/codec_impl.h  |   51 +
 src/charset/codec_utf8.c  |  620 ++++++++++++
 src/charset/detect.c      |  673 +++++++++++++
 src/charset/detect.h      |   22 +
 src/hubbub.c              |   63 ++
 src/input/Makefile        |   53 ++
 src/input/filter.c        |  380 ++++++++
 src/input/filter.h        |   57 ++
 src/input/inputstream.c   |  479 ++++++++++
 src/input/inputstream.h   |   98 ++
 src/input/streamimpl.h    |   77 ++
 src/input/utf8_stream.c   |  567 +++++++++++
 src/parser.c              |  237 +++++
 src/tokeniser/Makefile    |   53 ++
 src/tokeniser/entities.c  |  363 +++++++
 src/tokeniser/entities.h  |   25 +
 src/tokeniser/tokeniser.c | 2282 +++++++++++++++++++++++++++++++++++++++++++++
 src/tokeniser/tokeniser.h |   71 ++
 src/utils/Makefile        |   53 ++
 src/utils/dict.c          |  219 +++++
 src/utils/dict.h          |   31 +
 src/utils/errors.c        |   70 ++
 src/utils/utf8.c          |  368 ++++++++
 src/utils/utf8.h          |   38 +
 src/utils/utils.h         |   28 +
 32 files changed, 8689 insertions(+)
 create mode 100644 src/Makefile
 create mode 100644 src/charset/Makefile
 create mode 100644 src/charset/aliases.c
 create mode 100644 src/charset/aliases.h
 create mode 100644 src/charset/codec.c
 create mode 100644 src/charset/codec.h
 create mode 100644 src/charset/codec_iconv.c
 create mode 100644 src/charset/codec_impl.h
 create mode 100644 src/charset/codec_utf8.c
 create mode 100644 src/charset/detect.c
 create mode 100644 src/charset/detect.h
 create mode 100644 src/hubbub.c
 create mode 100644 src/input/Makefile
 create mode 100644 src/input/filter.c
 create mode 100644 src/input/filter.h
 create mode 100644 src/input/inputstream.c
 create mode 100644 src/input/inputstream.h
 create mode 100644 src/input/streamimpl.h
 create mode 100644 src/input/utf8_stream.c
 create mode 100644 src/parser.c
 create mode 100644 src/tokeniser/Makefile
 create mode 100644 src/tokeniser/entities.c
 create mode 100644 src/tokeniser/entities.h
 create mode 100644 src/tokeniser/tokeniser.c
 create mode 100644 src/tokeniser/tokeniser.h
 create mode 100644 src/utils/Makefile
 create mode 100644 src/utils/dict.c
 create mode 100644 src/utils/dict.h
 create mode 100644 src/utils/errors.c
 create mode 100644 src/utils/utf8.c
 create mode 100644 src/utils/utf8.h
 create mode 100644 src/utils/utils.h

(limited to 'src')

diff --git a/src/Makefile b/src/Makefile
new file mode 100644
index 0000000..b72a9e0
--- /dev/null
+++ b/src/Makefile
@@ -0,0 +1,79 @@
+# Makefile for libhubbub
+#
+# Toolchain is exported by top-level makefile
+#
+# Top-level makefile also exports the following variables:
+#
+# COMPONENT  Name of component
+# EXPORT     Absolute path of export directory
+# TOP        Absolute path of source tree root
+#
+# The top-level makefile requires the following targets to exist:
+#
+# clean      Clean source tree
+# debug      Create a debug binary
+# distclean  Fully clean source tree, back to pristine condition
+# export     Export distributable components to ${EXPORT}
+# release    Create a release binary
+# setup      Perform any setup required prior to compilation
+# test       Execute any test cases
+
+# Manipulate include paths
+CFLAGS += -I$(CURDIR)
+
+# Release output
+RELEASE = ${TOP}/${COMPONENT}.a
+
+# Debug output
+DEBUG = ${TOP}/${COMPONENT}-debug.a
+
+# Objects
+OBJS = hubbub parser
+
+.PHONY: clean debug distclean export release setup test
+
+# Targets
+release: $(addprefix Release/, $(addsuffix .o, $(OBJS)))
+	@${MAKE} -C charset release
+	@${MAKE} -C input release
+	@${MAKE} -C tokeniser release
+	@${MAKE} -C utils release
+	@${AR} ${ARFLAGS} $(RELEASE) Release/*
+
+debug: $(addprefix Debug/, $(addsuffix .o, $(OBJS)))
+	@${MAKE} -C charset debug
+	@${MAKE} -C input debug
+	@${MAKE} -C tokeniser debug
+	@${MAKE} -C utils debug
+	@${AR} ${ARFLAGS} $(DEBUG) Debug/*
+
+clean:
+	@${MAKE} -C charset clean
+	@${MAKE} -C input clean
+	@${MAKE} -C tokeniser clean
+	@${MAKE} -C utils clean
+	-@${RM} ${RMFLAGS} $(addprefix Release/, $(addsuffix .o, ${OBJS}))
+	-@${RM} ${RMFLAGS} $(addprefix Debug/, $(addsuffix .o, ${OBJS}))
+	-@${RM} ${RMFLAGS} $(RELEASE) $(DEBUG)
+
+distclean:
+	-@${RM} ${RMFLAGS} -r Release
+	-@${RM} ${RMFLAGS} -r Debug
+
+setup:
+	@${MKDIR} ${MKDIRFLAGS} Release
+	@${MKDIR} ${MKDIRFLAGS} Debug
+
+export:
+	@${CP} ${CPFLAGS} $(RELEASE) ${EXPORT}/lib/
+
+test:
+
+# Pattern rules
+Release/%.o: %.c
+	@${ECHO} ${ECHOFLAGS} "==> $<"
+	@${CC} -c ${CFLAGS} -DNDEBUG -o $@ $<
+
+Debug/%.o: %.c
+	@${ECHO} ${ECHOFLAGS} "==> $<"
+	@${CC} -c -g ${CFLAGS} -o $@ $<
diff --git a/src/charset/Makefile b/src/charset/Makefile
new file mode 100644
index 0000000..62817b3
--- /dev/null
+++ b/src/charset/Makefile
@@ -0,0 +1,53 @@
+# Makefile for libhubbub
+#
+# Toolchain is exported by top-level makefile
+#
+# Top-level makefile also exports the following variables:
+#
+# COMPONENT  Name of component
+# EXPORT     Absolute path of export directory
+# TOP        Absolute path of source tree root
+#
+# The top-level makefile requires the following targets to exist:
+#
+# clean      Clean source tree
+# debug      Create a debug binary
+# distclean  Fully clean source tree, back to pristine condition
+# export     Export distributable components to ${EXPORT}
+# release    Create a release binary
+# setup      Perform any setup required prior to compilation
+# test       Execute any test cases
+
+# Manipulate include paths
+CFLAGS += -I$(CURDIR)
+
+# Objects
+OBJS = aliases codec codec_iconv codec_utf8 detect
+
+.PHONY: clean debug distclean export release setup test
+
+# Targets
+release: $(addprefix ../Release/, $(addsuffix .o, $(OBJS)))
+
+debug: $(addprefix ../Debug/, $(addsuffix .o, $(OBJS)))
+
+clean:
+	-@${RM} ${RMFLAGS} $(addprefix ../Release/, $(addsuffix .o, ${OBJS}))
+	-@${RM} ${RMFLAGS} $(addprefix ../Debug/, $(addsuffix .o, ${OBJS}))  
+
+distclean:
+
+setup:
+
+export:
+
+test:
+
+# Pattern rules
+../Release/%.o: %.c
+	@${ECHO} ${ECHOFLAGS} "==> $<"
+	@${CC} -c ${CFLAGS} -DNDEBUG -o $@ $<
+
+../Debug/%.o: %.c
+	@${ECHO} ${ECHOFLAGS} "==> $<"
+	@${CC} -c -g ${CFLAGS} -o $@ $<
diff --git a/src/charset/aliases.c b/src/charset/aliases.c
new file mode 100644
index 0000000..dcf6de2
--- /dev/null
+++ b/src/charset/aliases.c
@@ -0,0 +1,361 @@
+/*
+ * This file is part of Hubbub.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#include <ctype.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "charset/aliases.h"
+
+struct alias {
+	struct alias *next;
+	hubbub_aliases_canon *canon;
+	uint16_t name_len;
+	char name[1];
+};
+
+#define HASH_SIZE (43)
+static hubbub_aliases_canon *canon_tab[HASH_SIZE];
+static struct alias *alias_tab[HASH_SIZE];
+
+static hubbub_error hubbub_create_alias(const char *alias,
+		hubbub_aliases_canon *c, hubbub_alloc alloc, void *pw);
+static hubbub_aliases_canon *hubbub_create_canon(const char *canon,
+		uint16_t mibenum, hubbub_alloc alloc, void *pw);
+static uint32_t hubbub_hash_val(const char *alias, size_t len);
+
+/**
+ * Create alias data from Aliases file
+ *
+ * \param filename  The path to the Aliases file
+ * \param alloc     Memory (de)allocation function
+ * \param pw        Pointer to client-specific private data (may be NULL)
+ * \return HUBBUB_OK on success, appropriate error otherwise.
+ */
+hubbub_error hubbub_aliases_create(const char *filename,
+		hubbub_alloc alloc, void *pw)
+{
+	char buf[300];
+	FILE *fp;
+
+	if (filename == NULL || alloc == NULL)
+		return HUBBUB_BADPARM;
+
+	fp = fopen(filename, "r");
+	if (fp == NULL)
+		return HUBBUB_FILENOTFOUND;
+
+	while (fgets(buf, sizeof buf, fp)) {
+		char *p, *aliases = 0, *mib, *end;
+		hubbub_aliases_canon *cf;
+
+		if (buf[0] == 0 || buf[0] == '#')
+			/* skip blank lines or comments */
+			continue;
+
+		buf[strlen(buf) - 1] = 0; /* lose terminating newline */
+		end = buf + strlen(buf);
+
+		/* find end of canonical form */
+		for (p = buf; *p && !isspace(*p) && !iscntrl(*p); p++)
+			; /* do nothing */
+		if (p >= end)
+			continue;
+		*p++ = '\0'; /* terminate canonical form */
+
+		/* skip whitespace */
+		for (; *p && isspace(*p); p++)
+			; /* do nothing */
+		if (p >= end)
+			continue;
+		mib = p;
+
+		/* find end of mibenum */
+		for (; *p && !isspace(*p) && !iscntrl(*p); p++)
+			; /* do nothing */
+		if (p < end)
+			*p++ = '\0'; /* terminate mibenum */
+
+		cf = hubbub_create_canon(buf, atoi(mib), alloc, pw);
+		if (cf == NULL)
+			continue;
+
+		/* skip whitespace */
+		for (; p < end && *p && isspace(*p); p++)
+			; /* do nothing */
+		if (p >= end)
+			continue;
+		aliases = p;
+
+		while (p < end) {
+			/* find end of alias */
+			for (; *p && !isspace(*p) && !iscntrl(*p); p++)
+				; /* do nothing */
+			if (p > end)
+				/* stop if we've gone past the end */
+				break;
+			/* terminate current alias */
+			*p++ = '\0';
+
+			if (hubbub_create_alias(aliases, cf,
+					alloc, pw) != HUBBUB_OK)
+				break;
+
+			/* in terminating, we may have advanced
+			 * past the end - check this here */
+			if (p >= end)
+				break;
+
+			/* skip whitespace */
+			for (; *p && isspace(*p); p++)
+				; /* do nothing */
+
+			if (p >= end)
+				/* gone past end => stop */
+				break;
+
+			/* update pointer to current alias */
+			aliases = p;
+		}
+	}
+
+	fclose(fp);
+
+	return HUBBUB_OK;
+}
+
+/**
+ * Free all alias data
+ *
+ * \param alloc  Memory (de)allocation function
+ * \param pw     Pointer to client-specific private data
+ */
+void hubbub_aliases_destroy(hubbub_alloc alloc, void *pw)
+{
+	hubbub_aliases_canon *c, *d;
+	struct alias *a, *b;
+	int i;
+
+	for (i = 0; i != HASH_SIZE; i++) {
+		for (c = canon_tab[i]; c; c = d) {
+			d = c->next;
+			alloc(c, 0, pw);
+		}
+		canon_tab[i] = NULL;
+
+		for (a = alias_tab[i]; a; a = b) {
+			b = a->next;
+			alloc(a, 0, pw);
+		}
+		alias_tab[i] = NULL;
+	}
+}
+
+/**
+ * Retrieve the MIB enum value assigned to an encoding name
+ *
+ * \param alias  The alias to lookup
+ * \param len    The length of the alias string
+ * \return The MIB enum value, or 0 if not found
+ */
+uint16_t hubbub_mibenum_from_name(const char *alias, size_t len)
+{
+	hubbub_aliases_canon *c;
+
+	if (alias == NULL)
+		return 0;
+
+	c = hubbub_alias_canonicalise(alias, len);
+	if (c == NULL)
+		return 0;
+
+	return c->mib_enum;
+}
+
+/**
+ * Retrieve the canonical name of an encoding from the MIB enum
+ *
+ * \param mibenum The MIB enum value
+ * \return Pointer to canonical name, or NULL if not found
+ */
+const char *hubbub_mibenum_to_name(uint16_t mibenum)
+{
+	int i;
+	hubbub_aliases_canon *c;
+
+	for (i = 0; i != HASH_SIZE; i++)
+		for (c = canon_tab[i]; c; c = c->next)
+			if (c->mib_enum == mibenum)
+				return c->name;
+
+	return NULL;
+}
+
+
+/**
+ * Retrieve the canonical form of an alias name
+ *
+ * \param alias  The alias name
+ * \param len    The length of the alias name
+ * \return Pointer to canonical form or NULL if not found
+ */
+hubbub_aliases_canon *hubbub_alias_canonicalise(const char *alias,
+		size_t len)
+{
+	uint32_t hash;
+	hubbub_aliases_canon *c;
+	struct alias *a;
+
+	if (alias == NULL)
+		return NULL;
+
+	hash = hubbub_hash_val(alias, len);
+
+	for (c = canon_tab[hash]; c; c = c->next)
+		if (c->name_len == len &&
+				strncasecmp(c->name, alias, len) == 0)
+			break;
+	if (c)
+		return c;
+
+	for (a = alias_tab[hash]; a; a = a->next)
+		if (a->name_len == len &&
+				strncasecmp(a->name, alias, len) == 0)
+			break;
+	if (a)
+		return a->canon;
+
+	return NULL;
+}
+
+
+/**
+ * Create an alias
+ *
+ * \param alias  The alias name
+ * \param c      The canonical form
+ * \param alloc  Memory (de)allocation function
+ * \param pw     Pointer to client-specific private data (may be NULL)
+ * \return HUBBUB_OK on success, appropriate error otherwise
+ */
+hubbub_error hubbub_create_alias(const char *alias, hubbub_aliases_canon *c,
+		hubbub_alloc alloc, void *pw)
+{
+	struct alias *a;
+	uint32_t hash;
+
+	if (alias == NULL || c == NULL || alloc == NULL)
+		return HUBBUB_BADPARM;
+
+	a = alloc(NULL, sizeof(struct alias) + strlen(alias) + 1, pw);
+	if (a == NULL)
+		return HUBBUB_NOMEM;
+
+	a->canon = c;
+	a->name_len = strlen(alias);
+	strcpy(a->name, alias);
+	a->name[a->name_len] = '\0';
+
+	hash = hubbub_hash_val(alias, a->name_len);
+
+	a->next = alias_tab[hash];
+	alias_tab[hash] = a;
+
+	return HUBBUB_OK;
+}
+
+/**
+ * Create a canonical form
+ *
+ * \param canon    The canonical name
+ * \param mibenum  The MIB enum value
+ * \param alloc    Memory (de)allocation function
+ * \param pw       Pointer to client-specific private data (may be NULL)
+ * \return Pointer to canonical form or NULL on error
+ */
+hubbub_aliases_canon *hubbub_create_canon(const char *canon,
+		uint16_t mibenum, hubbub_alloc alloc, void *pw)
+{
+	hubbub_aliases_canon *c;
+	uint32_t hash, len;
+
+	if (canon == NULL || alloc == NULL)
+		return NULL;
+
+	len = strlen(canon);
+
+	c = alloc(NULL, sizeof(hubbub_aliases_canon) + len + 1, pw);
+	if (c == NULL)
+		return NULL;
+
+	c->mib_enum = mibenum;
+	c->name_len = len;
+	strcpy(c->name, canon);
+	c->name[len] = '\0';
+
+	hash = hubbub_hash_val(canon, len);
+
+	c->next = canon_tab[hash];
+	canon_tab[hash] = c;
+
+	return c;
+}
+
+/**
+ * Hash function
+ *
+ * \param alias String to hash
+ * \return The hashed value
+ */
+uint32_t hubbub_hash_val(const char *alias, size_t len)
+{
+	const char *s = alias;
+	uint32_t h = 5381;
+
+	if (alias == NULL)
+		return 0;
+
+	while (len--)
+		h = (h * 33) ^ (*s++ & ~0x20); /* case insensitive */
+
+	return h % HASH_SIZE;
+}
+
+
+#ifndef NDEBUG
+/**
+ * Dump all alias data to stdout
+ */
+void hubbub_aliases_dump(void)
+{
+	hubbub_aliases_canon *c;
+	struct alias *a;
+	int i;
+	size_t size = 0;
+
+	for (i = 0; i != HASH_SIZE; i++) {
+		for (c = canon_tab[i]; c; c = c->next) {
+			printf("%d %s\n", i, c->name);
+			size += offsetof(hubbub_aliases_canon, name) +
+					c->name_len;
+		}
+
+		for (a = alias_tab[i]; a; a = a->next) {
+			printf("%d %s\n", i, a->name);
+			size += offsetof(struct alias, name) + a->name_len;
+		}
+	}
+
+	size += (sizeof(canon_tab) / sizeof(canon_tab[0]));
+	size += (sizeof(alias_tab) / sizeof(alias_tab[0]));
+
+	printf("%u\n", (unsigned int) size);
+}
+#endif
diff --git a/src/charset/aliases.h b/src/charset/aliases.h
new file mode 100644
index 0000000..e0505d0
--- /dev/null
+++ b/src/charset/aliases.h
@@ -0,0 +1,42 @@
+/*
+ * This file is part of Hubbub.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#ifndef hubbub_charset_aliases_h_
+#define hubbub_charset_aliases_h_
+
+#include <inttypes.h>
+
+#include <hubbub/errors.h>
+#include <hubbub/functypes.h>
+
+typedef struct hubbub_aliases_canon {
+	struct hubbub_aliases_canon *next;
+	uint16_t mib_enum;
+	uint16_t name_len;
+	char name[1];
+} hubbub_aliases_canon;
+
+/* Load encoding aliases from file */
+hubbub_error hubbub_aliases_create(const char *filename,
+		hubbub_alloc alloc, void *pw);
+/* Destroy encoding aliases */
+void hubbub_aliases_destroy(hubbub_alloc alloc, void *pw);
+
+/* Convert an encoding alias to a MIB enum value */
+uint16_t hubbub_mibenum_from_name(const char *alias, size_t len);
+/* Convert a MIB enum value into an encoding alias */
+const char *hubbub_mibenum_to_name(uint16_t mibenum);
+
+/* Canonicalise an alias name */
+hubbub_aliases_canon *hubbub_alias_canonicalise(const char *alias,
+		size_t len);
+
+#ifndef NDEBUG
+void hubbub_aliases_dump(void);
+#endif
+
+#endif
diff --git a/src/charset/codec.c b/src/charset/codec.c
new file mode 100644
index 0000000..12a1bdc
--- /dev/null
+++ b/src/charset/codec.c
@@ -0,0 +1,186 @@
+/*
+ * This file is part of Hubbub.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#include <string.h>
+
+#include "charset/aliases.h"
+
+#include "codec_impl.h"
+
+extern hubbub_charsethandler hubbub_iconv_codec_handler;
+extern hubbub_charsethandler hubbub_utf8_codec_handler;
+
+static hubbub_charsethandler *handler_table[] = {
+	&hubbub_utf8_codec_handler,
+	&hubbub_iconv_codec_handler,
+	NULL,
+};
+
+/**
+ * Create a charset codec
+ *
+ * \param charset  Target charset
+ * \param alloc    Memory (de)allocation function
+ * \param pw       Pointer to client-specific private data (may be NULL)
+ * \return Pointer to codec instance, or NULL on failure
+ */
+hubbub_charsetcodec *hubbub_charsetcodec_create(const char *charset,
+		hubbub_alloc alloc, void *pw)
+{
+	hubbub_charsetcodec *codec;
+	hubbub_charsethandler **handler;
+	const hubbub_aliases_canon * canon;
+
+	if (charset == NULL || alloc == NULL)
+		return NULL;
+
+	/* Canonicalise charset name. */
+	canon = hubbub_alias_canonicalise(charset, strlen(charset));
+	if (canon == NULL)
+		return NULL;
+
+	/* Search for handler class */
+	for (handler = handler_table; *handler != NULL; handler++) {
+		if ((*handler)->handles_charset(canon->name))
+			break;
+	}
+
+	/* None found */
+	if ((*handler) == NULL)
+		return NULL;
+
+	/* Instantiate class */
+	codec = (*handler)->create(canon->name, alloc, pw);
+	if (codec == NULL)
+		return NULL;
+
+	/* and initialise it */
+	codec->mibenum = canon->mib_enum;
+
+	codec->filter = NULL;
+	codec->filter_pw = NULL;
+
+	codec->errormode = HUBBUB_CHARSETCODEC_ERROR_LOOSE;
+
+	codec->alloc = alloc;
+	codec->alloc_pw = pw;
+
+	return codec;
+}
+
+/**
+ * Destroy a charset codec
+ *
+ * \param codec  The codec to destroy
+ */
+void hubbub_charsetcodec_destroy(hubbub_charsetcodec *codec)
+{
+	if (codec == NULL)
+		return;
+
+	codec->handler.destroy(codec);
+
+	codec->alloc(codec, 0, codec->alloc_pw);
+}
+
+/**
+ * Configure a charset codec
+ *
+ * \param codec   The codec to configure
+ * \parem type    The codec option type to configure
+ * \param params  Option-specific parameters
+ * \return HUBBUB_OK on success, appropriate error otherwise
+ */
+hubbub_error hubbub_charsetcodec_setopt(hubbub_charsetcodec *codec,
+		hubbub_charsetcodec_opttype type,
+		hubbub_charsetcodec_optparams *params)
+{
+	if (codec == NULL || params == NULL)
+		return HUBBUB_BADPARM;
+
+	switch (type) {
+	case HUBBUB_CHARSETCODEC_FILTER_FUNC:
+		codec->filter = params->filter_func.filter;
+		codec->filter_pw = params->filter_func.pw;
+		break;
+
+	case HUBBUB_CHARSETCODEC_ERROR_MODE:
+		codec->errormode = params->error_mode.mode;
+		break;
+	}
+
+	return HUBBUB_OK;
+}
+
+/**
+ * Encode a chunk of UCS4 data into a codec's charset
+ *
+ * \param codec      The codec to use
+ * \param source     Pointer to pointer to source data
+ * \param sourcelen  Pointer to length (in bytes) of source data
+ * \param dest       Pointer to pointer to output buffer
+ * \param destlen    Pointer to length (in bytes) of output buffer
+ * \return HUBBUB_OK on success, appropriate error otherwise.
+ *
+ * source, sourcelen, dest and destlen will be updated appropriately on exit
+ */
+hubbub_error hubbub_charsetcodec_encode(hubbub_charsetcodec *codec,
+		const uint8_t **source, size_t *sourcelen,
+		uint8_t **dest, size_t *destlen)
+{
+	if (codec == NULL || source == NULL || *source == NULL ||
+			sourcelen == NULL || dest == NULL || *dest == NULL ||
+			destlen == NULL)
+		return HUBBUB_BADPARM;
+
+	return codec->handler.encode(codec, source, sourcelen, dest, destlen);
+}
+
+/**
+ * Decode a chunk of data in a codec's charset into UCS4
+ *
+ * \param codec      The codec to use
+ * \param source     Pointer to pointer to source data
+ * \param sourcelen  Pointer to length (in bytes) of source data
+ * \param dest       Pointer to pointer to output buffer
+ * \param destlen    Pointer to length (in bytes) of output buffer
+ * \return HUBBUB_OK on success, appropriate error otherwise.
+ *
+ * source, sourcelen, dest and destlen will be updated appropriately on exit
+ *
+ * Call this with a source length of 0 to flush any buffers.
+ */
+hubbub_error hubbub_charsetcodec_decode(hubbub_charsetcodec *codec,
+		const uint8_t **source, size_t *sourcelen,
+		uint8_t **dest, size_t *destlen)
+{
+	if (codec == NULL || source == NULL || *source == NULL ||
+			sourcelen == NULL || dest == NULL || *dest == NULL ||
+			destlen == NULL)
+		return HUBBUB_BADPARM;
+
+	return codec->handler.decode(codec, source, sourcelen, dest, destlen);
+}
+
+/**
+ * Clear a charset codec's encoding state
+ *
+ * \param codec  The codec to reset
+ * \return HUBBUB_OK on success, appropriate error otherwise
+ */
+hubbub_error hubbub_charsetcodec_reset(hubbub_charsetcodec *codec)
+{
+	if (codec == NULL)
+		return HUBBUB_BADPARM;
+
+	/* Reset filter */
+	if (codec->filter)
+		codec->filter(HUBBUB_CHARSETCODEC_NULL, NULL, NULL, NULL);
+
+	return codec->handler.reset(codec);
+}
+
diff --git a/src/charset/codec.h b/src/charset/codec.h
new file mode 100644
index 0000000..4cd94d8
--- /dev/null
+++ b/src/charset/codec.h
@@ -0,0 +1,153 @@
+/*
+ * This file is part of Hubbub.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#ifndef hubbub_charset_codec_h_
+#define hubbub_charset_codec_h_
+
+#include <inttypes.h>
+
+#include <hubbub/errors.h>
+#include <hubbub/functypes.h>
+
+typedef struct hubbub_charsetcodec hubbub_charsetcodec;
+
+#define HUBBUB_CHARSETCODEC_NULL (0xffffffffU)
+
+/**
+ * Type of charset codec filter function
+ *
+ * \param c          UCS4 character (in host byte order) or
+ *                   HUBBUB_CHARSETCODEC_NULL to reset
+ * \param output     Pointer to location to store output buffer location
+ * \param outputlen  Pointer to location to store output buffer length
+ * \param pw         Pointer to client-specific private data
+ * \return HUBBUB_OK on success, or appropriate error otherwise.
+ *
+ * The output buffer is owned by the filter code and will not be freed by
+ * any charset codec. It should contain the replacement UCS4 character(s)
+ * for the input. The replacement characters should be in host byte order.
+ * The contents of *output and *outputlen on entry are ignored and these
+ * will be filled in by the filter code.
+ *
+ * Filters may elect to replace the input character with no output. In this
+ * case, *output should be set to NULL and *outputlen should be set to 0 and
+ * HUBBUB_OK should be returned.
+ *
+ * The output length is in terms of the number of UCS4 characters in the
+ * output buffer. i.e.:
+ *
+ * for (size_t i = 0; i < outputlen; i++) {
+ *   dest[curchar++] = output[i];
+ * }
+ *
+ * would copy the contents of the filter output buffer to the codec's output
+ * buffer.
+ */
+typedef hubbub_error (*hubbub_charsetcodec_filter)(uint32_t c,
+		uint32_t **output, size_t *outputlen, void *pw);
+
+/**
+ * Charset codec error mode
+ *
+ * A codec's error mode determines its behaviour in the face of:
+ *
+ * + characters which are unrepresentable in the destination charset (if
+ *   encoding data) or which cannot be converted to UCS4 (if decoding data).
+ * + invalid byte sequences (both encoding and decoding)
+ *
+ * The options provide a choice between the following approaches:
+ *
+ * + draconian, "stop processing" ("strict")
+ * + "replace the unrepresentable character with something else" ("loose")
+ * + "attempt to transliterate, or replace if unable" ("translit")
+ *
+ * The default error mode is "loose".
+ *
+ *
+ * In the "loose" case, the replacement character will depend upon:
+ *
+ * + Whether the operation was encoding or decoding
+ * + If encoding, what the destination charset is.
+ *
+ * If decoding, the replacement character will be:
+ *
+ *     U+FFFD (REPLACEMENT CHARACTER)
+ *
+ * If encoding, the replacement character will be:
+ *
+ *     U+003F (QUESTION MARK) if the destination charset is not UTF-(8|16|32)
+ *     U+FFFD (REPLACEMENT CHARACTER) otherwise.
+ *
+ *
+ * In the "translit" case, the codec will attempt to transliterate into
+ * the destination charset, if encoding. If decoding, or if transliteration
+ * fails, this option is identical to "loose".
+ */
+typedef enum hubbub_charsetcodec_errormode {
+	/** Abort processing if unrepresentable character encountered */
+	HUBBUB_CHARSETCODEC_ERROR_STRICT   = 0,
+	/** Replace unrepresentable characters with single alternate */
+	HUBBUB_CHARSETCODEC_ERROR_LOOSE    = 1,
+	/** Transliterate unrepresentable characters, if possible */
+	HUBBUB_CHARSETCODEC_ERROR_TRANSLIT = 2,
+} hubbub_charsetcodec_errormode;
+
+/**
+ * Charset codec option types
+ */
+typedef enum hubbub_charsetcodec_opttype {
+	/** Register codec filter function */
+	HUBBUB_CHARSETCODEC_FILTER_FUNC = 0,
+	/** Set codec error mode */
+	HUBBUB_CHARSETCODEC_ERROR_MODE  = 1,
+} hubbub_charsetcodec_opttype;
+
+/**
+ * Charset codec option parameters
+ */
+typedef union hubbub_charsetcodec_optparams {
+	/** Parameters for filter function setting */
+	struct {
+		/** Filter function */
+		hubbub_charsetcodec_filter filter;
+		/** Client-specific private data */
+		void *pw;
+	} filter_func;
+
+	/** Parameters for error mode setting */
+	struct {
+		/** The desired error handling mode */
+		hubbub_charsetcodec_errormode mode;
+	} error_mode;
+} hubbub_charsetcodec_optparams;
+
+
+/* Create a charset codec */
+hubbub_charsetcodec *hubbub_charsetcodec_create(const char *charset,
+		hubbub_alloc alloc, void *pw);
+/* Destroy a charset codec */
+void hubbub_charsetcodec_destroy(hubbub_charsetcodec *codec);
+
+/* Configure a charset codec */
+hubbub_error hubbub_charsetcodec_setopt(hubbub_charsetcodec *codec,
+		hubbub_charsetcodec_opttype type,
+		hubbub_charsetcodec_optparams *params);
+
+/* Encode a chunk of UCS4 data into a codec's charset */
+hubbub_error hubbub_charsetcodec_encode(hubbub_charsetcodec *codec,
+		const uint8_t **source, size_t *sourcelen,
+		uint8_t **dest, size_t *destlen);
+
+/* Decode a chunk of data in a codec's charset into UCS4 */
+hubbub_error hubbub_charsetcodec_decode(hubbub_charsetcodec *codec,
+		const uint8_t **source, size_t *sourcelen,
+		uint8_t **dest, size_t *destlen);
+
+/* Reset a charset codec */
+hubbub_error hubbub_charsetcodec_reset(hubbub_charsetcodec *codec);
+
+#endif
diff --git a/src/charset/codec_iconv.c b/src/charset/codec_iconv.c
new file mode 100644
index 0000000..097e82a
--- /dev/null
+++ b/src/charset/codec_iconv.c
@@ -0,0 +1,837 @@
+/*
+ * This file is part of Hubbub.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+/* This codec is hideously slow. Only use it as a last resort */
+
+#include <errno.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <iconv.h>
+
+/* These two are for htonl / ntohl */
+#include <arpa/inet.h>
+#include <netinet/in.h>
+
+#include "charset/aliases.h"
+#include "utils/utils.h"
+
+#include "codec_impl.h"
+
+/**
+ * A note on endianness:
+ *
+ * UCS4 is big-endian by default. Therefore, this codec reads and writes
+ * big-endian values. This is fine, and causes no problems. However, to
+ * make life easier for client-supplied filter code, character values passed
+ * to a filter and those read back from a filter are in host-endian.
+ * Therefore, we need to convert from big-endian to host-endian when passing
+ * characters to a filter and perform the reverse translation when reading
+ * characters back.
+ */
+
+/**
+ * Iconv-based charset codec
+ */
+typedef struct hubbub_iconv_codec {
+	hubbub_charsetcodec base;	/**< Base class */
+
+	iconv_t read_cd;		/**< Iconv handle for reading */
+#define INVAL_BUFSIZE (32)
+	uint8_t inval_buf[INVAL_BUFSIZE];	/**< Buffer for fixing up
+						 * incomplete input
+						 * sequences */
+	size_t inval_len;		/**< Number of bytes in inval_buf */
+
+#define READ_BUFSIZE (8)
+	uint32_t read_buf[READ_BUFSIZE];	/**< Buffer for partial
+						 * output sequences (decode)
+						 */
+	size_t read_len;		/**< Number of characters in
+					 * read_buf */
+
+	iconv_t write_cd;		/**< Iconv handle for writing */
+#define WRITE_BUFSIZE (8)
+	uint32_t write_buf[WRITE_BUFSIZE];	/**< Buffer for partial
+						 * output sequences (encode)
+						 */
+	size_t write_len;		/**< Number of characters in
+					 * write_buf */
+} hubbub_iconv_codec;
+
+
+static bool hubbub_iconv_codec_handles_charset(const char *charset);
+static hubbub_charsetcodec *hubbub_iconv_codec_create(const char *charset,
+		hubbub_alloc alloc, void *pw);
+static void hubbub_iconv_codec_destroy (hubbub_charsetcodec *codec);
+static hubbub_error hubbub_iconv_codec_encode(hubbub_charsetcodec *codec,
+		const uint8_t **source, size_t *sourcelen,
+		uint8_t **dest, size_t *destlen);
+static hubbub_error hubbub_iconv_codec_decode(hubbub_charsetcodec *codec,
+		const uint8_t **source, size_t *sourcelen,
+		uint8_t **dest, size_t *destlen);
+static hubbub_error hubbub_iconv_codec_reset(hubbub_charsetcodec *codec);
+static hubbub_error hubbub_iconv_codec_filter_decoded_char(
+		hubbub_iconv_codec *c, uint32_t ucs4, uint8_t **dest,
+		size_t *destlen);
+static bool hubbub_iconv_codec_is_unicode(hubbub_iconv_codec *c);
+static hubbub_error hubbub_iconv_codec_read_char(hubbub_iconv_codec *c,
+		const uint8_t **source, size_t *sourcelen,
+		uint8_t **dest, size_t *destlen);
+static hubbub_error hubbub_iconv_codec_write_char(hubbub_iconv_codec *c,
+		uint32_t ucs4, uint8_t **dest, size_t *destlen);
+
+/**
+ * Determine whether this codec handles a specific charset
+ *
+ * \param charset  Charset to test
+ * \return true if handleable, false otherwise
+ */
+bool hubbub_iconv_codec_handles_charset(const char *charset)
+{
+	iconv_t cd;
+	bool ret;
+
+	cd = iconv_open("UCS-4", charset);
+
+	ret = (cd != (iconv_t) -1);
+
+	if (ret)
+		iconv_close(cd);
+
+	return ret;
+}
+
+/**
+ * Create an iconv-based codec
+ *
+ * \param charset  The charset to read from / write to
+ * \param alloc    Memory (de)allocation function
+ * \param pw       Pointer to client-specific private data (may be NULL)
+ * \return Pointer to codec, or NULL on failure
+ */
+hubbub_charsetcodec *hubbub_iconv_codec_create(const char *charset,
+		hubbub_alloc alloc, void *pw)
+{
+	hubbub_iconv_codec *codec;
+
+	codec = alloc(NULL, sizeof(hubbub_iconv_codec), pw);
+	if (codec == NULL)
+		return NULL;
+
+	codec->read_cd = iconv_open("UCS-4", charset);
+	if (codec->read_cd == (iconv_t) -1) {
+		alloc(codec, 0, pw);
+		return NULL;
+	}
+
+	codec->write_cd = iconv_open(charset, "UCS-4");
+	if (codec->write_cd == (iconv_t) -1) {
+		iconv_close(codec->read_cd);
+		alloc(codec, 0, pw);
+		return NULL;
+	}
+
+	codec->inval_buf[0] = '\0';
+	codec->inval_len = 0;
+
+	codec->read_buf[0] = 0;
+	codec->read_len = 0;
+
+	codec->write_buf[0] = 0;
+	codec->write_len = 0;
+
+	/* Finally, populate vtable */
+	codec->base.handler.destroy = hubbub_iconv_codec_destroy;
+	codec->base.handler.encode = hubbub_iconv_codec_encode;
+	codec->base.handler.decode = hubbub_iconv_codec_decode;
+	codec->base.handler.reset = hubbub_iconv_codec_reset;
+
+	return (hubbub_charsetcodec *) codec;
+}
+
+/**
+ * Destroy an iconv-based codec
+ *
+ * \param codec  The codec to destroy
+ */
+void hubbub_iconv_codec_destroy (hubbub_charsetcodec *codec)
+{
+	hubbub_iconv_codec *c = (hubbub_iconv_codec *) codec;
+
+	iconv_close(c->read_cd);
+	iconv_close(c->write_cd);
+
+	return;
+}
+
+/**
+ * Encode a chunk of UCS4 data into an iconv-based codec's charset
+ *
+ * \param codec      The codec to use
+ * \param source     Pointer to pointer to source data
+ * \param sourcelen  Pointer to length (in bytes) of source data
+ * \param dest       Pointer to pointer to output buffer
+ * \param destlen    Pointer to length (in bytes) of output buffer
+ * \return HUBBUB_OK          on success,
+ *         HUBBUB_NOMEM       if output buffer is too small,
+ *         HUBBUB_INVALID     if a character cannot be represented and the
+ *                            codec's error handling mode is set to STRICT,
+ *         <any_other_error>  as a result of the failure of the
+ *                            client-provided filter function.
+ *
+ * On exit, ::source will point immediately _after_ the last input character
+ * read. Any remaining output for the character will be buffered by the
+ * codec for writing on the next call. This buffered data is post-filtering,
+ * so will not be refiltered on the next call.
+ *
+ * In the case of the filter function failing, ::source will point _at_ the
+ * last input character read; nothing will be written or buffered for the
+ * failed character. It is up to the client to fix the cause of the failure
+ * and retry the encoding process.
+ *
+ * Note that, if failure occurs whilst attempting to write any output
+ * buffered by the last call, then ::source and ::sourcelen will remain
+ * unchanged (as nothing more has been read).
+ *
+ * There is no way to determine the output character which caused a
+ * failure (as it may be one in a filter-injected replacement sequence).
+ * It is, however, possible to determine which source character caused it
+ * (this being the character immediately before the location pointed to by
+ * ::source on exit).
+ *
+ * [I.e. the process of filtering results in a potential one-to-many mapping
+ * between source characters and output characters, and identification of
+ * individual output characters is impossible.]
+ *
+ * ::sourcelen will be reduced appropriately on exit.
+ *
+ * ::dest will point immediately _after_ the last character written.
+ *
+ * ::destlen will be reduced appropriately on exit.
+ */
+hubbub_error hubbub_iconv_codec_encode(hubbub_charsetcodec *codec,
+		const uint8_t **source, size_t *sourcelen,
+		uint8_t **dest, size_t *destlen)
+{
+	hubbub_iconv_codec *c = (hubbub_iconv_codec *) codec;
+	uint32_t ucs4;
+	const uint32_t *towrite;
+	size_t towritelen;
+	hubbub_error error;
+
+	/* Process any outstanding characters from the previous call */
+	if (c->write_len > 0) {
+		uint32_t *pwrite = c->write_buf;
+
+		while (c->write_len > 0) {
+			error = hubbub_iconv_codec_write_char(c, pwrite[0],
+					dest, destlen);
+			if (error != HUBBUB_OK) {
+				/* Copy outstanding chars down, skipping
+				 * invalid one, if present, so as to avoid
+				 * reprocessing the invalid character */
+				if (error == HUBBUB_INVALID) {
+					for (ucs4 = 1; ucs4 < c->write_len;
+							ucs4++) {
+						c->write_buf[ucs4] =
+								pwrite[ucs4];
+					}
+				}
+
+				return error;
+			}
+
+			pwrite++;
+			c->write_len--;
+		}
+	}
+
+	/* Now process the characters for this call */
+	while (*sourcelen > 0) {
+		towrite = (const uint32_t *) (const void *) *source;
+		towritelen = 1;
+		ucs4 = *towrite;
+
+		/* Run character we're about to output through the
+		 * registered filter, so it can replace it, if it sees
+		 * fit to do so */
+		if (c->base.filter != NULL) {
+			uint32_t *replacement;
+
+			error = c->base.filter(ntohl(ucs4),
+					&replacement, &towritelen,
+					c->base.filter_pw);
+			if (error != HUBBUB_OK) {
+				/* Don't eat character -- filter failed,
+				 * so nothing gets written or buffered.
+				 * It's up to the client to ensure that
+				 * the filter works in the case where it
+				 * reprocesses this character after the
+				 * fault is fixed up. */
+
+				return error;
+			}
+
+			/* Convert filter output to big endian UCS4 */
+			for (ucs4 = 0; ucs4 < towritelen; ucs4++) {
+				replacement[ucs4] = htonl(replacement[ucs4]);
+			}
+
+			towrite = (const uint32_t *) replacement;
+		}
+
+		/* Output current character(s) */
+		while (towritelen > 0) {
+			error = hubbub_iconv_codec_write_char(c, towrite[0],
+					dest, destlen);
+
+			if (error != HUBBUB_OK) {
+				ucs4 = (error == HUBBUB_INVALID) ? 1 : 0;
+
+				if (towritelen - ucs4 >= WRITE_BUFSIZE)
+					abort();
+
+				c->write_len = towritelen - ucs4;
+
+				/* Copy pending chars to save area, for
+				 * processing next call; skipping invalid
+				 * character, if present, so it's not
+				 * reprocessed. */
+				for (; ucs4 < towritelen; ucs4++) {
+					c->write_buf[ucs4] = towrite[ucs4];
+				}
+
+				/* Claim character we've just buffered,
+				 * so it's not repreocessed */
+				*source += 4;
+				*sourcelen -= 4;
+
+				return error;
+			}
+
+			towrite++;
+			towritelen--;
+		}
+
+		*source += 4;
+		*sourcelen -= 4;
+	}
+
+	return HUBBUB_OK;
+}
+
+/**
+ * Decode a chunk of data in an iconv-based codec's charset into UCS4
+ *
+ * \param codec      The codec to use
+ * \param source     Pointer to pointer to source data
+ * \param sourcelen  Pointer to length (in bytes) of source data
+ * \param dest       Pointer to pointer to output buffer
+ * \param destlen    Pointer to length (in bytes) of output buffer
+ * \return HUBBUB_OK          on success,
+ *         HUBBUB_NOMEM       if output buffer is too small,
+ *         HUBBUB_INVALID     if a character cannot be represented and the
+ *                            codec's error handling mode is set to STRICT,
+ *         <any_other_error>  as a result of the failure of the
+ *                            client-provided filter function.
+ *
+ * On exit, ::source will point immediately _after_ the last input character
+ * read, if the result is _OK or _NOMEM. Any remaining output for the
+ * character will be buffered by the codec for writing on the next call.
+ * This buffered data is post-filtering, so will not be refiltered on the
+ * next call.
+ *
+ * In the case of the result being _INVALID or the filter function failing,
+ * ::source will point _at_ the last input character read; nothing will be
+ * written or buffered for the failed character. It is up to the client to
+ * fix the cause of the failure and retry the decoding process.
+ *
+ * Note that, if failure occurs whilst attempting to write any output
+ * buffered by the last call, then ::source and ::sourcelen will remain
+ * unchanged (as nothing more has been read).
+ *
+ * There is no way to determine the output character which caused a
+ * failure (as it may be one in a filter-injected replacement sequence).
+ * It is, however, possible to determine which source character caused it
+ * (this being the character immediately at or before the location pointed
+ * to by ::source on exit).
+ *
+ * [I.e. the process of filtering results in a potential one-to-many mapping
+ * between source characters and output characters, and identification of
+ * individual output characters is impossible.]
+ *
+ * If STRICT error handling is configured and an illegal sequence is split
+ * over two calls, then _INVALID will be returned from the second call,
+ * but ::source will point mid-way through the invalid sequence (i.e. it
+ * will be unmodified over the second call). In addition, the internal
+ * incomplete-sequence buffer will be emptied, such that subsequent calls
+ * will progress, rather than re-evaluating the same invalid sequence.
+ *
+ * ::sourcelen will be reduced appropriately on exit.
+ *
+ * ::dest will point immediately _after_ the last character written.
+ *
+ * ::destlen will be reduced appropriately on exit.
+ *
+ * Call this with a source length of 0 to flush the output buffer.
+ */
+hubbub_error hubbub_iconv_codec_decode(hubbub_charsetcodec *codec,
+		const uint8_t **source, size_t *sourcelen,
+		uint8_t **dest, size_t *destlen)
+{
+	hubbub_iconv_codec *c = (hubbub_iconv_codec *) codec;
+	hubbub_error error;
+
+	if (c->read_len > 0) {
+		/* Output left over from last decode
+		 * Attempt to finish this here */
+		uint32_t *pread = c->read_buf;
+
+		while (c->read_len > 0 && *destlen >= c->read_len * 4) {
+			*((uint32_t *) (void *) *dest) = pread[0];
+
+			*dest += 4;
+			*destlen -= 4;
+
+			pread++;
+			c->read_len--;
+		}
+
+		if (*destlen < c->read_len * 4) {
+			/* Run out of output buffer */
+			size_t i;
+
+			/* Shuffle remaining output down */
+			for (i = 0; i < c->read_len; i++) {
+				c->read_buf[i] = pread[i];
+			}
+
+			return HUBBUB_NOMEM;
+		}
+	}
+
+	if (c->inval_len > 0) {
+		/* The last decode ended in an incomplete sequence.
+		 * Fill up inval_buf with data from the start of the
+		 * new chunk and process it. */
+		uint8_t *in = c->inval_buf;
+		size_t ol = c->inval_len;
+		size_t l = min(INVAL_BUFSIZE - ol - 1, *sourcelen);
+		size_t orig_l = l;
+
+		memcpy(c->inval_buf + ol, *source, l);
+
+		l += c->inval_len;
+
+		error = hubbub_iconv_codec_read_char(c,
+				(const uint8_t **) &in, &l, dest, destlen);
+		if (error != HUBBUB_OK && error != HUBBUB_NOMEM) {
+			return error;
+		}
+
+
+		/* And now, fix everything up so the normal processing
+		 * does the right thing. */
+		*source += max((signed) (orig_l - l), 0);
+		*sourcelen -= max((signed) (orig_l - l), 0);
+
+		/* Failed to resolve an incomplete character and
+		 * ran out of buffer space. No recovery strategy
+		 * possible, so explode everywhere. */
+		if ((orig_l + ol) - l == 0)
+			abort();
+
+		/* Handle memry exhaustion case from above */
+		if (error != HUBBUB_OK)
+			return error;
+	}
+
+	while (*sourcelen > 0) {
+		error = hubbub_iconv_codec_read_char(c,
+				source, sourcelen, dest, destlen);
+		if (error != HUBBUB_OK) {
+			return error;
+		}
+	}
+
+	return HUBBUB_OK;
+}
+
+/**
+ * Clear an iconv-based codec's encoding state
+ *
+ * \param codec  The codec to reset
+ * \return HUBBUB_OK on success, appropriate error otherwise
+ */
+hubbub_error hubbub_iconv_codec_reset(hubbub_charsetcodec *codec)
+{
+	hubbub_iconv_codec *c = (hubbub_iconv_codec *) codec;
+
+	iconv(c->read_cd, NULL, NULL, NULL, NULL);
+	iconv(c->write_cd, NULL, NULL, NULL, NULL);
+
+	c->inval_buf[0] = '\0';
+	c->inval_len = 0;
+
+	c->read_buf[0] = 0;
+	c->read_len = 0;
+
+	c->write_buf[0] = 0;
+	c->write_len = 0;
+
+	return HUBBUB_OK;
+}
+
+/**
+ * Feed a UCS4 character through the registered filter and output the result
+ *
+ * \param c        Codec to use
+ * \param ucs4     UCS4 character (big endian)
+ * \param dest     Pointer to pointer to output buffer
+ * \param destlen  Pointer to output buffer length
+ * \return HUBBUB_OK          on success,
+ *         HUBBUB_NOMEM       if output buffer is too small,
+ *         <any_other_error>  as a result of the failure of the
+ *                            client-provided filter function.
+ */
+hubbub_error hubbub_iconv_codec_filter_decoded_char(hubbub_iconv_codec *c,
+		uint32_t ucs4, uint8_t **dest, size_t *destlen)
+{
+	if (c->base.filter != NULL) {
+		uint32_t *rep;
+		size_t replen;
+		hubbub_error error;
+
+		error = c->base.filter(ntohl(ucs4), &rep, &replen,
+				c->base.filter_pw);
+		if (error != HUBBUB_OK) {
+			return error;
+		}
+
+		while (replen > 0 && *destlen >= replen * 4) {
+			*((uint32_t *) (void *) *dest) = htonl(*rep);
+
+			*dest += 4;
+			*destlen -= 4;
+
+			rep++;
+			replen--;
+		}
+
+		if (*destlen < replen * 4) {
+			/* Run out of output buffer */
+			size_t i;
+
+			/* Buffer remaining output */
+			c->read_len = replen;
+
+			for (i = 0; i < replen; i++) {
+				c->read_buf[i] = htonl(rep[i]);
+			}
+
+			return HUBBUB_NOMEM;
+		}
+
+	} else {
+		if (*destlen < 4) {
+			/* Run out of output buffer */
+
+			c->read_len = 1;
+			c->read_buf[0] = ucs4;
+
+			return HUBBUB_NOMEM;
+		}
+
+		*((uint32_t *) (void *) *dest) = ucs4;
+		*dest += 4;
+		*destlen -= 4;
+	}
+
+	return HUBBUB_OK;
+}
+
+/**
+ * Detect if a codec's charset is Unicode capable
+ *
+ * \param c  Codec to consider
+ * \return true if a Unicode variant, false otherwise
+ */
+bool hubbub_iconv_codec_is_unicode(hubbub_iconv_codec *c)
+{
+	static uint16_t ucs4;
+	static uint16_t ucs2;
+	static uint16_t utf8;
+	static uint16_t utf16;
+	static uint16_t utf16be;
+	static uint16_t utf16le;
+	static uint16_t utf32;
+	static uint16_t utf32be;
+	static uint16_t utf32le;
+
+	if (ucs4 == 0) {
+		ucs4 = hubbub_mibenum_from_name("UCS-4", SLEN("UCS-4"));
+		ucs2 = hubbub_mibenum_from_name("UCS-2", SLEN("UCS-2"));
+		utf8 = hubbub_mibenum_from_name("UTF-8", SLEN("UTF-8"));
+		utf16 = hubbub_mibenum_from_name("UTF-16", SLEN("UTF-16"));
+		utf16be = hubbub_mibenum_from_name("UTF-16BE",
+				SLEN("UTF-16BE"));
+		utf16le = hubbub_mibenum_from_name("UTF-16LE",
+				SLEN("UTF-16LE"));
+		utf32 = hubbub_mibenum_from_name("UTF-32", SLEN("UTF-32"));
+		utf32be = hubbub_mibenum_from_name("UTF-32BE",
+				SLEN("UTF-32BE"));
+		utf32le = hubbub_mibenum_from_name("UTF-32LE",
+				SLEN("UTF-32LE"));
+	}
+
+	return (c->base.mibenum == ucs4 ||
+			c->base.mibenum == ucs2 ||
+			c->base.mibenum == utf8 ||
+			c->base.mibenum == utf16 ||
+			c->base.mibenum == utf16be ||
+			c->base.mibenum == utf16le ||
+			c->base.mibenum == utf32 ||
+			c->base.mibenum == utf32be ||
+			c->base.mibenum == utf32le);
+}
+
+/**
+ * Read a character from the codec's native charset to UCS4 (big endian)
+ *
+ * \param c          The codec
+ * \param source     Pointer to pointer to source buffer (updated on exit)
+ * \param sourcelen  Pointer to length of source buffer (updated on exit)
+ * \param dest       Pointer to pointer to output buffer (updated on exit)
+ * \param destlen    Pointer to length of output buffer (updated on exit)
+ * \return HUBBUB_OK on success,
+ *         HUBBUB_NOMEM       if output buffer is too small,
+ *         HUBBUB_INVALID     if a character cannot be represented and the
+ *                            codec's error handling mode is set to STRICT,
+ *         <any_other_error>  as a result of the failure of the
+ *                            client-provided filter function.
+ *
+ * On exit, ::source will point immediately _after_ the last input character
+ * read, if the result is _OK or _NOMEM. Any remaining output for the
+ * character will be buffered by the codec for writing on the next call.
+ * This buffered data is post-filtering, so will not be refiltered on the
+ * next call.
+ *
+ * In the case of the result being _INVALID or the filter function failing,
+ * ::source will point _at_ the last input character read; nothing will be
+ * written or buffered for the failed character. It is up to the client to
+ * fix the cause of the failure and retry the decoding process.
+ *
+ * ::sourcelen will be reduced appropriately on exit.
+ *
+ * ::dest will point immediately _after_ the last character written.
+ *
+ * ::destlen will be reduced appropriately on exit.
+ */
+hubbub_error hubbub_iconv_codec_read_char(hubbub_iconv_codec *c,
+		const uint8_t **source, size_t *sourcelen,
+		uint8_t **dest, size_t *destlen)
+{
+	size_t iconv_ret;
+	const uint8_t *origsrc = *source;
+	size_t origsrclen = *sourcelen;
+	uint32_t ucs4;
+	uint8_t *pucs4 = (uint8_t *) &ucs4;
+	size_t sucs4 = 4;
+	hubbub_error error;
+
+	/* Use iconv to convert a single character
+	 * Side effect: Updates *source to point at next input
+	 * character and *sourcelen to reflect reduced input length
+	 */
+	iconv_ret = iconv(c->read_cd, (char **) source, sourcelen,
+			(char **) (void *) &pucs4, &sucs4);
+
+	if (iconv_ret != (size_t) -1 ||
+			(*source != origsrc && sucs4 == 0)) {
+		/* Read a character */
+		error = hubbub_iconv_codec_filter_decoded_char(c,
+				ucs4, dest, destlen);
+		if (error != HUBBUB_OK && error != HUBBUB_NOMEM) {
+			/* filter function failed; restore source pointers */
+			*source = origsrc;
+			*sourcelen = origsrclen;
+		}
+
+		/* Clear inval buffer */
+		c->inval_buf[0] = '\0';
+		c->inval_len = 0;
+
+		return error;
+	} else if (errno == E2BIG) {
+		/* Should never happen */
+		abort();
+	} else if (errno == EINVAL) {
+		/* Incomplete input sequence */
+		if (*sourcelen > INVAL_BUFSIZE)
+			abort();
+
+		memmove(c->inval_buf, (const char *) *source, *sourcelen);
+		c->inval_buf[*sourcelen] = '\0';
+		c->inval_len = *sourcelen;
+
+		*source += *sourcelen;
+		*sourcelen = 0;
+
+		return HUBBUB_OK;
+	} else if (errno == EILSEQ) {
+		/* Illegal input sequence */
+		bool found = false;
+		const uint8_t *oldsrc;
+		size_t oldsrclen;
+
+		/* Clear inval buffer */
+		c->inval_buf[0] = '\0';
+		c->inval_len = 0;
+
+		/* Strict errormode; simply flag invalid character */
+		if (c->base.errormode == HUBBUB_CHARSETCODEC_ERROR_STRICT) {
+			/* restore source pointers */
+			*source = origsrc;
+			*sourcelen = origsrclen;
+
+			return HUBBUB_INVALID;
+		}
+
+		/* Ok, this becomes problematic. The iconv API here
+		* is particularly unhelpful; *source will point at
+		* the _start_ of the illegal sequence. This means
+		* that we must find the end of the sequence */
+
+		/* Search for the start of the next valid input
+		 * sequence (or the end of the input stream) */
+		while (*sourcelen > 1) {
+			pucs4 = (uint8_t *) &ucs4;
+			sucs4 = 4;
+
+			(*source)++;
+			(*sourcelen)--;
+
+			oldsrc = *source;
+			oldsrclen = *sourcelen;
+
+			iconv_ret = iconv(c->read_cd,
+					(char **) source, sourcelen,
+					(char **) (void *) &pucs4, &sucs4);
+			if (iconv_ret != (size_t) -1 || errno != EILSEQ) {
+				found = true;
+				break;
+			}
+		}
+
+		if (found) {
+			/* Found start of next valid sequence */
+			*source = oldsrc;
+			*sourcelen = oldsrclen;
+		} else {
+			/* Not found - skip last byte in buffer */
+			(*source)++;
+			(*sourcelen)--;
+
+			if (*sourcelen != 0)
+				abort();
+		}
+
+		/* output U+FFFD and continue processing. */
+		error = hubbub_iconv_codec_filter_decoded_char(c,
+				htonl(0xFFFD), dest, destlen);
+		if (error != HUBBUB_OK && error != HUBBUB_NOMEM) {
+			/* filter function failed; restore source pointers */
+			*source = origsrc;
+			*sourcelen = origsrclen;
+		}
+
+		return error;
+	}
+
+	return HUBBUB_OK;
+}
+
+/**
+ * Write a UCS4 character in a codec's native charset
+ *
+ * \param c        The codec
+ * \param ucs4     The UCS4 character to write (big endian)
+ * \param dest     Pointer to pointer to output buffer (updated on exit)
+ * \param destlen  Pointer to length of output buffer (updated on exit)
+ * \return HUBBUB_OK       on success,
+ *         HUBBUB_NOMEM    if output buffer is too small,
+ *         HUBBUB_INVALID  if character cannot be represented and the
+ *                         codec's error handling mode is set to STRICT.
+ */
+hubbub_error hubbub_iconv_codec_write_char(hubbub_iconv_codec *c,
+		uint32_t ucs4, uint8_t **dest, size_t *destlen)
+{
+	size_t iconv_ret;
+	uint8_t *pucs4 = (uint8_t *) &ucs4;
+	size_t sucs4 = 4;
+	uint8_t *origdest = *dest;
+
+	iconv_ret = iconv(c->write_cd, (char **) (void *) &pucs4,
+			&sucs4, (char **) dest, destlen);
+
+	if (iconv_ret == (size_t) -1 && errno == E2BIG) {
+		/* Output buffer is too small */
+		return HUBBUB_NOMEM;
+	} else if (iconv_ret == (size_t) -1 && errno == EILSEQ) {
+		/* Illegal multibyte sequence */
+		/* This should never happen */
+		abort();
+	} else if (iconv_ret == (size_t) -1 && errno == EINVAL) {
+		/* Incomplete input character */
+		/* This should never happen */
+		abort();
+	} else if (*dest == origdest) {
+		/* Nothing was output */
+		switch (c->base.errormode) {
+		case HUBBUB_CHARSETCODEC_ERROR_STRICT:
+			return HUBBUB_INVALID;
+
+		case HUBBUB_CHARSETCODEC_ERROR_TRANSLIT:
+			/** \todo transliteration */
+		case HUBBUB_CHARSETCODEC_ERROR_LOOSE:
+		{
+			pucs4 = (uint8_t *) &ucs4;
+			sucs4 = 4;
+
+			ucs4 = hubbub_iconv_codec_is_unicode(c)
+					? htonl(0xFFFD) : htonl(0x3F);
+
+			iconv_ret = iconv(c->write_cd,
+					(char **) (void *) &pucs4, &sucs4,
+					(char **) dest, destlen);
+
+			if (iconv_ret == (size_t) -1 && errno == E2BIG) {
+				return HUBBUB_NOMEM;
+			} else if (iconv_ret == (size_t) -1 &&
+					errno == EILSEQ) {
+				/* Illegal multibyte sequence */
+				/* This should never happen */
+				abort();
+			} else if (iconv_ret == (size_t) -1 &&
+					errno == EINVAL) {
+				/* Incomplete input character */
+				/* This should never happen */
+				abort();
+			}
+		}
+			break;
+		}
+	}
+
+	return HUBBUB_OK;
+}
+
+const hubbub_charsethandler hubbub_iconv_codec_handler = {
+	hubbub_iconv_codec_handles_charset,
+	hubbub_iconv_codec_create
+};
diff --git a/src/charset/codec_impl.h b/src/charset/codec_impl.h
new file mode 100644
index 0000000..eb5116b
--- /dev/null
+++ b/src/charset/codec_impl.h
@@ -0,0 +1,51 @@
+/*
+ * This file is part of Hubbub.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#ifndef hubbub_charset_codecimpl_h_
+#define hubbub_charset_codecimpl_h_
+
+#include <stdbool.h>
+#include <inttypes.h>
+
+#include "codec.h"
+
+/**
+ * Core charset codec definition; implementations extend this
+ */
+struct hubbub_charsetcodec {
+	uint16_t mibenum;			/**< MIB enum for charset */
+
+	hubbub_charsetcodec_filter filter;	/**< filter function */
+	void *filter_pw;			/**< filter private word */
+
+	hubbub_charsetcodec_errormode errormode;	/**< error mode */
+
+	hubbub_alloc alloc;			/**< allocation function */
+	void *alloc_pw;				/**< private word */
+
+	struct {
+		void (*destroy)(hubbub_charsetcodec *codec);
+		hubbub_error (*encode)(hubbub_charsetcodec *codec,
+				const uint8_t **source, size_t *sourcelen,
+				uint8_t **dest, size_t *destlen);
+		hubbub_error (*decode)(hubbub_charsetcodec *codec,
+				const uint8_t **source, size_t *sourcelen,
+				uint8_t **dest, size_t *destlen);
+		hubbub_error (*reset)(hubbub_charsetcodec *codec);
+	} handler; /**< Vtable for handler code */
+};
+
+/**
+ * Codec factory component definition
+ */
+typedef struct hubbub_charsethandler {
+	bool (*handles_charset)(const char *charset);
+	hubbub_charsetcodec *(*create)(const char *charset,
+			hubbub_alloc alloc, void *pw);
+} hubbub_charsethandler;
+
+#endif
diff --git a/src/charset/codec_utf8.c b/src/charset/codec_utf8.c
new file mode 100644
index 0000000..86d667f
--- /dev/null
+++ b/src/charset/codec_utf8.c
@@ -0,0 +1,620 @@
+/*
+ * This file is part of Hubbub.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#include <stdlib.h>
+#include <string.h>
+
+/* These two are for htonl / ntohl */
+#include <arpa/inet.h>
+#include <netinet/in.h>
+
+#include "charset/aliases.h"
+#include "utils/utf8.h"
+#include "utils/utils.h"
+
+#include "codec_impl.h"
+
+/**
+ * UTF-8 charset codec
+ */
+typedef struct hubbub_utf8_codec {
+	hubbub_charsetcodec base;	/**< Base class */
+
+#define INVAL_BUFSIZE (32)
+	uint8_t inval_buf[INVAL_BUFSIZE];	/**< Buffer for fixing up
+						 * incomplete input
+						 * sequences */
+	size_t inval_len;		/*< Byte length of inval_buf **/
+
+#define READ_BUFSIZE (8)
+	uint32_t read_buf[READ_BUFSIZE];	/**< Buffer for partial
+						 * output sequences (decode)
+						 * (host-endian) */
+	size_t read_len;		/**< Character length of read_buf */
+
+#define WRITE_BUFSIZE (8)
+	uint32_t write_buf[WRITE_BUFSIZE];	/**< Buffer for partial
+						 * output sequences (encode)
+						 * (host-endian) */
+	size_t write_len;		/**< Character length of write_buf */
+
+} hubbub_utf8_codec;
+
+static bool hubbub_utf8_codec_handles_charset(const char *charset);
+static hubbub_charsetcodec *hubbub_utf8_codec_create(const char *charset,
+		hubbub_alloc alloc, void *pw);
+static void hubbub_utf8_codec_destroy (hubbub_charsetcodec *codec);
+static hubbub_error hubbub_utf8_codec_encode(hubbub_charsetcodec *codec,
+		const uint8_t **source, size_t *sourcelen,
+		uint8_t **dest, size_t *destlen);
+static hubbub_error hubbub_utf8_codec_decode(hubbub_charsetcodec *codec,
+		const uint8_t **source, size_t *sourcelen,
+		uint8_t **dest, size_t *destlen);
+static hubbub_error hubbub_utf8_codec_reset(hubbub_charsetcodec *codec);
+static hubbub_error hubbub_utf8_codec_read_char(hubbub_utf8_codec *c,
+		const uint8_t **source, size_t *sourcelen,
+		uint8_t **dest, size_t *destlen);
+static hubbub_error hubbub_utf8_codec_filter_decoded_char(
+		hubbub_utf8_codec *c,
+		uint32_t ucs4, uint8_t **dest, size_t *destlen);
+
+/**
+ * Determine whether this codec handles a specific charset
+ *
+ * \param charset  Charset to test
+ * \return true if handleable, false otherwise
+ */
+bool hubbub_utf8_codec_handles_charset(const char *charset)
+{
+	return hubbub_mibenum_from_name(charset, strlen(charset)) ==
+			hubbub_mibenum_from_name("UTF-8", SLEN("UTF-8"));
+}
+
+/**
+ * Create a utf8 codec
+ *
+ * \param charset  The charset to read from / write to
+ * \param alloc    Memory (de)allocation function
+ * \param pw       Pointer to client-specific private data (may be NULL)
+ * \return Pointer to codec, or NULL on failure
+ */
+hubbub_charsetcodec *hubbub_utf8_codec_create(const char *charset,
+		hubbub_alloc alloc, void *pw)
+{
+	hubbub_utf8_codec *codec;
+
+	UNUSED(charset);
+
+	codec = alloc(NULL, sizeof(hubbub_utf8_codec), pw);
+	if (codec == NULL)
+		return NULL;
+
+	codec->inval_buf[0] = '\0';
+	codec->inval_len = 0;
+
+	codec->read_buf[0] = 0;
+	codec->read_len = 0;
+
+	codec->write_buf[0] = 0;
+	codec->write_len = 0;
+
+	/* Finally, populate vtable */
+	codec->base.handler.destroy = hubbub_utf8_codec_destroy;
+	codec->base.handler.encode = hubbub_utf8_codec_encode;
+	codec->base.handler.decode = hubbub_utf8_codec_decode;
+	codec->base.handler.reset = hubbub_utf8_codec_reset;
+
+	return (hubbub_charsetcodec *) codec;
+}
+
+/**
+ * Destroy a utf8 codec
+ *
+ * \param codec  The codec to destroy
+ */
+void hubbub_utf8_codec_destroy (hubbub_charsetcodec *codec)
+{
+	UNUSED(codec);
+}
+
+/**
+ * Encode a chunk of UCS4 data into utf8
+ *
+ * \param codec      The codec to use
+ * \param source     Pointer to pointer to source data
+ * \param sourcelen  Pointer to length (in bytes) of source data
+ * \param dest       Pointer to pointer to output buffer
+ * \param destlen    Pointer to length (in bytes) of output buffer
+ * \return HUBBUB_OK          on success,
+ *         HUBBUB_NOMEM       if output buffer is too small,
+ *         HUBBUB_INVALID     if a character cannot be represented and the
+ *                            codec's error handling mode is set to STRICT,
+ *         <any_other_error>  as a result of the failure of the
+ *                            client-provided filter function.
+ *
+ * On exit, ::source will point immediately _after_ the last input character
+ * read. Any remaining output for the character will be buffered by the
+ * codec for writing on the next call. This buffered data is post-filtering,
+ * so will not be refiltered on the next call.
+ *
+ * In the case of the filter function failing, ::source will point _at_ the
+ * last input character read; nothing will be written or buffered for the
+ * failed character. It is up to the client to fix the cause of the failure
+ * and retry the encoding process.
+ *
+ * Note that, if failure occurs whilst attempting to write any output
+ * buffered by the last call, then ::source and ::sourcelen will remain
+ * unchanged (as nothing more has been read).
+ *
+ * There is no way to determine the output character which caused a
+ * failure (as it may be one in a filter-injected replacement sequence).
+ * It is, however, possible to determine which source character caused it
+ * (this being the character immediately before the location pointed to by
+ * ::source on exit).
+ *
+ * [I.e. the process of filtering results in a potential one-to-many mapping
+ * between source characters and output characters, and identification of
+ * individual output characters is impossible.]
+ *
+ * ::sourcelen will be reduced appropriately on exit.
+ *
+ * ::dest will point immediately _after_ the last character written.
+ *
+ * ::destlen will be reduced appropriately on exit.
+ */
+hubbub_error hubbub_utf8_codec_encode(hubbub_charsetcodec *codec,
+		const uint8_t **source, size_t *sourcelen,
+		uint8_t **dest, size_t *destlen)
+{
+	hubbub_utf8_codec *c = (hubbub_utf8_codec *) codec;
+	uint32_t ucs4;
+	uint32_t *towrite;
+	size_t towritelen;
+	hubbub_error error;
+
+	/* Process any outstanding characters from the previous call */
+	if (c->write_len > 0) {
+		uint32_t *pwrite = c->write_buf;
+		uint8_t buf[6];
+		size_t len;
+
+		while (c->write_len > 0) {
+			error = hubbub_utf8_from_ucs4(pwrite[0], buf, &len);
+			if (error != HUBBUB_OK)
+				abort();
+
+			if (*destlen < len) {
+				/* Insufficient output buffer space */
+				for (len = 0; len < c->write_len; len++)
+					c->write_buf[len] = pwrite[len];
+
+				return HUBBUB_NOMEM;
+			}
+
+			memcpy(*dest, buf, len);
+
+			*dest += len;
+			*destlen -= len;
+
+			pwrite++;
+			c->write_len--;
+		}
+	}
+
+	/* Now process the characters for this call */
+	while (*sourcelen > 0) {
+		ucs4 = ntohl(*((uint32_t *) (void *) *source));
+		towrite = &ucs4;
+		towritelen = 1;
+
+		/* Run character we're about to output through the
+		 * registered filter, so it can replace it. */
+		if (c->base.filter != NULL) {
+			error = c->base.filter(ucs4,
+					&towrite, &towritelen,
+					c->base.filter_pw);
+			if (error != HUBBUB_OK)
+				return error;
+		}
+
+		/* Output current characters */
+		while (towritelen > 0) {
+			uint8_t buf[6];
+			size_t len;
+
+			error = hubbub_utf8_from_ucs4(towrite[0], buf, &len);
+			if (error != HUBBUB_OK)
+				abort();
+
+			if (*destlen < len) {
+				/* Insufficient output space */
+				if (towritelen >= WRITE_BUFSIZE)
+					abort();
+
+				c->write_len = towritelen;
+
+				/* Copy pending chars to save area, for
+				 * processing next call. */
+				for (len = 0; len < towritelen; len++)
+					c->write_buf[len] = towrite[len];
+
+				/* Claim character we've just buffered,
+				 * so it's not reprocessed */
+				*source += 4;
+				*sourcelen -= 4;
+
+				return HUBBUB_NOMEM;
+			}
+
+			memcpy(*dest, buf, len);
+
+			*dest += len;
+			*destlen -= len;
+
+			towrite++;
+			towritelen--;
+		}
+
+		*source += 4;
+		*sourcelen -= 4;
+	}
+
+	return HUBBUB_OK;
+}
+
+/**
+ * Decode a chunk of utf8 data into UCS4
+ *
+ * \param codec      The codec to use
+ * \param source     Pointer to pointer to source data
+ * \param sourcelen  Pointer to length (in bytes) of source data
+ * \param dest       Pointer to pointer to output buffer
+ * \param destlen    Pointer to length (in bytes) of output buffer
+ * \return HUBBUB_OK          on success,
+ *         HUBBUB_NOMEM       if output buffer is too small,
+ *         HUBBUB_INVALID     if a character cannot be represented and the
+ *                            codec's error handling mode is set to STRICT,
+ *         <any_other_error>  as a result of the failure of the
+ *                            client-provided filter function.
+ *
+ * On exit, ::source will point immediately _after_ the last input character
+ * read, if the result is _OK or _NOMEM. Any remaining output for the
+ * character will be buffered by the codec for writing on the next call.
+ * This buffered data is post-filtering, so will not be refiltered on the
+ * next call.
+ *
+ * In the case of the result being _INVALID or the filter function failing,
+ * ::source will point _at_ the last input character read; nothing will be
+ * written or buffered for the failed character. It is up to the client to
+ * fix the cause of the failure and retry the decoding process.
+ *
+ * Note that, if failure occurs whilst attempting to write any output
+ * buffered by the last call, then ::source and ::sourcelen will remain
+ * unchanged (as nothing more has been read).
+ *
+ * There is no way to determine the output character which caused a
+ * failure (as it may be one in a filter-injected replacement sequence).
+ * It is, however, possible to determine which source character caused it
+ * (this being the character immediately at or before the location pointed
+ * to by ::source on exit).
+ *
+ * [I.e. the process of filtering results in a potential one-to-many mapping
+ * between source characters and output characters, and identification of
+ * individual output characters is impossible.]
+ *
+ * If STRICT error handling is configured and an illegal sequence is split
+ * over two calls, then _INVALID will be returned from the second call,
+ * but ::source will point mid-way through the invalid sequence (i.e. it
+ * will be unmodified over the second call). In addition, the internal
+ * incomplete-sequence buffer will be emptied, such that subsequent calls
+ * will progress, rather than re-evaluating the same invalid sequence.
+ *
+ * ::sourcelen will be reduced appropriately on exit.
+ *
+ * ::dest will point immediately _after_ the last character written.
+ *
+ * ::destlen will be reduced appropriately on exit.
+ *
+ * Call this with a source length of 0 to flush the output buffer.
+ */
+hubbub_error hubbub_utf8_codec_decode(hubbub_charsetcodec *codec,
+		const uint8_t **source, size_t *sourcelen,
+		uint8_t **dest, size_t *destlen)
+{
+	hubbub_utf8_codec *c = (hubbub_utf8_codec *) codec;
+	hubbub_error error;
+
+	if (c->read_len > 0) {
+		/* Output left over from last decode */
+		uint32_t *pread = c->read_buf;
+
+		while (c->read_len > 0 && *destlen >= c->read_len * 4) {
+			*((uint32_t *) (void *) *dest) = htonl(pread[0]);
+
+			*dest += 4;
+			*destlen -= 4;
+
+			pread++;
+			c->read_len--;
+		}
+
+		if (*destlen < c->read_len * 4) {
+			/* Ran out of output buffer */
+			size_t i;
+
+			/* Shuffle remaining output down */
+			for (i = 0; i < c->read_len; i++)
+				c->read_buf[i] = pread[i];
+
+			return HUBBUB_NOMEM;
+		}
+	}
+
+	if (c->inval_len > 0) {
+		/* The last decode ended in an incomplete sequence.
+		 * Fill up inval_buf with data from the start of the
+		 * new chunk and process it. */
+		uint8_t *in = c->inval_buf;
+		size_t ol = c->inval_len;
+		size_t l = min(INVAL_BUFSIZE - ol - 1, *sourcelen);
+		size_t orig_l = l;
+
+		memcpy(c->inval_buf + ol, *source, l);
+
+		l += c->inval_len;
+
+		error = hubbub_utf8_codec_read_char(c,
+				(const uint8_t **) &in, &l, dest, destlen);
+		if (error != HUBBUB_OK && error != HUBBUB_NOMEM) {
+			return error;
+		}
+
+		/* And now, fix up source pointers */
+		*source += max((signed) (orig_l - l), 0);
+		*sourcelen -= max((signed) (orig_l - l), 0);
+
+		/* Failed to resolve an incomplete character and
+		 * ran out of buffer space. No recovery strategy
+		 * possible, so explode everywhere. */
+		if ((orig_l + ol) - l == 0)
+			abort();
+
+		/* Report memory exhaustion case from above */
+		if (error != HUBBUB_OK)
+			return error;
+	}
+
+	/* Finally, the "normal" case; process all outstanding characters */
+	while (*sourcelen > 0) {
+		error = hubbub_utf8_codec_read_char(c,
+				source, sourcelen, dest, destlen);
+		if (error != HUBBUB_OK) {
+			return error;
+		}
+	}
+
+	return HUBBUB_OK;
+}
+
+/**
+ * Clear a utf8 codec's encoding state
+ *
+ * \param codec  The codec to reset
+ * \return HUBBUB_OK on success, appropriate error otherwise
+ */
+hubbub_error hubbub_utf8_codec_reset(hubbub_charsetcodec *codec)
+{
+	hubbub_utf8_codec *c = (hubbub_utf8_codec *) codec;
+
+	c->inval_buf[0] = '\0';
+	c->inval_len = 0;
+
+	c->read_buf[0] = 0;
+	c->read_len = 0;
+
+	c->write_buf[0] = 0;
+	c->write_len = 0;
+
+	return HUBBUB_OK;
+}
+
+
+/**
+ * Read a character from the UTF-8 to UCS4 (big endian)
+ *
+ * \param c          The codec
+ * \param source     Pointer to pointer to source buffer (updated on exit)
+ * \param sourcelen  Pointer to length of source buffer (updated on exit)
+ * \param dest       Pointer to pointer to output buffer (updated on exit)
+ * \param destlen    Pointer to length of output buffer (updated on exit)
+ * \return HUBBUB_OK on success,
+ *         HUBBUB_NOMEM       if output buffer is too small,
+ *         HUBBUB_INVALID     if a character cannot be represented and the
+ *                            codec's error handling mode is set to STRICT,
+ *         <any_other_error>  as a result of the failure of the
+ *                            client-provided filter function.
+ *
+ * On exit, ::source will point immediately _after_ the last input character
+ * read, if the result is _OK or _NOMEM. Any remaining output for the
+ * character will be buffered by the codec for writing on the next call.
+ * This buffered data is post-filtering, so will not be refiltered on the
+ * next call.
+ *
+ * In the case of the result being _INVALID or the filter function failing,
+ * ::source will point _at_ the last input character read; nothing will be
+ * written or buffered for the failed character. It is up to the client to
+ * fix the cause of the failure and retry the decoding process.
+ *
+ * ::sourcelen will be reduced appropriately on exit.
+ *
+ * ::dest will point immediately _after_ the last character written.
+ *
+ * ::destlen will be reduced appropriately on exit.
+ */
+hubbub_error hubbub_utf8_codec_read_char(hubbub_utf8_codec *c,
+		const uint8_t **source, size_t *sourcelen,
+		uint8_t **dest, size_t *destlen)
+{
+	uint32_t ucs4;
+	size_t sucs4;
+	hubbub_error error;
+
+	/* Convert a single character */
+	error = hubbub_utf8_to_ucs4(*source, *sourcelen, &ucs4, &sucs4);
+	if (error == HUBBUB_OK) {
+		/* Read a character */
+		error = hubbub_utf8_codec_filter_decoded_char(c,
+				ucs4, dest, destlen);
+		if (error == HUBBUB_OK || error == HUBBUB_NOMEM) {
+			/* filter function succeeded; update source pointers */
+			*source += sucs4;
+			*sourcelen -= sucs4;
+		}
+
+		/* Clear inval buffer */
+		c->inval_buf[0] = '\0';
+		c->inval_len = 0;
+
+		return error;
+	} else if (error == HUBBUB_NEEDDATA) {
+		/* Incomplete input sequence */
+		if (*sourcelen > INVAL_BUFSIZE)
+			abort();
+
+		memmove(c->inval_buf, (char *) *source, *sourcelen);
+		c->inval_buf[*sourcelen] = '\0';
+		c->inval_len = *sourcelen;
+
+		*source += *sourcelen;
+		*sourcelen = 0;
+
+		return HUBBUB_OK;
+	} else if (error == HUBBUB_INVALID) {
+		/* Illegal input sequence */
+		uint32_t nextchar;
+
+		/* Clear inval buffer */
+		c->inval_buf[0] = '\0';
+		c->inval_len = 0;
+
+		/* Strict errormode; simply flag invalid character */
+		if (c->base.errormode == HUBBUB_CHARSETCODEC_ERROR_STRICT) {
+			return HUBBUB_INVALID;
+		}
+
+		/* Find next valid UTF-8 sequence.
+		 * We're processing client-provided data, so let's
+		 * be paranoid about its validity. */
+		error = hubbub_utf8_next_paranoid(*source, *sourcelen,
+				0, &nextchar);
+		if (error != HUBBUB_OK) {
+			if (error == HUBBUB_NEEDDATA) {
+				/* Need more data to be sure */
+				if (*sourcelen > INVAL_BUFSIZE)
+					abort();
+
+				memmove(c->inval_buf, (char *) *source,
+						*sourcelen);
+				c->inval_buf[*sourcelen] = '\0';
+				c->inval_len = *sourcelen;
+
+				*source += *sourcelen;
+				*sourcelen = 0;
+
+				nextchar = 0;
+			} else {
+				return error;
+			}
+		}
+
+		/* output U+FFFD and continue processing. */
+		error = hubbub_utf8_codec_filter_decoded_char(c,
+				0xFFFD, dest, destlen);
+		if (error == HUBBUB_OK || error == HUBBUB_NOMEM) {
+			/* filter function succeeded; update source pointers */
+			*source += nextchar;
+			*sourcelen -= nextchar;
+		}
+
+		return error;
+	}
+
+	return HUBBUB_OK;
+}
+
+/**
+ * Feed a UCS4 character through the registered filter and output the result
+ *
+ * \param c        Codec to use
+ * \param ucs4     UCS4 character (host endian)
+ * \param dest     Pointer to pointer to output buffer
+ * \param destlen  Pointer to output buffer length
+ * \return HUBBUB_OK          on success,
+ *         HUBBUB_NOMEM       if output buffer is too small,
+ *         <any_other_error>  as a result of the failure of the
+ *                            client-provided filter function.
+ */
+hubbub_error hubbub_utf8_codec_filter_decoded_char(hubbub_utf8_codec *c,
+		uint32_t ucs4, uint8_t **dest, size_t *destlen)
+{
+	if (c->base.filter != NULL) {
+		uint32_t *rep;
+		size_t replen;
+		hubbub_error error;
+
+		error = c->base.filter(ucs4, &rep, &replen,
+				c->base.filter_pw);
+		if (error != HUBBUB_OK) {
+			return error;
+		}
+
+		while (replen > 0 && *destlen >= replen * 4) {
+			*((uint32_t *) (void *) *dest) = htonl(*rep);
+
+			*dest += 4;
+			*destlen -= 4;
+
+			rep++;
+			replen--;
+		}
+
+		if (*destlen < replen * 4) {
+			/* Run out of output buffer */
+			size_t i;
+
+			/* Buffer remaining output */
+			c->read_len = replen;
+
+			for (i = 0; i < replen; i++) {
+				c->read_buf[i] = rep[i];
+			}
+
+			return HUBBUB_NOMEM;
+		}
+
+	} else {
+		if (*destlen < 4) {
+			/* Run out of output buffer */
+			c->read_len = 1;
+			c->read_buf[0] = ucs4;
+
+			return HUBBUB_NOMEM;
+		}
+
+		*((uint32_t *) (void *) *dest) = htonl(ucs4);
+		*dest += 4;
+		*destlen -= 4;
+	}
+
+	return HUBBUB_OK;
+}
+
+
+const hubbub_charsethandler hubbub_utf8_codec_handler = {
+	hubbub_utf8_codec_handles_charset,
+	hubbub_utf8_codec_create
+};
diff --git a/src/charset/detect.c b/src/charset/detect.c
new file mode 100644
index 0000000..8ff3b87
--- /dev/null
+++ b/src/charset/detect.c
@@ -0,0 +1,673 @@
+/*
+ * This file is part of Hubbub.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#include <stdbool.h>
+#include <string.h>
+
+#include "charset/aliases.h"
+#include "utils/utils.h"
+
+#include "detect.h"
+
+static uint16_t hubbub_charset_read_bom(const uint8_t **data, size_t *len);
+static uint16_t hubbub_charset_scan_meta(const uint8_t *data, size_t len);
+static uint16_t hubbub_charset_parse_attributes(const uint8_t **pos,
+		const uint8_t *end);
+static uint16_t hubbub_charset_parse_content(const uint8_t *value,
+		uint32_t valuelen);
+static bool hubbub_charset_get_attribute(const uint8_t **data,
+		const uint8_t *end,
+		const uint8_t **name, uint32_t *namelen,
+		const uint8_t **value, uint32_t *valuelen);
+
+/**
+ * Extract a charset from a chunk of data
+ *
+ * \param data     Pointer to pointer to buffer containing data
+ * \param len      Pointer to buffer length
+ * \param mibenum  Pointer to location to store MIB enum representing charset
+ * \param source   Pointer to location to receive charset source
+ * \return HUBBUB_OK on success, appropriate error otherwise
+ *
+ * The data pointer and length will be modified by this function if
+ * a byte order mark is encountered at the start of the buffer. The updated
+ * data pointer will point to the first byte in the buffer after the BOM.
+ * The length will be modified appropriately.
+ *
+ * The larger a chunk of data fed to this routine, the better, as it allows
+ * charset autodetection access to a larger dataset for analysis.
+ */
+hubbub_error hubbub_charset_extract(const uint8_t **data, size_t *len,
+		uint16_t *mibenum, hubbub_charset_source *source)
+{
+	uint16_t charset = 0;
+
+	if (data == NULL || *data == NULL || len == NULL ||
+			mibenum == NULL || source == NULL)
+		return HUBBUB_BADPARM;
+
+	/* We need at least 4 bytes of data */
+	if (*len < 4)
+		goto default_encoding;
+
+	/* First, look for a BOM */
+	charset = hubbub_charset_read_bom(data, len);
+	if (charset != 0) {
+		*mibenum = charset;
+		*source = HUBBUB_CHARSET_DOCUMENT;
+
+		return HUBBUB_OK;
+	}
+
+	/* No BOM was found, so we must look for a meta charset within
+	 * the document itself. */
+	charset = hubbub_charset_scan_meta(*data, *len);
+	if (charset != 0) {
+		/* ISO-8859-1 becomes Windows-1252 */
+		if (charset == hubbub_mibenum_from_name("ISO-8859-1",
+				SLEN("ISO-8859-1"))) {
+			charset = hubbub_mibenum_from_name("Windows-1252",
+					SLEN("Windows-1252"));
+			/* Fallback to 8859-1 if that failed */
+			if (charset == 0)
+				charset = hubbub_mibenum_from_name(
+					"ISO-8859-1", SLEN("ISO-8859-1"));
+		}
+
+		/* If we've encountered a meta charset for a non-ASCII-
+		 * compatible encoding, don't trust it.
+		 *
+		 * Firstly, it should have been sent with a BOM (and thus
+		 * detected above).
+		 *
+		 * Secondly, we've just used an ASCII-only parser to
+		 * extract the encoding from the document. Therefore,
+		 * the document plainly isn't what the meta charset
+		 * claims it is.
+		 *
+		 * What we do in this case is to ignore the meta charset's
+		 * claims and leave the charset determination to the
+		 * autodetection routines (or the fallback case if they
+		 * fail).
+		 */
+		if (charset != hubbub_mibenum_from_name("UTF-16",
+					SLEN("UTF-16")) &&
+			charset != hubbub_mibenum_from_name("UTF-16LE",
+					SLEN("UTF-16LE")) &&
+			charset != hubbub_mibenum_from_name("UTF-16BE",
+					SLEN("UTF-16BE")) &&
+			charset != hubbub_mibenum_from_name("UTF-32",
+					SLEN("UTF-32")) &&
+			charset != hubbub_mibenum_from_name("UTF-32LE",
+					SLEN("UTF-32LE")) &&
+			charset != hubbub_mibenum_from_name("UTF-32BE",
+					SLEN("UTF-32BE"))) {
+
+			*mibenum = charset;
+			*source = HUBBUB_CHARSET_DOCUMENT;
+
+			return HUBBUB_OK;
+		}
+	}
+
+	/* No charset was specified within the document, attempt to
+	 * autodetect the encoding from the data that we have available. */
+
+	/** \todo Charset autodetection */
+
+	/* We failed to autodetect a charset, so use the default fallback */
+default_encoding:
+
+	charset = hubbub_mibenum_from_name("Windows-1252",
+			SLEN("Windows-1252"));
+	if (charset == 0)
+		charset = hubbub_mibenum_from_name("ISO-8859-1",
+				SLEN("ISO-8859-1"));
+
+	*mibenum = charset;
+	*source = HUBBUB_CHARSET_DEFAULT;
+
+	return HUBBUB_OK;
+}
+
+
+/**
+ * Inspect the beginning of a buffer of data for the presence of a
+ * UTF Byte Order Mark.
+ *
+ * \param data  Pointer to pointer to buffer containing data
+ * \param len   Pointer to buffer length
+ * \return MIB enum representing encoding described by BOM, or 0 if not found
+ *
+ * If a BOM is found, the data pointer will be modified to point to the first
+ * byte in the buffer after the BOM. The length will also be modified
+ * appropriately.
+ */
+uint16_t hubbub_charset_read_bom(const uint8_t **data, size_t *len)
+{
+	if (data == NULL || *data == NULL || len == NULL)
+		return 0;
+
+	/* We require at least 4 bytes of data */
+	if (*len < 4)
+		return 0;
+
+#define UTF32BOM_LEN (4)
+#define UTF16BOM_LEN (2)
+#define UTF8BOM_LEN  (3)
+
+	if ((*data)[0] == 0x00 && (*data)[1] == 0x00 &&
+			(*data)[2] == 0xFE && (*data)[3] == 0xFF) {
+		*data += UTF32BOM_LEN;
+		*len  -= UTF32BOM_LEN;
+
+		return hubbub_mibenum_from_name("UTF-32BE",
+				SLEN("UTF-32BE"));
+	} else if ((*data)[0] == 0xFF && (*data)[1] == 0xFE &&
+			(*data)[2] == 0x00 && (*data)[3] == 0x00) {
+		*data += UTF32BOM_LEN;
+		*len  -= UTF32BOM_LEN;
+
+		return hubbub_mibenum_from_name("UTF-32LE",
+				SLEN("UTF-32LE"));
+	} else if ((*data)[0] == 0xFE && (*data)[1] == 0xFF) {
+		*data += UTF16BOM_LEN;
+		*len  -= UTF16BOM_LEN;
+
+		return hubbub_mibenum_from_name("UTF-16BE",
+				SLEN("UTF-16BE"));
+	} else if ((*data)[0] == 0xFF && (*data)[1] == 0xFE) {
+		*data += UTF16BOM_LEN;
+		*len  -= UTF16BOM_LEN;
+
+		return hubbub_mibenum_from_name("UTF-16LE",
+				SLEN("UTF-16LE"));
+	} else if ((*data)[0] == 0xEF && (*data)[1] == 0xBB &&
+			(*data)[2] == 0xBF) {
+		*data += UTF8BOM_LEN;
+		*len  -= UTF8BOM_LEN;
+
+		return hubbub_mibenum_from_name("UTF-8", SLEN("UTF-8"));
+	}
+
+#undef UTF32BOM_LEN
+#undef UTF16BOM_LEN
+#undef UTF8BOM_LEN
+
+	return 0;
+}
+
+#define PEEK(a)								\
+	(pos < end - SLEN(a) && 					\
+		strncasecmp((const char *) pos, a, SLEN(a)) == 0)
+
+#define ADVANCE(a)							\
+	while (pos < end - SLEN(a)) {					\
+		if (PEEK(a))						\
+			break;						\
+		pos++;							\
+	}								\
+									\
+	if (pos == end - SLEN(a))					\
+		return 0;
+
+#define ISSPACE(a)							\
+	(a == 0x09 || a == 0x0a || a == 0x0b || 			\
+		a == 0x0c || a == 0x0d || a == 0x20)
+
+/**
+ * Search for a meta charset within a buffer of data
+ *
+ * \param data  Pointer to buffer containing data
+ * \param len   Length of buffer in data
+ * \return MIB enum representing encoding, or 0 if none found
+ */
+uint16_t hubbub_charset_scan_meta(const uint8_t *data, size_t len)
+{
+	const uint8_t *pos = data;
+	const uint8_t *end;
+	uint16_t mibenum;
+
+	if (data == NULL)
+		return 0;
+
+	end = pos + min(512, len);
+
+	/* 1. */
+	while (pos < end) {
+		/* a */
+		if (PEEK("<!--")) {
+			pos += SLEN("<!--");
+			ADVANCE("-->");
+		/* b */
+		} else if (PEEK("<meta")) {
+			if (pos + SLEN("<meta") >= end - 1)
+				return 0;
+
+			if (ISSPACE(*(pos + SLEN("<meta")))) {
+				/* 1 */
+				pos += SLEN("<meta");
+
+				mibenum = hubbub_charset_parse_attributes(
+						&pos, end);
+				if (mibenum != 0)
+					return mibenum;
+
+				if (pos >= end)
+					return 0;
+			}
+		/* c */
+		} else if ((PEEK("</") && (pos < end - 3 &&
+				(0x41 <= (*(pos + 2) & ~ 0x20) &&
+				(*(pos + 2) & ~ 0x20) <= 0x5A))) ||
+				(pos < end - 2 && *pos == '<' &&
+				(0x41 <= (*(pos + 1) & ~ 0x20) &&
+				(*(pos + 1) & ~ 0x20) <= 0x5A))) {
+
+			/* skip '<' */
+			pos++;
+
+			/* 1. */
+			while (pos < end) {
+				if (ISSPACE(*pos) ||
+						*pos == '>' || *pos == '<')
+					break;
+				pos++;
+			}
+
+			if (pos >= end)
+				return 0;
+
+			/* 3 */
+			if (*pos != '<') {
+				const uint8_t *n;
+				const uint8_t *v;
+				uint32_t nl, vl;
+
+				while (hubbub_charset_get_attribute(&pos, end,
+						&n, &nl, &v, &vl))
+					; /* do nothing */
+			/* 2 */
+			} else
+				continue;
+		/* d */
+		} else if (PEEK("<!") || PEEK("</") || PEEK("<?")) {
+			pos++;
+			ADVANCE(">");
+		}
+
+		/* e - do nothing */
+
+		/* 2 */
+		pos++;
+	}
+
+	return 0;
+}
+
+/**
+ * Parse attributes on a meta tag
+ *
+ * \param pos  Pointer to pointer to current location (updated on exit)
+ * \param end  Pointer to end of data stream
+ * \return MIB enum of detected encoding, or 0 if none found
+ */
+uint16_t hubbub_charset_parse_attributes(const uint8_t **pos,
+		const uint8_t *end)
+{
+	const uint8_t *name;
+	const uint8_t *value;
+	uint32_t namelen, valuelen;
+	uint16_t mibenum;
+
+	if (pos == NULL || *pos == NULL || end == NULL)
+		return 0;
+
+	/* 2 */
+	while (hubbub_charset_get_attribute(pos, end,
+			&name, &namelen, &value, &valuelen)) {
+		/* 3 */
+		/* a */
+		if (namelen == SLEN("charset") && valuelen > 0 &&
+				strncasecmp((const char *) name, "charset",
+					SLEN("charset")) == 0) {
+			/* strip value */
+			while (ISSPACE(*value)) {
+				value++;
+				valuelen--;
+			}
+
+			while (valuelen > 0 && ISSPACE(value[valuelen - 1]))
+				valuelen--;
+
+			mibenum = hubbub_mibenum_from_name(
+					(const char *) value, valuelen);
+			if (mibenum != 0)
+				return mibenum;
+		/* b */
+		} else if (namelen == SLEN("content") && valuelen > 0 &&
+				strncasecmp((const char *) name, "content",
+					SLEN("content")) == 0) {
+			mibenum = hubbub_charset_parse_content(value,
+					valuelen);
+			if (mibenum != 0)
+				return mibenum;
+		}
+
+		/* c - do nothing */
+
+		/* 1 */
+		while (*pos < end) {
+			if (ISSPACE(**pos))
+				break;
+			(*pos)++;
+		}
+
+		if (*pos >= end) {
+			return 0;
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * Parse a content= attribute's value
+ *
+ * \param value     Attribute's value
+ * \param valuelen  Length of value
+ * \return MIB enum of detected encoding, or 0 if none found
+ */
+uint16_t hubbub_charset_parse_content(const uint8_t *value,
+		uint32_t valuelen)
+{
+	const uint8_t *end;
+	const uint8_t *tentative = NULL;
+	uint32_t tentative_len = 0;
+
+	if (value == NULL)
+		return 0;
+
+	end = value + valuelen;
+
+	/* 1 */
+	while (value < end) {
+		if (*value == ';') {
+			value++;
+			break;
+		}
+
+		value++;
+	}
+
+	if (value >= end)
+		return 0;
+
+	/* 2 */
+	while (value < end && ISSPACE(*value)) {
+		value++;
+	}
+
+	if (value >= end)
+		return 0;
+
+	/* 3 */
+	if (value < end - SLEN("charset") &&
+			strncasecmp((const char *) value,
+					"charset", SLEN("charset")) != 0)
+		return 0;
+
+	value += SLEN("charset");
+
+	/* 4 */
+	while (value < end && ISSPACE(*value)) {
+		value++;
+	}
+
+	if (value >= end)
+		return 0;
+
+	/* 5 */
+	if (*value != '=')
+		return 0;
+	/* skip '=' */
+	value++;
+
+	/* 6 */
+	while (value < end && ISSPACE(*value)) {
+		value++;
+	}
+
+	if (value >= end)
+		return 0;
+
+	/* 7 */
+	tentative = value;
+
+	/* a */
+	if (*value == '"') {
+		while (++value < end && *value != '"') {
+			tentative_len++;
+		}
+
+		if (value < end)
+			tentative++;
+		else
+			tentative = NULL;
+	/* b */
+	} else if (*value == '\'') {
+		while (++value < end && *value != '\'') {
+			tentative_len++;
+		}
+
+		if (value < end)
+			tentative++;
+		else
+			tentative = NULL;
+	/* c */
+	} else {
+		while (value < end && !ISSPACE(*value)) {
+			value++;
+			tentative_len++;
+		}
+	}
+
+	/* 8 */
+	if (tentative != NULL) {
+		return hubbub_mibenum_from_name((const char *) tentative,
+				tentative_len);
+	}
+
+	/* 9 */
+	return 0;
+}
+
+/**
+ * Extract an attribute from the data stream
+ *
+ * \param data      Pointer to pointer to current location (updated on exit)
+ * \param end       Pointer to end of data stream
+ * \param name      Pointer to location to receive attribute name
+ * \param namelen   Pointer to location to receive attribute name length
+ * \param value     Pointer to location to receive attribute value
+ * \param valuelen  Pointer to location to receive attribute value langth
+ * \return true if attribute extracted, false otherwise.
+ *
+ * Note: The caller should heed the returned lengths; these are the only
+ * indicator that useful content resides in name or value.
+ */
+bool hubbub_charset_get_attribute(const uint8_t **data, const uint8_t *end,
+		const uint8_t **name, uint32_t *namelen,
+		const uint8_t **value, uint32_t *valuelen)
+{
+	const uint8_t *pos;
+
+	if (data == NULL || *data == NULL || end == NULL || name == NULL ||
+			namelen == NULL || value == NULL || valuelen == NULL)
+		return false;
+
+	pos = *data;
+
+	/* 1. Skip leading spaces or '/' characters */
+	while (pos < end && (ISSPACE(*pos) || *pos == '/')) {
+		pos++;
+	}
+
+	if (pos >= end) {
+		*data = pos;
+		return false;
+	}
+
+	/* 2. Invalid element open character */
+	if (*pos == '<') {
+		pos--;
+		*data = pos;
+		return false;
+	}
+
+	/* 3. End of element */
+	if (*pos == '>') {
+		*data = pos;
+		return false;
+	}
+
+	/* 4. Initialise name & value to empty string */
+	*name = pos;
+	*namelen = 0;
+	*value = (const uint8_t *) "";
+	*valuelen = 0;
+
+	/* 5. Extract name */
+	while (pos < end) {
+		/* a */
+		if (*pos == '=') {
+			break;
+		}
+
+		/* b */
+		if (ISSPACE(*pos)) {
+			break;
+		}
+
+		/* c */
+		if (*pos == '/' || *pos == '<' || *pos == '>') {
+			return true;
+		}
+
+		/* d is handled by strncasecmp in _parse_attributes */
+
+		/* e */
+		(*namelen)++;
+
+		/* 6 */
+		pos++;
+	}
+
+	if (pos >= end) {
+		*data = pos;
+		return false;
+	}
+
+	if (ISSPACE(*pos)) {
+		/* 7. Skip trailing spaces */
+		while (pos < end && ISSPACE(*pos)) {
+			pos++;
+		}
+
+		if (pos >= end) {
+			*data = pos;
+			return false;
+		}
+
+		/* 8. Must be '=' */
+		if (*pos != '=') {
+			pos--;
+			*data = pos;
+			return true;
+		}
+	}
+
+	/* 9. Skip '=' */
+	pos++;
+
+	/* 10. Skip any spaces after '=' */
+	while (pos < end && ISSPACE(*pos)) {
+		pos++;
+	}
+
+	if (pos >= end) {
+		*data = pos;
+		return false;
+	}
+
+	/* 11. Extract value, if quoted */
+	/* a */
+	if (*pos == '\'' || *pos == '"') {
+		/* 1 */
+		const uint8_t *quote = pos;
+
+		/* 2 */
+		while (++pos < end) {
+			/* 3 */
+			if (*pos == *quote) {
+				*value = (quote + 1);
+				*data = ++pos;
+				return true;
+			}
+
+			/* 4 is handled by strncasecmp */
+
+			/* 5 */
+			(*valuelen)++;
+
+			/* 6 */
+		}
+
+		if (pos >= end) {
+			*data = pos;
+			return false;
+		}
+	}
+
+	/* b */
+	if (*pos == '<' || *pos == '>') {
+		*data = pos;
+		return true;
+	}
+
+	/* c is handled by strncasecmp */
+
+	/* d */
+	*value = pos;
+
+	while (pos < end) {
+		/* 12. Extract unquoted value */
+		/* a */
+		if (ISSPACE(*pos) || *pos == '<' || *pos == '>') {
+			*data = pos;
+			return true;
+		}
+
+		/* b is handled by strncasecmp */
+
+		/* c */
+		(*valuelen)++;
+
+		/* 13. Advance */
+		pos++;
+	}
+
+	if (pos >= end) {
+		*data = pos;
+		return false;
+	}
+
+	/* should never be reached */
+	abort();
+
+	return false;
+}
diff --git a/src/charset/detect.h b/src/charset/detect.h
new file mode 100644
index 0000000..854a8d6
--- /dev/null
+++ b/src/charset/detect.h
@@ -0,0 +1,22 @@
+/*
+ * This file is part of Hubbub.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#ifndef hubbub_charset_detect_h_
+#define hubbub_charset_detect_h_
+
+#include <inttypes.h>
+
+#include <hubbub/errors.h>
+#include <hubbub/functypes.h>
+#include <hubbub/types.h>
+
+/* Extract a charset from a chunk of data */
+hubbub_error hubbub_charset_extract(const uint8_t **data, size_t *len,
+		uint16_t *mibenum, hubbub_charset_source *source);
+
+#endif
+
diff --git a/src/hubbub.c b/src/hubbub.c
new file mode 100644
index 0000000..32e0a1f
--- /dev/null
+++ b/src/hubbub.c
@@ -0,0 +1,63 @@
+/*
+ * This file is part of Hubbub.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#include <hubbub/hubbub.h>
+
+#include "charset/aliases.h"
+#include "tokeniser/entities.h"
+
+/**
+ * Initialise the Hubbub library for use.
+ *
+ * This _must_ be called before using any hubbub functions
+ *
+ * \param aliases_file  Pointer to name of file containing encoding alias data
+ * \param alloc         Pointer to (de)allocation function
+ * \param pw            Pointer to client-specific private data (may be NULL)
+ * \return HUBBUB_OK on success, applicable error otherwise.
+ */
+hubbub_error hubbub_initialise(const char *aliases_file,
+		hubbub_alloc alloc, void *pw)
+{
+	hubbub_error error;
+
+	if (aliases_file == NULL || alloc == NULL)
+		return HUBBUB_BADPARM;
+
+	error = hubbub_aliases_create(aliases_file, alloc, pw);
+	if (error != HUBBUB_OK)
+		return error;
+
+	error = hubbub_entities_create(alloc, pw);
+	if (error != HUBBUB_OK) {
+		hubbub_aliases_destroy(alloc, pw);
+		return error;
+	}
+
+	return HUBBUB_OK;
+}
+
+/**
+ * Clean up after Hubbub
+ *
+ * \param alloc  Pointer to (de)allocation function
+ * \param pw       Pointer to client-specific private data (may be NULL)
+ * \return HUBBUB_OK on success, applicable error otherwise.
+ */
+hubbub_error hubbub_finalise(hubbub_alloc alloc, void *pw)
+{
+	if (alloc == NULL)
+		return HUBBUB_BADPARM;
+
+	hubbub_entities_destroy(alloc, pw);
+
+	hubbub_aliases_destroy(alloc, pw);
+
+	return HUBBUB_OK;
+}
+
+
diff --git a/src/input/Makefile b/src/input/Makefile
new file mode 100644
index 0000000..8b06c63
--- /dev/null
+++ b/src/input/Makefile
@@ -0,0 +1,53 @@
+# Makefile for libhubbub
+#
+# Toolchain is exported by top-level makefile
+#
+# Top-level makefile also exports the following variables:
+#
+# COMPONENT  Name of component
+# EXPORT     Absolute path of export directory
+# TOP        Absolute path of source tree root
+#
+# The top-level makefile requires the following targets to exist:
+#
+# clean      Clean source tree
+# debug      Create a debug binary
+# distclean  Fully clean source tree, back to pristine condition
+# export     Export distributable components to ${EXPORT}
+# release    Create a release binary
+# setup      Perform any setup required prior to compilation
+# test       Execute any test cases
+
+# Manipulate include paths
+CFLAGS += -I$(CURDIR)
+
+# Objects
+OBJS = filter inputstream utf8_stream
+
+.PHONY: clean debug distclean export release setup test
+
+# Targets
+release: $(addprefix ../Release/, $(addsuffix .o, $(OBJS)))
+
+debug: $(addprefix ../Debug/, $(addsuffix .o, $(OBJS)))
+
+clean:
+	-@${RM} ${RMFLAGS} $(addprefix ../Release/, $(addsuffix .o, ${OBJS}))
+	-@${RM} ${RMFLAGS} $(addprefix ../Debug/, $(addsuffix .o, ${OBJS}))  
+
+distclean:
+
+setup:
+
+export:
+
+test:
+
+# Pattern rules
+../Release/%.o: %.c
+	@${ECHO} ${ECHOFLAGS} "==> $<"
+	@${CC} -c ${CFLAGS} -DNDEBUG -o $@ $<
+
+../Debug/%.o: %.c
+	@${ECHO} ${ECHOFLAGS} "==> $<"
+	@${CC} -c -g ${CFLAGS} -o $@ $<
diff --git a/src/input/filter.c b/src/input/filter.c
new file mode 100644
index 0000000..5ac5391
--- /dev/null
+++ b/src/input/filter.c
@@ -0,0 +1,380 @@
+/*
+ * This file is part of Hubbub.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#include <errno.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "charset/aliases.h"
+#include "charset/codec.h"
+#include "utils/utils.h"
+
+#include "input/filter.h"
+
+
+/** Input filter */
+struct hubbub_filter {
+	hubbub_charsetcodec *read_codec;	/**< Read codec */
+	hubbub_charsetcodec *write_codec;	/**< Write codec */
+
+	uint32_t filter_output[2];	/**< Filter output buffer */
+	uint32_t last_filter_char;	/**< Last filtered character */
+
+	uint32_t pivot_buf[64];		/**< Conversion pivot buffer */
+
+	bool leftover;			/**< Data remains from last call */
+	uint8_t *pivot_left;		/**< Remaining pivot to write */
+	size_t pivot_len;		/**< Length of pivot remaining */
+
+	struct {
+		uint16_t encoding;	/**< Input encoding */
+	} settings;			/**< Filter settings */
+
+	hubbub_alloc alloc;		/**< Memory (de)allocation function */
+	void *pw;			/**< Client private data */
+};
+
+static hubbub_error hubbub_filter_set_defaults(hubbub_filter *input);
+static hubbub_error hubbub_filter_set_encoding(hubbub_filter *input,
+		const char *enc);
+static hubbub_error read_character_filter(uint32_t c,
+		uint32_t **output, size_t *outputlen, void *pw);
+
+/**
+ * Create an input filter
+ *
+ * \param int_enc  Desired encoding of document
+ * \param alloc    Function used to (de)allocate data
+ * \param pw       Pointer to client-specific private data (may be NULL)
+ * \return Pointer to filter instance, or NULL on failure
+ */
+hubbub_filter *hubbub_filter_create(const char *int_enc,
+		hubbub_alloc alloc, void *pw)
+{
+	hubbub_filter *filter;
+
+	if (alloc == NULL)
+		return NULL;
+
+	filter = alloc(NULL, sizeof(*filter), pw);
+	if (!filter)
+		return NULL;
+
+	filter->last_filter_char = 0;
+
+	filter->leftover = false;
+	filter->pivot_left = NULL;
+	filter->pivot_len = 0;
+
+	filter->alloc = alloc;
+	filter->pw = pw;
+
+	if (hubbub_filter_set_defaults(filter) != HUBBUB_OK) {
+		filter->alloc(filter, 0, pw);
+		return NULL;
+	}
+
+	filter->write_codec = hubbub_charsetcodec_create(int_enc, alloc, pw);
+	if (filter->write_codec == NULL) {
+		if (filter->read_codec != NULL)
+			hubbub_charsetcodec_destroy(filter->read_codec);
+		filter->alloc(filter, 0, pw);
+		return NULL;
+	}
+
+	return filter;
+}
+
+/**
+ * Destroy an input filter
+ *
+ * \param input  Pointer to filter instance
+ */
+void hubbub_filter_destroy(hubbub_filter *input)
+{
+	if (input == NULL)
+		return;
+
+	if (input->read_codec != NULL)
+		hubbub_charsetcodec_destroy(input->read_codec);
+
+	if (input->write_codec != NULL)
+		hubbub_charsetcodec_destroy(input->write_codec);
+
+	input->alloc(input, 0, input->pw);
+
+	return;
+}
+
+/**
+ * Configure an input filter
+ *
+ * \param input   Pointer to filter instance
+ * \param type    Input option type to configure
+ * \param params  Option-specific parameters
+ * \return HUBBUB_OK on success, appropriate error otherwise
+ */
+hubbub_error hubbub_filter_setopt(hubbub_filter *input,
+		hubbub_filter_opttype type,
+		hubbub_filter_optparams *params)
+{
+	hubbub_error error = HUBBUB_OK;
+
+	if (input == NULL || params == NULL)
+		return HUBBUB_BADPARM;
+
+	switch (type) {
+	case HUBBUB_FILTER_SET_ENCODING:
+		error = hubbub_filter_set_encoding(input,
+				params->encoding.name);
+		break;
+	}
+
+	return error;
+}
+
+/**
+ * Process a chunk of data
+ *
+ * \param input   Pointer to filter instance
+ * \param data    Pointer to pointer to input buffer
+ * \param len     Pointer to length of input buffer
+ * \param output  Pointer to pointer to output buffer
+ * \param outlen  Pointer to length of output buffer
+ * \return HUBBUB_OK on success, appropriate error otherwise
+ *
+ * Call this with an input buffer length of 0 to flush any buffers.
+ */
+hubbub_error hubbub_filter_process_chunk(hubbub_filter *input,
+		const uint8_t **data, size_t *len,
+		uint8_t **output, size_t *outlen)
+{
+	hubbub_error read_error, write_error;
+
+	if (input == NULL || data == NULL || *data == NULL || len == NULL ||
+			output == NULL || *output == NULL || outlen == NULL)
+		return HUBBUB_BADPARM;
+
+	if (input->leftover) {
+		/* Some data left to be written from last call */
+
+		/* Attempt to flush the remaining data. */
+		write_error = hubbub_charsetcodec_encode(input->write_codec,
+				(const uint8_t **) &input->pivot_left,
+				&input->pivot_len,
+				output, outlen);
+
+		if (write_error != HUBBUB_OK) {
+			return write_error;
+		}
+
+		/* And clear leftover */
+		input->pivot_left = NULL;
+		input->pivot_len = 0;
+		input->leftover = false;
+	}
+
+	while (*len > 0) {
+		size_t pivot_len = sizeof(input->pivot_buf);
+		uint8_t *pivot = (uint8_t *) input->pivot_buf;
+
+		read_error = hubbub_charsetcodec_decode(input->read_codec,
+				data, len,
+				(uint8_t **) &pivot, &pivot_len);
+
+		pivot = (uint8_t *) input->pivot_buf;
+		pivot_len = sizeof(input->pivot_buf) - pivot_len;
+
+		if (pivot_len > 0) {
+			write_error = hubbub_charsetcodec_encode(
+					input->write_codec,
+					(const uint8_t **) &pivot,
+					&pivot_len,
+					output, outlen);
+
+			if (write_error != HUBBUB_OK) {
+				input->leftover = true;
+				input->pivot_left = pivot;
+				input->pivot_len = pivot_len;
+
+				return write_error;
+			}
+		}
+
+		if (read_error != HUBBUB_OK && read_error != HUBBUB_NOMEM)
+			return read_error;
+	}
+
+	return HUBBUB_OK;
+}
+
+/**
+ * Reset an input filter's state
+ *
+ * \param input  The input filter to reset
+ * \param HUBBUB_OK on success, appropriate error otherwise
+ */
+hubbub_error hubbub_filter_reset(hubbub_filter *input)
+{
+	hubbub_error error;
+
+	if (input == NULL)
+		return HUBBUB_BADPARM;
+
+	/* Clear pivot buffer leftovers */
+	input->pivot_left = NULL;
+	input->pivot_len = 0;
+	input->leftover = false;
+
+	/* Reset read codec */
+	error = hubbub_charsetcodec_reset(input->read_codec);
+	if (error != HUBBUB_OK)
+		return error;
+
+	/* Reset write codec */
+	error = hubbub_charsetcodec_reset(input->write_codec);
+	if (error != HUBBUB_OK)
+		return error;
+
+	return HUBBUB_OK;
+}
+
+/**
+ * Set an input filter's default settings
+ *
+ * \param input  Input filter to configure
+ * \return HUBBUB_OK on success, appropriate error otherwise
+ */
+hubbub_error hubbub_filter_set_defaults(hubbub_filter *input)
+{
+	hubbub_error error;
+
+	if (input == NULL)
+		return HUBBUB_BADPARM;
+
+	input->read_codec = NULL;
+	input->write_codec = NULL;
+	input->settings.encoding = 0;
+	error = hubbub_filter_set_encoding(input, "ISO-8859-1");
+	if (error != HUBBUB_OK)
+		return error;
+
+	return HUBBUB_OK;
+}
+
+/**
+ * Set an input filter's encoding
+ *
+ * \param input  Input filter to configure
+ * \param enc    Encoding name
+ * \return HUBBUB_OK on success, appropriate error otherwise
+ */
+hubbub_error hubbub_filter_set_encoding(hubbub_filter *input,
+		const char *enc)
+{
+	const char *old_enc;
+	uint16_t mibenum;
+	hubbub_error error;
+	hubbub_charsetcodec_optparams params;
+
+	if (input == NULL || enc == NULL)
+		return HUBBUB_BADPARM;
+
+	mibenum = hubbub_mibenum_from_name(enc, strlen(enc));
+	if (mibenum == 0)
+		return HUBBUB_INVALID;
+
+	/* Exit early if we're already using this encoding */
+	if (input->settings.encoding == mibenum)
+		return HUBBUB_OK;
+
+	old_enc = hubbub_mibenum_to_name(input->settings.encoding);
+	if (old_enc == NULL)
+		old_enc = "ISO-8859-1";
+
+	if (input->read_codec != NULL)
+		hubbub_charsetcodec_destroy(input->read_codec);
+
+	input->read_codec = hubbub_charsetcodec_create(enc, input->alloc,
+			input->pw);
+	if (input->read_codec == NULL)
+		return HUBBUB_NOMEM;
+
+	/* Register filter function */
+	params.filter_func.filter = read_character_filter;
+	params.filter_func.pw = (void *) input;
+	error = hubbub_charsetcodec_setopt(input->read_codec,
+			HUBBUB_CHARSETCODEC_FILTER_FUNC,
+			(hubbub_charsetcodec_optparams *) &params);
+	if (error != HUBBUB_OK)
+		return error;
+
+	input->settings.encoding = mibenum;
+
+	return HUBBUB_OK;
+}
+
+/**
+ * Character filter function for read characters
+ *
+ * \param c          The read character (UCS4 - host byte order)
+ * \param output     Pointer to pointer to output buffer (filled on exit)
+ * \param outputlen  Pointer to output buffer length (filled on exit)
+ * \param pw         Pointer to client-specific private data.
+ * \return HUBBUB_OK on success, appropriate error otherwise
+ */
+hubbub_error read_character_filter(uint32_t c, uint32_t **output,
+		size_t *outputlen, void *pw)
+{
+	hubbub_filter *input = (hubbub_filter *) pw;
+	size_t len;
+
+	if (output == NULL || outputlen == NULL || pw == NULL)
+		return HUBBUB_BADPARM;
+
+	/* Line ending normalisation:
+	 *   CRLF -> LF  (trap CR and let LF through unmodified)
+	 *   CR   -> LF  (trap CR and convert to LF if not CRLF)
+	 *   LF   -> LF  (leave LF alone)
+	 */
+
+#define NUL (0x00000000)
+#define CR  (0x0000000D)
+#define LF  (0x0000000A)
+#define REP (0x0000FFFD)
+
+	if (c == NUL) {
+		/* Replace NUL (U+0000) characters in input with U+FFFD */
+		input->filter_output[0] = REP;
+		len = 1;
+	} else if (c == CR) {
+		/* Trap CR characters */
+		len = 0;
+	} else if (input->last_filter_char == CR && c != LF) {
+		/* Last char was CR and this isn't LF => CR -> LF */
+		input->filter_output[0] = LF;
+		input->filter_output[1] = c;
+		len = 2;
+	} else {
+		/* Let character through unchanged */
+		input->filter_output[0] = c;
+		len = 1;
+	}
+
+#undef NUL
+#undef CR
+#undef LF
+#undef REP
+
+	input->last_filter_char = c;
+
+	*output = input->filter_output;
+	*outputlen = len;
+
+	return HUBBUB_OK;
+}
diff --git a/src/input/filter.h b/src/input/filter.h
new file mode 100644
index 0000000..6650e09
--- /dev/null
+++ b/src/input/filter.h
@@ -0,0 +1,57 @@
+/*
+ * This file is part of Hubbub.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#ifndef hubbub_input_filter_h_
+#define hubbub_input_filter_h_
+
+#include <inttypes.h>
+
+#include <hubbub/errors.h>
+#include <hubbub/functypes.h>
+
+typedef struct hubbub_filter hubbub_filter;
+
+/**
+ * Input filter option types
+ */
+typedef enum hubbub_filter_opttype {
+	HUBBUB_FILTER_SET_ENCODING       = 0,
+} hubbub_filter_opttype;
+
+/**
+ * Input filter option parameters
+ */
+typedef union hubbub_filter_optparams {
+	/** Parameters for encoding setting */
+	struct {
+		/** Encoding name */
+		const char *name;
+	} encoding;
+} hubbub_filter_optparams;
+
+
+/* Create an input filter */
+hubbub_filter *hubbub_filter_create(const char *int_enc,
+		hubbub_alloc alloc, void *pw);
+/* Destroy an input filter */
+void hubbub_filter_destroy(hubbub_filter *input);
+
+/* Configure an input filter */
+hubbub_error hubbub_filter_setopt(hubbub_filter *input,
+		hubbub_filter_opttype type,
+		hubbub_filter_optparams *params);
+
+/* Process a chunk of data */
+hubbub_error hubbub_filter_process_chunk(hubbub_filter *input,
+		const uint8_t **data, size_t *len,
+		uint8_t **output, size_t *outlen);
+
+/* Reset an input filter's state */
+hubbub_error hubbub_filter_reset(hubbub_filter *input);
+
+#endif
+
diff --git a/src/input/inputstream.c b/src/input/inputstream.c
new file mode 100644
index 0000000..f82d279
--- /dev/null
+++ b/src/input/inputstream.c
@@ -0,0 +1,479 @@
+/*
+ * This file is part of Hubbub.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#include <stdlib.h>
+
+#include "charset/aliases.h"
+#include "input/streamimpl.h"
+
+/**
+ * Buffer moving claimant context
+ */
+struct hubbub_inputstream_bm_handler {
+	hubbub_inputstream_buffermoved handler;	/**< Handler function */
+	void *pw;				/**< Client private data */
+
+	struct hubbub_inputstream_bm_handler *next;
+	struct hubbub_inputstream_bm_handler *prev;
+};
+
+extern hubbub_streamhandler utf8stream;
+
+static hubbub_streamhandler *handler_table[] = {
+	&utf8stream,
+	NULL
+};
+
+/**
+ * Create an input stream
+ *
+ * \param enc      Document charset, or NULL to autodetect
+ * \param int_enc  Desired encoding of document
+ * \param alloc    Memory (de)allocation function
+ * \param pw       Pointer to client-specific private data (may be NULL)
+ * \return Pointer to stream instance, or NULL on failure
+ */
+hubbub_inputstream *hubbub_inputstream_create(const char *enc,
+		const char *int_enc, hubbub_alloc alloc, void *pw)
+{
+	hubbub_inputstream *stream;
+	hubbub_streamhandler **handler;
+
+	if (int_enc == NULL || alloc == NULL)
+		return NULL;
+
+	/* Search for handler class */
+	for (handler = handler_table; *handler != NULL; handler++) {
+		if ((*handler)->uses_encoding(int_enc))
+			break;
+	}
+
+	/* None found */
+	if ((*handler) == NULL)
+		return NULL;
+
+	stream = (*handler)->create(enc, int_enc, alloc, pw);
+	if (stream == NULL)
+		return NULL;
+
+	stream->handlers = NULL;
+
+	stream->alloc = alloc;
+	stream->pw = pw;
+
+	return stream;
+}
+
+/**
+ * Destroy an input stream
+ *
+ * \param stream  Input stream to destroy
+ */
+void hubbub_inputstream_destroy(hubbub_inputstream *stream)
+{
+	hubbub_inputstream_bm_handler *h, *i;
+
+	if (stream == NULL)
+		return;
+
+	for (h = stream->handlers; h; h = i) {
+		i = h->next;
+
+		stream->alloc(h, 0, stream->pw);
+	}
+
+	stream->destroy(stream);
+}
+
+/**
+ * Append data to an input stream
+ *
+ * \param stream  Input stream to append data to
+ * \param data    Data to append (in document charset), or NULL to flag EOF
+ * \param len     Length, in bytes, of data
+ * \return HUBBUB_OK on success, appropriate error otherwise
+ */
+hubbub_error hubbub_inputstream_append(hubbub_inputstream *stream,
+		const uint8_t *data, size_t len)
+{
+	if (stream == NULL)
+		return HUBBUB_BADPARM;
+
+	/* Calling this if we've disowned the buffer is foolish */
+	if (stream->buffer == NULL)
+		return HUBBUB_INVALID;
+
+	return stream->append(stream, data, len);
+}
+
+/**
+ * Insert data into stream at current location
+ *
+ * \param stream  Input stream to insert into
+ * \param data    Data to insert (UTF-8 encoded)
+ * \param len     Length, in bytes, of data
+ * \return HUBBUB_OK on success, appropriate error otherwise
+ */
+hubbub_error hubbub_inputstream_insert(hubbub_inputstream *stream,
+		const uint8_t *data, size_t len)
+{
+	if (stream == NULL || data == NULL)
+		return HUBBUB_BADPARM;
+
+	/* Calling this if we've disowned the buffer is foolish */
+	if (stream->buffer == NULL)
+		return HUBBUB_INVALID;
+
+	return stream->insert(stream, data, len);
+}
+
+/**
+ * Look at the next character in the stream
+ *
+ * \param stream  Stream to look in
+ * \return UCS4 (host-endian) character code, or EOF or OOD.
+ */
+uint32_t hubbub_inputstream_peek(hubbub_inputstream *stream)
+{
+	/* It is illegal to call this after the buffer has been disowned */
+	if (stream == NULL || stream->buffer == NULL)
+		return HUBBUB_INPUTSTREAM_OOD;
+
+	return stream->peek(stream);;
+}
+
+/**
+ * Retrieve the byte index and length of the current character in the stream
+ *
+ * \param stream  Stream to look in
+ * \param len     Pointer to location to receive byte length of character
+ * \return Byte index of current character from start of stream,
+ *         or (uint32_t) -1 on error
+ */
+uint32_t hubbub_inputstream_cur_pos(hubbub_inputstream *stream,
+		size_t *len)
+{
+	/* It is illegal to call this after the buffer has been disowned */
+	if (stream == NULL || len == NULL || stream->buffer == NULL)
+		return (uint32_t) -1;
+
+	return stream->cur_pos(stream, len);
+}
+
+/**
+ * Convert the current character to lower case
+ *
+ * \param stream  Stream to look in
+ */
+void hubbub_inputstream_lowercase(hubbub_inputstream *stream)
+{
+	if (stream == NULL || stream->buffer == NULL)
+		return;
+
+	stream->lowercase(stream);
+}
+
+/**
+ * Convert the current character to upper case
+ *
+ * \param stream  Stream to look in
+ */
+void hubbub_inputstream_uppercase(hubbub_inputstream *stream)
+{
+	if (stream == NULL || stream->buffer == NULL)
+		return;
+
+	stream->uppercase(stream);
+}
+
+/**
+ * Advance the stream's current position
+ *
+ * \param stream  The stream whose position to advance
+ */
+void hubbub_inputstream_advance(hubbub_inputstream *stream)
+{
+	/* It is illegal to call this after the buffer has been disowned */
+	if (stream == NULL || stream->buffer == NULL)
+		return;
+
+	if (stream->cursor == stream->buffer_len)
+		return;
+
+	stream->advance(stream);
+}
+
+/**
+ * Push a character back onto the stream
+ *
+ * \param stream     Stream to push back to
+ * \param character  UCS4 (host-endian) codepoint to push back
+ * \return HUBBUB_OK on success, appropriate error otherwise
+ *
+ * Note that this doesn't actually modify the data in the stream.
+ * It works by ensuring that the character located just before the
+ * current stream location is the same as ::character. If it is,
+ * then the stream pointer is moved back. If it is not, then an
+ * error is returned and the stream pointer remains unmodified.
+ */
+hubbub_error hubbub_inputstream_push_back(hubbub_inputstream *stream,
+		uint32_t character)
+{
+	/* It is illegal to call this after the buffer has been disowned */
+	if (stream == NULL || stream->buffer == NULL)
+		return HUBBUB_BADPARM;
+
+	if (stream->cursor == 0)
+		return HUBBUB_INVALID;
+
+	return stream->push_back(stream, character);
+}
+
+/**
+ * Rewind the input stream by a number of bytes
+ *
+ * \param stream  Stream to rewind
+ * \param n       Number of bytes to go back
+ * \return HUBBUB_OK on success, appropriate error otherwise
+ */
+hubbub_error hubbub_inputstream_rewind(hubbub_inputstream *stream, size_t n)
+{
+	if (stream == NULL || stream->buffer == NULL)
+		return HUBBUB_BADPARM;
+
+	if (stream->cursor < n)
+		return HUBBUB_INVALID;
+
+	stream->cursor -= n;
+
+	return HUBBUB_OK;
+}
+
+/**
+ * Claim ownership of an input stream's buffer
+ *
+ * \param stream  Input stream whose buffer to claim
+ * \param buffer  Pointer to location to receive buffer pointer
+ * \param len     Pointer to location to receive byte length of buffer
+ * \return HUBBUB_OK on success, appropriate error otherwise.
+ *
+ * Once the buffer has been claimed by a client, the input stream disclaims
+ * all ownership rights (and invalidates any internal references it may have
+ * to the buffer). Therefore, the only input stream call which may be made
+ * after calling this function is to destroy the input stream. Therefore,
+ * unless the stream pointer is located at EOF, this call will return an
+ * error.
+ */
+hubbub_error hubbub_inputstream_claim_buffer(hubbub_inputstream *stream,
+		uint8_t **buffer, size_t *len)
+{
+	if (stream == NULL || buffer == NULL || len == NULL)
+		return HUBBUB_BADPARM;
+
+	if (stream->had_eof == false ||
+			stream->cursor != stream->buffer_len)
+		return HUBBUB_INVALID;
+
+	*buffer = stream->buffer;
+	*len = stream->buffer_len;
+
+	stream->buffer = NULL;
+
+	return HUBBUB_OK;
+}
+
+/**
+ * Register interest in buffer moved events
+ *
+ * \param stream   Input stream to register interest with
+ * \param handler  Pointer to handler function
+ * \param pw       Pointer to client-specific private data (may be NULL)
+ * \return HUBBUB_OK on success, appropriate error otherwise
+ */
+hubbub_error hubbub_inputstream_register_movehandler(
+		hubbub_inputstream *stream,
+		hubbub_inputstream_buffermoved handler, void *pw)
+{
+	hubbub_inputstream_bm_handler *h;
+
+	if (stream == NULL || handler == NULL)
+		return HUBBUB_BADPARM;
+
+	h = stream->alloc(NULL, sizeof(hubbub_inputstream_bm_handler),
+			stream->pw);
+	if (h == NULL)
+		return HUBBUB_NOMEM;
+
+	h->handler = handler;
+	h->pw = pw;
+
+	h->prev = NULL;
+	h->next = stream->handlers;
+
+	if (stream->handlers)
+		stream->handlers->prev = h;
+	stream->handlers = h;
+
+	/* And notify claimant of current buffer location */
+	handler(stream->buffer, stream->buffer_len, pw);
+
+	return HUBBUB_OK;
+}
+
+/**
+ * Deregister interest in buffer moved events
+ *
+ * \param stream   Input stream to deregister from
+ * \param handler  Pointer to handler function
+ * \param pw       Pointer to client-specific private data (may be NULL)
+ * \return HUBBUB_OK on success, appropriate error otherwise
+ */
+hubbub_error hubbub_inputstream_deregister_movehandler(
+		hubbub_inputstream *stream,
+		hubbub_inputstream_buffermoved handler, void *pw)
+{
+	hubbub_inputstream_bm_handler *h;
+
+	if (stream == NULL || handler == NULL)
+		return HUBBUB_BADPARM;
+
+	for (h = stream->handlers; h; h = h->next) {
+		if (h->handler == handler && h->pw == pw)
+			break;
+	}
+
+	if (h == NULL)
+		return HUBBUB_INVALID;
+
+	if (h->next)
+		h->next->prev = h->prev;
+	if (h->prev)
+		h->prev->next = h->next;
+	else
+		stream->handlers = h->next;
+
+	stream->alloc(h, 0, stream->pw);
+
+	return HUBBUB_OK;
+}
+
+/**
+ * Case insensitively compare a pair of ranges in the input stream
+ *
+ * \param stream  Input stream to look in
+ * \param r1      Offset of start of first range
+ * \param r2      Offset of start of second range
+ * \param len     Byte length of ranges
+ * \return 0 if ranges match, non-zero otherwise
+ */
+int hubbub_inputstream_compare_range_ci(hubbub_inputstream *stream,
+		uint32_t r1, uint32_t r2, size_t len)
+{
+	if (stream == NULL || stream->buffer == NULL)
+		return 1; /* arbitrary */
+
+	return stream->cmp_range_ci(stream, r1, r2, len);
+}
+
+/**
+ * Case sensitively compare a pair of ranges in the input stream
+ *
+ * \param stream  Input stream to look in
+ * \param r1      Offset of start of first range
+ * \param r2      Offset of start of second range
+ * \param len     Byte length of ranges
+ * \return 0 if ranges match, non-zero otherwise
+ */
+int hubbub_inputstream_compare_range_cs(hubbub_inputstream *stream,
+		uint32_t r1, uint32_t r2, size_t len)
+{
+	if (stream == NULL || stream->buffer == NULL)
+		return 1; /* arbitrary */
+
+	return stream->cmp_range_cs(stream, r1, r2, len);
+}
+
+/**
+ * Case sensitively compare a range of input stream against an ASCII string
+ *
+ * \param stream  Input stream to look in
+ * \param off     Offset of range start
+ * \param len     Byte length of range
+ * \param data    Comparison string
+ * \param dlen    Byte length of comparison string
+ * \return 0 if match, non-zero otherwise
+ */
+int hubbub_inputstream_compare_range_ascii(hubbub_inputstream *stream,
+		uint32_t off, size_t len, const char *data, size_t dlen)
+{
+	if (stream == NULL || stream->buffer == NULL)
+		return 1; /* arbitrary */
+
+	return stream->cmp_range_ascii(stream, off, len, data, dlen);
+}
+
+/**
+ * Replace a range of bytes in the input stream with a single character
+ *
+ * \param stream  Input stream containing data
+ * \param start   Offset of start of range to replace
+ * \param len     Length (in bytes) of range to replace
+ * \param ucs4    UCS4 (host endian) encoded replacement character
+ * \return HUBBUB_OK on success, appropriate error otherwise
+ */
+hubbub_error hubbub_inputstream_replace_range(hubbub_inputstream *stream,
+		uint32_t start, size_t len, uint32_t ucs4)
+{
+	if (stream == NULL || stream->buffer == NULL)
+		return HUBBUB_BADPARM;
+
+	if (start >= stream->buffer_len)
+		return HUBBUB_INVALID;
+
+	if (start < stream->cursor)
+		return HUBBUB_INVALID;
+
+	return stream->replace_range(stream, start, len, ucs4);
+}
+
+/**
+ * Read the document charset
+ *
+ * \param stream  Input stream to query
+ * \param source  Pointer to location to receive charset source
+ * \return Pointer to charset name (constant; do not free), or NULL if unknown
+ */
+const char *hubbub_inputstream_read_charset(hubbub_inputstream *stream,
+		hubbub_charset_source *source)
+{
+	if (stream == NULL || source == NULL)
+		return NULL;
+
+	*source = stream->encsrc;
+
+	if (stream->encsrc == HUBBUB_CHARSET_UNKNOWN)
+		return NULL;
+
+	return hubbub_mibenum_to_name(stream->mibenum);
+}
+
+/**
+ * Inform interested parties that the buffer has moved
+ *
+ * \param stream  Input stream
+ */
+void hubbub_inputstream_buffer_moved(hubbub_inputstream *stream)
+{
+	hubbub_inputstream_bm_handler *h;
+
+	if (stream == NULL)
+		return;
+
+	for (h = stream->handlers; h; h = h->next)
+		h->handler(stream->buffer, stream->buffer_len, h->pw);
+}
+
diff --git a/src/input/inputstream.h b/src/input/inputstream.h
new file mode 100644
index 0000000..5325d14
--- /dev/null
+++ b/src/input/inputstream.h
@@ -0,0 +1,98 @@
+/*
+ * This file is part of Hubbub.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#ifndef hubbub_input_inputstream_h_
+#define hubbub_input_inputstream_h_
+
+#include <inttypes.h>
+
+#include <hubbub/errors.h>
+#include <hubbub/functypes.h>
+#include <hubbub/types.h>
+
+typedef struct hubbub_inputstream hubbub_inputstream;
+
+/* EOF pseudo-character */
+#define HUBBUB_INPUTSTREAM_EOF (0xFFFFFFFFU)
+/* Out-of-data indicator */
+#define HUBBUB_INPUTSTREAM_OOD (0xFFFFFFFEU)
+
+/* Type of input stream buffer moved handler function */
+typedef void (*hubbub_inputstream_buffermoved)(const uint8_t *buffer,
+		size_t len, void *pw);
+
+/* Create an input stream */
+hubbub_inputstream *hubbub_inputstream_create(const char *enc,
+		const char *int_enc, hubbub_alloc alloc, void *pw);
+/* Destroy an input stream */
+void hubbub_inputstream_destroy(hubbub_inputstream *stream);
+
+/* Append data to an input stream */
+hubbub_error hubbub_inputstream_append(hubbub_inputstream *stream,
+		const uint8_t *data, size_t len);
+/* Insert data into stream at current location */
+hubbub_error hubbub_inputstream_insert(hubbub_inputstream *stream,
+		const uint8_t *data, size_t len);
+
+/* Look at the next character in the stream */
+uint32_t hubbub_inputstream_peek(hubbub_inputstream *stream);
+
+/* Retrieve the byte index and length of the current character in the stream */
+uint32_t hubbub_inputstream_cur_pos(hubbub_inputstream *stream, size_t *len);
+
+/* Convert the current character to lowercase */
+void hubbub_inputstream_lowercase(hubbub_inputstream *stream);
+
+/* Convert the current character to uppercase */
+void hubbub_inputstream_uppercase(hubbub_inputstream *stream);
+
+/* Advance the stream's current position */
+void hubbub_inputstream_advance(hubbub_inputstream *stream);
+
+/* Push a character back onto the stream */
+hubbub_error hubbub_inputstream_push_back(hubbub_inputstream *stream,
+		uint32_t character);
+
+/* Rewind the input stream by a number of bytes */
+hubbub_error hubbub_inputstream_rewind(hubbub_inputstream *stream, size_t n);
+
+/* Claim ownership of an input stream's buffer */
+hubbub_error hubbub_inputstream_claim_buffer(hubbub_inputstream *stream,
+		uint8_t **buffer, size_t *len);
+
+/* Register interest in buffer moved events */
+hubbub_error hubbub_inputstream_register_movehandler(
+		hubbub_inputstream *stream,
+		hubbub_inputstream_buffermoved handler, void *pw);
+
+/* Deregister interest in buffer moved events */
+hubbub_error hubbub_inputstream_deregister_movehandler(
+		hubbub_inputstream *stream,
+		hubbub_inputstream_buffermoved handler, void *pw);
+
+/* Case insensitively compare a pair of ranges in the input stream */
+int hubbub_inputstream_compare_range_ci(hubbub_inputstream *stream,
+		uint32_t r1, uint32_t r2, size_t len);
+
+/* Case sensitively compare a pair of ranges in the input stream */
+int hubbub_inputstream_compare_range_cs(hubbub_inputstream *stream,
+		uint32_t r1, uint32_t r2, size_t len);
+
+/* Case sensitively compare a range of input stream against an ASCII string */
+int hubbub_inputstream_compare_range_ascii(hubbub_inputstream *stream,
+		uint32_t off, size_t len, const char *data, size_t dlen);
+
+/* Replace a range of bytes in the input stream with a single character */
+hubbub_error hubbub_inputstream_replace_range(hubbub_inputstream *stream,
+		uint32_t start, size_t len, uint32_t ucs4);
+
+/* Read the document charset */
+const char *hubbub_inputstream_read_charset(hubbub_inputstream *stream,
+		hubbub_charset_source *source);
+
+#endif
+
diff --git a/src/input/streamimpl.h b/src/input/streamimpl.h
new file mode 100644
index 0000000..f44f6da
--- /dev/null
+++ b/src/input/streamimpl.h
@@ -0,0 +1,77 @@
+/*
+ * This file is part of Hubbub.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#ifndef hubbub_input_streamimpl_h_
+#define hubbub_input_streamimpl_h_
+
+#include <stdbool.h>
+
+#include <hubbub/types.h>
+
+#include "input/filter.h"
+#include "input/inputstream.h"
+
+typedef struct hubbub_inputstream_bm_handler hubbub_inputstream_bm_handler;
+
+/**
+ * Input stream definition: implementations extend this
+ */
+struct hubbub_inputstream {
+	uint8_t *buffer;		/**< Document buffer */
+	size_t buffer_len;		/**< Amount of data in buffer */
+	size_t buffer_alloc;		/**< Allocated size of buffer */
+
+	uint32_t cursor;		/**< Byte offset of current position */
+
+	bool had_eof;			/**< Whether EOF has been reached */
+
+	uint16_t mibenum;		/**< MIB enum for charset, or 0 */
+	hubbub_charset_source encsrc;	/**< Charset source */
+
+	hubbub_filter *input;		/**< Charset conversion filter */
+
+	hubbub_inputstream_bm_handler *handlers;	/**< List of buffer
+							 * moved handlers */
+	hubbub_alloc alloc;		/**< Memory (de)allocation function */
+	void *pw;			/**< Client private data */
+
+	void (*destroy)(hubbub_inputstream *stream);
+	hubbub_error (*append)(hubbub_inputstream *stream,
+			const uint8_t *data, size_t len);
+	hubbub_error (*insert)(hubbub_inputstream *stream,
+			const uint8_t *data, size_t len);
+	uint32_t (*peek)(hubbub_inputstream *stream);
+	uint32_t (*cur_pos)(hubbub_inputstream *stream, size_t *len);
+	void (*lowercase)(hubbub_inputstream *stream);
+	void (*uppercase)(hubbub_inputstream *stream);
+	void (*advance)(hubbub_inputstream *stream);
+	hubbub_error (*push_back)(hubbub_inputstream *stream,
+			uint32_t character);
+	int (*cmp_range_ci)(hubbub_inputstream *stream, uint32_t r1,
+			uint32_t r2, size_t len);
+	int (*cmp_range_cs)(hubbub_inputstream *stream, uint32_t r1,
+			uint32_t r2, size_t len);
+	int (*cmp_range_ascii)(hubbub_inputstream *stream,
+			uint32_t off, size_t len,
+			const char *data, size_t dlen);
+	hubbub_error (*replace_range)(hubbub_inputstream *stream,
+			uint32_t start, size_t len, uint32_t ucs4);
+};
+
+/**
+ * Input stream factory component definition
+ */
+typedef struct hubbub_streamhandler {
+	bool (*uses_encoding)(const char *int_enc);
+	hubbub_inputstream *(*create)(const char *enc, const char *int_enc,
+			hubbub_alloc alloc, void *pw);
+} hubbub_streamhandler;
+
+/* Notification of stream buffer moving */
+void hubbub_inputstream_buffer_moved(hubbub_inputstream *stream);
+
+#endif
diff --git a/src/input/utf8_stream.c b/src/input/utf8_stream.c
new file mode 100644
index 0000000..5d08993
--- /dev/null
+++ b/src/input/utf8_stream.c
@@ -0,0 +1,567 @@
+/*
+ * This file is part of Hubbub.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#include <stdbool.h>
+#include <string.h>
+
+#include "charset/aliases.h"
+#include "charset/detect.h"
+#include "input/streamimpl.h"
+#include "utils/utf8.h"
+#include "utils/utils.h"
+
+#define BUFFER_CHUNK (4096)
+
+static bool hubbub_utf8stream_uses_encoding(const char *int_enc);
+static hubbub_inputstream *hubbub_utf8stream_create(const char *enc,
+		const char *int_enc, hubbub_alloc alloc, void *pw);
+static void hubbub_utf8stream_destroy(hubbub_inputstream *stream);
+static hubbub_error hubbub_utf8stream_append(hubbub_inputstream *stream,
+		const uint8_t *data, size_t len);
+static hubbub_error hubbub_utf8stream_insert(hubbub_inputstream *stream,
+		const uint8_t *data, size_t len);
+static uint32_t hubbub_utf8stream_peek(hubbub_inputstream *stream);
+static uint32_t hubbub_utf8stream_cur_pos(hubbub_inputstream *stream,
+		size_t *len);
+static void hubbub_utf8stream_lowercase(hubbub_inputstream *stream);
+static void hubbub_utf8stream_uppercase(hubbub_inputstream *stream);
+static void hubbub_utf8stream_advance(hubbub_inputstream *stream);
+static hubbub_error hubbub_utf8stream_push_back(hubbub_inputstream *stream,
+		uint32_t character);
+static int hubbub_utf8stream_compare_range_ci(hubbub_inputstream *stream,
+		uint32_t r1, uint32_t r2, size_t len);
+static int hubbub_utf8stream_compare_range_cs(hubbub_inputstream *stream,
+		uint32_t r1, uint32_t r2, size_t len);
+static int hubbub_utf8stream_compare_range_ascii(hubbub_inputstream *stream,
+		uint32_t off, size_t len, const char *data, size_t dlen);
+static hubbub_error hubbub_utf8stream_replace_range(
+		hubbub_inputstream *stream,
+		uint32_t start, size_t len, uint32_t ucs4);
+
+/**
+ * Determine whether a stream implementation uses an internal encoding
+ *
+ * \param int_enc  The desired encoding
+ * \return true if handled, false otherwise
+ */
+bool hubbub_utf8stream_uses_encoding(const char *int_enc)
+{
+	return (hubbub_mibenum_from_name(int_enc, strlen(int_enc)) ==
+			hubbub_mibenum_from_name("UTF-8", SLEN("UTF-8")));
+}
+
+/**
+ * Create an input stream
+ *
+ * \param enc      Document charset, or NULL if unknown
+ * \param int_enc  Desired encoding of document
+ * \param alloc    Memory (de)allocation function
+ * \param pw       Pointer to client-specific private data (may be NULL)
+ * \return Pointer to stream instance, or NULL on failure
+ */
+hubbub_inputstream *hubbub_utf8stream_create(const char *enc,
+		const char *int_enc, hubbub_alloc alloc, void *pw)
+{
+	hubbub_inputstream *stream;
+
+	if (hubbub_mibenum_from_name(int_enc, strlen(int_enc)) !=
+			hubbub_mibenum_from_name("UTF-8", SLEN("UTF-8")))
+		return NULL;
+
+	stream = alloc(NULL, sizeof(hubbub_inputstream), pw);
+	if (stream == NULL)
+		return NULL;
+
+	stream->buffer = alloc(NULL, BUFFER_CHUNK, pw);
+	if (stream->buffer == NULL) {
+		alloc(stream, 0, pw);
+		return NULL;
+	}
+
+	stream->buffer_len = 0;
+	stream->buffer_alloc = BUFFER_CHUNK;
+
+	stream->cursor = 0;
+
+	stream->had_eof = false;
+
+	stream->input = hubbub_filter_create(int_enc, alloc, pw);
+	if (stream->input == NULL) {
+		alloc(stream->buffer, 0, pw);
+		alloc(stream, 0, pw);
+		return NULL;
+	}
+
+	if (enc != NULL) {
+		hubbub_error error;
+		hubbub_filter_optparams params;
+
+		stream->mibenum = hubbub_mibenum_from_name(enc, strlen(enc));
+
+		if (stream->mibenum != 0) {
+			params.encoding.name = enc;
+
+			error = hubbub_filter_setopt(stream->input,
+					HUBBUB_FILTER_SET_ENCODING, &params);
+			if (error != HUBBUB_OK && error != HUBBUB_INVALID) {
+				hubbub_filter_destroy(stream->input);
+				alloc(stream->buffer, 0, pw);
+				alloc(stream, 0, pw);
+				return NULL;
+			}
+
+			stream->encsrc = HUBBUB_CHARSET_DICTATED;
+		}
+	} else {
+		stream->mibenum = 0;
+		stream->encsrc = HUBBUB_CHARSET_UNKNOWN;
+	}
+
+	stream->destroy = hubbub_utf8stream_destroy;
+	stream->append = hubbub_utf8stream_append;
+	stream->insert = hubbub_utf8stream_insert;
+	stream->peek = hubbub_utf8stream_peek;
+	stream->cur_pos = hubbub_utf8stream_cur_pos;
+	stream->lowercase = hubbub_utf8stream_lowercase;
+	stream->uppercase = hubbub_utf8stream_uppercase;
+	stream->advance = hubbub_utf8stream_advance;
+	stream->push_back = hubbub_utf8stream_push_back;
+	stream->cmp_range_ci = hubbub_utf8stream_compare_range_ci;
+	stream->cmp_range_cs = hubbub_utf8stream_compare_range_cs;
+	stream->cmp_range_ascii = hubbub_utf8stream_compare_range_ascii;
+	stream->replace_range = hubbub_utf8stream_replace_range;
+
+	return stream;
+}
+
+/**
+ * Destroy an input stream
+ *
+ * \param stream  Input stream to destroy
+ */
+void hubbub_utf8stream_destroy(hubbub_inputstream *stream)
+{
+	if (stream->input != NULL) {
+		hubbub_filter_destroy(stream->input);
+	}
+
+	if (stream->buffer != NULL) {
+		stream->alloc(stream->buffer, 0, stream->pw);
+	}
+
+	stream->alloc(stream, 0, stream->pw);
+}
+
+/**
+ * Append data to an input stream
+ *
+ * \param stream  Input stream to append data to
+ * \param data    Data to append (in document charset), or NULL to flag EOF
+ * \param len     Length, in bytes, of data
+ * \return HUBBUB_OK on success, appropriate error otherwise
+ */
+hubbub_error hubbub_utf8stream_append(hubbub_inputstream *stream,
+		const uint8_t *data, size_t len)
+{
+	hubbub_error error;
+	uint8_t *base;
+	size_t space;
+
+	if (data == NULL) {
+		/* EOF indicated */
+		size_t dummy_len = 0;
+		uint8_t *dummy_data = (uint8_t *) &dummy_len;
+
+		base = stream->buffer + stream->buffer_len;
+		space = stream->buffer_alloc - stream->buffer_len;
+
+		/* Forcibly flush through any remaining buffered data */
+		while ((error = hubbub_filter_process_chunk(stream->input,
+				(const uint8_t **) &dummy_data, &dummy_len,
+				&base, &space)) == HUBBUB_NOMEM) {
+			bool moved = false;
+			uint8_t *temp = stream->alloc(stream->buffer,
+					stream->buffer_alloc + BUFFER_CHUNK,
+					stream->pw);
+
+			if (temp == NULL) {
+				return HUBBUB_NOMEM;
+			}
+
+			moved = (temp != stream->buffer);
+
+			stream->buffer = temp;
+			stream->buffer_len += stream->buffer_alloc -
+					stream->buffer_len - space;
+			stream->buffer_alloc += BUFFER_CHUNK;
+
+			base = stream->buffer + stream->buffer_len;
+			space = stream->buffer_alloc - stream->buffer_len;
+
+			if (moved)
+				hubbub_inputstream_buffer_moved(stream);
+		}
+
+		/* And fix up buffer length */
+		stream->buffer_len += stream->buffer_alloc -
+				stream->buffer_len - space;
+
+		stream->had_eof = true;
+	} else {
+		/* Normal data chunk */
+
+		if (stream->mibenum == 0) {
+			/* Haven't found charset yet; detect it */
+			error = hubbub_charset_extract(&data, &len,
+					&stream->mibenum, &stream->encsrc);
+			if (error) {
+				return error;
+			}
+
+			/* We should always have a charset by now */
+			if (stream->mibenum == 0)
+				abort();
+		}
+
+		base = stream->buffer + stream->buffer_len;
+		space = stream->buffer_alloc - stream->buffer_len;
+
+		/* Convert chunk to UTF-8 */
+		while ((error = hubbub_filter_process_chunk(stream->input,
+				&data, &len,
+				&base, &space)) == HUBBUB_NOMEM) {
+			bool moved = false;
+			uint8_t *temp = stream->alloc(stream->buffer,
+					stream->buffer_alloc + BUFFER_CHUNK,
+					stream->pw);
+
+			if (temp == NULL) {
+				return HUBBUB_NOMEM;
+			}
+
+			moved = (temp != stream->buffer);
+
+			stream->buffer = temp;
+			stream->buffer_len += stream->buffer_alloc -
+					stream->buffer_len - space;
+			stream->buffer_alloc += BUFFER_CHUNK;
+
+			base = stream->buffer + stream->buffer_len;
+			space = stream->buffer_alloc - stream->buffer_len -
+					space;
+
+			if (moved)
+				hubbub_inputstream_buffer_moved(stream);
+		}
+
+		/* And fix up buffer length */
+		stream->buffer_len += stream->buffer_alloc -
+				stream->buffer_len - space;
+	}
+
+	return HUBBUB_OK;
+}
+
+/**
+ * Insert data into stream at current location
+ *
+ * \param stream  Input stream to insert into
+ * \param data    Data to insert (UTF-8 encoded)
+ * \param len     Length, in bytes, of data
+ * \return HUBBUB_OK on success, appropriate error otherwise
+ */
+hubbub_error hubbub_utf8stream_insert(hubbub_inputstream *stream,
+		const uint8_t *data, size_t len)
+{
+	size_t space;
+	uint8_t *curpos;
+
+	space = stream->buffer_alloc - stream->buffer_len;
+
+	/* Need to grow buffer, if there's insufficient space */
+	if (space <= len) {
+		bool moved = false;
+		uint8_t *temp = stream->alloc(stream->buffer,
+				stream->buffer_alloc +
+				((len + BUFFER_CHUNK - 1) & ~BUFFER_CHUNK) +
+				BUFFER_CHUNK,
+				stream->pw);
+
+		if (temp == NULL)
+			return HUBBUB_NOMEM;
+
+		moved = (temp != stream->buffer);
+
+		stream->buffer = temp;
+		stream->buffer_alloc +=
+				((len + BUFFER_CHUNK - 1) & ~BUFFER_CHUNK);
+
+		if (moved)
+			hubbub_inputstream_buffer_moved(stream);
+	}
+
+	/* Find the insertion point
+	 * (just before the next character to be read) */
+	curpos = stream->buffer + stream->cursor;
+
+	/* Move data above this point up */
+	memmove(curpos + len, curpos, stream->buffer_len - stream->cursor);
+
+	/* Copy new data into gap created by memmove */
+	memcpy(curpos, data, len);
+
+	/* Fix up buffer length */
+	stream->buffer_len += len;
+
+	return HUBBUB_OK;
+}
+
+/**
+ * Look at the next character in the stream
+ *
+ * \param stream  Stream to look in
+ * \return UCS4 (host-endian) character code, or EOF or OOD.
+ */
+uint32_t hubbub_utf8stream_peek(hubbub_inputstream *stream)
+{
+	hubbub_error error;
+	size_t len;
+	uint32_t ret;
+
+	if (stream->cursor == stream->buffer_len) {
+		return stream->had_eof ? HUBBUB_INPUTSTREAM_EOF
+					: HUBBUB_INPUTSTREAM_OOD;
+	}
+
+	error = hubbub_utf8_to_ucs4(stream->buffer + stream->cursor,
+			stream->buffer_len - stream->cursor,
+			&ret, &len);
+	if (error != HUBBUB_OK && error != HUBBUB_NEEDDATA)
+		return HUBBUB_INPUTSTREAM_OOD;
+
+	if (error == HUBBUB_NEEDDATA) {
+		if (stream->had_eof)
+			return HUBBUB_INPUTSTREAM_EOF;
+		else
+			return HUBBUB_INPUTSTREAM_OOD;
+	}
+
+	return ret;
+}
+
+/**
+ * Retrieve the byte index and length of the current character in the stream
+ *
+ * \param stream  Stream to look in
+ * \param len     Pointer to location to receive byte length of character
+ * \return Byte index of current character from start of stream,
+ *         or (uint32_t) -1 on error
+ */
+uint32_t hubbub_utf8stream_cur_pos(hubbub_inputstream *stream,
+		size_t *len)
+{
+	hubbub_utf8_char_byte_length(stream->buffer + stream->cursor, len);
+
+	return stream->cursor;
+}
+
+/**
+ * Convert the current character to lower case
+ *
+ * \param stream  Stream to look in
+ */
+void hubbub_utf8stream_lowercase(hubbub_inputstream *stream)
+{
+	if ('A' <= stream->buffer[stream->cursor] &&
+			stream->buffer[stream->cursor] <= 'Z')
+		stream->buffer[stream->cursor] += 0x0020;
+}
+
+/**
+ * Convert the current character to upper case
+ *
+ * \param stream  Stream to look in
+ */
+void hubbub_utf8stream_uppercase(hubbub_inputstream *stream)
+{
+	if ('a' <= stream->buffer[stream->cursor] &&
+			stream->buffer[stream->cursor] <= 'z')
+		stream->buffer[stream->cursor] -= 0x0020;
+}
+
+/**
+ * Advance the stream's current position
+ *
+ * \param stream  The stream whose position to advance
+ */
+void hubbub_utf8stream_advance(hubbub_inputstream *stream)
+{
+	hubbub_error error;
+	uint32_t next;
+
+	error = hubbub_utf8_next(stream->buffer, stream->buffer_len,
+			stream->cursor, &next);
+
+	if (error == HUBBUB_OK)
+		stream->cursor = next;
+}
+
+/**
+ * Push a character back onto the stream
+ *
+ * \param stream     Stream to push back to
+ * \param character  UCS4 (host-endian) codepoint to push back
+ * \return HUBBUB_OK on success, appropriate error otherwise
+ *
+ * Note that this doesn't actually modify the data in the stream.
+ * It works by ensuring that the character located just before the
+ * current stream location is the same as ::character. If it is,
+ * then the stream pointer is moved back. If it is not, then an
+ * error is returned and the stream pointer remains unmodified.
+ */
+hubbub_error hubbub_utf8stream_push_back(hubbub_inputstream *stream,
+		uint32_t character)
+{
+	hubbub_error error;
+	uint32_t prev;
+	uint8_t buf[6];
+	size_t len;
+
+	error = hubbub_utf8_prev(stream->buffer, stream->cursor, &prev);
+	if (error != HUBBUB_OK)
+		return error;
+
+	error = hubbub_utf8_from_ucs4(character, buf, &len);
+	if (error != HUBBUB_OK)
+		return error;
+
+	if ((stream->cursor - prev) != len ||
+			memcmp(stream->buffer + prev, buf, len) != 0)
+		return HUBBUB_INVALID;
+
+	stream->cursor = prev;
+
+	return HUBBUB_OK;
+}
+
+/**
+ * Case insensitively compare a pair of ranges in the input stream
+ *
+ * \param stream  Input stream to look in
+ * \param r1      Offset of start of first range
+ * \param r2      Offset of start of second range
+ * \param len     Byte length of ranges
+ * \return 0 if ranges match, non-zero otherwise
+ */
+int hubbub_utf8stream_compare_range_ci(hubbub_inputstream *stream,
+		uint32_t r1, uint32_t r2, size_t len)
+{
+	return strncasecmp((const char *) (stream->buffer + r1),
+			(const char *) (stream->buffer + r2), len);
+}
+
+/**
+ * Case sensitively compare a pair of ranges in the input stream
+ *
+ * \param stream  Input stream to look in
+ * \param r1      Offset of start of first range
+ * \param r2      Offset of start of second range
+ * \param len     Byte length of ranges
+ * \return 0 if ranges match, non-zero otherwise
+ */
+int hubbub_utf8stream_compare_range_cs(hubbub_inputstream *stream,
+		uint32_t r1, uint32_t r2, size_t len)
+{
+	return strncmp((const char *) (stream->buffer + r1),
+			(const char *) (stream->buffer + r2), len);
+}
+
+/**
+ * Case sensitively compare a range of input stream against an ASCII string
+ *
+ * \param stream  Input stream to look in
+ * \param off     Offset of range start
+ * \param len     Byte length of range
+ * \param data    Comparison string
+ * \param dlen    Byte length of comparison string
+ * \return 0 if match, non-zero otherwise
+ */
+int hubbub_utf8stream_compare_range_ascii(hubbub_inputstream *stream,
+		uint32_t off, size_t len, const char *data, size_t dlen)
+{
+	/* Lengths don't match, so strings don't */
+	if (len != dlen)
+		return 1; /* arbitrary */
+
+	return strncmp((const char *) (stream->buffer + off),
+			data, len);
+}
+
+/**
+ * Replace a range of bytes in the input stream with a single character
+ *
+ * \param stream  Input stream containing data
+ * \param start   Offset of start of range to replace
+ * \param len     Length (in bytes) of range to replace
+ * \param ucs4    UCS4 (host endian) encoded replacement character
+ * \return HUBBUB_OK on success, appropriate error otherwise
+ */
+hubbub_error hubbub_utf8stream_replace_range(hubbub_inputstream *stream,
+		uint32_t start, size_t len, uint32_t ucs4)
+{
+	uint8_t buf[6];
+	size_t replen;
+	int32_t diff;
+	hubbub_error error;
+
+	/* Get UTF8 version of replacement character */
+	error = hubbub_utf8_from_ucs4(ucs4, buf, &replen);
+	if (error)
+		return error;
+
+	diff = replen - len;
+
+	if (stream->buffer_len + diff >= stream->buffer_alloc) {
+		/* Need more buffer space */
+		bool moved = false;
+		uint8_t *temp = stream->alloc(stream->buffer,
+				stream->buffer_alloc +
+				((diff + BUFFER_CHUNK - 1) & ~BUFFER_CHUNK) +
+				BUFFER_CHUNK,
+				stream->pw);
+
+		if (temp == NULL)
+			return HUBBUB_NOMEM;
+
+		moved = (temp != stream->buffer);
+
+		stream->buffer = temp;
+		stream->buffer_alloc +=
+				((diff + BUFFER_CHUNK - 1) & ~BUFFER_CHUNK);
+
+		if (moved)
+			hubbub_inputstream_buffer_moved(stream);
+	}
+
+	/* Move subsequent input to correct location */
+	memmove(stream->buffer + start + len + diff,
+			stream->buffer + start + len,
+			stream->buffer_len - (start + len));
+
+	/* And fill the gap with the replacement character */
+	memcpy(stream->buffer + start, buf, replen);
+
+	/* Finally, update length */
+	stream->buffer_len += diff;
+
+	return HUBBUB_OK;
+}
+
+hubbub_streamhandler utf8stream = {
+	hubbub_utf8stream_uses_encoding,
+	hubbub_utf8stream_create
+};
diff --git a/src/parser.c b/src/parser.c
new file mode 100644
index 0000000..e7a4fe8
--- /dev/null
+++ b/src/parser.c
@@ -0,0 +1,237 @@
+/*
+ * This file is part of Hubbub.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#include <hubbub/parser.h>
+
+#include "input/inputstream.h"
+#include "tokeniser/tokeniser.h"
+
+/**
+ * Hubbub parser object
+ */
+struct hubbub_parser {
+	hubbub_inputstream *stream;	/**< Input stream instance */
+	hubbub_tokeniser *tok;		/**< Tokeniser instance */
+
+	hubbub_alloc alloc;		/**< Memory (de)allocation function */
+	void *pw;			/**< Client data */
+};
+
+/**
+ * Create a hubbub parser
+ *
+ * \param enc      Source document encoding, or NULL to autodetect
+ * \param int_enc  Desired encoding of document
+ * \param alloc    Memory (de)allocation function
+ * \param pw       Pointer to client-specific private data (may be NULL)
+ * \return Pointer to parser instance, or NULL on error
+ */
+hubbub_parser *hubbub_parser_create(const char *enc, const char *int_enc,
+		hubbub_alloc alloc, void *pw)
+{
+	hubbub_parser *parser;
+
+	if (alloc == NULL)
+		return NULL;
+
+	parser = alloc(NULL, sizeof(hubbub_parser), pw);
+	if (parser == NULL)
+		return NULL;
+
+	parser->stream = hubbub_inputstream_create(enc, int_enc, alloc, pw);
+	if (parser->stream == NULL) {
+		alloc(parser, 0, pw);
+		return NULL;
+	}
+
+	parser->tok = hubbub_tokeniser_create(parser->stream, alloc, pw);
+	if (parser->tok == NULL) {
+		hubbub_inputstream_destroy(parser->stream);
+		alloc(parser, 0, pw);
+		return NULL;
+	}
+
+	parser->alloc = alloc;
+	parser->pw = pw;
+
+	return parser;
+}
+
+/**
+ * Destroy a hubbub parser
+ *
+ * \param parser  Parser instance to destroy
+ */
+void hubbub_parser_destroy(hubbub_parser *parser)
+{
+	if (parser == NULL)
+		return;
+
+	hubbub_tokeniser_destroy(parser->tok);
+
+	hubbub_inputstream_destroy(parser->stream);
+
+	parser->alloc(parser, 0, parser->pw);
+}
+
+/**
+ * Configure a hubbub parser
+ *
+ * \param parser  Parser instance to configure
+ * \param type    Option to set
+ * \param params  Option-specific parameters
+ * \return HUBBUB_OK on success, appropriate error otherwise
+ */
+hubbub_error hubbub_parser_setopt(hubbub_parser *parser,
+		hubbub_parser_opttype type,
+		hubbub_parser_optparams *params)
+{
+	hubbub_tokeniser_opttype toktype;
+
+	if (parser == NULL || params == NULL)
+		return HUBBUB_BADPARM;
+
+	switch (type) {
+	case HUBBUB_PARSER_TOKEN_HANDLER:
+		toktype = HUBBUB_TOKENISER_TOKEN_HANDLER;
+		break;
+	case HUBBUB_PARSER_BUFFER_HANDLER:
+		toktype = HUBBUB_TOKENISER_BUFFER_HANDLER;
+		break;
+	case HUBBUB_PARSER_ERROR_HANDLER:
+		toktype = HUBBUB_TOKENISER_BUFFER_HANDLER;
+		break;
+	case HUBBUB_PARSER_CONTENT_MODEL:
+		toktype = HUBBUB_TOKENISER_CONTENT_MODEL;
+		break;
+	}
+
+	return hubbub_tokeniser_setopt(parser->tok, toktype,
+			(hubbub_tokeniser_optparams *) params);
+}
+
+/**
+ * Pass a chunk of data to a hubbub parser for parsing
+ *
+ * \param parser  Parser instance to use
+ * \param data    Data to parse (encoded in the input charset)
+ * \param len     Length, in bytes, of data
+ * \return HUBBUB_OK on success, appropriate error otherwise
+ */
+hubbub_error hubbub_parser_parse_chunk(hubbub_parser *parser,
+		uint8_t *data, size_t len)
+{
+	hubbub_error error;
+
+	if (parser == NULL || data == NULL)
+		return HUBBUB_BADPARM;
+
+	error = hubbub_inputstream_append(parser->stream, data, len);
+	if (error != HUBBUB_OK)
+		return error;
+
+	error = hubbub_tokeniser_run(parser->tok);
+	if (error != HUBBUB_OK)
+		return error;
+
+	return HUBBUB_OK;
+}
+
+/**
+ * Pass a chunk of extraneous data to a hubbub parser for parsing
+ *
+ * \param parser  Parser instance to use
+ * \param data    Data to parse (encoded in internal charset)
+ * \param len     Length, in byte, of data
+ * \return HUBBUB_OK on success, appropriate error otherwise
+ */
+hubbub_error hubbub_parser_parse_extraneous_chunk(hubbub_parser *parser,
+		uint8_t *data, size_t len)
+{
+	hubbub_error error;
+
+	/** \todo In some cases, we don't actually want script-inserted
+	 * data to be parsed until later. We'll need some way of flagging
+	 * this through the public API, and the inputstream API will need
+	 * some way of marking the insertion point so that, when the
+	 * tokeniser is run, only the inserted chunk is parsed. */
+
+	if (parser == NULL || data == NULL)
+		return HUBBUB_BADPARM;
+
+	error = hubbub_inputstream_insert(parser->stream, data, len);
+	if (error != HUBBUB_OK)
+		return error;
+
+	error = hubbub_tokeniser_run(parser->tok);
+	if (error != HUBBUB_OK)
+		return error;
+
+	return HUBBUB_OK;
+}
+
+/**
+ * Inform the parser that the last chunk of data has been parsed
+ *
+ * \param parser  Parser to inform
+ * \return HUBBUB_OK on success, appropriate error otherwise
+ */
+hubbub_error hubbub_parser_completed(hubbub_parser *parser)
+{
+	hubbub_error error;
+
+	if (parser == NULL)
+		return HUBBUB_BADPARM;
+
+	error = hubbub_inputstream_append(parser->stream, NULL, 0);
+	if (error != HUBBUB_OK)
+		return error;
+
+	error = hubbub_tokeniser_run(parser->tok);
+	if (error != HUBBUB_OK)
+		return error;
+
+	return HUBBUB_OK;
+}
+
+/**
+ * Read the document charset
+ *
+ * \param parser  Parser instance to query
+ * \param source  Pointer to location to receive charset source
+ * \return Pointer to charset name (constant; do not free), or NULL if unknown
+ */
+const char *hubbub_parser_read_charset(hubbub_parser *parser,
+		hubbub_charset_source *source)
+{
+	if (parser == NULL || source == NULL)
+		return NULL;
+
+	return hubbub_inputstream_read_charset(parser->stream, source);
+}
+
+/**
+ * Claim ownership of the document buffer
+ *
+ * \param parser  Parser whose buffer to claim
+ * \param buffer  Pointer to location to receive buffer pointer
+ * \param len     Pointer to location to receive byte length of buffer
+ * \return HUBBUB_OK on success, appropriate error otherwise.
+ *
+ * Once the buffer has been claimed by a client, the parser disclaims
+ * all ownership rights (and invalidates any internal references it may have
+ * to the buffer). Therefore, the only parser call which may be made
+ * after calling this function is to destroy the parser.
+  */
+hubbub_error hubbub_parser_claim_buffer(hubbub_parser *parser,
+		uint8_t **buffer, size_t *len)
+{
+	if (parser == NULL || buffer == NULL || len == NULL)
+		return HUBBUB_BADPARM;
+
+	return hubbub_inputstream_claim_buffer(parser->stream, buffer, len);
+}
diff --git a/src/tokeniser/Makefile b/src/tokeniser/Makefile
new file mode 100644
index 0000000..539625f
--- /dev/null
+++ b/src/tokeniser/Makefile
@@ -0,0 +1,53 @@
+# Makefile for libhubbub
+#
+# Toolchain is exported by top-level makefile
+#
+# Top-level makefile also exports the following variables:
+#
+# COMPONENT  Name of component
+# EXPORT     Absolute path of export directory
+# TOP        Absolute path of source tree root
+#
+# The top-level makefile requires the following targets to exist:
+#
+# clean      Clean source tree
+# debug      Create a debug binary
+# distclean  Fully clean source tree, back to pristine condition
+# export     Export distributable components to ${EXPORT}
+# release    Create a release binary
+# setup      Perform any setup required prior to compilation
+# test       Execute any test cases
+
+# Manipulate include paths
+CFLAGS += -I$(CURDIR)
+
+# Objects
+OBJS = entities tokeniser
+
+.PHONY: clean debug distclean export release setup test
+
+# Targets
+release: $(addprefix ../Release/, $(addsuffix .o, $(OBJS)))
+
+debug: $(addprefix ../Debug/, $(addsuffix .o, $(OBJS)))
+
+clean:
+	-@${RM} ${RMFLAGS} $(addprefix ../Release/, $(addsuffix .o, ${OBJS}))
+	-@${RM} ${RMFLAGS} $(addprefix ../Debug/, $(addsuffix .o, ${OBJS}))  
+
+distclean:
+
+setup:
+
+export:
+
+test:
+
+# Pattern rules
+../Release/%.o: %.c
+	@${ECHO} ${ECHOFLAGS} "==> $<"
+	@${CC} -c ${CFLAGS} -DNDEBUG -o $@ $<
+
+../Debug/%.o: %.c
+	@${ECHO} ${ECHOFLAGS} "==> $<"
+	@${CC} -c -g ${CFLAGS} -o $@ $<
diff --git a/src/tokeniser/entities.c b/src/tokeniser/entities.c
new file mode 100644
index 0000000..8a9acf5
--- /dev/null
+++ b/src/tokeniser/entities.c
@@ -0,0 +1,363 @@
+/*
+ * This file is part of Hubbub.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#include "utils/dict.h"
+#include "utils/utils.h"
+#include "tokeniser/entities.h"
+
+typedef struct hubbub_entity hubbub_entity;
+
+static const struct hubbub_entity {
+	const char *name;
+	uint32_t ucs4;
+} entities[] = {
+	{ "AElig", 0x00C6 },
+	{ "Aacute", 0x00C1 },
+	{ "Acirc", 0x00C2 },
+	{ "Agrave", 0x00C0 },
+	{ "Alpha", 0x0391 },
+	{ "Aring", 0x00C5 },
+	{ "Atilde", 0x00C3 },
+	{ "Auml", 0x00C4 },
+	{ "Beta", 0x0392 },
+	{ "Ccedil", 0x00C7 },
+	{ "Chi", 0x03A7 },
+	{ "Dagger", 0x2021 },
+	{ "Delta", 0x0394 },
+	{ "ETH", 0x00D0 },
+	{ "Eacute", 0x00C9 },
+	{ "Ecirc", 0x00CA },
+	{ "Egrave", 0x00C8 },
+	{ "Epsilon", 0x0395 },
+	{ "Eta", 0x0397 },
+	{ "Euml", 0x00CB },
+	{ "Gamma", 0x0393 },
+	{ "Iacute", 0x00CD },
+	{ "Icirc", 0x00CE },
+	{ "Igrave", 0x00CC },
+	{ "Iota", 0x0399 },
+	{ "Iuml", 0x00CF },
+	{ "Kappa", 0x039A },
+	{ "Lambda", 0x039B },
+	{ "Mu", 0x039C },
+	{ "Ntilde", 0x00D1 },
+	{ "Nu", 0x039D },
+	{ "OElig", 0x0152 },
+	{ "Oacute", 0x00D3 },
+	{ "Ocirc", 0x00D4 },
+	{ "Ograve", 0x00D2 },
+	{ "Omega", 0x03A9 },
+	{ "Omicron", 0x039F },
+	{ "Oslash", 0x00D8 },
+	{ "Otilde", 0x00D5 },
+	{ "Ouml", 0x00D6 },
+	{ "Phi", 0x03A6 },
+	{ "Pi", 0x03A0 },
+	{ "Prime", 0x2033 },
+	{ "Psi", 0x03A8 },
+	{ "Rho", 0x03A1 },
+	{ "Scaron", 0x0160 },
+	{ "Sigma", 0x03A3 },
+	{ "THORN", 0x00DE },
+	{ "Tau", 0x03A4 },
+	{ "Theta", 0x0398 },
+	{ "Uacute", 0x00DA },
+	{ "Ucirc", 0x00DB },
+	{ "Ugrave", 0x00D9 },
+	{ "Upsilon", 0x03A5 },
+	{ "Uuml", 0x00DC },
+	{ "Xi", 0x039E },
+	{ "Yacute", 0x00DD },
+	{ "Yuml", 0x0178 },
+	{ "Zeta", 0x0396 },
+	{ "aacute", 0x00E1 },
+	{ "acirc", 0x00E2 },
+	{ "acute", 0x00B4 },
+	{ "aelig", 0x00E6 },
+	{ "agrave", 0x00E0 },
+	{ "alefsym", 0x2135 },
+	{ "alpha", 0x03B1 },
+	{ "amp", 0x0026 },
+	{ "AMP", 0x0026 },
+	{ "and", 0x2227 },
+	{ "ang", 0x2220 },
+	{ "apos", 0x0027 },
+	{ "aring", 0x00E5 },
+	{ "asymp", 0x2248 },
+	{ "atilde", 0x00E3 },
+	{ "auml", 0x00E4 },
+	{ "bdquo", 0x201E },
+	{ "beta", 0x03B2 },
+	{ "brvbar", 0x00A6 },
+	{ "bull", 0x2022 },
+	{ "cap", 0x2229 },
+	{ "ccedil", 0x00E7 },
+	{ "cedil", 0x00B8 },
+	{ "cent", 0x00A2 },
+	{ "chi", 0x03C7 },
+	{ "circ", 0x02C6 },
+	{ "clubs", 0x2663 },
+	{ "cong", 0x2245 },
+	{ "copy", 0x00A9 },
+	{ "COPY", 0x00A9 },
+	{ "crarr", 0x21B5 },
+	{ "cup", 0x222A },
+	{ "curren", 0x00A4 },
+	{ "dArr", 0x21D3 },
+	{ "dagger", 0x2020 },
+	{ "darr", 0x2193 },
+	{ "deg", 0x00B0 },
+	{ "delta", 0x03B4 },
+	{ "diams", 0x2666 },
+	{ "divide", 0x00F7 },
+	{ "eacute", 0x00E9 },
+	{ "ecirc", 0x00EA },
+	{ "egrave", 0x00E8 },
+	{ "empty", 0x2205 },
+	{ "emsp", 0x2003 },
+	{ "ensp", 0x2002 },
+	{ "epsilon", 0x03B5 },
+	{ "equiv", 0x2261 },
+	{ "eta", 0x03B7 },
+	{ "eth", 0x00F0 },
+	{ "euml", 0x00EB },
+	{ "euro", 0x20AC },
+	{ "exist", 0x2203 },
+	{ "fnof", 0x0192 },
+	{ "forall", 0x2200 },
+	{ "frac12", 0x00BD },
+	{ "frac14", 0x00BC },
+	{ "frac34", 0x00BE },
+	{ "frasl", 0x2044 },
+	{ "gamma", 0x03B3 },
+	{ "ge", 0x2265 },
+	{ "gt", 0x003E },
+	{ "GT", 0x003E },
+	{ "hArr", 0x21D4 },
+	{ "harr", 0x2194 },
+	{ "hearts", 0x2665 },
+	{ "hellip", 0x2026 },
+	{ "iacute", 0x00ED },
+	{ "icirc", 0x00EE },
+	{ "iexcl", 0x00A1 },
+	{ "igrave", 0x00EC },
+	{ "image", 0x2111 },
+	{ "infin", 0x221E },
+	{ "int", 0x222B },
+	{ "iota", 0x03B9 },
+	{ "iquest", 0x00BF },
+	{ "isin", 0x2208 },
+	{ "iuml", 0x00EF },
+	{ "kappa", 0x03BA },
+	{ "lArr", 0x21D0 },
+	{ "lambda", 0x03BB },
+	{ "lang", 0x2329 },
+	{ "laquo", 0x00AB },
+	{ "larr", 0x2190 },
+	{ "lceil", 0x2308 },
+	{ "ldquo", 0x201C },
+	{ "le", 0x2264 },
+	{ "lfloor", 0x230A },
+	{ "lowast", 0x2217 },
+	{ "loz", 0x25CA },
+	{ "lrm", 0x200E },
+	{ "lsaquo", 0x2039 },
+	{ "lsquo", 0x2018 },
+	{ "lt", 0x003C },
+	{ "LT", 0x003C },
+	{ "macr", 0x00AF },
+	{ "mdash", 0x2014 },
+	{ "micro", 0x00B5 },
+	{ "middot", 0x00B7 },
+	{ "minus", 0x2212 },
+	{ "mu", 0x03BC },
+	{ "nabla", 0x2207 },
+	{ "nbsp", 0x00A0 },
+	{ "ndash", 0x2013 },
+	{ "ne", 0x2260 },
+	{ "ni", 0x220B },
+	{ "not", 0x00AC },
+	{ "notin", 0x2209 },
+	{ "nsub", 0x2284 },
+	{ "ntilde", 0x00F1 },
+	{ "nu", 0x03BD },
+	{ "oacute", 0x00F3 },
+	{ "ocirc", 0x00F4 },
+	{ "oelig", 0x0153 },
+	{ "ograve", 0x00F2 },
+	{ "oline", 0x203E },
+	{ "omega", 0x03C9 },
+	{ "omicron", 0x03BF },
+	{ "oplus", 0x2295 },
+	{ "or", 0x2228 },
+	{ "ordf", 0x00AA },
+	{ "ordm", 0x00BA },
+	{ "oslash", 0x00F8 },
+	{ "otilde", 0x00F5 },
+	{ "otimes", 0x2297 },
+	{ "ouml", 0x00F6 },
+	{ "para", 0x00B6 },
+	{ "part", 0x2202 },
+	{ "permil", 0x2030 },
+	{ "perp", 0x22A5 },
+	{ "phi", 0x03C6 },
+	{ "pi", 0x03C0 },
+	{ "piv", 0x03D6 },
+	{ "plusmn", 0x00B1 },
+	{ "pound", 0x00A3 },
+	{ "prime", 0x2032 },
+	{ "prod", 0x220F },
+	{ "prop", 0x221D },
+	{ "psi", 0x03C8 },
+	{ "quot", 0x0022 },
+	{ "QUOT", 0x0022 },
+	{ "rArr", 0x21D2 },
+	{ "radic", 0x221A },
+	{ "rang", 0x232A },
+	{ "raquo", 0x00BB },
+	{ "rarr", 0x2192 },
+	{ "rceil", 0x2309 },
+	{ "rdquo", 0x201D },
+	{ "real", 0x211C },
+	{ "reg", 0x00AE },
+	{ "REG", 0x00AE },
+	{ "rfloor", 0x230B },
+	{ "rho", 0x03C1 },
+	{ "rlm", 0x200F },
+	{ "rsaquo", 0x203A },
+	{ "rsquo", 0x2019 },
+	{ "sbquo", 0x201A },
+	{ "scaron", 0x0161 },
+	{ "sdot", 0x22C5 },
+	{ "sect", 0x00A7 },
+	{ "shy", 0x00AD },
+	{ "sigma", 0x03C3 },
+	{ "sigmaf", 0x03C2 },
+	{ "sim", 0x223C },
+	{ "spades", 0x2660 },
+	{ "sub", 0x2282 },
+	{ "sube", 0x2286 },
+	{ "sum", 0x2211 },
+	{ "sup", 0x2283 },
+	{ "sup1", 0x00B9 },
+	{ "sup2", 0x00B2 },
+	{ "sup3", 0x00B3 },
+	{ "supe", 0x2287 },
+	{ "szlig", 0x00DF },
+	{ "tau", 0x03C4 },
+	{ "there4", 0x2234 },
+	{ "theta", 0x03B8 },
+	{ "thetasym", 0x03D1 },
+	{ "thinsp", 0x2009 },
+	{ "thorn", 0x00FE },
+	{ "tilde", 0x02DC },
+	{ "times", 0x00D7 },
+	{ "trade", 0x2122 },
+	{ "uArr", 0x21D1 },
+	{ "uacute", 0x00FA },
+	{ "uarr", 0x2191 },
+	{ "ucirc", 0x00FB },
+	{ "ugrave", 0x00F9 },
+	{ "uml", 0x00A8 },
+	{ "upsih", 0x03D2 },
+	{ "upsilon", 0x03C5 },
+	{ "uuml", 0x00FC },
+	{ "weierp", 0x2118 },
+	{ "xi", 0x03BE },
+	{ "yacute", 0x00FD },
+	{ "yen", 0x00A5 },
+	{ "yuml", 0x00FF },
+	{ "zeta", 0x03B6 },
+	{ "zwj", 0x200D },
+	{ "zwnj", 0x200C },
+};
+
+static hubbub_dict *dict;
+
+/**
+ * Create the entities dictionary
+ *
+ * \param alloc  Memory (de)allocation function
+ * \param pw     Pointer to client-specific private data (may be NULL)
+ * \return HUBBUB_OK on success, appropriate error otherwise
+ */
+hubbub_error hubbub_entities_create(hubbub_alloc alloc, void *pw)
+{
+	hubbub_error error;
+	size_t i;
+
+	if (alloc == NULL)
+		return HUBBUB_BADPARM;
+
+	dict = hubbub_dict_create(alloc, pw);
+	if (dict == NULL)
+		return HUBBUB_NOMEM;
+
+	for (i = 0; i < sizeof(entities) / sizeof(entities[0]); i++) {
+		error = hubbub_dict_insert(dict, entities[i].name,
+				&entities[i]);
+		if (error != HUBBUB_OK) {
+			hubbub_dict_destroy(dict);
+			return error;
+		}
+	}
+
+	return HUBBUB_OK;
+}
+
+/**
+ * Destroy the entities dictionary
+ *
+ * \param alloc  Memory (de)allocation function
+ * \param pw     Pointer to client-specific private data (may be NULL)
+ */
+void hubbub_entities_destroy(hubbub_alloc alloc, void *pw)
+{
+	UNUSED(alloc);
+	UNUSED(pw);
+
+	hubbub_dict_destroy(dict);
+}
+
+/**
+ * Step-wise search for an entity in the dictionary
+ *
+ * \param c        Character to look for
+ * \param result   Pointer to location for result
+ * \param context  Pointer to location for search context
+ * \return HUBBUB_OK if key found,
+ *         HUBBUB_NEEDDATA if more steps are required
+ *         HUBBUB_INVALID if nothing matches
+ *
+ * The value pointed to by ::context should be NULL for the first call.
+ * Thereafter, pass in the same value as returned by the previous call.
+ * The context is opaque to the caller and should not be inspected.
+ *
+ * The location pointed to by ::result will be set to U+FFFD unless a match
+ * is found.
+ */
+hubbub_error hubbub_entities_search_step(uint8_t c, uint32_t *result,
+		void **context)
+{
+	const hubbub_entity *e;
+	hubbub_error error;
+
+	if (result == NULL || context == NULL)
+		return HUBBUB_BADPARM;
+
+	error = hubbub_dict_search_step(dict, c,
+			(const void **) (const void *) &e,
+			context);
+	if (error != HUBBUB_OK) {
+		*result = 0xFFFD;
+		return error;
+	}
+
+	*result = e->ucs4;
+
+	return HUBBUB_OK;
+}
diff --git a/src/tokeniser/entities.h b/src/tokeniser/entities.h
new file mode 100644
index 0000000..efd1987
--- /dev/null
+++ b/src/tokeniser/entities.h
@@ -0,0 +1,25 @@
+/*
+ * This file is part of Hubbub.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#ifndef hubbub_tokeniser_entities_h_
+#define hubbub_tokeniser_entities_h_
+
+#include <inttypes.h>
+
+#include <hubbub/errors.h>
+#include <hubbub/functypes.h>
+
+/* Create the entities dictionary */
+hubbub_error hubbub_entities_create(hubbub_alloc alloc, void *pw);
+/* Destroy the entities dictionary */
+void hubbub_entities_destroy(hubbub_alloc alloc, void *pw);
+
+/* Step-wise search for an entity in the dictionary */
+hubbub_error hubbub_entities_search_step(uint8_t c, uint32_t *result,
+		void **context);
+
+#endif
diff --git a/src/tokeniser/tokeniser.c b/src/tokeniser/tokeniser.c
new file mode 100644
index 0000000..f8b6bb3
--- /dev/null
+++ b/src/tokeniser/tokeniser.c
@@ -0,0 +1,2282 @@
+/*
+ * This file is part of Hubbub.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#include <stdbool.h>
+#include <string.h>
+
+#include "utils/utils.h"
+
+#include "tokeniser/entities.h"
+#include "tokeniser/tokeniser.h"
+
+/**
+ * Table of mappings between Windows-1252 codepoints 128-159 and UCS4
+ */
+static const uint32_t cp1252Table[32] = {
+	0x20AC, 0xFFFD, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021,
+	0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0xFFFD, 0x017D, 0xFFFD,
+	0xFFFD, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
+	0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0xFFFD, 0x017E, 0x0178
+};
+
+/**
+ * Tokeniser states
+ */
+typedef enum hubbub_tokeniser_state {
+	HUBBUB_TOKENISER_STATE_DATA,
+	HUBBUB_TOKENISER_STATE_ENTITY_DATA,
+	HUBBUB_TOKENISER_STATE_TAG_OPEN,
+	HUBBUB_TOKENISER_STATE_CLOSE_TAG_OPEN,
+	HUBBUB_TOKENISER_STATE_CLOSE_TAG_MATCH,
+	HUBBUB_TOKENISER_STATE_TAG_NAME,
+	HUBBUB_TOKENISER_STATE_BEFORE_ATTRIBUTE_NAME,
+	HUBBUB_TOKENISER_STATE_ATTRIBUTE_NAME,
+	HUBBUB_TOKENISER_STATE_AFTER_ATTRIBUTE_NAME,
+	HUBBUB_TOKENISER_STATE_BEFORE_ATTRIBUTE_VALUE,
+	HUBBUB_TOKENISER_STATE_ATTRIBUTE_VALUE_DQ,
+	HUBBUB_TOKENISER_STATE_ATTRIBUTE_VALUE_SQ,
+	HUBBUB_TOKENISER_STATE_ATTRIBUTE_VALUE_UQ,
+	HUBBUB_TOKENISER_STATE_ENTITY_IN_ATTRIBUTE_VALUE,
+	HUBBUB_TOKENISER_STATE_BOGUS_COMMENT,
+	HUBBUB_TOKENISER_STATE_MARKUP_DECLARATION_OPEN,
+	HUBBUB_TOKENISER_STATE_COMMENT_START,
+	HUBBUB_TOKENISER_STATE_COMMENT,
+	HUBBUB_TOKENISER_STATE_COMMENT_DASH,
+	HUBBUB_TOKENISER_STATE_COMMENT_END,
+	HUBBUB_TOKENISER_STATE_MATCH_DOCTYPE,
+	HUBBUB_TOKENISER_STATE_DOCTYPE,
+	HUBBUB_TOKENISER_STATE_BEFORE_DOCTYPE_NAME,
+	HUBBUB_TOKENISER_STATE_DOCTYPE_NAME,
+	HUBBUB_TOKENISER_STATE_AFTER_DOCTYPE_NAME,
+	HUBBUB_TOKENISER_STATE_BOGUS_DOCTYPE,
+	HUBBUB_TOKENISER_STATE_NUMBERED_ENTITY,
+	HUBBUB_TOKENISER_STATE_NAMED_ENTITY
+} hubbub_tokeniser_state;
+
+/**
+ * Context for tokeniser
+ */
+typedef struct hubbub_tokeniser_context {
+	hubbub_token_type current_tag_type;	/**< Type of current_tag */
+	hubbub_tag current_tag;			/**< Current tag */
+
+	hubbub_string current_comment;		/**< Current comment */
+
+	hubbub_doctype current_doctype;		/**< Current doctype */
+
+	hubbub_string current_chars;		/**< Pending characters */
+
+	hubbub_tokeniser_state prev_state;	/**< Previous state */
+
+	struct {
+		hubbub_string tag;		/**< Pending close tag */
+	} close_tag_match;
+
+	struct {
+		uint32_t count;			/**< Index into "DOCTYPE" */
+	} match_doctype;
+
+	struct {
+		hubbub_string str;		/**< Pending string */
+		uint8_t base;			/**< Base for numeric
+						 * entities */
+		uint32_t codepoint;		/**< UCS4 codepoint */
+		bool had_data;			/**< Whether we read
+						 * anything after &#(x)? */
+		hubbub_tokeniser_state return_state;	/**< State we were
+							 * called from */
+		bool complete;			/**< Flag that entity
+						 * matching completed */
+		bool done_setup;		/**< Flag that match setup
+						 * has completed */
+		void *context;			/**< Context for named
+						 * entity search */
+		size_t prev_len;		/**< Previous byte length
+						 * of str */
+	} match_entity;
+
+	struct {
+		uint32_t line;			/**< Current line of input */
+		uint32_t col;			/**< Current character in
+						 * line */
+	} position;
+} hubbub_tokeniser_context;
+
+/**
+ * Tokeniser data structure
+ */
+struct hubbub_tokeniser {
+	hubbub_tokeniser_state state;	/**< Current tokeniser state */
+	hubbub_content_model content_model;	/**< Current content
+						 * model flag */
+
+	hubbub_inputstream *input;	/**< Input stream */
+
+	const uint8_t *input_buffer;	/**< Start of input stream's buffer */
+	size_t input_buffer_len;	/**< Length of input buffer */
+
+	hubbub_tokeniser_context context;	/**< Tokeniser context */
+
+	hubbub_token_handler token_handler;
+	void *token_pw;
+
+	hubbub_buffer_handler buffer_handler;
+	void *buffer_pw;
+
+	hubbub_error_handler error_handler;
+	void *error_pw;
+
+	hubbub_alloc alloc;		/**< Memory (de)allocation function */
+	void *alloc_pw;			/**< Client private data */
+};
+
+static bool hubbub_tokeniser_handle_data(hubbub_tokeniser *tokeniser);
+static bool hubbub_tokeniser_handle_entity_data(hubbub_tokeniser *tokeniser);
+static bool hubbub_tokeniser_handle_tag_open(hubbub_tokeniser *tokeniser);
+static bool hubbub_tokeniser_handle_close_tag_open(
+		hubbub_tokeniser *tokeniser);
+static bool hubbub_tokeniser_handle_close_tag_match(
+		hubbub_tokeniser *tokeniser);
+static bool hubbub_tokeniser_handle_tag_name(hubbub_tokeniser *tokeniser);
+static bool hubbub_tokeniser_handle_before_attribute_name(
+		hubbub_tokeniser *tokeniser);
+static bool hubbub_tokeniser_handle_attribute_name(
+		hubbub_tokeniser *tokeniser);
+static bool hubbub_tokeniser_handle_after_attribute_name(
+		hubbub_tokeniser *tokeniser);
+static bool hubbub_tokeniser_handle_before_attribute_value(
+		hubbub_tokeniser *tokeniser);
+static bool hubbub_tokeniser_handle_attribute_value_dq(
+		hubbub_tokeniser *tokeniser);
+static bool hubbub_tokeniser_handle_attribute_value_sq(
+		hubbub_tokeniser *tokeniser);
+static bool hubbub_tokeniser_handle_attribute_value_uq(
+		hubbub_tokeniser *tokeniser);
+static bool hubbub_tokeniser_handle_entity_in_attribute_value(
+		hubbub_tokeniser *tokeniser);
+static bool hubbub_tokeniser_handle_bogus_comment(
+		hubbub_tokeniser *tokeniser);
+static bool hubbub_tokeniser_handle_markup_declaration_open(
+		hubbub_tokeniser *tokeniser);
+static bool hubbub_tokeniser_handle_comment_start(
+		hubbub_tokeniser *tokeniser);
+static bool hubbub_tokeniser_handle_comment(hubbub_tokeniser *tokeniser);
+static bool hubbub_tokeniser_handle_comment_dash(
+		hubbub_tokeniser *tokeniser);
+static bool hubbub_tokeniser_handle_comment_end(hubbub_tokeniser *tokeniser);
+static bool hubbub_tokeniser_handle_match_doctype(
+		hubbub_tokeniser *tokeniser);
+static bool hubbub_tokeniser_handle_doctype(hubbub_tokeniser *tokeniser);
+static bool hubbub_tokeniser_handle_before_doctype_name(
+		hubbub_tokeniser *tokeniser);
+static bool hubbub_tokeniser_handle_doctype_name(
+		hubbub_tokeniser *tokeniser);
+static bool hubbub_tokeniser_handle_after_doctype_name(
+		hubbub_tokeniser *tokeniser);
+static bool hubbub_tokeniser_handle_bogus_doctype(
+		hubbub_tokeniser *tokeniser);
+static bool hubbub_tokeniser_consume_entity(hubbub_tokeniser *tokeniser);
+static bool hubbub_tokeniser_handle_numbered_entity(
+		hubbub_tokeniser *tokeniser);
+static bool hubbub_tokeniser_handle_named_entity(
+		hubbub_tokeniser *tokeniser);
+static void hubbub_tokeniser_buffer_moved_handler(const uint8_t *buffer,
+		size_t len, void *pw);
+static void hubbub_tokeniser_emit_token(hubbub_tokeniser *tokeniser,
+		hubbub_token *token);
+
+/**
+ * Create a hubbub tokeniser
+ *
+ * \param input  Input stream instance
+ * \param alloc  Memory (de)allocation function
+ * \param pw     Pointer to client-specific private data (may be NULL)
+ * \return Pointer to tokeniser instance, or NULL on failure
+ */
+hubbub_tokeniser *hubbub_tokeniser_create(hubbub_inputstream *input,
+		hubbub_alloc alloc, void *pw)
+{
+	hubbub_tokeniser *tok;
+
+	if (input == NULL || alloc == NULL)
+		return NULL;
+
+	tok = alloc(NULL, sizeof(hubbub_tokeniser), pw);
+	if (tok == NULL)
+		return NULL;
+
+	tok->state = HUBBUB_TOKENISER_STATE_DATA;
+	tok->content_model = HUBBUB_CONTENT_MODEL_PCDATA;
+
+	tok->input = input;
+	tok->input_buffer = NULL;
+	tok->input_buffer_len = 0;
+
+	tok->token_handler = NULL;
+	tok->token_pw = NULL;
+
+	tok->buffer_handler = NULL;
+	tok->buffer_pw = NULL;
+
+	tok->error_handler = NULL;
+	tok->error_pw = NULL;
+
+	tok->alloc = alloc;
+	tok->alloc_pw = pw;
+
+	if (hubbub_inputstream_register_movehandler(input,
+			hubbub_tokeniser_buffer_moved_handler, tok) !=
+			HUBBUB_OK) {
+		alloc(tok, 0, pw);
+		return NULL;
+	}
+
+	memset(&tok->context, 0, sizeof(hubbub_tokeniser_context));
+
+	return tok;
+}
+
+/**
+ * Destroy a hubbub tokeniser
+ *
+ * \param tokeniser  The tokeniser instance to destroy
+ */
+void hubbub_tokeniser_destroy(hubbub_tokeniser *tokeniser)
+{
+	if (tokeniser == NULL)
+		return;
+
+	hubbub_inputstream_deregister_movehandler(tokeniser->input,
+			hubbub_tokeniser_buffer_moved_handler, tokeniser);
+
+	if (tokeniser->context.current_tag.attributes != NULL) {
+		tokeniser->alloc(tokeniser->context.current_tag.attributes,
+				0, tokeniser->alloc_pw);
+	}
+
+	tokeniser->alloc(tokeniser, 0, tokeniser->alloc_pw);
+}
+
+/**
+ * Configure a hubbub tokeniser
+ *
+ * \param tokeniser  The tokeniser instance to configure
+ * \param type       The option type to set
+ * \param params     Option-specific parameters
+ * \return HUBBUB_OK on success, appropriate error otherwise
+ */
+hubbub_error hubbub_tokeniser_setopt(hubbub_tokeniser *tokeniser,
+		hubbub_tokeniser_opttype type,
+		hubbub_tokeniser_optparams *params)
+{
+	if (tokeniser == NULL || params == NULL)
+		return HUBBUB_BADPARM;
+
+	switch (type) {
+	case HUBBUB_TOKENISER_TOKEN_HANDLER:
+		tokeniser->token_handler = params->token_handler.handler;
+		tokeniser->token_pw = params->token_handler.pw;
+		break;
+	case HUBBUB_TOKENISER_BUFFER_HANDLER:
+		tokeniser->buffer_handler = params->buffer_handler.handler;
+		tokeniser->buffer_pw = params->buffer_handler.pw;
+		tokeniser->buffer_handler(tokeniser->input_buffer,
+				tokeniser->input_buffer_len,
+				tokeniser->buffer_pw);
+		break;
+	case HUBBUB_TOKENISER_ERROR_HANDLER:
+		tokeniser->error_handler = params->error_handler.handler;
+		tokeniser->error_pw = params->error_handler.pw;
+		break;
+	case HUBBUB_TOKENISER_CONTENT_MODEL:
+		tokeniser->content_model = params->content_model.model;
+		break;
+	}
+
+	return HUBBUB_OK;
+}
+
+/**
+ * Process remaining data in the input stream
+ *
+ * \param tokeniser  The tokeniser instance to invoke
+ * \return HUBBUB_OK on success, appropriate error otherwise
+ */
+hubbub_error hubbub_tokeniser_run(hubbub_tokeniser *tokeniser)
+{
+	bool cont = true;
+
+	if (tokeniser == NULL)
+		return HUBBUB_BADPARM;
+
+	while (cont) {
+		switch (tokeniser->state) {
+		case HUBBUB_TOKENISER_STATE_DATA:
+			cont = hubbub_tokeniser_handle_data(tokeniser);
+			break;
+		case HUBBUB_TOKENISER_STATE_ENTITY_DATA:
+			cont = hubbub_tokeniser_handle_entity_data(
+					tokeniser);
+			break;
+		case HUBBUB_TOKENISER_STATE_TAG_OPEN:
+			cont = hubbub_tokeniser_handle_tag_open(tokeniser);
+			break;
+		case HUBBUB_TOKENISER_STATE_CLOSE_TAG_OPEN:
+			cont = hubbub_tokeniser_handle_close_tag_open(
+					tokeniser);
+			break;
+		case HUBBUB_TOKENISER_STATE_CLOSE_TAG_MATCH:
+			cont = hubbub_tokeniser_handle_close_tag_match(
+					tokeniser);
+			break;
+		case HUBBUB_TOKENISER_STATE_TAG_NAME:
+			cont = hubbub_tokeniser_handle_tag_name(tokeniser);
+			break;
+		case HUBBUB_TOKENISER_STATE_BEFORE_ATTRIBUTE_NAME:
+			cont = hubbub_tokeniser_handle_before_attribute_name(
+					tokeniser);
+			break;
+		case HUBBUB_TOKENISER_STATE_ATTRIBUTE_NAME:
+			cont = hubbub_tokeniser_handle_attribute_name(
+					tokeniser);
+			break;
+		case HUBBUB_TOKENISER_STATE_AFTER_ATTRIBUTE_NAME:
+			cont = hubbub_tokeniser_handle_after_attribute_name(
+					tokeniser);
+			break;
+		case HUBBUB_TOKENISER_STATE_BEFORE_ATTRIBUTE_VALUE:
+			cont = hubbub_tokeniser_handle_before_attribute_value(
+					tokeniser);
+			break;
+		case HUBBUB_TOKENISER_STATE_ATTRIBUTE_VALUE_DQ:
+			cont = hubbub_tokeniser_handle_attribute_value_dq(
+					tokeniser);
+			break;
+		case HUBBUB_TOKENISER_STATE_ATTRIBUTE_VALUE_SQ:
+			cont = hubbub_tokeniser_handle_attribute_value_sq(
+					tokeniser);
+			break;
+		case HUBBUB_TOKENISER_STATE_ATTRIBUTE_VALUE_UQ:
+			cont = hubbub_tokeniser_handle_attribute_value_uq(
+					tokeniser);
+			break;
+		case HUBBUB_TOKENISER_STATE_ENTITY_IN_ATTRIBUTE_VALUE:
+			cont = hubbub_tokeniser_handle_entity_in_attribute_value(
+					tokeniser);
+			break;
+		case HUBBUB_TOKENISER_STATE_BOGUS_COMMENT:
+			cont = hubbub_tokeniser_handle_bogus_comment(
+					tokeniser);
+			break;
+		case HUBBUB_TOKENISER_STATE_MARKUP_DECLARATION_OPEN:
+			cont = hubbub_tokeniser_handle_markup_declaration_open(
+					tokeniser);
+			break;
+		case HUBBUB_TOKENISER_STATE_COMMENT_START:
+			cont = hubbub_tokeniser_handle_comment_start(
+					tokeniser);
+			break;
+		case HUBBUB_TOKENISER_STATE_COMMENT:
+			cont = hubbub_tokeniser_handle_comment(tokeniser);
+			break;
+		case HUBBUB_TOKENISER_STATE_COMMENT_DASH:
+			cont = hubbub_tokeniser_handle_comment_dash(
+					tokeniser);
+			break;
+		case HUBBUB_TOKENISER_STATE_COMMENT_END:
+			cont = hubbub_tokeniser_handle_comment_end(
+					tokeniser);
+			break;
+		case HUBBUB_TOKENISER_STATE_MATCH_DOCTYPE:
+			cont = hubbub_tokeniser_handle_match_doctype(
+					tokeniser);
+			break;
+		case HUBBUB_TOKENISER_STATE_DOCTYPE:
+			cont = hubbub_tokeniser_handle_doctype(tokeniser);
+			break;
+		case HUBBUB_TOKENISER_STATE_BEFORE_DOCTYPE_NAME:
+			cont = hubbub_tokeniser_handle_before_doctype_name(
+					tokeniser);
+			break;
+		case HUBBUB_TOKENISER_STATE_DOCTYPE_NAME:
+			cont = hubbub_tokeniser_handle_doctype_name(
+					tokeniser);
+			break;
+		case HUBBUB_TOKENISER_STATE_AFTER_DOCTYPE_NAME:
+			cont = hubbub_tokeniser_handle_after_doctype_name(
+					tokeniser);
+			break;
+		case HUBBUB_TOKENISER_STATE_BOGUS_DOCTYPE:
+			cont = hubbub_tokeniser_handle_bogus_doctype(
+					tokeniser);
+			break;
+		case HUBBUB_TOKENISER_STATE_NUMBERED_ENTITY:
+			cont = hubbub_tokeniser_handle_numbered_entity(
+					tokeniser);
+			break;
+		case HUBBUB_TOKENISER_STATE_NAMED_ENTITY:
+			cont = hubbub_tokeniser_handle_named_entity(
+					tokeniser);
+			break;
+		}
+	}
+
+	return HUBBUB_OK;
+}
+
+bool hubbub_tokeniser_handle_data(hubbub_tokeniser *tokeniser)
+{
+	hubbub_token token;
+	uint32_t c;
+
+	/* Clear current characters */
+	tokeniser->context.current_chars.data_off = 0;
+	tokeniser->context.current_chars.len = 0;
+
+	while ((c = hubbub_inputstream_peek(tokeniser->input)) !=
+			HUBBUB_INPUTSTREAM_EOF &&
+			c != HUBBUB_INPUTSTREAM_OOD) {
+		if (c == '&' && (tokeniser->content_model ==
+					HUBBUB_CONTENT_MODEL_PCDATA ||
+				tokeniser->content_model ==
+					HUBBUB_CONTENT_MODEL_RCDATA)) {
+			tokeniser->state =
+					HUBBUB_TOKENISER_STATE_ENTITY_DATA;
+			/* Don't eat the '&'; it'll be handled by
+			 * entity consumption */
+			break;
+		} else if (c == '<' && tokeniser->content_model !=
+				HUBBUB_CONTENT_MODEL_PLAINTEXT) {
+			if (tokeniser->context.current_chars.len > 0) {
+				/* Emit any pending characters */
+				token.type = HUBBUB_TOKEN_CHARACTER;
+				token.data.character =
+					tokeniser->context.current_chars;
+
+				hubbub_tokeniser_emit_token(tokeniser,
+						&token);
+			}
+
+			/* Buffer '<' */
+			tokeniser->context.current_chars.data_off =
+				hubbub_inputstream_cur_pos(tokeniser->input,
+					&tokeniser->context.current_chars.len);
+
+			tokeniser->state = HUBBUB_TOKENISER_STATE_TAG_OPEN;
+			hubbub_inputstream_advance(tokeniser->input);
+			break;
+		} else {
+			uint32_t pos;
+			size_t len;
+
+			/* Accumulate characters into buffer */
+			pos = hubbub_inputstream_cur_pos(tokeniser->input,
+					&len);
+
+			if (tokeniser->context.current_chars.len == 0) {
+				tokeniser->context.current_chars.data_off =
+						pos;
+			}
+			tokeniser->context.current_chars.len++;
+
+			hubbub_inputstream_advance(tokeniser->input);
+		}
+	}
+
+	if (tokeniser->state != HUBBUB_TOKENISER_STATE_TAG_OPEN &&
+			tokeniser->context.current_chars.len > 0) {
+		/* Emit any pending characters */
+		token.type = HUBBUB_TOKEN_CHARACTER;
+		token.data.character = tokeniser->context.current_chars;
+
+		hubbub_tokeniser_emit_token(tokeniser, &token);
+
+		tokeniser->context.current_chars.data_off = 0;
+		tokeniser->context.current_chars.len = 0;
+	}
+
+	if (c == HUBBUB_INPUTSTREAM_EOF) {
+		token.type = HUBBUB_TOKEN_EOF;
+
+		hubbub_tokeniser_emit_token(tokeniser, &token);
+	}
+
+	return (c != HUBBUB_INPUTSTREAM_EOF && c != HUBBUB_INPUTSTREAM_OOD);
+}
+
+bool hubbub_tokeniser_handle_entity_data(hubbub_tokeniser *tokeniser)
+{
+	if (tokeniser->context.match_entity.complete == false) {
+		return hubbub_tokeniser_consume_entity(tokeniser);
+	} else {
+		hubbub_token token;
+		uint32_t c = hubbub_inputstream_peek(tokeniser->input);
+
+		if (c == HUBBUB_INPUTSTREAM_OOD ||
+				c == HUBBUB_INPUTSTREAM_EOF) {
+			/* Should never happen */
+			abort();
+		}
+
+		/* Emit character */
+		token.type = HUBBUB_TOKEN_CHARACTER;
+		token.data.character.data_off =
+				hubbub_inputstream_cur_pos(tokeniser->input,
+						&token.data.character.len);
+
+		hubbub_tokeniser_emit_token(tokeniser, &token);
+
+		/* Reset for next time */
+		tokeniser->context.match_entity.complete = false;
+
+		tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+
+		hubbub_inputstream_advance(tokeniser->input);
+	}
+
+	return true;
+}
+
+bool hubbub_tokeniser_handle_tag_open(hubbub_tokeniser *tokeniser)
+{
+	uint32_t c = hubbub_inputstream_peek(tokeniser->input);
+	hubbub_tag *ctag = &tokeniser->context.current_tag;
+	uint32_t pos;
+	size_t len;
+
+	if (c == HUBBUB_INPUTSTREAM_OOD)
+		return false;
+
+	if (tokeniser->content_model == HUBBUB_CONTENT_MODEL_RCDATA ||
+			tokeniser->content_model ==
+					HUBBUB_CONTENT_MODEL_CDATA) {
+		if (c == '/') {
+			pos = hubbub_inputstream_cur_pos(tokeniser->input,
+					&len);
+			tokeniser->context.current_chars.len += len;
+
+			tokeniser->state =
+				HUBBUB_TOKENISER_STATE_CLOSE_TAG_OPEN;
+
+			hubbub_inputstream_advance(tokeniser->input);
+		} else {
+			hubbub_token token;
+
+			/* Emit '<' */
+			token.type = HUBBUB_TOKEN_CHARACTER;
+			token.data.character =
+					tokeniser->context.current_chars;
+
+			hubbub_tokeniser_emit_token(tokeniser, &token);
+
+			tokeniser->state =
+				HUBBUB_TOKENISER_STATE_DATA;
+		}
+	} else if (tokeniser->content_model == HUBBUB_CONTENT_MODEL_PCDATA) {
+		if (c == '!') {
+			pos = hubbub_inputstream_cur_pos(tokeniser->input,
+					&len);
+
+			tokeniser->context.current_chars.len += len;
+
+			tokeniser->state =
+				HUBBUB_TOKENISER_STATE_MARKUP_DECLARATION_OPEN;
+			hubbub_inputstream_advance(tokeniser->input);
+		} else if (c == '/') {
+			pos = hubbub_inputstream_cur_pos(tokeniser->input,
+					&len);
+
+			tokeniser->context.current_chars.len += len;
+
+			tokeniser->state =
+				HUBBUB_TOKENISER_STATE_CLOSE_TAG_OPEN;
+			hubbub_inputstream_advance(tokeniser->input);
+		} else if ('A' <= c && c <= 'Z') {
+			hubbub_inputstream_lowercase(tokeniser->input);
+
+			tokeniser->context.current_tag_type =
+					HUBBUB_TOKEN_START_TAG;
+
+			ctag->name.data_off =
+				hubbub_inputstream_cur_pos(tokeniser->input,
+				&ctag->name.len);
+			ctag->n_attributes = 0;
+
+			tokeniser->state =
+				HUBBUB_TOKENISER_STATE_TAG_NAME;
+			hubbub_inputstream_advance(tokeniser->input);
+		} else if ('a' <= c && c <= 'z') {
+			tokeniser->context.current_tag_type =
+					HUBBUB_TOKEN_START_TAG;
+
+			ctag->name.data_off =
+				hubbub_inputstream_cur_pos(tokeniser->input,
+				&ctag->name.len);
+			ctag->n_attributes = 0;
+
+			tokeniser->state =
+				HUBBUB_TOKENISER_STATE_TAG_NAME;
+			hubbub_inputstream_advance(tokeniser->input);
+		} else if (c == '>') {
+			hubbub_token token;
+
+			pos = hubbub_inputstream_cur_pos(tokeniser->input,
+					&len);
+			tokeniser->context.current_chars.len += len;
+
+			/* Emit "<>" */
+			token.type = HUBBUB_TOKEN_CHARACTER;
+			token.data.character =
+					tokeniser->context.current_chars;
+
+			hubbub_tokeniser_emit_token(tokeniser, &token);
+
+			tokeniser->state =
+				HUBBUB_TOKENISER_STATE_DATA;
+
+			hubbub_inputstream_advance(tokeniser->input);
+		} else if (c == '?') {
+			pos = hubbub_inputstream_cur_pos(tokeniser->input,
+					&len);
+			tokeniser->context.current_chars.len += len;
+
+			tokeniser->context.current_comment.data_off = pos;
+			tokeniser->context.current_comment.len = len;
+			tokeniser->state =
+				HUBBUB_TOKENISER_STATE_BOGUS_COMMENT;
+			hubbub_inputstream_advance(tokeniser->input);
+		} else {
+			hubbub_token token;
+
+			/* Emit '<' */
+			token.type = HUBBUB_TOKEN_CHARACTER;
+			token.data.character =
+					tokeniser->context.current_chars;
+
+			hubbub_tokeniser_emit_token(tokeniser, &token);
+
+			tokeniser->state =
+				HUBBUB_TOKENISER_STATE_DATA;
+		}
+	}
+
+	return true;
+}
+
+bool hubbub_tokeniser_handle_close_tag_open(hubbub_tokeniser *tokeniser)
+{
+	if (tokeniser->content_model == HUBBUB_CONTENT_MODEL_RCDATA ||
+			tokeniser->content_model ==
+					HUBBUB_CONTENT_MODEL_CDATA) {
+		tokeniser->context.close_tag_match.tag.len = 0;
+		tokeniser->state = HUBBUB_TOKENISER_STATE_CLOSE_TAG_MATCH;
+	} else if (tokeniser->content_model == HUBBUB_CONTENT_MODEL_PCDATA) {
+		hubbub_tag *ctag = &tokeniser->context.current_tag;
+		uint32_t c = hubbub_inputstream_peek(tokeniser->input);
+		uint32_t pos;
+		size_t len;
+
+		if ('A' <= c && c <= 'Z') {
+			hubbub_inputstream_lowercase(tokeniser->input);
+
+			pos = hubbub_inputstream_cur_pos(tokeniser->input,
+					&len);
+
+			tokeniser->context.current_tag_type =
+					HUBBUB_TOKEN_END_TAG;
+			ctag->name.data_off = pos;
+			ctag->name.len = len;
+			ctag->n_attributes = 0;
+
+			tokeniser->state = HUBBUB_TOKENISER_STATE_TAG_NAME;
+			hubbub_inputstream_advance(tokeniser->input);
+		} else if ('a' <= c && c <= 'z') {
+			pos = hubbub_inputstream_cur_pos(tokeniser->input,
+					&len);
+
+			tokeniser->context.current_tag_type =
+					HUBBUB_TOKEN_END_TAG;
+			ctag->name.data_off = pos;
+			ctag->name.len = len;
+			ctag->n_attributes = 0;
+
+			tokeniser->state = HUBBUB_TOKENISER_STATE_TAG_NAME;
+			hubbub_inputstream_advance(tokeniser->input);
+		} else if (c == '>') {
+			tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+			hubbub_inputstream_advance(tokeniser->input);
+		} else if (c == HUBBUB_INPUTSTREAM_EOF) {
+			hubbub_token token;
+
+			/* Emit "</" */
+			token.type = HUBBUB_TOKEN_CHARACTER;
+			token.data.character =
+					tokeniser->context.current_chars;
+
+			hubbub_tokeniser_emit_token(tokeniser, &token);
+
+			tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+		} else if (c != HUBBUB_INPUTSTREAM_OOD) {
+			pos = hubbub_inputstream_cur_pos(tokeniser->input,
+					&len);
+
+			tokeniser->context.current_comment.data_off = pos;
+			tokeniser->context.current_comment.len = len;
+
+			tokeniser->state =
+				HUBBUB_TOKENISER_STATE_BOGUS_COMMENT;
+			hubbub_inputstream_advance(tokeniser->input);
+		} else {
+			/* Out of data */
+			return false;
+		}
+	}
+
+	return true;
+}
+
+bool hubbub_tokeniser_handle_close_tag_match(hubbub_tokeniser *tokeniser)
+{
+	hubbub_tokeniser_context *ctx = &tokeniser->context;
+	hubbub_tag *ctag = &tokeniser->context.current_tag;
+	uint32_t c = 0;
+
+	while (ctx->close_tag_match.tag.len < ctag->name.len &&
+			(c = hubbub_inputstream_peek(tokeniser->input)) !=
+			HUBBUB_INPUTSTREAM_EOF &&
+			c != HUBBUB_INPUTSTREAM_OOD) {
+		/* Match last open tag */
+		uint32_t off;
+		size_t len;
+
+		off = hubbub_inputstream_cur_pos(tokeniser->input, &len);
+
+		if (ctx->close_tag_match.tag.len == 0) {
+			ctx->close_tag_match.tag.data_off = off;
+			ctx->close_tag_match.tag.len = len;
+		} else {
+			ctx->close_tag_match.tag.len += len;
+		}
+
+		hubbub_inputstream_advance(tokeniser->input);
+
+		if (ctx->close_tag_match.tag.len > ctag->name.len ||
+			(ctx->close_tag_match.tag.len == ctag->name.len &&
+				hubbub_inputstream_compare_range_ci(
+					tokeniser->input,
+					ctag->name.data_off,
+					ctx->close_tag_match.tag.data_off,
+					ctag->name.len) != 0)) {
+			hubbub_token token;
+
+			/* Rewind input stream to start of tag name */
+			if (hubbub_inputstream_rewind(tokeniser->input,
+					ctx->close_tag_match.tag.len) !=
+					HUBBUB_OK)
+				abort();
+
+			/* Emit "</" */
+			token.type = HUBBUB_TOKEN_CHARACTER;
+			token.data.character =
+					tokeniser->context.current_chars;
+
+			hubbub_tokeniser_emit_token(tokeniser, &token);
+
+			tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+
+			return true;
+		} else if (ctx->close_tag_match.tag.len == ctag->name.len &&
+				hubbub_inputstream_compare_range_ci(
+					tokeniser->input,
+					ctag->name.data_off,
+					ctx->close_tag_match.tag.data_off,
+					ctag->name.len) == 0) {
+			/* Matched => stop searching */
+			break;
+		}
+	}
+
+	if (c == HUBBUB_INPUTSTREAM_OOD) {
+		/* Need more data */
+		return false;
+	}
+
+	if (c == HUBBUB_INPUTSTREAM_EOF) {
+		/* Ran out of data - parse error */
+		hubbub_token token;
+
+		/* Rewind input stream to start of tag name */
+		if (hubbub_inputstream_rewind(tokeniser->input,
+				ctx->close_tag_match.tag.len) != HUBBUB_OK)
+			abort();
+
+		/* Emit "</" */
+		token.type = HUBBUB_TOKEN_CHARACTER;
+		token.data.character = tokeniser->context.current_chars;
+
+		hubbub_tokeniser_emit_token(tokeniser, &token);
+
+		tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+
+		return true;
+	}
+
+	/* Match following char */
+	c = hubbub_inputstream_peek(tokeniser->input);
+
+	if (c == HUBBUB_INPUTSTREAM_OOD) {
+		/* Need more data */
+		return false;
+	}
+
+	/* Rewind input stream to start of tag name */
+	if (hubbub_inputstream_rewind(tokeniser->input,
+			ctx->close_tag_match.tag.len) != HUBBUB_OK)
+		abort();
+
+	/* Check that following char was valid */
+	if (c != '\t' && c != '\n' && c != '\v' && c != '\f' &&
+			c != ' ' && c != '>' && c != '/' && c != '<' &&
+			c != HUBBUB_INPUTSTREAM_EOF) {
+		hubbub_token token;
+
+		/* Emit "</" */
+		token.type = HUBBUB_TOKEN_CHARACTER;
+		token.data.character = tokeniser->context.current_chars;
+
+		hubbub_tokeniser_emit_token(tokeniser, &token);
+
+		tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+
+		return true;
+	}
+
+	/* Switch the content model back to PCDATA */
+	tokeniser->content_model = HUBBUB_CONTENT_MODEL_PCDATA;
+
+	/* Finally, transition back to close tag open state */
+	tokeniser->state = HUBBUB_TOKENISER_STATE_CLOSE_TAG_OPEN;
+
+	return true;
+}
+
+bool hubbub_tokeniser_handle_tag_name(hubbub_tokeniser *tokeniser)
+{
+	hubbub_tag *ctag = &tokeniser->context.current_tag;
+	uint32_t c = hubbub_inputstream_peek(tokeniser->input);
+
+	if (c == HUBBUB_INPUTSTREAM_OOD)
+		return false;
+
+	if (c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == ' ') {
+		tokeniser->state =
+			HUBBUB_TOKENISER_STATE_BEFORE_ATTRIBUTE_NAME;
+		hubbub_inputstream_advance(tokeniser->input);
+	} else if (c == '>') {
+		hubbub_token token;
+
+		/* Emit current tag */
+		token.type = tokeniser->context.current_tag_type;
+		token.data.tag = tokeniser->context.current_tag;
+
+		hubbub_tokeniser_emit_token(tokeniser, &token);
+
+		tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+		hubbub_inputstream_advance(tokeniser->input);
+	} else if ('A' <= c && c <= 'Z') {
+		uint32_t pos;
+		size_t len;
+
+		hubbub_inputstream_lowercase(tokeniser->input);
+
+		pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
+
+		ctag->name.len += len;
+
+		hubbub_inputstream_advance(tokeniser->input);
+	} else if (c == '<' || c == HUBBUB_INPUTSTREAM_EOF) {
+		hubbub_token token;
+
+		/* Emit current tag */
+		token.type = tokeniser->context.current_tag_type;
+		token.data.tag = tokeniser->context.current_tag;
+
+		hubbub_tokeniser_emit_token(tokeniser, &token);
+
+		tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+	} else if (c == '/') {
+		/** \todo permitted slash */
+		tokeniser->state =
+			HUBBUB_TOKENISER_STATE_BEFORE_ATTRIBUTE_NAME;
+		hubbub_inputstream_advance(tokeniser->input);
+	} else {
+		uint32_t pos;
+		size_t len;
+
+		pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
+
+		ctag->name.len += len;
+
+		hubbub_inputstream_advance(tokeniser->input);
+	}
+
+	return true;
+}
+
+bool hubbub_tokeniser_handle_before_attribute_name(
+		hubbub_tokeniser *tokeniser)
+{
+	hubbub_tag *ctag = &tokeniser->context.current_tag;
+	uint32_t c = hubbub_inputstream_peek(tokeniser->input);
+
+	if (c == HUBBUB_INPUTSTREAM_OOD)
+		return false;
+
+	if (c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == ' ') {
+		hubbub_inputstream_advance(tokeniser->input);
+	} else if (c == '>') {
+		hubbub_token token;
+
+		/* Emit current tag */
+		token.type = tokeniser->context.current_tag_type;
+		token.data.tag = tokeniser->context.current_tag;
+
+		hubbub_tokeniser_emit_token(tokeniser, &token);
+
+		tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+		hubbub_inputstream_advance(tokeniser->input);
+	} else if ('A' <= c && c <= 'Z') {
+		uint32_t pos;
+		size_t len;
+		hubbub_attribute *attr;
+
+		hubbub_inputstream_lowercase(tokeniser->input);
+
+		pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
+
+		attr = tokeniser->alloc(ctag->attributes,
+				(ctag->n_attributes + 1) *
+					sizeof(hubbub_attribute),
+				tokeniser->alloc_pw);
+		if (attr == NULL) {
+			/** \todo handle memory exhaustion */
+		}
+
+		ctag->attributes = attr;
+
+		attr[ctag->n_attributes].name.data_off = pos;
+		attr[ctag->n_attributes].name.len = len;
+		attr[ctag->n_attributes].value.data_off = 0;
+		attr[ctag->n_attributes].value.len = 0;
+
+		ctag->n_attributes++;
+
+		tokeniser->state = HUBBUB_TOKENISER_STATE_ATTRIBUTE_NAME;
+
+		hubbub_inputstream_advance(tokeniser->input);
+	} else if (c == '/') {
+		/** \todo permitted slash */
+		hubbub_inputstream_advance(tokeniser->input);
+	} else if (c == '<' || c == HUBBUB_INPUTSTREAM_EOF) {
+		hubbub_token token;
+
+		/* Emit current tag */
+		token.type = tokeniser->context.current_tag_type;
+		token.data.tag = tokeniser->context.current_tag;
+
+		hubbub_tokeniser_emit_token(tokeniser, &token);
+
+		tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+	} else {
+		uint32_t pos;
+		size_t len;
+		hubbub_attribute *attr;
+
+		pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
+
+		attr = tokeniser->alloc(ctag->attributes,
+				(ctag->n_attributes + 1) *
+					sizeof(hubbub_attribute),
+				tokeniser->alloc_pw);
+		if (attr == NULL) {
+			/** \todo handle memory exhaustion */
+		}
+
+		ctag->attributes = attr;
+
+		attr[ctag->n_attributes].name.data_off = pos;
+		attr[ctag->n_attributes].name.len = len;
+		attr[ctag->n_attributes].value.data_off = 0;
+		attr[ctag->n_attributes].value.len = 0;
+
+		ctag->n_attributes++;
+
+		tokeniser->state = HUBBUB_TOKENISER_STATE_ATTRIBUTE_NAME;
+
+		hubbub_inputstream_advance(tokeniser->input);
+	}
+
+	return true;
+}
+
+bool hubbub_tokeniser_handle_attribute_name(hubbub_tokeniser *tokeniser)
+{
+	hubbub_tag *ctag = &tokeniser->context.current_tag;
+	uint32_t c = hubbub_inputstream_peek(tokeniser->input);
+
+	if (c == HUBBUB_INPUTSTREAM_OOD)
+		return false;
+
+	if (c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == ' ') {
+		tokeniser->state =
+				HUBBUB_TOKENISER_STATE_AFTER_ATTRIBUTE_NAME;
+		hubbub_inputstream_advance(tokeniser->input);
+	} else if (c == '=') {
+		tokeniser->state =
+			HUBBUB_TOKENISER_STATE_BEFORE_ATTRIBUTE_VALUE;
+		hubbub_inputstream_advance(tokeniser->input);
+	} else if (c == '>') {
+		hubbub_token token;
+
+		/* Emit current tag */
+		token.type = tokeniser->context.current_tag_type;
+		token.data.tag = tokeniser->context.current_tag;
+
+		hubbub_tokeniser_emit_token(tokeniser, &token);
+
+		tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+		hubbub_inputstream_advance(tokeniser->input);
+	} else if ('A' <= c && c <= 'Z') {
+		uint32_t pos;
+		size_t len;
+
+		hubbub_inputstream_lowercase(tokeniser->input);
+
+		pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
+
+		ctag->attributes[ctag->n_attributes - 1].name.len += len;
+
+		hubbub_inputstream_advance(tokeniser->input);
+	} else if (c == '/') {
+		/** \todo permitted slash */
+		tokeniser->state =
+				HUBBUB_TOKENISER_STATE_BEFORE_ATTRIBUTE_NAME;
+		hubbub_inputstream_advance(tokeniser->input);
+	} else if (c == '<' || c == HUBBUB_INPUTSTREAM_EOF) {
+		hubbub_token token;
+
+		/* Emit current tag */
+		token.type = tokeniser->context.current_tag_type;
+		token.data.tag = tokeniser->context.current_tag;
+
+		hubbub_tokeniser_emit_token(tokeniser, &token);
+
+		tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+	} else {
+		uint32_t pos;
+		size_t len;
+
+		pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
+
+		ctag->attributes[ctag->n_attributes - 1].name.len += len;
+
+		hubbub_inputstream_advance(tokeniser->input);
+	}
+
+	return true;
+}
+
+bool hubbub_tokeniser_handle_after_attribute_name(
+		hubbub_tokeniser *tokeniser)
+{
+	hubbub_tag *ctag = &tokeniser->context.current_tag;
+	uint32_t c = hubbub_inputstream_peek(tokeniser->input);
+
+	if (c == HUBBUB_INPUTSTREAM_OOD)
+		return false;
+
+	if (c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == ' ') {
+		hubbub_inputstream_advance(tokeniser->input);
+	} else if (c == '=') {
+		tokeniser->state =
+			HUBBUB_TOKENISER_STATE_BEFORE_ATTRIBUTE_VALUE;
+		hubbub_inputstream_advance(tokeniser->input);
+	} else if (c == '>') {
+		hubbub_token token;
+
+		/* Emit current tag */
+		token.type = tokeniser->context.current_tag_type;
+		token.data.tag = tokeniser->context.current_tag;
+
+		hubbub_tokeniser_emit_token(tokeniser, &token);
+
+		tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+		hubbub_inputstream_advance(tokeniser->input);
+	} else if ('A' <= c && c <= 'Z') {
+		uint32_t pos;
+		size_t len;
+		hubbub_attribute *attr;
+
+		hubbub_inputstream_lowercase(tokeniser->input);
+
+		pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
+
+		attr = tokeniser->alloc(ctag->attributes,
+				(ctag->n_attributes + 1) *
+					sizeof(hubbub_attribute),
+				tokeniser->alloc_pw);
+		if (attr == NULL) {
+			/** \todo handle memory exhaustion */
+		}
+
+		ctag->attributes = attr;
+
+		attr[ctag->n_attributes].name.data_off = pos;
+		attr[ctag->n_attributes].name.len = len;
+		attr[ctag->n_attributes].value.data_off = 0;
+		attr[ctag->n_attributes].value.len = 0;
+
+		ctag->n_attributes++;
+
+		tokeniser->state = HUBBUB_TOKENISER_STATE_ATTRIBUTE_NAME;
+
+		hubbub_inputstream_advance(tokeniser->input);
+	} else if (c == '/') {
+		/** \todo permitted slash */
+		tokeniser->state =
+				HUBBUB_TOKENISER_STATE_BEFORE_ATTRIBUTE_NAME;
+		hubbub_inputstream_advance(tokeniser->input);
+	} else if (c == '<' || c == HUBBUB_INPUTSTREAM_EOF) {
+		hubbub_token token;
+
+		/* Emit current tag */
+		token.type = tokeniser->context.current_tag_type;
+		token.data.tag = tokeniser->context.current_tag;
+
+		hubbub_tokeniser_emit_token(tokeniser, &token);
+
+		tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+	} else {
+		uint32_t pos;
+		size_t len;
+		hubbub_attribute *attr;
+
+		hubbub_inputstream_lowercase(tokeniser->input);
+
+		pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
+
+		attr = tokeniser->alloc(ctag->attributes,
+				(ctag->n_attributes + 1) *
+					sizeof(hubbub_attribute),
+				tokeniser->alloc_pw);
+		if (attr == NULL) {
+			/** \todo handle memory exhaustion */
+		}
+
+		ctag->attributes = attr;
+
+		attr[ctag->n_attributes].name.data_off = pos;
+		attr[ctag->n_attributes].name.len = len;
+		attr[ctag->n_attributes].value.data_off = 0;
+		attr[ctag->n_attributes].value.len = 0;
+
+		ctag->n_attributes++;
+
+		tokeniser->state = HUBBUB_TOKENISER_STATE_ATTRIBUTE_NAME;
+
+		hubbub_inputstream_advance(tokeniser->input);
+	}
+
+	return true;
+}
+
+bool hubbub_tokeniser_handle_before_attribute_value(
+		hubbub_tokeniser *tokeniser)
+{
+	hubbub_tag *ctag = &tokeniser->context.current_tag;
+	uint32_t c = hubbub_inputstream_peek(tokeniser->input);
+
+	if (c == HUBBUB_INPUTSTREAM_OOD)
+		return false;
+
+	if (c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == ' ') {
+		hubbub_inputstream_advance(tokeniser->input);
+	} else if (c == '"') {
+		tokeniser->state = HUBBUB_TOKENISER_STATE_ATTRIBUTE_VALUE_DQ;
+		hubbub_inputstream_advance(tokeniser->input);
+	} else if (c == '&') {
+		tokeniser->state = HUBBUB_TOKENISER_STATE_ATTRIBUTE_VALUE_UQ;
+	} else if (c == '\'') {
+		tokeniser->state = HUBBUB_TOKENISER_STATE_ATTRIBUTE_VALUE_SQ;
+		hubbub_inputstream_advance(tokeniser->input);
+	} else if (c == '>') {
+		hubbub_token token;
+
+		/* Emit current tag */
+		token.type = tokeniser->context.current_tag_type;
+		token.data.tag = tokeniser->context.current_tag;
+
+		hubbub_tokeniser_emit_token(tokeniser, &token);
+
+		tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+		hubbub_inputstream_advance(tokeniser->input);
+	} else if (c == '<' || c == HUBBUB_INPUTSTREAM_EOF) {
+		hubbub_token token;
+
+		/* Emit current tag */
+		token.type = tokeniser->context.current_tag_type;
+		token.data.tag = tokeniser->context.current_tag;
+
+		hubbub_tokeniser_emit_token(tokeniser, &token);
+
+		tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+	} else {
+		uint32_t pos;
+		size_t len;
+
+		pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
+
+		ctag->attributes[ctag->n_attributes - 1].value.data_off = pos;
+		ctag->attributes[ctag->n_attributes - 1].value.len = len;
+
+		tokeniser->state = HUBBUB_TOKENISER_STATE_ATTRIBUTE_VALUE_UQ;
+
+		hubbub_inputstream_advance(tokeniser->input);
+	}
+
+	return true;
+}
+
+bool hubbub_tokeniser_handle_attribute_value_dq(hubbub_tokeniser *tokeniser)
+{
+	hubbub_tag *ctag = &tokeniser->context.current_tag;
+	uint32_t c = hubbub_inputstream_peek(tokeniser->input);
+
+	if (c == HUBBUB_INPUTSTREAM_OOD)
+		return false;
+
+	if (c == '"') {
+		tokeniser->state =
+				HUBBUB_TOKENISER_STATE_BEFORE_ATTRIBUTE_NAME;
+		hubbub_inputstream_advance(tokeniser->input);
+	} else if (c == '&') {
+		tokeniser->context.prev_state = tokeniser->state;
+		tokeniser->state =
+			HUBBUB_TOKENISER_STATE_ENTITY_IN_ATTRIBUTE_VALUE;
+		/* Don't eat the '&'; entity consumption handles this */
+	} else if (c == HUBBUB_INPUTSTREAM_EOF) {
+		hubbub_token token;
+
+		/* Emit current tag */
+		token.type = tokeniser->context.current_tag_type;
+		token.data.tag = tokeniser->context.current_tag;
+
+		hubbub_tokeniser_emit_token(tokeniser, &token);
+
+		tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+	} else {
+		uint32_t pos;
+		size_t len;
+
+		pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
+
+		if (ctag->attributes[ctag->n_attributes - 1].value.len == 0) {
+			ctag->attributes[ctag->n_attributes - 1].value.data_off =
+					pos;
+		}
+
+		ctag->attributes[ctag->n_attributes - 1].value.len += len;
+
+		hubbub_inputstream_advance(tokeniser->input);
+	}
+
+	return true;
+}
+
+bool hubbub_tokeniser_handle_attribute_value_sq(hubbub_tokeniser *tokeniser)
+{
+	hubbub_tag *ctag = &tokeniser->context.current_tag;
+	uint32_t c = hubbub_inputstream_peek(tokeniser->input);
+
+	if (c == HUBBUB_INPUTSTREAM_OOD)
+		return false;
+
+	if (c == '\'') {
+		tokeniser->state =
+				HUBBUB_TOKENISER_STATE_BEFORE_ATTRIBUTE_NAME;
+		hubbub_inputstream_advance(tokeniser->input);
+	} else if (c == '&') {
+		tokeniser->context.prev_state = tokeniser->state;
+		tokeniser->state =
+			HUBBUB_TOKENISER_STATE_ENTITY_IN_ATTRIBUTE_VALUE;
+		/* Don't eat the '&'; entity consumption handles this */
+	} else if (c == HUBBUB_INPUTSTREAM_EOF) {
+		hubbub_token token;
+
+		/* Emit current tag */
+		token.type = tokeniser->context.current_tag_type;
+		token.data.tag = tokeniser->context.current_tag;
+
+		hubbub_tokeniser_emit_token(tokeniser, &token);
+
+		tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+	} else {
+		uint32_t pos;
+		size_t len;
+
+		pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
+
+		if (ctag->attributes[ctag->n_attributes - 1].value.len == 0) {
+			ctag->attributes[ctag->n_attributes - 1].value.data_off =
+					pos;
+		}
+
+		ctag->attributes[ctag->n_attributes - 1].value.len += len;
+
+		hubbub_inputstream_advance(tokeniser->input);
+	}
+
+	return true;
+}
+
+bool hubbub_tokeniser_handle_attribute_value_uq(hubbub_tokeniser *tokeniser)
+{
+	hubbub_tag *ctag = &tokeniser->context.current_tag;
+	uint32_t c = hubbub_inputstream_peek(tokeniser->input);
+
+	if (c == HUBBUB_INPUTSTREAM_OOD)
+		return false;
+
+	if (c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == ' ') {
+		tokeniser->state =
+				HUBBUB_TOKENISER_STATE_BEFORE_ATTRIBUTE_NAME;
+		hubbub_inputstream_advance(tokeniser->input);
+	} else if (c == '&') {
+		tokeniser->context.prev_state = tokeniser->state;
+		tokeniser->state =
+			HUBBUB_TOKENISER_STATE_ENTITY_IN_ATTRIBUTE_VALUE;
+		/* Don't eat the '&'; entity consumption handles this */
+	} else if (c == '>') {
+		hubbub_token token;
+
+		/* Emit current tag */
+		token.type = tokeniser->context.current_tag_type;
+		token.data.tag = tokeniser->context.current_tag;
+
+		hubbub_tokeniser_emit_token(tokeniser, &token);
+
+		tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+		hubbub_inputstream_advance(tokeniser->input);
+	} else if (c == '<' || c == HUBBUB_INPUTSTREAM_EOF) {
+		hubbub_token token;
+
+		/* Emit current tag */
+		token.type = tokeniser->context.current_tag_type;
+		token.data.tag = tokeniser->context.current_tag;
+
+		hubbub_tokeniser_emit_token(tokeniser, &token);
+
+		tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+	} else {
+		uint32_t pos;
+		size_t len;
+
+		pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
+
+		if (ctag->attributes[ctag->n_attributes - 1].value.len == 0) {
+			ctag->attributes[ctag->n_attributes - 1].value.data_off =
+					pos;
+		}
+
+		ctag->attributes[ctag->n_attributes - 1].value.len += len;
+
+		hubbub_inputstream_advance(tokeniser->input);
+	}
+
+	return true;
+}
+
+bool hubbub_tokeniser_handle_entity_in_attribute_value(
+		hubbub_tokeniser *tokeniser)
+{
+	hubbub_tag *ctag = &tokeniser->context.current_tag;
+	uint32_t pos;
+	size_t len;
+
+	if (tokeniser->context.match_entity.complete == false) {
+		return hubbub_tokeniser_consume_entity(tokeniser);
+	} else {
+		uint32_t c = hubbub_inputstream_peek(tokeniser->input);
+
+		if (c == HUBBUB_INPUTSTREAM_OOD ||
+				c == HUBBUB_INPUTSTREAM_EOF) {
+			/* Should never happen */
+			abort();
+		}
+
+		pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
+
+		if (ctag->attributes[ctag->n_attributes - 1].value.len == 0) {
+			ctag->attributes[ctag->n_attributes - 1].value.data_off =
+					pos;
+		}
+
+		ctag->attributes[ctag->n_attributes - 1].value.len += len;
+
+		/* Reset for next time */
+		tokeniser->context.match_entity.complete = false;
+
+		/* And back to the previous state */
+		tokeniser->state = tokeniser->context.prev_state;
+
+		hubbub_inputstream_advance(tokeniser->input);
+	}
+
+	return true;
+}
+
+bool hubbub_tokeniser_handle_bogus_comment(hubbub_tokeniser *tokeniser)
+{
+	hubbub_token token;
+	uint32_t c;
+
+	while ((c = hubbub_inputstream_peek(tokeniser->input)) !=
+			HUBBUB_INPUTSTREAM_EOF &&
+			c != HUBBUB_INPUTSTREAM_OOD) {
+		uint32_t pos;
+		size_t len;
+
+		if (c == '>') {
+			hubbub_inputstream_advance(tokeniser->input);
+			break;
+		}
+
+		pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
+
+		if (tokeniser->context.current_comment.len == 0)
+			tokeniser->context.current_comment.data_off = pos;
+		tokeniser->context.current_comment.len += len;
+
+		hubbub_inputstream_advance(tokeniser->input);
+	}
+
+	if (c == HUBBUB_INPUTSTREAM_OOD)
+		return false;
+
+	/* Emit comment */
+	token.type = HUBBUB_TOKEN_COMMENT;
+	token.data.comment = tokeniser->context.current_comment;
+
+	hubbub_tokeniser_emit_token(tokeniser, &token);
+
+	tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+
+	return true;
+}
+
+bool hubbub_tokeniser_handle_markup_declaration_open(
+		hubbub_tokeniser *tokeniser)
+{
+	uint32_t c = hubbub_inputstream_peek(tokeniser->input);
+
+	if (c == HUBBUB_INPUTSTREAM_OOD)
+		return false;
+
+	if (c == '-') {
+		tokeniser->state = HUBBUB_TOKENISER_STATE_COMMENT_START;
+		hubbub_inputstream_advance(tokeniser->input);
+	} else if ((c & ~0x20) == 'D') {
+		hubbub_inputstream_uppercase(tokeniser->input);
+		tokeniser->context.match_doctype.count = 1;
+		tokeniser->state = HUBBUB_TOKENISER_STATE_MATCH_DOCTYPE;
+		hubbub_inputstream_advance(tokeniser->input);
+	} else {
+		tokeniser->context.current_comment.data_off = 0;
+		tokeniser->context.current_comment.len = 0;
+
+		tokeniser->state = HUBBUB_TOKENISER_STATE_BOGUS_COMMENT;
+	}
+
+	return true;
+}
+
+bool hubbub_tokeniser_handle_comment_start(hubbub_tokeniser *tokeniser)
+{
+	uint32_t c = hubbub_inputstream_peek(tokeniser->input);
+
+	if (c == HUBBUB_INPUTSTREAM_OOD)
+		return false;
+
+	tokeniser->context.current_comment.data_off = 0;
+	tokeniser->context.current_comment.len = 0;
+
+
+	if (c == '-') {
+		tokeniser->state = HUBBUB_TOKENISER_STATE_COMMENT;
+		hubbub_inputstream_advance(tokeniser->input);
+	} else {
+		hubbub_inputstream_push_back(tokeniser->input, '-');
+		tokeniser->state = HUBBUB_TOKENISER_STATE_BOGUS_COMMENT;
+	}
+
+	return true;
+}
+
+bool hubbub_tokeniser_handle_comment(hubbub_tokeniser *tokeniser)
+{
+	uint32_t c = hubbub_inputstream_peek(tokeniser->input);
+
+	if (c == HUBBUB_INPUTSTREAM_OOD)
+		return false;
+
+	if (c == '-') {
+		tokeniser->state = HUBBUB_TOKENISER_STATE_COMMENT_DASH;
+		hubbub_inputstream_advance(tokeniser->input);
+	} else if (c == HUBBUB_INPUTSTREAM_EOF) {
+		hubbub_token token;
+
+		/* Emit comment */
+		token.type = HUBBUB_TOKEN_COMMENT;
+		token.data.comment = tokeniser->context.current_comment;
+
+		hubbub_tokeniser_emit_token(tokeniser, &token);
+
+		tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+	} else {
+		uint32_t pos;
+		size_t len;
+
+		pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
+
+		if (tokeniser->context.current_comment.len == 0)
+			tokeniser->context.current_comment.data_off = pos;
+		tokeniser->context.current_comment.len += len;
+
+		hubbub_inputstream_advance(tokeniser->input);
+	}
+
+	return true;
+}
+
+bool hubbub_tokeniser_handle_comment_dash(hubbub_tokeniser *tokeniser)
+{
+	uint32_t c = hubbub_inputstream_peek(tokeniser->input);
+
+	if (c == HUBBUB_INPUTSTREAM_OOD)
+		return false;
+
+	if (c == '-') {
+		tokeniser->state = HUBBUB_TOKENISER_STATE_COMMENT_END;
+		hubbub_inputstream_advance(tokeniser->input);
+	} else if (c == HUBBUB_INPUTSTREAM_EOF) {
+		hubbub_token token;
+
+		/* Emit comment */
+		token.type = HUBBUB_TOKEN_COMMENT;
+		token.data.comment = tokeniser->context.current_comment;
+
+		hubbub_tokeniser_emit_token(tokeniser, &token);
+
+		tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+	} else {
+		uint32_t pos;
+		size_t len;
+
+		pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
+
+		if (tokeniser->context.current_comment.len == 0) {
+			tokeniser->context.current_comment.data_off = pos;
+		} else {
+			/* Need to do this to get length of '-' */
+			len += pos -
+				tokeniser->context.current_comment.data_off;
+		}
+
+		tokeniser->context.current_comment.len = len;
+
+		tokeniser->state = HUBBUB_TOKENISER_STATE_COMMENT;
+
+		hubbub_inputstream_advance(tokeniser->input);
+	}
+
+	return true;
+}
+
+bool hubbub_tokeniser_handle_comment_end(hubbub_tokeniser *tokeniser)
+{
+	uint32_t c = hubbub_inputstream_peek(tokeniser->input);
+
+	if (c == HUBBUB_INPUTSTREAM_OOD)
+		return false;
+
+	if (c == '>') {
+		hubbub_token token;
+
+		/* Emit comment */
+		token.type = HUBBUB_TOKEN_COMMENT;
+		token.data.comment = tokeniser->context.current_comment;
+
+		hubbub_tokeniser_emit_token(tokeniser, &token);
+
+		tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+		hubbub_inputstream_advance(tokeniser->input);
+	} else if (c == '-') {
+		uint32_t pos;
+		size_t len;
+
+		pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
+
+		if (tokeniser->context.current_comment.len == 0) {
+			tokeniser->context.current_comment.data_off = pos;
+			tokeniser->context.current_comment.len = len;
+		} else {
+			/* Need to do this to get length of '-' */
+			len = pos -
+				tokeniser->context.current_comment.data_off;
+		}
+
+		tokeniser->context.current_comment.len = len;
+
+		tokeniser->state = HUBBUB_TOKENISER_STATE_COMMENT_END;
+		hubbub_inputstream_advance(tokeniser->input);
+	} else if (c == HUBBUB_INPUTSTREAM_EOF) {
+		hubbub_token token;
+
+		/* Emit comment */
+		token.type = HUBBUB_TOKEN_COMMENT;
+		token.data.comment = tokeniser->context.current_comment;
+
+		hubbub_tokeniser_emit_token(tokeniser, &token);
+
+		tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+	} else {
+		uint32_t pos;
+		size_t len;
+
+		pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
+
+		if (tokeniser->context.current_comment.len == 0) {
+			tokeniser->context.current_comment.data_off = pos;
+		} else {
+			/* Need to do this to get length of '--' */
+			len += pos -
+				tokeniser->context.current_comment.data_off;
+		}
+
+		tokeniser->context.current_comment.len = len;
+
+		tokeniser->state = HUBBUB_TOKENISER_STATE_COMMENT;
+
+		hubbub_inputstream_advance(tokeniser->input);
+	}
+
+	return true;
+}
+
+bool hubbub_tokeniser_handle_match_doctype(hubbub_tokeniser *tokeniser)
+{
+	uint32_t c = hubbub_inputstream_peek(tokeniser->input);
+
+	if (c == HUBBUB_INPUTSTREAM_OOD)
+		return false;
+
+	if (tokeniser->context.match_doctype.count == 1 &&
+			(c & ~0x20) == 'O') {
+		hubbub_inputstream_uppercase(tokeniser->input);
+		tokeniser->context.match_doctype.count++;
+		hubbub_inputstream_advance(tokeniser->input);
+	} else if (tokeniser->context.match_doctype.count == 2 &&
+			(c & ~0x20) == 'C') {
+		hubbub_inputstream_uppercase(tokeniser->input);
+		tokeniser->context.match_doctype.count++;
+		hubbub_inputstream_advance(tokeniser->input);
+	} else if (tokeniser->context.match_doctype.count == 3 &&
+			(c & ~0x20) == 'T') {
+		hubbub_inputstream_uppercase(tokeniser->input);
+		tokeniser->context.match_doctype.count++;
+		hubbub_inputstream_advance(tokeniser->input);
+	} else if (tokeniser->context.match_doctype.count == 4 &&
+			(c & ~0x20) == 'Y') {
+		hubbub_inputstream_uppercase(tokeniser->input);
+		tokeniser->context.match_doctype.count++;
+		hubbub_inputstream_advance(tokeniser->input);
+	} else if (tokeniser->context.match_doctype.count == 5 &&
+			(c & ~0x20) == 'P') {
+		hubbub_inputstream_uppercase(tokeniser->input);
+		tokeniser->context.match_doctype.count++;
+		hubbub_inputstream_advance(tokeniser->input);
+	} else if (tokeniser->context.match_doctype.count == 6 &&
+			(c & ~0x20) == 'E') {
+		hubbub_inputstream_uppercase(tokeniser->input);
+		tokeniser->state = HUBBUB_TOKENISER_STATE_DOCTYPE;
+		hubbub_inputstream_advance(tokeniser->input);
+	} else {
+		switch (tokeniser->context.match_doctype.count) {
+		case 6: hubbub_inputstream_push_back(tokeniser->input, 'P');
+		case 5: hubbub_inputstream_push_back(tokeniser->input, 'Y');
+		case 4: hubbub_inputstream_push_back(tokeniser->input, 'T');
+		case 3: hubbub_inputstream_push_back(tokeniser->input, 'C');
+		case 2: hubbub_inputstream_push_back(tokeniser->input, 'O');
+		case 1: hubbub_inputstream_push_back(tokeniser->input, 'D');
+		}
+
+		tokeniser->context.current_comment.data_off = 0;
+		tokeniser->context.current_comment.len = 0;
+
+		tokeniser->state = HUBBUB_TOKENISER_STATE_BOGUS_COMMENT;
+	}
+
+	return true;
+}
+
+bool hubbub_tokeniser_handle_doctype(hubbub_tokeniser *tokeniser)
+{
+	uint32_t c = hubbub_inputstream_peek(tokeniser->input);
+
+	if (c == HUBBUB_INPUTSTREAM_OOD)
+		return false;
+
+	if (c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == ' ') {
+		hubbub_inputstream_advance(tokeniser->input);
+	}
+
+	tokeniser->state = HUBBUB_TOKENISER_STATE_BEFORE_DOCTYPE_NAME;
+
+	return true;
+}
+
+bool hubbub_tokeniser_handle_before_doctype_name(
+		hubbub_tokeniser *tokeniser)
+{
+	hubbub_doctype *cdoc = &tokeniser->context.current_doctype;
+	uint32_t c = hubbub_inputstream_peek(tokeniser->input);
+
+	if (c == HUBBUB_INPUTSTREAM_OOD)
+		return false;
+
+	if (c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == ' ') {
+		hubbub_inputstream_advance(tokeniser->input);
+	} else if ('a' <= c && c <= 'z') {
+		uint32_t pos;
+		size_t len;
+
+		hubbub_inputstream_uppercase(tokeniser->input);
+
+		pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
+
+		cdoc->name.data_off = pos;
+		cdoc->name.len = len;
+		cdoc->correct = false;
+
+		tokeniser->state = HUBBUB_TOKENISER_STATE_DOCTYPE_NAME;
+
+		hubbub_inputstream_advance(tokeniser->input);
+	} else if (c == '>') {
+		hubbub_token token;
+
+		/* Emit doctype */
+		token.type = HUBBUB_TOKEN_DOCTYPE;
+		token.data.doctype = tokeniser->context.current_doctype;
+
+		hubbub_tokeniser_emit_token(tokeniser, &token);
+
+		tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+		hubbub_inputstream_advance(tokeniser->input);
+	} else if (c == HUBBUB_INPUTSTREAM_EOF) {
+		hubbub_token token;
+
+		/* Emit doctype */
+		token.type = HUBBUB_TOKEN_DOCTYPE;
+		token.data.doctype = tokeniser->context.current_doctype;
+
+		hubbub_tokeniser_emit_token(tokeniser, &token);
+
+		tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+	} else {
+		uint32_t pos;
+		size_t len;
+
+		pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
+
+		cdoc->name.data_off = pos;
+		cdoc->name.len = len;
+		cdoc->correct = false;
+
+		tokeniser->state = HUBBUB_TOKENISER_STATE_DOCTYPE_NAME;
+
+		hubbub_inputstream_advance(tokeniser->input);
+	}
+
+	return true;
+}
+
+bool hubbub_tokeniser_handle_doctype_name(hubbub_tokeniser *tokeniser)
+{
+	hubbub_doctype *cdoc = &tokeniser->context.current_doctype;
+	uint32_t c = hubbub_inputstream_peek(tokeniser->input);
+
+	if (c == HUBBUB_INPUTSTREAM_OOD)
+		return false;
+
+	if (c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == ' ') {
+		tokeniser->state = HUBBUB_TOKENISER_STATE_AFTER_DOCTYPE_NAME;
+		hubbub_inputstream_advance(tokeniser->input);
+	} else if (c == '>') {
+		hubbub_token token;
+
+		/* Emit doctype */
+		token.type = HUBBUB_TOKEN_DOCTYPE;
+		token.data.doctype = tokeniser->context.current_doctype;
+		token.data.doctype.correct =
+			(hubbub_inputstream_compare_range_ascii(
+				tokeniser->input,
+				token.data.doctype.name.data_off,
+				token.data.doctype.name.len,
+				"HTML", SLEN("HTML")) == 0);
+
+		hubbub_tokeniser_emit_token(tokeniser, &token);
+
+		tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+		hubbub_inputstream_advance(tokeniser->input);
+	} else if ('a' <= c && c <= 'z') {
+		uint32_t pos;
+		size_t len;
+
+		hubbub_inputstream_uppercase(tokeniser->input);
+
+		pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
+
+		cdoc->name.len += len;
+
+		hubbub_inputstream_advance(tokeniser->input);
+	} else if (c == HUBBUB_INPUTSTREAM_EOF) {
+		hubbub_token token;
+
+		/* Emit doctype */
+		token.type = HUBBUB_TOKEN_DOCTYPE;
+		token.data.doctype = tokeniser->context.current_doctype;
+
+		hubbub_tokeniser_emit_token(tokeniser, &token);
+
+		tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+	} else {
+		uint32_t pos;
+		size_t len;
+
+		pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
+
+		cdoc->name.len += len;
+
+		hubbub_inputstream_advance(tokeniser->input);
+	}
+
+	return true;
+}
+
+bool hubbub_tokeniser_handle_after_doctype_name(hubbub_tokeniser *tokeniser)
+{
+	hubbub_doctype *cdoc = &tokeniser->context.current_doctype;
+	uint32_t c = hubbub_inputstream_peek(tokeniser->input);
+
+	if (c == HUBBUB_INPUTSTREAM_OOD)
+		return false;
+
+	if (c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == ' ') {
+		hubbub_inputstream_advance(tokeniser->input);
+	} else if (c == '>') {
+		hubbub_token token;
+
+		/* Emit doctype */
+		token.type = HUBBUB_TOKEN_DOCTYPE;
+		token.data.doctype = tokeniser->context.current_doctype;
+		token.data.doctype.correct =
+			(hubbub_inputstream_compare_range_ascii(
+				tokeniser->input,
+				token.data.doctype.name.data_off,
+				token.data.doctype.name.len,
+				"HTML", SLEN("HTML")) == 0);
+
+		hubbub_tokeniser_emit_token(tokeniser, &token);
+
+		tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+		hubbub_inputstream_advance(tokeniser->input);
+	} else if (c == HUBBUB_INPUTSTREAM_EOF) {
+		hubbub_token token;
+
+		/* Emit doctype */
+		token.type = HUBBUB_TOKEN_DOCTYPE;
+		token.data.doctype = tokeniser->context.current_doctype;
+
+		hubbub_tokeniser_emit_token(tokeniser, &token);
+
+		tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+	} else {
+		cdoc->correct = false;
+
+		tokeniser->state = HUBBUB_TOKENISER_STATE_BOGUS_DOCTYPE;
+
+		hubbub_inputstream_advance(tokeniser->input);
+	}
+
+	return true;
+}
+
+bool hubbub_tokeniser_handle_bogus_doctype(hubbub_tokeniser *tokeniser)
+{
+	uint32_t c = hubbub_inputstream_peek(tokeniser->input);
+
+	if (c == HUBBUB_INPUTSTREAM_OOD)
+		return false;
+
+	if (c == '>') {
+		hubbub_token token;
+
+		/* Emit doctype */
+		token.type = HUBBUB_TOKEN_DOCTYPE;
+		token.data.doctype = tokeniser->context.current_doctype;
+
+		hubbub_tokeniser_emit_token(tokeniser, &token);
+
+		tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+		hubbub_inputstream_advance(tokeniser->input);
+	} else if (c == HUBBUB_INPUTSTREAM_EOF) {
+		hubbub_token token;
+
+		/* Emit doctype */
+		token.type = HUBBUB_TOKEN_DOCTYPE;
+		token.data.doctype = tokeniser->context.current_doctype;
+
+		hubbub_tokeniser_emit_token(tokeniser, &token);
+
+		tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+	} else {
+		hubbub_inputstream_advance(tokeniser->input);
+	}
+
+	return true;
+}
+
+bool hubbub_tokeniser_consume_entity(hubbub_tokeniser *tokeniser)
+{
+	uint32_t c;
+	uint32_t pos;
+	size_t len;
+
+	if (tokeniser->context.match_entity.done_setup == false) {
+		pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
+
+		tokeniser->context.match_entity.str.data_off = pos;
+		tokeniser->context.match_entity.str.len = len;
+		tokeniser->context.match_entity.base = 0;
+		tokeniser->context.match_entity.codepoint = 0;
+		tokeniser->context.match_entity.had_data = false;
+		tokeniser->context.match_entity.return_state =
+				tokeniser->state;
+		tokeniser->context.match_entity.complete = false;
+		tokeniser->context.match_entity.done_setup = true;
+		tokeniser->context.match_entity.context = NULL;
+		tokeniser->context.match_entity.prev_len = len;
+
+		hubbub_inputstream_advance(tokeniser->input);
+	}
+
+	c = hubbub_inputstream_peek(tokeniser->input);
+
+	if (c == HUBBUB_INPUTSTREAM_OOD)
+		return false;
+
+	if (c == '#') {
+		pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
+
+		tokeniser->context.match_entity.str.len += len;
+
+		tokeniser->state = HUBBUB_TOKENISER_STATE_NUMBERED_ENTITY;
+		hubbub_inputstream_advance(tokeniser->input);
+	} else {
+		tokeniser->state = HUBBUB_TOKENISER_STATE_NAMED_ENTITY;
+	}
+
+	return true;
+}
+
+bool hubbub_tokeniser_handle_numbered_entity(hubbub_tokeniser *tokeniser)
+{
+	hubbub_tokeniser_context *ctx = &tokeniser->context;
+	uint32_t c = hubbub_inputstream_peek(tokeniser->input);
+	uint32_t pos;
+	size_t len;
+	hubbub_error error;
+
+	if (c == HUBBUB_INPUTSTREAM_OOD)
+		return false;
+
+	if (ctx->match_entity.base == 0) {
+		if ((c & ~0x20) == 'X') {
+			ctx->match_entity.base = 16;
+
+			pos = hubbub_inputstream_cur_pos(tokeniser->input,
+					&len);
+			ctx->match_entity.str.len += len;
+
+			hubbub_inputstream_advance(tokeniser->input);
+		} else {
+			ctx->match_entity.base = 10;
+		}
+	}
+
+	while ((c = hubbub_inputstream_peek(tokeniser->input)) !=
+			HUBBUB_INPUTSTREAM_EOF &&
+			c != HUBBUB_INPUTSTREAM_OOD) {
+		if (ctx->match_entity.base == 10 &&
+				('0' <= c && c <= '9')) {
+			ctx->match_entity.had_data = true;
+
+			ctx->match_entity.codepoint =
+				ctx->match_entity.codepoint * 10 + (c - '0');
+
+			pos = hubbub_inputstream_cur_pos(tokeniser->input,
+					&len);
+			ctx->match_entity.str.len += len;
+		} else if (ctx->match_entity.base == 16 &&
+				(('0' <= c && c <= '9') ||
+				('A' <= (c & ~0x20) &&
+						(c & ~0x20) <= 'F'))) {
+			ctx->match_entity.had_data = true;
+
+			ctx->match_entity.codepoint *= 16;
+
+			if ('0' <= c && c <= '9') {
+				ctx->match_entity.codepoint += (c - '0');
+			} else {
+				ctx->match_entity.codepoint +=
+						((c & ~0x20) - 'A' + 10);
+			}
+
+			pos = hubbub_inputstream_cur_pos(tokeniser->input,
+					&len);
+			ctx->match_entity.str.len += len;
+		} else {
+			break;
+		}
+
+		hubbub_inputstream_advance(tokeniser->input);
+	}
+
+	if (c == HUBBUB_INPUTSTREAM_OOD)
+		return false;
+
+	/* Eat trailing semicolon, if any */
+	if (c == ';') {
+		pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
+		ctx->match_entity.str.len += len;
+
+		hubbub_inputstream_advance(tokeniser->input);
+	}
+
+	/* Rewind the inputstream to start of matched sequence */
+	hubbub_inputstream_rewind(tokeniser->input,
+			ctx->match_entity.str.len);
+
+	if (ctx->match_entity.had_data) {
+		/* Had data, so calculate final codepoint */
+		if (0x80 <= ctx->match_entity.codepoint &&
+				ctx->match_entity.codepoint <= 0x9F) {
+			ctx->match_entity.codepoint =
+				cp1252Table[ctx->match_entity.codepoint -
+						0x80];
+		} else if (ctx->match_entity.codepoint == 0 ||
+				ctx->match_entity.codepoint > 0x10FFFF) {
+			ctx->match_entity.codepoint = 0xFFFD;
+		}
+
+		/* And replace the matched range with it */
+		error = hubbub_inputstream_replace_range(tokeniser->input,
+				ctx->match_entity.str.data_off,
+				ctx->match_entity.str.len,
+				ctx->match_entity.codepoint);
+		if (error != HUBBUB_OK) {
+			/** \todo handle memory exhaustion */
+		}
+	}
+
+	/* Reset for next time */
+	ctx->match_entity.done_setup = false;
+
+	/* Flag completion */
+	ctx->match_entity.complete = true;
+
+	/* And back to the state we were entered in */
+	tokeniser->state = ctx->match_entity.return_state;
+
+	return true;
+}
+
+bool hubbub_tokeniser_handle_named_entity(hubbub_tokeniser *tokeniser)
+{
+	hubbub_tokeniser_context *ctx = &tokeniser->context;
+	uint32_t c;
+	uint32_t pos;
+	size_t len;
+	hubbub_error error;
+
+	while ((c = hubbub_inputstream_peek(tokeniser->input)) !=
+			HUBBUB_INPUTSTREAM_EOF &&
+			c != HUBBUB_INPUTSTREAM_OOD) {
+		uint32_t cp;
+
+		if (c > 0x7F) {
+			/* Entity names are ASCII only */
+			break;
+		}
+
+		error = hubbub_entities_search_step((uint8_t) c,
+				&cp,
+				&ctx->match_entity.context);
+		if (error == HUBBUB_OK) {
+			/* Had a match - store it for later */
+			ctx->match_entity.codepoint = cp;
+
+			pos = hubbub_inputstream_cur_pos(tokeniser->input,
+					&len);
+			ctx->match_entity.str.len += len;
+
+			/* And cache length, for replacement */
+			ctx->match_entity.prev_len =
+					ctx->match_entity.str.len;
+		} else if (error == HUBBUB_INVALID) {
+			/* No further matches - use last found */
+			break;
+		} else {
+			pos = hubbub_inputstream_cur_pos(tokeniser->input,
+					&len);
+			ctx->match_entity.str.len += len;
+		}
+
+		hubbub_inputstream_advance(tokeniser->input);
+	}
+
+	if (c == HUBBUB_INPUTSTREAM_OOD)
+		return false;
+
+	/* Eat trailing semicolon, if any */
+	if (ctx->match_entity.codepoint != 0 && c == ';' &&
+			ctx->match_entity.prev_len ==
+				ctx->match_entity.str.len) {
+		pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
+		ctx->match_entity.prev_len += len;
+	}
+
+	/* Rewind the inputstream to start of processed sequence */
+	hubbub_inputstream_rewind(tokeniser->input,
+			ctx->match_entity.str.len);
+
+	/* Now, replace range, if we found a named entity */
+	if (ctx->match_entity.codepoint != 0) {
+		error = hubbub_inputstream_replace_range(tokeniser->input,
+				ctx->match_entity.str.data_off,
+				ctx->match_entity.prev_len,
+				ctx->match_entity.codepoint);
+		if (error != HUBBUB_OK) {
+			/** \todo handle memory exhaustion */
+		}
+	}
+
+	/* Reset for next time */
+	ctx->match_entity.done_setup = false;
+
+	/* Flag completion */
+	ctx->match_entity.complete = true;
+
+	/* And back to the state from whence we came */
+	tokeniser->state = ctx->match_entity.return_state;
+
+	return true;
+}
+
+/**
+ * Handle input stream buffer moving
+ *
+ * \param buffer  Pointer to buffer
+ * \param len     Length of data in buffer (bytes)
+ * \param pw      Pointer to our context
+ */
+void hubbub_tokeniser_buffer_moved_handler(const uint8_t *buffer,
+		size_t len, void *pw)
+{
+	hubbub_tokeniser *tok = (hubbub_tokeniser *) pw;
+
+	tok->input_buffer = buffer;
+	tok->input_buffer_len = len;
+
+	if (tok->buffer_handler != NULL)
+		tok->buffer_handler(buffer, len, tok->buffer_pw);
+}
+
+/**
+ * Emit a token, performing sanity checks if necessary
+ *
+ * \param tokeniser  Tokeniser instance
+ * \param token      Token to emit
+ */
+void hubbub_tokeniser_emit_token(hubbub_tokeniser *tokeniser,
+		hubbub_token *token)
+{
+	if (tokeniser == NULL || token == NULL)
+		return;
+
+	/* Nothing to do if there's no registered handler */
+	if (tokeniser->token_handler == NULL)
+		return;
+
+	if (token->type == HUBBUB_TOKEN_START_TAG ||
+			token->type == HUBBUB_TOKEN_END_TAG) {
+		uint32_t i, j;
+		uint32_t n_attributes = token->data.tag.n_attributes;
+		hubbub_attribute *attrs =
+				token->data.tag.attributes;
+
+		/* Discard duplicate attributes */
+		for (i = 0; i < n_attributes; i++) {
+			for (j = 0; j < n_attributes; j++) {
+				uint32_t move;
+
+				if (j == i ||
+					attrs[i].name.len !=
+							attrs[j].name.len ||
+					hubbub_inputstream_compare_range_cs(
+						tokeniser->input,
+						attrs[i].name.data_off,
+						attrs[j].name.data_off,
+						attrs[i].name.len) != 0) {
+					/* Attributes don't match */
+					continue;
+				}
+
+				/* Calculate amount to move */
+				move = (n_attributes - 1 -
+					((i < j) ? j : i)) *
+					sizeof(hubbub_attribute);
+
+				if (move > 0) {
+					memmove((i < j) ? &attrs[j]
+							: &attrs[i],
+						(i < j) ? &attrs[j+1]
+							: &attrs[i+1],
+						move);
+				}
+
+				/* And reduce the number of attributes */
+				n_attributes--;
+			}
+		}
+
+		token->data.tag.n_attributes = n_attributes;
+	}
+
+	/* Finally, emit token */
+	tokeniser->token_handler(token, tokeniser->token_pw);
+}
diff --git a/src/tokeniser/tokeniser.h b/src/tokeniser/tokeniser.h
new file mode 100644
index 0000000..20bbe20
--- /dev/null
+++ b/src/tokeniser/tokeniser.h
@@ -0,0 +1,71 @@
+/*
+ * This file is part of Hubbub.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#ifndef hubbub_tokeniser_tokeniser_h_
+#define hubbub_tokeniser_tokeniser_h_
+
+#include <stdbool.h>
+#include <inttypes.h>
+
+#include <hubbub/errors.h>
+#include <hubbub/functypes.h>
+#include <hubbub/types.h>
+
+#include "input/inputstream.h"
+
+typedef struct hubbub_tokeniser hubbub_tokeniser;
+
+/**
+ * Hubbub tokeniser option types
+ */
+typedef enum hubbub_tokeniser_opttype {
+	HUBBUB_TOKENISER_TOKEN_HANDLER,
+	HUBBUB_TOKENISER_BUFFER_HANDLER,
+	HUBBUB_TOKENISER_ERROR_HANDLER,
+	HUBBUB_TOKENISER_CONTENT_MODEL,
+} hubbub_tokeniser_opttype;
+
+/**
+ * Hubbub tokeniser option parameters
+ */
+typedef union hubbub_tokeniser_optparams {
+	struct {
+		hubbub_token_handler handler;
+		void *pw;
+	} token_handler;
+
+	struct {
+		hubbub_buffer_handler handler;
+		void *pw;
+	} buffer_handler;
+
+	struct {
+		hubbub_error_handler handler;
+		void *pw;
+	} error_handler;
+
+	struct {
+		hubbub_content_model model;
+	} content_model;
+} hubbub_tokeniser_optparams;
+
+/* Create a hubbub tokeniser */
+hubbub_tokeniser *hubbub_tokeniser_create(hubbub_inputstream *input,
+		hubbub_alloc alloc, void *pw);
+/* Destroy a hubbub tokeniser */
+void hubbub_tokeniser_destroy(hubbub_tokeniser *tokeniser);
+
+/* Configure a hubbub tokeniser */
+hubbub_error hubbub_tokeniser_setopt(hubbub_tokeniser *tokeniser,
+		hubbub_tokeniser_opttype type,
+		hubbub_tokeniser_optparams *params);
+
+/* Process remaining data in the input stream */
+hubbub_error hubbub_tokeniser_run(hubbub_tokeniser *tokeniser);
+
+#endif
+
diff --git a/src/utils/Makefile b/src/utils/Makefile
new file mode 100644
index 0000000..59b5512
--- /dev/null
+++ b/src/utils/Makefile
@@ -0,0 +1,53 @@
+# Makefile for libhubbub
+#
+# Toolchain is exported by top-level makefile
+#
+# Top-level makefile also exports the following variables:
+#
+# COMPONENT  Name of component
+# EXPORT     Absolute path of export directory
+# TOP        Absolute path of source tree root
+#
+# The top-level makefile requires the following targets to exist:
+#
+# clean      Clean source tree
+# debug      Create a debug binary
+# distclean  Fully clean source tree, back to pristine condition
+# export     Export distributable components to ${EXPORT}
+# release    Create a release binary
+# setup      Perform any setup required prior to compilation
+# test       Execute any test cases
+
+# Manipulate include paths
+CFLAGS += -I$(CURDIR)
+
+# Objects
+OBJS = dict errors utf8
+
+.PHONY: clean debug distclean export release setup test
+
+# Targets
+release: $(addprefix ../Release/, $(addsuffix .o, $(OBJS)))
+
+debug: $(addprefix ../Debug/, $(addsuffix .o, $(OBJS)))
+
+clean:
+	-@${RM} ${RMFLAGS} $(addprefix ../Release/, $(addsuffix .o, ${OBJS}))
+	-@${RM} ${RMFLAGS} $(addprefix ../Debug/, $(addsuffix .o, ${OBJS}))  
+
+distclean:
+
+setup:
+
+export:
+
+test:
+
+# Pattern rules
+../Release/%.o: %.c
+	@${ECHO} ${ECHOFLAGS} "==> $<"
+	@${CC} -c ${CFLAGS} -DNDEBUG -o $@ $<
+
+../Debug/%.o: %.c
+	@${ECHO} ${ECHOFLAGS} "==> $<"
+	@${CC} -c -g ${CFLAGS} -o $@ $<
diff --git a/src/utils/dict.c b/src/utils/dict.c
new file mode 100644
index 0000000..f50ffab
--- /dev/null
+++ b/src/utils/dict.c
@@ -0,0 +1,219 @@
+/*
+ * This file is part of Hubbub.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#include <stdbool.h>
+
+#include "utils/dict.h"
+
+/** Node in a dictionary tree */
+typedef struct hubbub_dict_node {
+	uint8_t split;			/**< Data to split on */
+	struct hubbub_dict_node *lt;	/**< Subtree for data less than
+					 * split */
+	struct hubbub_dict_node *eq;	/**< Subtree for data equal to split
+					 * If split == '\0', this stores the
+					 * pointer to the actual data, not a
+					 * subtree */
+	struct hubbub_dict_node *gt;	/**< Subtree for data greater than
+					 * split */
+} hubbub_dict_node;
+
+/** Dictionary object */
+struct hubbub_dict {
+	hubbub_dict_node *dict;		/**< Root of tree */
+
+	hubbub_alloc alloc;		/**< Memory (de)allocation function */
+	void *pw;			/**< Pointer to client data */
+};
+
+static void hubbub_dict_destroy_internal(hubbub_dict *dict,
+		hubbub_dict_node *root);
+static hubbub_dict_node *hubbub_dict_insert_internal(hubbub_dict *dict,
+		hubbub_dict_node *parent, const char *key,
+		const void *value);
+
+
+/**
+ * Create a dictionary
+ *
+ * \param alloc  Memory (de)allocation function
+ * \param pw     Pointer to client-specific private data (may be NULL)
+ * \return Pointer to dictionary instance, or NULL on error
+ */
+hubbub_dict *hubbub_dict_create(hubbub_alloc alloc, void *pw)
+{
+	hubbub_dict *dict;
+
+	if (alloc == NULL)
+		return NULL;
+
+	dict = alloc(NULL, sizeof(hubbub_dict), pw);
+	if (dict == NULL)
+		return NULL;
+
+	dict->dict = NULL;
+
+	dict->alloc = alloc;
+	dict->pw = pw;
+
+	return dict;
+}
+
+/**
+ * Destroy a dictionary
+ *
+ * \param dict  Dictionary to destroy
+ */
+void hubbub_dict_destroy(hubbub_dict *dict)
+{
+	if (dict == NULL)
+		return;
+
+	hubbub_dict_destroy_internal(dict, dict->dict);
+
+	dict->alloc(dict, 0, dict->pw);
+}
+
+/**
+ * Helper routine for dictionary destruction
+ *
+ * \param dict  Dictionary being destroyed
+ * \param root  Root node of dictionary (sub)tree to destroy
+ */
+void hubbub_dict_destroy_internal(hubbub_dict *dict, hubbub_dict_node *root)
+{
+	if (root == NULL)
+		return;
+
+	hubbub_dict_destroy_internal(dict, root->lt);
+	if (root->split != '\0')
+		hubbub_dict_destroy_internal(dict, root->eq);
+	hubbub_dict_destroy_internal(dict, root->gt);
+
+	dict->alloc(root, 0, dict->pw);
+}
+
+/**
+ * Insert a key-value pair into a dictionary
+ *
+ * \param dict   Dictionary to insert into
+ * \param key    Key string
+ * \param value  Value to associate with key (may be NULL)
+ * \return HUBBUB_OK on success, appropriate error otherwise
+ */
+hubbub_error hubbub_dict_insert(hubbub_dict *dict, const char *key,
+		const void *value)
+{
+	if (dict == NULL || key == NULL)
+		return HUBBUB_BADPARM;
+
+	dict->dict = hubbub_dict_insert_internal(dict, dict->dict,
+			key, value);
+
+	return HUBBUB_OK;
+}
+
+/**
+ * Helper routine for insertion into dictionary
+ *
+ * \param dict    Dictionary being inserted into
+ * \param parent  Parent node of subtree to insert into
+ * \param key     Key string
+ * \param value   Value to associate with key
+ * \return Pointer to root of tree created
+ */
+hubbub_dict_node *hubbub_dict_insert_internal(hubbub_dict *dict,
+		hubbub_dict_node *parent, const char *key, const void *value)
+{
+	if (parent == NULL) {
+		parent = dict->alloc(NULL,
+				sizeof(hubbub_dict_node), dict->pw);
+		if (parent == NULL)
+			return NULL;
+		parent->split = (uint8_t) key[0];
+		parent->lt = parent->eq = parent->gt = NULL;
+	}
+
+	if ((uint8_t) key[0] < parent->split) {
+		parent->lt = hubbub_dict_insert_internal(dict,
+				parent->lt, key, value);
+	} else if ((uint8_t) key[0] == parent->split) {
+		if (key[0] == '\0') {
+			parent->eq = (hubbub_dict_node *) value;
+		} else {
+			parent->eq = hubbub_dict_insert_internal(dict,
+					parent->eq, ++key, value);
+		}
+	} else  {
+		parent->gt = hubbub_dict_insert_internal(dict,
+				parent->gt, key, value);
+	}
+
+	return parent;
+}
+
+/**
+ * Step-wise search for a key in a dictionary
+ *
+ * \param dict     Dictionary to search
+ * \param c        Character to look for
+ * \param result   Pointer to location for result
+ * \param context  Pointer to location for search context
+ * \return HUBBUB_OK if key found,
+ *         HUBBUB_NEEDDATA if more steps are required
+ *         HUBBUB_INVALID if nothing matches
+ *
+ * The value pointed to by ::context must be NULL for the first call.
+ * Thereafter, pass in the same value as returned by the previous call.
+ * The context is opaque to the caller and should not be inspected.
+ *
+ * The location pointed to by ::result will be set to NULL unless a match
+ * is found.
+ */
+hubbub_error hubbub_dict_search_step(hubbub_dict *dict, uint8_t c,
+		const void **result, void **context)
+{
+	bool match = false;
+	hubbub_dict_node *p;
+
+	if (dict == NULL || result == NULL || context == NULL)
+		return HUBBUB_BADPARM;
+
+	*result = NULL;
+
+	if (*context == NULL) {
+		p = dict->dict;
+	} else {
+		p = (hubbub_dict_node *) *context;
+	}
+
+	while (p != NULL) {
+		if (c < p->split) {
+			p = p->lt;
+		} else if (c == p->split) {
+			if (p->split == '\0') {
+				match = true;
+				p = NULL;
+			} else if (p->eq != NULL && p->eq->split == '\0') {
+				match = true;
+				*result = (const void *) p->eq->eq;
+				p = p->eq;
+			} else {
+				p = p->eq;
+			}
+
+			break;
+		} else {
+			p = p->gt;
+		}
+	}
+
+	*context = (void *) p;
+
+	return (match) ? HUBBUB_OK :
+			(p == NULL) ? HUBBUB_INVALID : HUBBUB_NEEDDATA;
+}
diff --git a/src/utils/dict.h b/src/utils/dict.h
new file mode 100644
index 0000000..2cde01d
--- /dev/null
+++ b/src/utils/dict.h
@@ -0,0 +1,31 @@
+/*
+ * This file is part of Hubbub.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#ifndef hubbub_utils_dict_h_
+#define hubbub_utils_dict_h_
+
+#include <inttypes.h>
+
+#include <hubbub/errors.h>
+#include <hubbub/hubbub.h>
+
+typedef struct hubbub_dict hubbub_dict;
+
+/* Create a dictionary */
+hubbub_dict *hubbub_dict_create(hubbub_alloc alloc, void *pw);
+/* Destroy a dictionary */
+void hubbub_dict_destroy(hubbub_dict *dict);
+
+/* Insert a key-value pair into a dictionary */
+hubbub_error hubbub_dict_insert(hubbub_dict *dict, const char *key,
+		const void *value);
+
+/* Step-wise search for a key in a dictionary */
+hubbub_error hubbub_dict_search_step(hubbub_dict *dict, uint8_t c,
+		const void **result, void **context);
+
+#endif
diff --git a/src/utils/errors.c b/src/utils/errors.c
new file mode 100644
index 0000000..e57ba6a
--- /dev/null
+++ b/src/utils/errors.c
@@ -0,0 +1,70 @@
+/*
+ * This file is part of Hubbub.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#include <string.h>
+
+#include <hubbub/errors.h>
+
+/**
+ * Convert a hubbub error code to a string
+ *
+ * \param error  The error code to convert
+ * \return Pointer to string representation of error, or NULL if unknown.
+ */
+const char *hubbub_error_to_string(hubbub_error error)
+{
+	const char *result = NULL;
+
+	switch (error) {
+	case HUBBUB_OK:
+		result = "No error";
+		break;
+	case HUBBUB_NOMEM:
+		result = "Insufficient memory";
+		break;
+	case HUBBUB_BADPARM:
+		result = "Bad parameter";
+		break;
+	case HUBBUB_INVALID:
+		result = "Invalid input";
+		break;
+	case HUBBUB_FILENOTFOUND:
+		result = "File not found";
+		break;
+	case HUBBUB_NEEDDATA:
+		result = "Insufficient data";
+		break;
+	}
+
+	return result;
+}
+
+/**
+ * Convert a string representation of an error name to a hubbub error code
+ *
+ * \param str  String containing error name
+ * \param len  Length of string (bytes)
+ * \return Hubbub error code, or HUBBUB_OK if unknown
+ */
+hubbub_error hubbub_error_from_string(const char *str, size_t len)
+{
+	if (strncmp(str, "HUBBUB_OK", len) == 0) {
+		return HUBBUB_OK;
+	} else if (strncmp(str, "HUBBUB_NOMEM", len) == 0) {
+		return HUBBUB_NOMEM;
+	} else if (strncmp(str, "HUBBUB_BADPARM", len) == 0) {
+		return HUBBUB_BADPARM;
+	} else if (strncmp(str, "HUBBUB_INVALID", len) == 0) {
+		return HUBBUB_INVALID;
+	} else if (strncmp(str, "HUBBUB_FILENOTFOUND", len) == 0) {
+		return HUBBUB_FILENOTFOUND;
+	} else if (strncmp(str, "HUBBUB_NEEDDATA", len) == 0) {
+		return HUBBUB_NEEDDATA;
+	}
+
+	return HUBBUB_OK;
+}
diff --git a/src/utils/utf8.c b/src/utils/utf8.c
new file mode 100644
index 0000000..062d629
--- /dev/null
+++ b/src/utils/utf8.c
@@ -0,0 +1,368 @@
+/*
+ * This file is part of Hubbub.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+/** \file
+ * UTF-8 manipulation functions (implementation).
+ */
+
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "utils/utf8.h"
+
+/** Number of continuation bytes for a given start byte */
+static const uint8_t numContinuations[256] = {
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+	3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5,
+};
+
+/**
+ * Convert a UTF-8 multibyte sequence into a single UCS4 character
+ *
+ * Encoding of UCS values outside the UTF-16 plane has been removed from
+ * RFC3629. This function conforms to RFC2279, however.
+ *
+ * \param s     The sequence to process
+ * \param len   Length of sequence
+ * \param ucs4  Pointer to location to receive UCS4 character (host endian)
+ * \param clen  Pointer to location to receive byte length of UTF-8 sequence
+ * \return HUBBUB_OK on success, appropriate error otherwise
+ */
+inline hubbub_error hubbub_utf8_to_ucs4(const uint8_t *s, size_t len,
+		uint32_t *ucs4, size_t *clen)
+{
+	if (s == NULL || ucs4 == NULL || clen == NULL)
+		return HUBBUB_BADPARM;
+
+	if (len == 0)
+		return HUBBUB_NEEDDATA;
+
+	if (*s < 0x80) {
+		*ucs4 = *s;
+		*clen = 1;
+	} else if ((*s & 0xE0) == 0xC0) {
+		if (len < 2)
+			return HUBBUB_NEEDDATA;
+		else if ((*(s+1) & 0xC0) != 0x80)
+			return HUBBUB_INVALID;
+		else {
+			*ucs4 = ((*s & 0x1F) << 6) | (*(s+1) & 0x3F);
+			*clen = 2;
+		}
+	} else if ((*s & 0xF0) == 0xE0) {
+		if (len < 3)
+			return HUBBUB_NEEDDATA;
+		else if ((*(s+1) & 0xC0) != 0x80 ||
+				(*(s+2) & 0xC0) != 0x80)
+			return HUBBUB_INVALID;
+		else {
+			*ucs4 = ((*s & 0x0F) << 12) |
+				((*(s+1) & 0x3F) << 6) |
+				(*(s+2) & 0x3F);
+			*clen = 3;
+		}
+	} else if ((*s & 0xF8) == 0xF0) {
+		if (len < 4)
+			return HUBBUB_NEEDDATA;
+		else if ((*(s+1) & 0xC0) != 0x80 ||
+				(*(s+2) & 0xC0) != 0x80 ||
+				(*(s+3) & 0xC0) != 0x80)
+			return HUBBUB_INVALID;
+		else {
+			*ucs4 = ((*s & 0x0F) << 18) |
+				((*(s+1) & 0x3F) << 12) |
+				((*(s+2) & 0x3F) << 6) |
+				(*(s+3) & 0x3F);
+			*clen = 4;
+		}
+	} else if ((*s & 0xFC) == 0xF8) {
+		if (len < 5)
+			return HUBBUB_NEEDDATA;
+		else if ((*(s+1) & 0xC0) != 0x80 ||
+				(*(s+2) & 0xC0) != 0x80 ||
+				(*(s+3) & 0xC0) != 0x80 ||
+				(*(s+4) & 0xC0) != 0x80)
+			return HUBBUB_INVALID;
+		else {
+			*ucs4 = ((*s & 0x0F) << 24) |
+				((*(s+1) & 0x3F) << 18) |
+				((*(s+2) & 0x3F) << 12) |
+				((*(s+3) & 0x3F) << 6) |
+				(*(s+4) & 0x3F);
+			*clen = 5;
+		}
+	} else if ((*s & 0xFE) == 0xFC) {
+		if (len < 6)
+			return HUBBUB_NEEDDATA;
+		else if ((*(s+1) & 0xC0) != 0x80 ||
+				(*(s+2) & 0xC0) != 0x80 ||
+				(*(s+3) & 0xC0) != 0x80 ||
+				(*(s+4) & 0xC0) != 0x80 ||
+				(*(s+5) & 0xC0) != 0x80)
+			return HUBBUB_INVALID;
+		else {
+			*ucs4 = ((*s & 0x0F) << 28) |
+				((*(s+1) & 0x3F) << 24) |
+				((*(s+2) & 0x3F) << 18) |
+				((*(s+3) & 0x3F) << 12) |
+				((*(s+4) & 0x3F) << 6) |
+				(*(s+5) & 0x3F);
+			*clen = 6;
+		}
+	} else {
+		return HUBBUB_INVALID;
+	}
+
+	return HUBBUB_OK;
+}
+
+/**
+ * Convert a single UCS4 character into a UTF-8 multibyte sequence
+ *
+ * Encoding of UCS values outside the UTF-16 plane has been removed from
+ * RFC3629. This function conforms to RFC2279, however.
+ *
+ * \param ucs4  The character to process (0 <= c <= 0x7FFFFFFF) (host endian)
+ * \param s     Pointer to 6 byte long output buffer
+ * \param len   Pointer to location to receive length of multibyte sequence
+ * \return HUBBUB_OK on success, appropriate error otherwise
+ */
+inline hubbub_error hubbub_utf8_from_ucs4(uint32_t ucs4, uint8_t *s,
+		size_t *len)
+{
+	uint32_t l = 0;
+
+	if (s == NULL || len == NULL)
+		return HUBBUB_BADPARM;
+	else if (ucs4 < 0x80) {
+		*s = (uint8_t) ucs4;
+		l = 1;
+	} else if (ucs4 < 0x800) {
+		*s = 0xC0 | ((ucs4 >> 6) & 0x1F);
+		*(s+1) = 0x80 | (ucs4 & 0x3F);
+		l = 2;
+	} else if (ucs4 < 0x10000) {
+		*s = 0xE0 | ((ucs4 >> 12) & 0xF);
+		*(s+1) = 0x80 | ((ucs4 >> 6) & 0x3F);
+		*(s+2) = 0x80 | (ucs4 & 0x3F);
+		l = 3;
+	} else if (ucs4 < 0x200000) {
+		*s = 0xF0 | ((ucs4 >> 18) & 0x7);
+		*(s+1) = 0x80 | ((ucs4 >> 12) & 0x3F);
+		*(s+2) = 0x80 | ((ucs4 >> 6) & 0x3F);
+		*(s+3) = 0x80 | (ucs4 & 0x3F);
+		l = 4;
+	} else if (ucs4 < 0x4000000) {
+		*s = 0xF8 | ((ucs4 >> 24) & 0x3);
+		*(s+1) = 0x80 | ((ucs4 >> 18) & 0x3F);
+		*(s+2) = 0x80 | ((ucs4 >> 12) & 0x3F);
+		*(s+3) = 0x80 | ((ucs4 >> 6) & 0x3F);
+		*(s+4) = 0x80 | (ucs4 & 0x3F);
+		l = 5;
+	} else if (ucs4 <= 0x7FFFFFFF) {
+		*s = 0xFC | ((ucs4 >> 30) & 0x1);
+		*(s+1) = 0x80 | ((ucs4 >> 24) & 0x3F);
+		*(s+2) = 0x80 | ((ucs4 >> 18) & 0x3F);
+		*(s+3) = 0x80 | ((ucs4 >> 12) & 0x3F);
+		*(s+4) = 0x80 | ((ucs4 >> 6) & 0x3F);
+		*(s+5) = 0x80 | (ucs4 & 0x3F);
+		l = 6;
+	} else {
+		return HUBBUB_INVALID;
+	}
+
+	*len = l;
+
+	return HUBBUB_OK;
+}
+
+/**
+ * Calculate the length (in characters) of a bounded UTF-8 string
+ *
+ * \param s    The string
+ * \param max  Maximum length
+ * \param len  Pointer to location to receive length of string
+ * \return HUBBUB_OK on success, appropriate error otherwise
+ */
+inline hubbub_error hubbub_utf8_length(const uint8_t *s, size_t max,
+		size_t *len)
+{
+	const uint8_t *end = s + max;
+	int l = 0;
+
+	if (s == NULL || len == NULL)
+		return HUBBUB_BADPARM;
+
+	while (s < end) {
+		if ((*s & 0x80) == 0x00)
+			s += 1;
+		else if ((*s & 0xE0) == 0xC0)
+			s += 2;
+		else if ((*s & 0xF0) == 0xE0)
+			s += 3;
+		else if ((*s & 0xF8) == 0xF0)
+			s += 4;
+		else if ((*s & 0xFC) == 0xF8)
+			s += 5;
+		else if ((*s & 0xFE) == 0xFC)
+			s += 6;
+		else
+			return HUBBUB_INVALID;
+		l++;
+	}
+
+	*len = l;
+
+	return HUBBUB_OK;
+}
+
+/**
+ * Calculate the length (in bytes) of a UTF-8 character
+ *
+ * \param s    Pointer to start of character
+ * \param len  Pointer to location to receive length
+ * \return HUBBUB_OK on success, appropriate error otherwise
+ */
+inline hubbub_error hubbub_utf8_char_byte_length(const uint8_t *s,
+		size_t *len)
+{
+	if (s == NULL || len == NULL)
+		return HUBBUB_BADPARM;
+
+	*len = numContinuations[s[0]] + 1 /* Start byte */;
+
+	return HUBBUB_OK;
+}
+
+/**
+ * Find previous legal UTF-8 char in string
+ *
+ * \param s        The string
+ * \param off      Offset in the string to start at
+ * \param prevoff  Pointer to location to receive offset of first byte of
+ *                 previous legal character
+ * \return HUBBUB_OK on success, appropriate error otherwise
+ */
+inline hubbub_error hubbub_utf8_prev(const uint8_t *s, uint32_t off,
+		uint32_t *prevoff)
+{
+	if (s == NULL || prevoff == NULL)
+		return HUBBUB_BADPARM;
+
+	while (off != 0 && (s[--off] & 0xC0) == 0x80)
+		/* do nothing */;
+
+	*prevoff = off;
+
+	return HUBBUB_OK;
+}
+
+/**
+ * Find next legal UTF-8 char in string
+ *
+ * \param s        The string (assumed valid)
+ * \param len      Maximum offset in string
+ * \param off      Offset in the string to start at
+ * \param nextoff  Pointer to location to receive offset of first byte of
+ *                 next legal character
+ * \return HUBBUB_OK on success, appropriate error otherwise
+ */
+inline hubbub_error hubbub_utf8_next(const uint8_t *s, uint32_t len,
+		uint32_t off, uint32_t *nextoff)
+{
+	if (s == NULL || off >= len || nextoff == NULL)
+		return HUBBUB_BADPARM;
+
+	/* Skip current start byte (if present - may be mid-sequence) */
+	if (s[off] < 0x80 || (s[off] & 0xC0) == 0xC0)
+		off++;
+
+	while (off < len && (s[off] & 0xC0) == 0x80)
+		off++;
+
+	*nextoff = off;
+
+	return HUBBUB_OK;
+}
+
+/**
+ * Find next legal UTF-8 char in string
+ *
+ * \param s        The string (assumed to be of dubious validity)
+ * \param len      Maximum offset in string
+ * \param off      Offset in the string to start at
+ * \param nextoff  Pointer to location to receive offset of first byte of
+ *                 next legal character
+ * \return HUBBUB_OK on success, appropriate error otherwise
+ */
+inline hubbub_error hubbub_utf8_next_paranoid(const uint8_t *s, uint32_t len,
+		uint32_t off, uint32_t *nextoff)
+{
+	bool valid;
+
+	if (s == NULL || off >= len || nextoff == NULL)
+		return HUBBUB_BADPARM;
+
+	/* Skip current start byte (if present - may be mid-sequence) */
+	if (s[off] < 0x80 || (s[off] & 0xC0) == 0xC0)
+		off++;
+
+	while (1) {
+		/* Find next possible start byte */
+		while (off < len && (s[off] & 0xC0) == 0x80)
+			off++;
+
+		/* Ran off end of data */
+		if (off == len || off + numContinuations[s[off]] >= len)
+			return HUBBUB_NEEDDATA;
+
+		/* Found if start byte is ascii,
+		 * or next n bytes are valid continuations */
+		valid = true;
+
+		switch (numContinuations[s[off]]) {
+		case 5:
+			valid &= ((s[off + 5] & 0xC0) == 0x80);
+		case 4:
+			valid &= ((s[off + 4] & 0xC0) == 0x80);
+		case 3:
+			valid &= ((s[off + 3] & 0xC0) == 0x80);
+		case 2:
+			valid &= ((s[off + 2] & 0xC0) == 0x80);
+		case 1:
+			valid &= ((s[off + 1] & 0xC0) == 0x80);
+		case 0:
+			valid &= (s[off + 0] < 0x80);
+		}
+
+		if (valid)
+			break;
+
+		/* Otherwise, skip this (invalid) start byte and try again */
+		off++;
+	}
+
+	*nextoff = off;
+
+	return HUBBUB_OK;
+}
+
diff --git a/src/utils/utf8.h b/src/utils/utf8.h
new file mode 100644
index 0000000..8836338
--- /dev/null
+++ b/src/utils/utf8.h
@@ -0,0 +1,38 @@
+/*
+ * This file is part of Hubbub.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+/** \file
+ * UTF-8 manipulation functions (interface).
+ */
+
+#ifndef hubbub_utils_utf8_h_
+#define hubbub_utils_utf8_h
+
+#include <inttypes.h>
+
+#include <hubbub/errors.h>
+
+inline hubbub_error hubbub_utf8_to_ucs4(const uint8_t *s, size_t len,
+		uint32_t *ucs4, size_t *clen);
+inline hubbub_error hubbub_utf8_from_ucs4(uint32_t ucs4, uint8_t *s,
+		size_t *len);
+
+inline hubbub_error hubbub_utf8_length(const uint8_t *s, size_t max,
+		size_t *len);
+inline hubbub_error hubbub_utf8_char_byte_length(const uint8_t *s,
+		size_t *len);
+
+inline hubbub_error hubbub_utf8_prev(const uint8_t *s, uint32_t off,
+		uint32_t *prevoff);
+inline hubbub_error hubbub_utf8_next(const uint8_t *s, uint32_t len,
+		uint32_t off, uint32_t *nextoff);
+
+inline hubbub_error hubbub_utf8_next_paranoid(const uint8_t *s, uint32_t len,
+		uint32_t off, uint32_t *nextoff);
+
+#endif
+
diff --git a/src/utils/utils.h b/src/utils/utils.h
new file mode 100644
index 0000000..a1e0230
--- /dev/null
+++ b/src/utils/utils.h
@@ -0,0 +1,28 @@
+/*
+ * This file is part of Hubbub.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#ifndef hubbub_utils_h_
+#define hubbub_utils_h_
+
+#ifndef max
+#define max(a,b) ((a)>(b)?(a):(b))
+#endif
+
+#ifndef min
+#define min(a,b) ((a)<(b)?(a):(b))
+#endif
+
+#ifndef SLEN
+/* Calculate length of a string constant */
+#define SLEN(s) (sizeof((s)) - 1) /* -1 for '\0' */
+#endif
+
+#ifndef UNUSED
+#define UNUSED(x) ((x)=(x))
+#endif
+
+#endif
-- 
cgit v1.2.3