From 2777a04ed2ba4fd36138b991d66a32a283361f7e Mon Sep 17 00:00:00 2001
From: John Mark Bell <jmb@netsurf-browser.org>
Date: Thu, 1 May 2008 16:34:46 +0000
Subject: Import parser construction utility library

svn path=/trunk/libparserutils/; revision=4111
---
 COPYING                                 |  19 +
 Makefile                                |  43 ++
 Makefile-riscos                         |  46 +++
 README                                  |  44 ++
 build/Makefile.common                   | 129 ++++++
 build/Makefile.config                   |   8 +
 include/parserutils/charset/codec.h     | 114 ++++++
 include/parserutils/charset/mibenum.h   |  24 ++
 include/parserutils/charset/utf16.h     |  38 ++
 include/parserutils/charset/utf8.h      |  38 ++
 include/parserutils/errors.h            |  29 ++
 include/parserutils/functypes.h         |  21 +
 include/parserutils/input/inputstream.h | 143 +++++++
 include/parserutils/parserutils.h       |  23 ++
 include/parserutils/types.h             |  15 +
 include/parserutils/utils/buffer.h      |  39 ++
 libparserutils.pc.in                    |  10 +
 src/Makefile                            |  49 +++
 src/charset/Makefile                    |  49 +++
 src/charset/aliases.c                   | 410 +++++++++++++++++++
 src/charset/aliases.h                   |  36 ++
 src/charset/charset.c                   |  54 +++
 src/charset/charset.h                   |  24 ++
 src/charset/codec.c                     | 185 +++++++++
 src/charset/codecs/Makefile             |  46 +++
 src/charset/codecs/codec_iconv.c        | 683 ++++++++++++++++++++++++++++++++
 src/charset/codecs/codec_impl.h         |  48 +++
 src/charset/codecs/codec_utf16.c        | 544 +++++++++++++++++++++++++
 src/charset/codecs/codec_utf8.c         | 546 +++++++++++++++++++++++++
 src/charset/encodings/Makefile          |  46 +++
 src/charset/encodings/utf16.c           | 239 +++++++++++
 src/charset/encodings/utf8.c            | 175 ++++++++
 src/charset/encodings/utf8impl.h        | 339 ++++++++++++++++
 src/input/Makefile                      |  46 +++
 src/input/filter.c                      | 384 ++++++++++++++++++
 src/input/filter.h                      |  57 +++
 src/input/inputstream.c                 | 477 ++++++++++++++++++++++
 src/parserutils.c                       |  54 +++
 src/utils/Makefile                      |  49 +++
 src/utils/buffer.c                      | 156 ++++++++
 src/utils/errors.c                      |  70 ++++
 src/utils/utils.h                       |  28 ++
 test/INDEX                              |  15 +
 test/Makefile                           |  80 ++++
 test/README                             |  84 ++++
 test/aliases.c                          |  62 +++
 test/charset.c                          |  31 ++
 test/cscodec.c                          | 232 +++++++++++
 test/data/Aliases                       | 302 ++++++++++++++
 test/data/cscodec/INDEX                 |   6 +
 test/data/cscodec/UTF-8-test.txt        | Bin 0 -> 41013 bytes
 test/data/cscodec/simple.dat            | Bin 0 -> 1109 bytes
 test/data/input/INDEX                   |   5 +
 test/data/input/UTF-8-test.txt          | Bin 0 -> 20334 bytes
 test/filter.c                           | 357 +++++++++++++++++
 test/inputstream.c                      |  97 +++++
 test/parserutils.c                      |  30 ++
 test/regression/cscodec-segv.c          |  38 ++
 test/regression/filter-segv.c           |  39 ++
 test/regression/stream-nomem.c          |  94 +++++
 test/testrunner.pl                      | 167 ++++++++
 test/testutils.h                        | 123 ++++++
 62 files changed, 7339 insertions(+)
 create mode 100644 COPYING
 create mode 100644 Makefile
 create mode 100644 Makefile-riscos
 create mode 100644 README
 create mode 100644 build/Makefile.common
 create mode 100644 build/Makefile.config
 create mode 100644 include/parserutils/charset/codec.h
 create mode 100644 include/parserutils/charset/mibenum.h
 create mode 100644 include/parserutils/charset/utf16.h
 create mode 100644 include/parserutils/charset/utf8.h
 create mode 100644 include/parserutils/errors.h
 create mode 100644 include/parserutils/functypes.h
 create mode 100644 include/parserutils/input/inputstream.h
 create mode 100644 include/parserutils/parserutils.h
 create mode 100644 include/parserutils/types.h
 create mode 100644 include/parserutils/utils/buffer.h
 create mode 100644 libparserutils.pc.in
 create mode 100644 src/Makefile
 create mode 100644 src/charset/Makefile
 create mode 100644 src/charset/aliases.c
 create mode 100644 src/charset/aliases.h
 create mode 100644 src/charset/charset.c
 create mode 100644 src/charset/charset.h
 create mode 100644 src/charset/codec.c
 create mode 100644 src/charset/codecs/Makefile
 create mode 100644 src/charset/codecs/codec_iconv.c
 create mode 100644 src/charset/codecs/codec_impl.h
 create mode 100644 src/charset/codecs/codec_utf16.c
 create mode 100644 src/charset/codecs/codec_utf8.c
 create mode 100644 src/charset/encodings/Makefile
 create mode 100644 src/charset/encodings/utf16.c
 create mode 100644 src/charset/encodings/utf8.c
 create mode 100644 src/charset/encodings/utf8impl.h
 create mode 100644 src/input/Makefile
 create mode 100644 src/input/filter.c
 create mode 100644 src/input/filter.h
 create mode 100644 src/input/inputstream.c
 create mode 100644 src/parserutils.c
 create mode 100644 src/utils/Makefile
 create mode 100644 src/utils/buffer.c
 create mode 100644 src/utils/errors.c
 create mode 100644 src/utils/utils.h
 create mode 100644 test/INDEX
 create mode 100644 test/Makefile
 create mode 100644 test/README
 create mode 100644 test/aliases.c
 create mode 100644 test/charset.c
 create mode 100644 test/cscodec.c
 create mode 100644 test/data/Aliases
 create mode 100644 test/data/cscodec/INDEX
 create mode 100644 test/data/cscodec/UTF-8-test.txt
 create mode 100644 test/data/cscodec/simple.dat
 create mode 100644 test/data/input/INDEX
 create mode 100644 test/data/input/UTF-8-test.txt
 create mode 100644 test/filter.c
 create mode 100644 test/inputstream.c
 create mode 100644 test/parserutils.c
 create mode 100644 test/regression/cscodec-segv.c
 create mode 100644 test/regression/filter-segv.c
 create mode 100644 test/regression/stream-nomem.c
 create mode 100644 test/testrunner.pl
 create mode 100644 test/testutils.h

diff --git a/COPYING b/COPYING
new file mode 100644
index 0000000..0f8d92b
--- /dev/null
+++ b/COPYING
@@ -0,0 +1,19 @@
+Copyright (C) 2007-8 J-M Bell
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+  * The above copyright notice and this permission notice shall be included in
+    all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..e4de9b9
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,43 @@
+# Toolchain definitions for building on the destination platform
+CC := gcc
+AR := ar
+LD := gcc
+
+CP := cp
+RM := rm
+MKDIR := mkdir
+MV := mv
+ECHO := echo
+MAKE := make
+PERL := perl
+PKGCONFIG := pkg-config
+INSTALL := install
+SED := sed
+LCOV := lcov
+GENHTML := genhtml
+
+# Toolchain flags
+WARNFLAGS := -Wall -Wextra -Wundef -Wpointer-arith -Wcast-align \
+	-Wwrite-strings -Wstrict-prototypes -Wmissing-prototypes \
+	-Wmissing-declarations -Wnested-externs -Werror -pedantic
+override CFLAGS += -std=c99 -D_BSD_SOURCE -I$(TOP)/include/ $(WARNFLAGS)
+RELEASECFLAGS = $(CFLAGS) -DNDEBUG -O2
+DEBUGCFLAGS = $(CFLAGS) -O0 -g
+ARFLAGS := -cru
+override LDFLAGS += -L$(TOP)/
+
+CPFLAGS :=
+RMFLAGS := -f
+MKDIRFLAGS := -p
+MVFLAGS :=
+ECHOFLAGS := 
+MAKEFLAGS :=
+PKGCONFIGFLAGS :=
+
+EXEEXT :=
+
+# Default installation prefix
+PREFIX ?= /usr/local
+
+
+include build/Makefile.common
diff --git a/Makefile-riscos b/Makefile-riscos
new file mode 100644
index 0000000..c9fef3c
--- /dev/null
+++ b/Makefile-riscos
@@ -0,0 +1,46 @@
+# Toolchain definitions for building for RISC OS using the GCCSDK cross-compiler
+GCCSDK_INSTALL_CROSSBIN ?= /home/riscos/cross/bin
+GCCSDK_INSTALL_ENV ?= /home/riscos/env
+
+CC := $(GCCSDK_INSTALL_CROSSBIN)/gcc
+AR := $(GCCSDK_INSTALL_CROSSBIN)/ar
+LD := $(GCCSDK_INSTALL_CROSSBIN)/gcc
+
+CP := cp
+RM := rm
+MKDIR := mkdir
+MV := mv
+ECHO := echo
+MAKE := make
+PERL := perl
+PKGCONFIG := pkg-config
+INSTALL := install
+SED := sed
+LCOV := echo
+GENHTML := echo
+
+# Toolchain flags
+WARNFLAGS := -Wall -Wextra -Wundef -Wpointer-arith -Wcast-align \
+	-Wwrite-strings -Wstrict-prototypes -Wmissing-prototypes \
+	-Wmissing-declarations -Wnested-externs -Werror -pedantic
+CFLAGS += -std=c99 -D_BSD_SOURCE -I$(TOP)/include/ $(WARNFLAGS) \
+	-mpoke-function-name
+RELEASECFLAGS = $(CFLAGS) -DNDEBUG -O2
+DEBUGCFLAGS = $(CFLAGS) -O0 -g
+ARFLAGS := -cru
+LDFLAGS = -L$(TOP)/
+
+CPFLAGS :=
+RMFLAGS := -f
+MKDIRFLAGS := -p
+MVFLAGS := 
+ECHOFLAGS := 
+MAKEFLAGS :=
+PKGCONFIGFLAGS :=
+
+EXEEXT := ,ff8
+
+# Default installation prefix
+PREFIX ?= $(GCCSDK_INSTALL_ENV)
+
+include build/Makefile.common
diff --git a/README b/README
new file mode 100644
index 0000000..72041c0
--- /dev/null
+++ b/README
@@ -0,0 +1,44 @@
+LibParserUtils -- a utility library for parser building
+=======================================================
+
+Overview
+--------
+
+  LibParserUtils provides various pieces of functionality that are useful
+  when writing parsers.
+
+Requirements
+------------
+
+  LibParserUtils requires the following tools:
+
+    + A C99 capable C compiler
+    + GNU make or compatible
+    + Perl (for the testcases)
+    + Pkg-config (for the testcases)
+
+  For enhanced charset support, LibParserUtils may also be configured to use
+  an iconv() implementation.
+
+Compilation
+-----------
+
+  If necessary, modify the toolchain settings in the Makefile.
+  Invoke make:
+  		$ make
+
+Verification
+------------
+
+  To verify that the parser is working, it is necessary to specify a 
+  different makefile target than that used for normal compilation, thus:
+  
+  		$ make test
+
+API documentation
+-----------------
+
+  Currently, there is none. However, the code is well commented and the 
+  public API may be found in the "include" directory. The testcase sources 
+  may also be of use in working out how to use it.
+
diff --git a/build/Makefile.common b/build/Makefile.common
new file mode 100644
index 0000000..418a5a8
--- /dev/null
+++ b/build/Makefile.common
@@ -0,0 +1,129 @@
+# Top-level Makefile fragment
+
+# Default target
+all: release
+
+# Name of component
+COMPONENT := libparserutils
+
+# Environment
+EXPORT := $(CURDIR)/dist
+TOP := $(CURDIR)
+RELEASEDIR := build/Release
+DEBUGDIR := build/Debug
+COVERAGEDIR := build/coverage
+
+# List of items to delete on clean
+ITEMS_CLEAN :=
+# List of items to delete on distclean
+ITEMS_DISTCLEAN :=
+
+# List of targets to run for testing
+TARGET_TESTS :=
+
+# Source files
+SOURCES :=
+
+# Include configuration Makefile fragment
+include build/Makefile.config
+
+# Include Makefile fragments in subdirectories
+
+define do_include
+DIR := $$(dir $(1))
+include $(1)
+
+endef
+
+MAKE_INCLUDES := $(wildcard */Makefile)
+$(eval $(foreach INC, $(MAKE_INCLUDES), $(call do_include,$(INC))))
+
+# Calculate objects to build
+OBJECTS := $(subst /,_,$(subst .c,.o,$(SOURCES)))
+
+.PHONY: release debug test coverage profile \
+	clean distclean setup export install uninstall
+
+# Rules
+release: setup $(addprefix $(RELEASEDIR)/,$(OBJECTS))
+	@$(AR) $(ARFLAGS) $(COMPONENT).a $(RELEASEDIR)/*
+
+debug: setup $(addprefix $(DEBUGDIR)/,$(OBJECTS))
+	@$(AR) $(ARFLAGS) $(COMPONENT)-debug.a $(DEBUGDIR)/*
+
+test: debug $(TARGET_TESTS)
+
+coverage: clean
+	@$(LCOV) --directory . --zerocounters
+	@$(MAKE) test CFLAGS="$(CFLAGS) -fprofile-arcs -ftest-coverage" \
+		LDFLAGS="$(LDFLAGS) -lgcov"
+	@$(LCOV) --directory $(DEBUGDIR) --base-directory $(TOP) \
+		--capture --output-file $(COVERAGEDIR)/$(COMPONENT)_tmp.info
+	@$(LCOV) --extract $(COVERAGEDIR)/$(COMPONENT)_tmp.info "$(TOP)/src*" \
+		-o $(COVERAGEDIR)/$(COMPONENT).info
+	@$(RM) $(RMFLAGS) $(COVERAGEDIR)/$(COMPONENT)_tmp.info
+	@$(GENHTML) -o $(COVERAGEDIR) --num-spaces 2 \
+		$(COVERAGEDIR)/$(COMPONENT).info
+
+profile: clean
+	@$(MAKE) test CFLAGS="$(CFLAGS) -pg" LDFLAGS="-pg $(LDFLAGS)"
+
+clean:
+	-@$(RM) $(RMFLAGS) $(ITEMS_CLEAN)
+	-@$(RM) $(RMFLAGS) gmon.out
+	-@$(RM) $(RMFLAGS) -r $(COVERAGEDIR)
+	-@$(RM) $(RMFLAGS) -r $(RELEASEDIR)
+	-@$(RM) $(RMFLAGS) -r $(DEBUGDIR)
+	-@$(RM) $(RMFLAGS) $(COMPONENT).a
+	-@$(RM) $(RMFLAGS) $(COMPONENT)-debug.a
+	-@$(RM) $(RMFLAGS) $(COMPONENT).pc
+
+distclean: clean
+	-@$(RM) $(RMFLAGS) $(ITEMS_DISTCLEAN)
+	-@$(RM) $(RMFLAGS) -r $(TOP)/dist
+
+setup:
+	@$(MKDIR) $(MKDIRFLAGS) $(RELEASEDIR)
+	@$(MKDIR) $(MKDIRFLAGS) $(DEBUGDIR)
+	@$(MKDIR) $(MKDIRFLAGS) $(COVERAGEDIR)
+
+export: release
+	@$(MKDIR) $(MKDIRFLAGS) $(TOP)/dist/lib
+	@$(CP) $(CPFLAGS) -r include $(EXPORT)/
+	@${CP} ${CPFLAGS} $(COMPONENT).a ${EXPORT}/lib/
+
+install: release
+	@$(MKDIR) $(MKDIRFLAGS) -p $(PREFIX)/lib/pkgconfig
+	@$(MKDIR) $(MKDIRFLAGS) -p $(PREFIX)/include/parserutils
+	@$(MKDIR) $(MKDIRFLAGS) -p $(PREFIX)/include/parserutils/charset
+	@$(MKDIR) $(MKDIRFLAGS) -p $(PREFIX)/include/parserutils/input
+	@$(MKDIR) $(MKDIRFLAGS) -p $(PREFIX)/include/parserutils/utils
+	@$(SED) -e 's#PREFIX#$(PREFIX)#' $(COMPONENT).pc.in >$(COMPONENT).pc
+	@$(INSTALL) --mode=644 -t $(PREFIX)/lib $(COMPONENT).a
+	@$(INSTALL) --mode=644 -t $(PREFIX)/lib/pkgconfig $(COMPONENT).pc
+	@$(INSTALL) --mode=644 -t $(PREFIX)/include/parserutils $(filter %.h, $(wildcard include/parserutils/*))
+	@$(INSTALL) --mode=644 -t $(PREFIX)/include/parserutils/charset $(filter %.h, $(wildcard include/parserutils/charset/*))
+	@$(INSTALL) --mode=644 -t $(PREFIX)/include/parserutils/input $(filter %.h, $(wildcard include/parserutils/input/*))
+	@$(INSTALL) --mode=644 -t $(PREFIX)/include/parserutils/utils $(filter %.h, $(wildcard include/parserutils/utils/*))
+
+
+uninstall:
+	@$(RM) $(RMFLAGS) $(PREFIX)/lib/$(COMPONENT).a
+	@$(RM) $(RMFLAGS) $(PREFIX)/lib/pkgconfig/$(COMPONENT).pc
+	@$(RM) $(RMFLAGS) -r $(PREFIX)/include/parserutils
+
+# Finally, build rules for compilation
+define do_compile
+$$(RELEASEDIR)/$(2): $(1)
+	@$$(ECHO) $$(ECHOFLAGS) "==> $(1)"
+	@$$(CC) -c $$(RELEASECFLAGS) -o $$@ $(1)
+
+$$(DEBUGDIR)/$(2): $(1)
+	@$$(ECHO) $$(ECHOFLAGS) "==> $(1)"
+	@$$(CC) -c $$(DEBUGCFLAGS) -o $$@ $(1)
+
+endef
+
+$(eval $(foreach SOURCE,$(filter %.c,$(SOURCES)), \
+	$(call do_compile,$(SOURCE),$(subst /,_,$(SOURCE:.c=.o)))))
+
diff --git a/build/Makefile.config b/build/Makefile.config
new file mode 100644
index 0000000..b6560c1
--- /dev/null
+++ b/build/Makefile.config
@@ -0,0 +1,8 @@
+# Configuration Makefile fragment
+
+# Build the iconv codec
+# override CFLAGS += -DWITH_ICONV_CODEC
+
+# Use iconv directly in the input filter
+# override CFLAGS += -DWITH_ICONV_FILTER
+
diff --git a/include/parserutils/charset/codec.h b/include/parserutils/charset/codec.h
new file mode 100644
index 0000000..ca98db5
--- /dev/null
+++ b/include/parserutils/charset/codec.h
@@ -0,0 +1,114 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#ifndef parserutils_charset_codec_h_
+#define parserutils_charset_codec_h_
+
+#include <inttypes.h>
+
+#include <parserutils/errors.h>
+#include <parserutils/functypes.h>
+
+typedef struct parserutils_charset_codec parserutils_charset_codec;
+
+#define PARSERUTILS_CHARSET_CODEC_NULL (0xffffffffU)
+
+/**
+ * Charset codec error mode
+ *
+ * A codec's error mode determines its behaviour in the face of:
+ *
+ * + characters which are unrepresentable in the destination charset (if
+ *   encoding data) or which cannot be converted to UCS4 (if decoding data).
+ * + invalid byte sequences (both encoding and decoding)
+ *
+ * The options provide a choice between the following approaches:
+ *
+ * + draconian, "stop processing" ("strict")
+ * + "replace the unrepresentable character with something else" ("loose")
+ * + "attempt to transliterate, or replace if unable" ("translit")
+ *
+ * The default error mode is "loose".
+ *
+ *
+ * In the "loose" case, the replacement character will depend upon:
+ *
+ * + Whether the operation was encoding or decoding
+ * + If encoding, what the destination charset is.
+ *
+ * If decoding, the replacement character will be:
+ *
+ *     U+FFFD (REPLACEMENT CHARACTER)
+ *
+ * If encoding, the replacement character will be:
+ *
+ *     U+003F (QUESTION MARK) if the destination charset is not UTF-(8|16|32)
+ *     U+FFFD (REPLACEMENT CHARACTER) otherwise.
+ *
+ *
+ * In the "translit" case, the codec will attempt to transliterate into
+ * the destination charset, if encoding. If decoding, or if transliteration
+ * fails, this option is identical to "loose".
+ */
+typedef enum parserutils_charset_codec_errormode {
+	/** Abort processing if unrepresentable character encountered */
+	PARSERUTILS_CHARSET_CODEC_ERROR_STRICT   = 0,
+	/** Replace unrepresentable characters with single alternate */
+	PARSERUTILS_CHARSET_CODEC_ERROR_LOOSE    = 1,
+	/** Transliterate unrepresentable characters, if possible */
+	PARSERUTILS_CHARSET_CODEC_ERROR_TRANSLIT = 2,
+} parserutils_charset_codec_errormode;
+
+/**
+ * Charset codec option types
+ */
+typedef enum parserutils_charset_codec_opttype {
+	/** Set codec error mode */
+	PARSERUTILS_CHARSET_CODEC_ERROR_MODE  = 1,
+} parserutils_charset_codec_opttype;
+
+/**
+ * Charset codec option parameters
+ */
+typedef union parserutils_charset_codec_optparams {
+	/** Parameters for error mode setting */
+	struct {
+		/** The desired error handling mode */
+		parserutils_charset_codec_errormode mode;
+	} error_mode;
+} parserutils_charset_codec_optparams;
+
+
+/* Create a charset codec */
+parserutils_charset_codec *parserutils_charset_codec_create(const char *charset,
+		parserutils_alloc alloc, void *pw);
+/* Destroy a charset codec */
+void parserutils_charset_codec_destroy(parserutils_charset_codec *codec);
+
+/* Configure a charset codec */
+parserutils_error parserutils_charset_codec_setopt(
+		parserutils_charset_codec *codec,
+		parserutils_charset_codec_opttype type, 
+		parserutils_charset_codec_optparams *params);
+
+/* Encode a chunk of UCS4 data into a codec's charset */
+parserutils_error parserutils_charset_codec_encode(
+		parserutils_charset_codec *codec,
+		const uint8_t **source, size_t *sourcelen,
+		uint8_t **dest, size_t *destlen);
+
+/* Decode a chunk of data in a codec's charset into UCS4 */
+parserutils_error parserutils_charset_codec_decode(
+		parserutils_charset_codec *codec,
+		const uint8_t **source, size_t *sourcelen,
+		uint8_t **dest, size_t *destlen);
+
+/* Reset a charset codec */
+parserutils_error parserutils_charset_codec_reset(
+		parserutils_charset_codec *codec);
+
+#endif
diff --git a/include/parserutils/charset/mibenum.h b/include/parserutils/charset/mibenum.h
new file mode 100644
index 0000000..8b3ac9d
--- /dev/null
+++ b/include/parserutils/charset/mibenum.h
@@ -0,0 +1,24 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#ifndef parserutils_charset_mibenum_h_
+#define parserutils_charset_mibenum_h_
+
+#include <inttypes.h>
+#include <stdbool.h>
+
+#include <parserutils/errors.h>
+#include <parserutils/functypes.h>
+
+/* Convert an encoding alias to a MIB enum value */
+uint16_t parserutils_charset_mibenum_from_name(const char *alias, size_t len);
+/* Convert a MIB enum value into an encoding alias */
+const char *parserutils_charset_mibenum_to_name(uint16_t mibenum);
+/* Determine if a MIB enum value represents a Unicode variant */
+bool parserutils_charset_mibenum_is_unicode(uint16_t mibenum);
+
+#endif
diff --git a/include/parserutils/charset/utf16.h b/include/parserutils/charset/utf16.h
new file mode 100644
index 0000000..6569d6e
--- /dev/null
+++ b/include/parserutils/charset/utf16.h
@@ -0,0 +1,38 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+/** \file
+ * UTF-16 manipulation functions (interface).
+ */
+
+#ifndef parserutils_charset_utf16_h_
+#define parserutils_charset_utf16_h_
+
+#include <inttypes.h>
+
+#include <parserutils/errors.h>
+
+parserutils_error parserutils_charset_utf16_to_ucs4(const uint8_t *s, 
+		size_t len, uint32_t *ucs4, size_t *clen);
+parserutils_error parserutils_charset_utf16_from_ucs4(uint32_t ucs4, 
+		uint8_t *s, size_t *len);
+
+parserutils_error parserutils_charset_utf16_length(const uint8_t *s, 
+		size_t max, size_t *len);
+parserutils_error parserutils_charset_utf16_char_byte_length(const uint8_t *s,
+		size_t *len);
+
+parserutils_error parserutils_charset_utf16_prev(const uint8_t *s, 
+		uint32_t off, uint32_t *prevoff);
+parserutils_error parserutils_charset_utf16_next(const uint8_t *s, 
+		uint32_t len, uint32_t off, uint32_t *nextoff);
+
+parserutils_error parserutils_charset_utf16_next_paranoid(const uint8_t *s,
+		uint32_t len, uint32_t off, uint32_t *nextoff);
+
+#endif
+
diff --git a/include/parserutils/charset/utf8.h b/include/parserutils/charset/utf8.h
new file mode 100644
index 0000000..16e012e
--- /dev/null
+++ b/include/parserutils/charset/utf8.h
@@ -0,0 +1,38 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+/** \file
+ * UTF-8 manipulation functions (interface).
+ */
+
+#ifndef parserutils_charset_utf8_h_
+#define parserutils_charset_utf8_h_
+
+#include <inttypes.h>
+
+#include <parserutils/errors.h>
+
+parserutils_error parserutils_charset_utf8_to_ucs4(const uint8_t *s, size_t len,
+		uint32_t *ucs4, size_t *clen);
+parserutils_error parserutils_charset_utf8_from_ucs4(uint32_t ucs4, uint8_t **s,
+		size_t *len);
+
+parserutils_error parserutils_charset_utf8_length(const uint8_t *s, size_t max,
+		size_t *len);
+parserutils_error parserutils_charset_utf8_char_byte_length(const uint8_t *s,
+		size_t *len);
+
+parserutils_error parserutils_charset_utf8_prev(const uint8_t *s, uint32_t off,
+		uint32_t *prevoff);
+parserutils_error parserutils_charset_utf8_next(const uint8_t *s, uint32_t len,
+		uint32_t off, uint32_t *nextoff);
+
+parserutils_error parserutils_charset_utf8_next_paranoid(const uint8_t *s, 
+		uint32_t len, uint32_t off, uint32_t *nextoff);
+
+#endif
+
diff --git a/include/parserutils/errors.h b/include/parserutils/errors.h
new file mode 100644
index 0000000..09c715c
--- /dev/null
+++ b/include/parserutils/errors.h
@@ -0,0 +1,29 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#ifndef parserutils_errors_h_
+#define parserutils_errors_h_
+
+#include <stddef.h>
+
+typedef enum parserutils_error {
+	PARSERUTILS_OK               = 0,
+
+	PARSERUTILS_NOMEM            = 1,
+	PARSERUTILS_BADPARM          = 2,
+	PARSERUTILS_INVALID          = 3,
+	PARSERUTILS_FILENOTFOUND     = 4,
+	PARSERUTILS_NEEDDATA         = 5,
+} parserutils_error;
+
+/* Convert a parserutils error value to a string */
+const char *parserutils_error_to_string(parserutils_error error);
+/* Convert a string to a parserutils error value */
+parserutils_error parserutils_error_from_string(const char *str, size_t len);
+
+#endif
+
diff --git a/include/parserutils/functypes.h b/include/parserutils/functypes.h
new file mode 100644
index 0000000..703a329
--- /dev/null
+++ b/include/parserutils/functypes.h
@@ -0,0 +1,21 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007-8 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#ifndef parserutils_functypes_h_
+#define parserutils_functypes_h_
+
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+#include <parserutils/types.h>
+
+/* Type of allocation function for parserutils */
+typedef void *(*parserutils_alloc)(void *ptr, size_t size, void *pw);
+
+#endif
+
diff --git a/include/parserutils/input/inputstream.h b/include/parserutils/input/inputstream.h
new file mode 100644
index 0000000..2b0c407
--- /dev/null
+++ b/include/parserutils/input/inputstream.h
@@ -0,0 +1,143 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#ifndef parserutils_input_inputstream_h_
+#define parserutils_input_inputstream_h_
+
+#include <stdbool.h>
+#include <stdlib.h>
+#include <inttypes.h>
+
+#include <parserutils/errors.h>
+#include <parserutils/functypes.h>
+#include <parserutils/types.h>
+#include <parserutils/charset/utf8.h>
+#include <parserutils/utils/buffer.h>
+
+/**
+ * Type of charset detection function
+ */
+typedef parserutils_error (*parserutils_charset_detect_func)(
+		const uint8_t *data, size_t len, 
+		uint16_t *mibenum, uint32_t *source);
+
+/**
+ * Input stream object
+ */
+typedef struct parserutils_inputstream 
+{
+	parserutils_buffer *utf8;	/**< Buffer containing utf8 data */
+
+	uint32_t cursor;		/**< Byte offset of current position */
+
+	bool had_eof;			/**< Whether EOF has been reached */
+} parserutils_inputstream;
+
+/* EOF pseudo-character */
+#define PARSERUTILS_INPUTSTREAM_EOF (0xFFFFFFFFU)
+/* Out-of-data indicator */
+#define PARSERUTILS_INPUTSTREAM_OOD (0xFFFFFFFEU)
+
+/* Create an input stream */
+parserutils_inputstream *parserutils_inputstream_create(const char *enc,
+		uint32_t encsrc, parserutils_charset_detect_func csdetect,
+		parserutils_alloc alloc, void *pw);
+/* Destroy an input stream */
+void parserutils_inputstream_destroy(parserutils_inputstream *stream);
+
+/* Append data to an input stream */
+parserutils_error parserutils_inputstream_append(
+		parserutils_inputstream *stream,
+		const uint8_t *data, size_t len);
+/* Insert data into stream at current location */
+parserutils_error parserutils_inputstream_insert(
+		parserutils_inputstream *stream,
+		const uint8_t *data, size_t len);
+
+/* Slow form of css_inputstream_peek. */
+uintptr_t parserutils_inputstream_peek_slow(parserutils_inputstream *stream, 
+		size_t offset, size_t *length);
+
+/* Look at the character in the stream that starts at 
+ * offset bytes from the cursor
+ *
+ * \param stream  Stream to look in
+ * \param offset  Byte offset of start of character
+ * \param length  Pointer to location to receive character length (in bytes)
+ * \return Pointer to character data, or EOF or OOD.
+ *
+ * Once the character pointed to by the result of this call has been advanced
+ * past (i.e. parserutils_inputstream_advance has caused the stream cursor to 
+ * pass over the character), then no guarantee is made as to the validity of 
+ * the data pointed to. Thus, any attempt to dereference the pointer after 
+ * advancing past the data it points to is a bug.
+ */
+static inline uintptr_t parserutils_inputstream_peek(
+		parserutils_inputstream *stream, size_t offset, size_t *length)
+{
+	parserutils_error error = PARSERUTILS_OK;
+	size_t len;
+
+	if (stream == NULL)
+		return PARSERUTILS_INPUTSTREAM_OOD;
+
+#define IS_ASCII(x) (((x) & 0x80) == 0)
+
+	if (stream->cursor + offset < stream->utf8->length) {
+		if (IS_ASCII(stream->utf8->data[stream->cursor + offset])) {
+			len = 1;
+		} else {
+			error = parserutils_charset_utf8_char_byte_length(
+				stream->utf8->data + stream->cursor + offset,
+				&len);
+
+			if (error != PARSERUTILS_OK && 
+					error != PARSERUTILS_NEEDDATA)
+				return PARSERUTILS_INPUTSTREAM_OOD;
+		}
+	}
+
+#undef IS_ASCII
+
+	if (stream->cursor + offset == stream->utf8->length ||
+			error == PARSERUTILS_NEEDDATA) {
+		return parserutils_inputstream_peek_slow(stream, 
+				offset, length);
+	}
+
+	*length = len;
+
+	return (uintptr_t) (stream->utf8->data + stream->cursor + offset);
+}
+
+/**
+ * Advance the stream's current position
+ *
+ * \param stream  The stream whose position to advance
+ * \param bytes   The number of bytes to advance
+ */
+static inline void parserutils_inputstream_advance(
+		parserutils_inputstream *stream, size_t bytes)
+{
+	if (stream == NULL)
+		return;
+
+	if (bytes > stream->utf8->length - stream->cursor)
+		abort();
+
+	if (stream->cursor == stream->utf8->length)
+		return;
+
+	stream->cursor += bytes;
+}
+
+/* Read the document charset */
+const char *parserutils_inputstream_read_charset(
+		parserutils_inputstream *stream, uint32_t *source);
+
+#endif
+
diff --git a/include/parserutils/parserutils.h b/include/parserutils/parserutils.h
new file mode 100644
index 0000000..460e80c
--- /dev/null
+++ b/include/parserutils/parserutils.h
@@ -0,0 +1,23 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#ifndef parserutils_parserutils_h_
+#define parserutils_parserutils_h_
+
+#include <parserutils/errors.h>
+#include <parserutils/functypes.h>
+#include <parserutils/types.h>
+
+/* Initialise the ParserUtils library for use */
+parserutils_error parserutils_initialise(const char *aliases_file,
+		parserutils_alloc alloc, void *pw);
+
+/* Clean up after ParserUtils */
+parserutils_error parserutils_finalise(parserutils_alloc alloc, void *pw);
+
+#endif
+
diff --git a/include/parserutils/types.h b/include/parserutils/types.h
new file mode 100644
index 0000000..b36e4aa
--- /dev/null
+++ b/include/parserutils/types.h
@@ -0,0 +1,15 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#ifndef parserutils_types_h_
+#define parserutils_types_h_
+
+#include <stdbool.h>
+#include <inttypes.h>
+
+#endif
+
diff --git a/include/parserutils/utils/buffer.h b/include/parserutils/utils/buffer.h
new file mode 100644
index 0000000..f3a1883
--- /dev/null
+++ b/include/parserutils/utils/buffer.h
@@ -0,0 +1,39 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2008 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#ifndef parserutils_utils_buffer_h_
+#define parserutils_utils_buffer_h_
+
+#include <parserutils/errors.h>
+#include <parserutils/functypes.h>
+
+struct parserutils_buffer
+{
+	uint8_t *data;
+	size_t length;
+	size_t allocated;
+
+	parserutils_alloc alloc;
+	void *pw;
+};
+typedef struct parserutils_buffer parserutils_buffer;
+
+parserutils_buffer *parserutils_buffer_create(parserutils_alloc alloc, 
+		void *pw);
+void parserutils_buffer_destroy(parserutils_buffer *buffer);
+
+parserutils_error parserutils_buffer_append(parserutils_buffer *buffer, 
+		const uint8_t *data, size_t len);
+parserutils_error parserutils_buffer_insert(parserutils_buffer *buffer, 
+		size_t offset, const uint8_t *data, size_t len);
+parserutils_error parserutils_buffer_discard(parserutils_buffer *buffer, 
+		size_t offset, size_t len);
+
+parserutils_error parserutils_buffer_grow(parserutils_buffer *buffer);
+
+#endif
+
diff --git a/libparserutils.pc.in b/libparserutils.pc.in
new file mode 100644
index 0000000..400ce78
--- /dev/null
+++ b/libparserutils.pc.in
@@ -0,0 +1,10 @@
+prefix=PREFIX
+exec_prefix=${prefix}
+libdir=${exec_prefix}/lib
+includedir=${prefix}/include
+
+Name: libparserutils
+Description: Utility library for facilitating parser development
+Version: 0.0.1
+Libs: -L${libdir} -lparserutils
+Cflags: -I${includedir}
diff --git a/src/Makefile b/src/Makefile
new file mode 100644
index 0000000..bb6c585
--- /dev/null
+++ b/src/Makefile
@@ -0,0 +1,49 @@
+# Child makefile fragment
+#
+# Toolchain is provided by top-level makefile
+#
+# Variables provided by top-level makefile
+#
+# COMPONENT		The name of the component
+# EXPORT		The location of the export directory
+# TOP			The location of the source tree root
+# RELEASEDIR		The place to put release objects
+# DEBUGDIR		The place to put debug objects
+#
+# do_include		Canned command sequence to include a child makefile
+#
+# Variables provided by parent makefile:
+#
+# DIR			The name of the directory we're in, relative to $(TOP)
+#
+# Variables we can manipulate:
+#
+# ITEMS_CLEAN		The list of items to remove for "make clean"
+# ITEMS_DISTCLEAN	The list of items to remove for "make distclean"
+# TARGET_TESTS		The list of target names to run for "make test"
+#
+# SOURCES		The list of sources to build for $(COMPONENT)
+#
+# Plus anything from the toolchain
+
+# Push parent directory onto the directory stack
+sp             := $(sp).x
+dirstack_$(sp) := $(d)
+d              := $(DIR)
+
+# Manipulate include paths
+override CFLAGS := $(CFLAGS) -I$(d)
+
+# Sources
+SRCS_$(d) := parserutils.c
+
+# Append to sources for component
+SOURCES += $(addprefix $(d), $(SRCS_$(d)))
+
+# Now include any children we may have
+MAKE_INCLUDES := $(wildcard $(d)*/Makefile)
+$(eval $(foreach INC, $(MAKE_INCLUDES), $(call do_include,$(INC))))
+
+# Finally, pop off the directory stack
+d  := $(dirstack_$(sp))
+sp := $(basename $(sp))
diff --git a/src/charset/Makefile b/src/charset/Makefile
new file mode 100644
index 0000000..fc34d7c
--- /dev/null
+++ b/src/charset/Makefile
@@ -0,0 +1,49 @@
+# Child makefile fragment
+#
+# Toolchain is provided by top-level makefile
+#
+# Variables provided by top-level makefile
+#
+# COMPONENT		The name of the component
+# EXPORT		The location of the export directory
+# TOP			The location of the source tree root
+# RELEASEDIR		The place to put release objects
+# DEBUGDIR		The place to put debug objects
+#
+# do_include		Canned command sequence to include a child makefile
+#
+# Variables provided by parent makefile:
+#
+# DIR			The name of the directory we're in, relative to $(TOP)
+#
+# Variables we can manipulate:
+#
+# ITEMS_CLEAN		The list of items to remove for "make clean"
+# ITEMS_DISTCLEAN	The list of items to remove for "make distclean"
+# TARGET_TESTS		The list of target names to run for "make test"
+#
+# SOURCES		The list of sources to build for $(COMPONENT)
+#
+# Plus anything from the toolchain
+
+# Push parent directory onto the directory stack
+sp             := $(sp).x
+dirstack_$(sp) := $(d)
+d              := $(DIR)
+
+# Manipulate include paths
+override CFLAGS := $(CFLAGS) -I$(d)
+
+# Sources
+SRCS_$(d) := aliases.c charset.c codec.c
+
+# Append to sources for component
+SOURCES += $(addprefix $(d), $(SRCS_$(d)))
+
+# Now include any children we may have
+MAKE_INCLUDES := $(wildcard $(d)*/Makefile)
+$(eval $(foreach INC, $(MAKE_INCLUDES), $(call do_include,$(INC))))
+
+# Finally, pop off the directory stack
+d  := $(dirstack_$(sp))
+sp := $(basename $(sp))
diff --git a/src/charset/aliases.c b/src/charset/aliases.c
new file mode 100644
index 0000000..1e7e6ea
--- /dev/null
+++ b/src/charset/aliases.c
@@ -0,0 +1,410 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#include <ctype.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "charset/aliases.h"
+#include "utils/utils.h"
+
+struct alias {
+	struct alias *next;
+	parserutils_charset_aliases_canon *canon;
+	uint16_t name_len;
+	char name[1];
+};
+
+#define HASH_SIZE (43)
+static parserutils_charset_aliases_canon *canon_tab[HASH_SIZE];
+static struct alias *alias_tab[HASH_SIZE];
+
+static parserutils_error parserutils_charset_create_alias(const char *alias,
+		parserutils_charset_aliases_canon *c, 
+		parserutils_alloc alloc, void *pw);
+static parserutils_charset_aliases_canon *parserutils_charset_create_canon(
+		const char *canon, uint16_t mibenum, 
+		parserutils_alloc alloc, void *pw);
+static uint32_t parserutils_charset_hash_val(const char *alias, size_t len);
+
+/**
+ * Create alias data from Aliases file
+ *
+ * \param filename  The path to the Aliases file
+ * \param alloc     Memory (de)allocation function
+ * \param pw        Pointer to client-specific private data (may be NULL)
+ * \return PARSERUTILS_OK on success, appropriate error otherwise.
+ */
+parserutils_error parserutils_charset_aliases_create(const char *filename,
+		parserutils_alloc alloc, void *pw)
+{
+	char buf[300];
+	FILE *fp;
+
+	if (filename == NULL || alloc == NULL)
+		return PARSERUTILS_BADPARM;
+
+	fp = fopen(filename, "r");
+	if (fp == NULL)
+		return PARSERUTILS_FILENOTFOUND;
+
+	while (fgets(buf, sizeof buf, fp)) {
+		char *p, *aliases = 0, *mib, *end;
+		parserutils_charset_aliases_canon *cf;
+
+		if (buf[0] == 0 || buf[0] == '#')
+			/* skip blank lines or comments */
+			continue;
+
+		buf[strlen(buf) - 1] = 0; /* lose terminating newline */
+		end = buf + strlen(buf);
+
+		/* find end of canonical form */
+		for (p = buf; *p && !isspace(*p) && !iscntrl(*p); p++)
+			; /* do nothing */
+		if (p >= end)
+			continue;
+		*p++ = '\0'; /* terminate canonical form */
+
+		/* skip whitespace */
+		for (; *p && isspace(*p); p++)
+			; /* do nothing */
+		if (p >= end)
+			continue;
+		mib = p;
+
+		/* find end of mibenum */
+		for (; *p && !isspace(*p) && !iscntrl(*p); p++)
+			; /* do nothing */
+		if (p < end)
+			*p++ = '\0'; /* terminate mibenum */
+
+		cf = parserutils_charset_create_canon(buf, atoi(mib), alloc, pw);
+		if (cf == NULL)
+			continue;
+
+		/* skip whitespace */
+		for (; p < end && *p && isspace(*p); p++)
+			; /* do nothing */
+		if (p >= end)
+			continue;
+		aliases = p;
+
+		while (p < end) {
+			/* find end of alias */
+			for (; *p && !isspace(*p) && !iscntrl(*p); p++)
+				; /* do nothing */
+			if (p > end)
+				/* stop if we've gone past the end */
+				break;
+			/* terminate current alias */
+			*p++ = '\0';
+
+			if (parserutils_charset_create_alias(aliases, cf,
+					alloc, pw) != PARSERUTILS_OK)
+				break;
+
+			/* in terminating, we may have advanced
+			 * past the end - check this here */
+			if (p >= end)
+				break;
+
+			/* skip whitespace */
+			for (; *p && isspace(*p); p++)
+				; /* do nothing */
+
+			if (p >= end)
+				/* gone past end => stop */
+				break;
+
+			/* update pointer to current alias */
+			aliases = p;
+		}
+	}
+
+	fclose(fp);
+
+	return PARSERUTILS_OK;
+}
+
+/**
+ * Free all alias data
+ *
+ * \param alloc  Memory (de)allocation function
+ * \param pw     Pointer to client-specific private data
+ */
+void parserutils_charset_aliases_destroy(parserutils_alloc alloc, void *pw)
+{
+	parserutils_charset_aliases_canon *c, *d;
+	struct alias *a, *b;
+	int i;
+
+	for (i = 0; i != HASH_SIZE; i++) {
+		for (c = canon_tab[i]; c; c = d) {
+			d = c->next;
+			alloc(c, 0, pw);
+		}
+		canon_tab[i] = NULL;
+
+		for (a = alias_tab[i]; a; a = b) {
+			b = a->next;
+			alloc(a, 0, pw);
+		}
+		alias_tab[i] = NULL;
+	}
+}
+
+/**
+ * Retrieve the MIB enum value assigned to an encoding name
+ *
+ * \param alias  The alias to lookup
+ * \param len    The length of the alias string
+ * \return The MIB enum value, or 0 if not found
+ */
+uint16_t parserutils_charset_mibenum_from_name(const char *alias, size_t len)
+{
+	parserutils_charset_aliases_canon *c;
+
+	if (alias == NULL)
+		return 0;
+
+	c = parserutils_charset_alias_canonicalise(alias, len);
+	if (c == NULL)
+		return 0;
+
+	return c->mib_enum;
+}
+
+/**
+ * Retrieve the canonical name of an encoding from the MIB enum
+ *
+ * \param mibenum The MIB enum value
+ * \return Pointer to canonical name, or NULL if not found
+ */
+const char *parserutils_charset_mibenum_to_name(uint16_t mibenum)
+{
+	int i;
+	parserutils_charset_aliases_canon *c;
+
+	for (i = 0; i != HASH_SIZE; i++)
+		for (c = canon_tab[i]; c; c = c->next)
+			if (c->mib_enum == mibenum)
+				return c->name;
+
+	return NULL;
+}
+
+/**
+ * Detect if a parserutils_charset is Unicode
+ *
+ * \param mibenum  The MIB enum to consider
+ * \return true if a Unicode variant, false otherwise
+ */
+bool parserutils_charset_mibenum_is_unicode(uint16_t mibenum)
+{
+	static uint16_t ucs4;
+	static uint16_t ucs2;
+	static uint16_t utf8;
+	static uint16_t utf16;
+	static uint16_t utf16be;
+	static uint16_t utf16le;
+	static uint16_t utf32;
+	static uint16_t utf32be;
+	static uint16_t utf32le;
+
+	if (ucs4 == 0) {
+		ucs4 = parserutils_charset_mibenum_from_name("UCS-4", 
+				SLEN("UCS-4"));
+		ucs2 = parserutils_charset_mibenum_from_name("UCS-2", 
+				SLEN("UCS-2"));
+		utf8 = parserutils_charset_mibenum_from_name("UTF-8", 
+				SLEN("UTF-8"));
+		utf16 = parserutils_charset_mibenum_from_name("UTF-16", 
+				SLEN("UTF-16"));
+		utf16be = parserutils_charset_mibenum_from_name("UTF-16BE",
+				SLEN("UTF-16BE"));
+		utf16le = parserutils_charset_mibenum_from_name("UTF-16LE",
+				SLEN("UTF-16LE"));
+		utf32 = parserutils_charset_mibenum_from_name("UTF-32", 
+				SLEN("UTF-32"));
+		utf32be = parserutils_charset_mibenum_from_name("UTF-32BE",
+				SLEN("UTF-32BE"));
+		utf32le = parserutils_charset_mibenum_from_name("UTF-32LE",
+				SLEN("UTF-32LE"));
+	}
+
+	return (mibenum == ucs4 || mibenum == ucs2 || mibenum == utf8 ||
+			mibenum == utf16 || mibenum == utf16be || 
+			mibenum == utf16le || mibenum == utf32 ||
+			mibenum == utf32be || mibenum == utf32le);
+}
+
+/**
+ * Retrieve the canonical form of an alias name
+ *
+ * \param alias  The alias name
+ * \param len    The length of the alias name
+ * \return Pointer to canonical form or NULL if not found
+ */
+parserutils_charset_aliases_canon *parserutils_charset_alias_canonicalise(
+		const char *alias, size_t len)
+{
+	uint32_t hash;
+	parserutils_charset_aliases_canon *c;
+	struct alias *a;
+
+	if (alias == NULL)
+		return NULL;
+
+	hash = parserutils_charset_hash_val(alias, len);
+
+	for (c = canon_tab[hash]; c; c = c->next)
+		if (c->name_len == len &&
+				strncasecmp(c->name, alias, len) == 0)
+			break;
+	if (c)
+		return c;
+
+	for (a = alias_tab[hash]; a; a = a->next)
+		if (a->name_len == len &&
+				strncasecmp(a->name, alias, len) == 0)
+			break;
+	if (a)
+		return a->canon;
+
+	return NULL;
+}
+
+
+/**
+ * Create an alias
+ *
+ * \param alias  The alias name
+ * \param c      The canonical form
+ * \param alloc  Memory (de)allocation function
+ * \param pw     Pointer to client-specific private data (may be NULL)
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_create_alias(const char *alias, 
+		parserutils_charset_aliases_canon *c,
+		parserutils_alloc alloc, void *pw)
+{
+	struct alias *a;
+	uint32_t hash;
+
+	if (alias == NULL || c == NULL || alloc == NULL)
+		return PARSERUTILS_BADPARM;
+
+	a = alloc(NULL, sizeof(struct alias) + strlen(alias) + 1, pw);
+	if (a == NULL)
+		return PARSERUTILS_NOMEM;
+
+	a->canon = c;
+	a->name_len = strlen(alias);
+	strcpy(a->name, alias);
+	a->name[a->name_len] = '\0';
+
+	hash = parserutils_charset_hash_val(alias, a->name_len);
+
+	a->next = alias_tab[hash];
+	alias_tab[hash] = a;
+
+	return PARSERUTILS_OK;
+}
+
+/**
+ * Create a canonical form
+ *
+ * \param canon    The canonical name
+ * \param mibenum  The MIB enum value
+ * \param alloc    Memory (de)allocation function
+ * \param pw       Pointer to client-specific private data (may be NULL)
+ * \return Pointer to canonical form or NULL on error
+ */
+parserutils_charset_aliases_canon *parserutils_charset_create_canon(
+		const char *canon, uint16_t mibenum, 
+		parserutils_alloc alloc, void *pw)
+{
+	parserutils_charset_aliases_canon *c;
+	uint32_t hash, len;
+
+	if (canon == NULL || alloc == NULL)
+		return NULL;
+
+	len = strlen(canon);
+
+	c = alloc(NULL, sizeof(parserutils_charset_aliases_canon) + len + 1, pw);
+	if (c == NULL)
+		return NULL;
+
+	c->mib_enum = mibenum;
+	c->name_len = len;
+	strcpy(c->name, canon);
+	c->name[len] = '\0';
+
+	hash = parserutils_charset_hash_val(canon, len);
+
+	c->next = canon_tab[hash];
+	canon_tab[hash] = c;
+
+	return c;
+}
+
+/**
+ * Hash function
+ *
+ * \param alias String to hash
+ * \return The hashed value
+ */
+uint32_t parserutils_charset_hash_val(const char *alias, size_t len)
+{
+	const char *s = alias;
+	uint32_t h = 5381;
+
+	if (alias == NULL)
+		return 0;
+
+	while (len--)
+		h = (h * 33) ^ (*s++ & ~0x20); /* case insensitive */
+
+	return h % HASH_SIZE;
+}
+
+
+#ifndef NDEBUG
+/**
+ * Dump all alias data to stdout
+ */
+void parserutils_charset_aliases_dump(void)
+{
+	parserutils_charset_aliases_canon *c;
+	struct alias *a;
+	int i;
+	size_t size = 0;
+
+	for (i = 0; i != HASH_SIZE; i++) {
+		for (c = canon_tab[i]; c; c = c->next) {
+			printf("%d %s\n", i, c->name);
+			size += offsetof(parserutils_charset_aliases_canon, 
+					name) + c->name_len;
+		}
+
+		for (a = alias_tab[i]; a; a = a->next) {
+			printf("%d %s\n", i, a->name);
+			size += offsetof(struct alias, name) + a->name_len;
+		}
+	}
+
+	size += (sizeof(canon_tab) / sizeof(canon_tab[0]));
+	size += (sizeof(alias_tab) / sizeof(alias_tab[0]));
+
+	printf("%u\n", (unsigned int) size);
+}
+#endif
diff --git a/src/charset/aliases.h b/src/charset/aliases.h
new file mode 100644
index 0000000..9abd2c8
--- /dev/null
+++ b/src/charset/aliases.h
@@ -0,0 +1,36 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#ifndef parserutils_charset_aliases_h_
+#define parserutils_charset_aliases_h_
+
+#include <inttypes.h>
+
+#include <parserutils/charset/mibenum.h>
+
+typedef struct parserutils_charset_aliases_canon {
+	struct parserutils_charset_aliases_canon *next;
+	uint16_t mib_enum;
+	uint16_t name_len;
+	char name[1];
+} parserutils_charset_aliases_canon;
+
+/* Load encoding aliases from file */
+parserutils_error parserutils_charset_aliases_create(const char *filename,
+		parserutils_alloc alloc, void *pw);
+/* Destroy encoding aliases */
+void parserutils_charset_aliases_destroy(parserutils_alloc alloc, void *pw);
+
+/* Canonicalise an alias name */
+parserutils_charset_aliases_canon *parserutils_charset_alias_canonicalise(
+		const char *alias, size_t len);
+
+#ifndef NDEBUG
+void parserutils_charset_aliases_dump(void);
+#endif
+
+#endif
diff --git a/src/charset/charset.c b/src/charset/charset.c
new file mode 100644
index 0000000..3ef1a71
--- /dev/null
+++ b/src/charset/charset.c
@@ -0,0 +1,54 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#include "charset/aliases.h"
+#include "charset/charset.h"
+
+/**
+ * Initialise the Charset library for use.
+ *
+ * This _must_ be called before using any libparserutils charset functions
+ *
+ * \param aliases_file  Pointer to name of file containing encoding alias data
+ * \param alloc         Pointer to (de)allocation function
+ * \param pw            Pointer to client-specific private data (may be NULL)
+ * \return PARSERUTILS_OK on success, applicable error otherwise.
+ */
+parserutils_error parserutils_charset_initialise(const char *aliases_file,
+		parserutils_alloc alloc, void *pw)
+{
+	parserutils_error error;
+
+	if (aliases_file == NULL || alloc == NULL)
+		return PARSERUTILS_BADPARM;
+
+	error = parserutils_charset_aliases_create(aliases_file, alloc, pw);
+	if (error != PARSERUTILS_OK)
+		return error;
+
+	return PARSERUTILS_OK;
+}
+
+/**
+ * Clean up after Libparserutils
+ *
+ * \param alloc  Pointer to (de)allocation function
+ * \param pw     Pointer to client-specific private data (may be NULL)
+ * \return PARSERUTILS_OK on success, applicable error otherwise.
+ */
+parserutils_error parserutils_charset_finalise(parserutils_alloc alloc, 
+		void *pw)
+{
+	if (alloc == NULL)
+		return PARSERUTILS_BADPARM;
+
+	parserutils_charset_aliases_destroy(alloc, pw);
+
+	return PARSERUTILS_OK;
+}
+
+
diff --git a/src/charset/charset.h b/src/charset/charset.h
new file mode 100644
index 0000000..4b07577
--- /dev/null
+++ b/src/charset/charset.h
@@ -0,0 +1,24 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#ifndef parserutils_charset_charset_h_
+#define parserutils_charset_charset_h_
+
+#include <parserutils/errors.h>
+#include <parserutils/functypes.h>
+#include <parserutils/types.h>
+
+/* Initialise the Charset library for use */
+parserutils_error parserutils_charset_initialise(const char *aliases_file,
+		parserutils_alloc alloc, void *pw);
+
+/* Clean up after Charset */
+parserutils_error parserutils_charset_finalise(parserutils_alloc alloc, 
+		void *pw);
+
+#endif
+
diff --git a/src/charset/codec.c b/src/charset/codec.c
new file mode 100644
index 0000000..5c3fb3a
--- /dev/null
+++ b/src/charset/codec.c
@@ -0,0 +1,185 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#include <string.h>
+
+#include "charset/aliases.h"
+#include "charset/codecs/codec_impl.h"
+
+#ifdef WITH_ICONV_CODEC
+extern parserutils_charset_handler iconv_codec_handler;
+#endif
+
+extern parserutils_charset_handler charset_utf8_codec_handler;
+extern parserutils_charset_handler charset_utf16_codec_handler;
+
+static parserutils_charset_handler *handler_table[] = {
+	&charset_utf8_codec_handler,
+	&charset_utf16_codec_handler,
+#ifdef WITH_ICONV_CODEC
+	&iconv_codec_handler,
+#endif
+	NULL,
+};
+
+/**
+ * Create a charset codec
+ *
+ * \param charset  Target charset
+ * \param alloc    Memory (de)allocation function
+ * \param pw       Pointer to client-specific private data (may be NULL)
+ * \return Pointer to codec instance, or NULL on failure
+ */
+parserutils_charset_codec *parserutils_charset_codec_create(const char *charset,
+		parserutils_alloc alloc, void *pw)
+{
+	parserutils_charset_codec *codec;
+	parserutils_charset_handler **handler;
+	const parserutils_charset_aliases_canon * canon;
+
+	if (charset == NULL || alloc == NULL)
+		return NULL;
+
+	/* Canonicalise parserutils_charset name. */
+	canon = parserutils_charset_alias_canonicalise(charset, 
+			strlen(charset));
+	if (canon == NULL)
+		return NULL;
+
+	/* Search for handler class */
+	for (handler = handler_table; *handler != NULL; handler++) {
+		if ((*handler)->handles_charset(canon->name))
+			break;
+	}
+
+	/* None found */
+	if ((*handler) == NULL)
+		return NULL;
+
+	/* Instantiate class */
+	codec = (*handler)->create(canon->name, alloc, pw);
+	if (codec == NULL)
+		return NULL;
+
+	/* and initialise it */
+	codec->mibenum = canon->mib_enum;
+
+	codec->errormode = PARSERUTILS_CHARSET_CODEC_ERROR_LOOSE;
+
+	codec->alloc = alloc;
+	codec->alloc_pw = pw;
+
+	return codec;
+}
+
+/**
+ * Destroy a charset codec
+ *
+ * \param codec  The codec to destroy
+ */
+void parserutils_charset_codec_destroy(parserutils_charset_codec *codec)
+{
+	if (codec == NULL)
+		return;
+
+	codec->handler.destroy(codec);
+
+	codec->alloc(codec, 0, codec->alloc_pw);
+}
+
+/**
+ * Configure a charset codec
+ *
+ * \param codec   The codec to configure
+ * \parem type    The codec option type to configure
+ * \param params  Option-specific parameters
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_codec_setopt(
+		parserutils_charset_codec *codec,
+		parserutils_charset_codec_opttype type,
+		parserutils_charset_codec_optparams *params)
+{
+	if (codec == NULL || params == NULL)
+		return PARSERUTILS_BADPARM;
+
+	switch (type) {
+	case PARSERUTILS_CHARSET_CODEC_ERROR_MODE:
+		codec->errormode = params->error_mode.mode;
+		break;
+	}
+
+	return PARSERUTILS_OK;
+}
+
+/**
+ * Encode a chunk of UCS4 data into a codec's charset
+ *
+ * \param codec      The codec to use
+ * \param source     Pointer to pointer to source data
+ * \param sourcelen  Pointer to length (in bytes) of source data
+ * \param dest       Pointer to pointer to output buffer
+ * \param destlen    Pointer to length (in bytes) of output buffer
+ * \return PARSERUTILS_OK on success, appropriate error otherwise.
+ *
+ * source, sourcelen, dest and destlen will be updated appropriately on exit
+ */
+parserutils_error parserutils_charset_codec_encode(
+		parserutils_charset_codec *codec,
+		const uint8_t **source, size_t *sourcelen,
+		uint8_t **dest, size_t *destlen)
+{
+	if (codec == NULL || source == NULL || *source == NULL ||
+			sourcelen == NULL || dest == NULL || *dest == NULL ||
+			destlen == NULL)
+		return PARSERUTILS_BADPARM;
+
+	return codec->handler.encode(codec, source, sourcelen, dest, destlen);
+}
+
+/**
+ * Decode a chunk of data in a codec's charset into UCS4
+ *
+ * \param codec      The codec to use
+ * \param source     Pointer to pointer to source data
+ * \param sourcelen  Pointer to length (in bytes) of source data
+ * \param dest       Pointer to pointer to output buffer
+ * \param destlen    Pointer to length (in bytes) of output buffer
+ * \return PARSERUTILS_OK on success, appropriate error otherwise.
+ *
+ * source, sourcelen, dest and destlen will be updated appropriately on exit
+ *
+ * Call this with a source length of 0 to flush any buffers.
+ */
+parserutils_error parserutils_charset_codec_decode(
+		parserutils_charset_codec *codec,
+		const uint8_t **source, size_t *sourcelen,
+		uint8_t **dest, size_t *destlen)
+{
+	if (codec == NULL || source == NULL || *source == NULL ||
+			sourcelen == NULL || dest == NULL || *dest == NULL ||
+			destlen == NULL)
+		return PARSERUTILS_BADPARM;
+
+	return codec->handler.decode(codec, source, sourcelen, dest, destlen);
+}
+
+/**
+ * Clear a charset codec's encoding state
+ *
+ * \param codec  The codec to reset
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_codec_reset(
+		parserutils_charset_codec *codec)
+{
+	if (codec == NULL)
+		return PARSERUTILS_BADPARM;
+
+	return codec->handler.reset(codec);
+}
+
diff --git a/src/charset/codecs/Makefile b/src/charset/codecs/Makefile
new file mode 100644
index 0000000..6d3b78e
--- /dev/null
+++ b/src/charset/codecs/Makefile
@@ -0,0 +1,46 @@
+# Child makefile fragment
+#
+# Toolchain is provided by top-level makefile
+#
+# Variables provided by top-level makefile
+#
+# COMPONENT		The name of the component
+# EXPORT		The location of the export directory
+# TOP			The location of the source tree root
+# RELEASEDIR		The place to put release objects
+# DEBUGDIR		The place to put debug objects
+#
+# do_include		Canned command sequence to include a child makefile
+#
+# Variables provided by parent makefile:
+#
+# DIR			The name of the directory we're in, relative to $(TOP)
+#
+# Variables we can manipulate:
+#
+# ITEMS_CLEAN		The list of items to remove for "make clean"
+# ITEMS_DISTCLEAN	The list of items to remove for "make distclean"
+# TARGET_TESTS		The list of target names to run for "make test"
+#
+# SOURCES		The list of sources to build for $(COMPONENT)
+#
+# Plus anything from the toolchain
+
+# Push parent directory onto the directory stack
+sp             := $(sp).x
+dirstack_$(sp) := $(d)
+d              := $(DIR)
+
+# Sources
+SRCS_$(d) := codec_iconv.c codec_utf8.c codec_utf16.c
+
+# Append to sources for component
+SOURCES += $(addprefix $(d), $(SRCS_$(d)))
+
+# Now include any children we may have
+MAKE_INCLUDES := $(wildcard $(d)*/Makefile)
+$(eval $(foreach INC, $(MAKE_INCLUDES), $(call do_include,$(INC))))
+
+# Finally, pop off the directory stack
+d  := $(dirstack_$(sp))
+sp := $(basename $(sp))
diff --git a/src/charset/codecs/codec_iconv.c b/src/charset/codecs/codec_iconv.c
new file mode 100644
index 0000000..bbe8bc4
--- /dev/null
+++ b/src/charset/codecs/codec_iconv.c
@@ -0,0 +1,683 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+/* This codec is hideously slow. Only use it as a last resort */
+
+#include <errno.h>
+#include <stdlib.h>
+#include <string.h>
+
+/* We put this here rather than at the top as GCC complains 
+ * about the source file being empty otherwise. */
+#ifdef WITH_ICONV_CODEC
+
+#include <iconv.h>
+
+/* These two are for htonl / ntohl */
+#include <arpa/inet.h>
+#include <netinet/in.h>
+
+#include <parserutils/charset/mibenum.h>
+
+#include "charset/codecs/codec_impl.h"
+#include "utils/utils.h"
+
+/**
+ * Iconv-based charset codec
+ */
+typedef struct iconv_codec {
+	parserutils_charset_codec base;	/**< Base class */
+
+	iconv_t read_cd;		/**< Iconv handle for reading */
+#define INVAL_BUFSIZE (32)
+	uint8_t inval_buf[INVAL_BUFSIZE];	/**< Buffer for fixing up
+						 * incomplete input
+						 * sequences */
+	size_t inval_len;		/**< Number of bytes in inval_buf */
+
+#define READ_BUFSIZE (8)
+	uint32_t read_buf[READ_BUFSIZE];	/**< Buffer for partial
+						 * output sequences (decode)
+						 */
+	size_t read_len;		/**< Number of characters in
+					 * read_buf */
+
+	iconv_t write_cd;		/**< Iconv handle for writing */
+#define WRITE_BUFSIZE (8)
+	uint32_t write_buf[WRITE_BUFSIZE];	/**< Buffer for partial
+						 * output sequences (encode)
+						 */
+	size_t write_len;		/**< Number of characters in
+					 * write_buf */
+} iconv_codec;
+
+
+static bool iconv_codec_handles_charset(const char *charset);
+static parserutils_charset_codec *iconv_codec_create(const char *charset,
+		parserutils_alloc alloc, void *pw);
+static void iconv_codec_destroy (parserutils_charset_codec *codec);
+static parserutils_error iconv_codec_encode(parserutils_charset_codec *codec,
+		const uint8_t **source, size_t *sourcelen,
+		uint8_t **dest, size_t *destlen);
+static parserutils_error iconv_codec_decode(parserutils_charset_codec *codec,
+		const uint8_t **source, size_t *sourcelen,
+		uint8_t **dest, size_t *destlen);
+static parserutils_error iconv_codec_reset(parserutils_charset_codec *codec);
+static parserutils_error iconv_codec_output_decoded_char(
+		iconv_codec *c, uint32_t ucs4, uint8_t **dest,
+		size_t *destlen);
+static parserutils_error iconv_codec_read_char(iconv_codec *c,
+		const uint8_t **source, size_t *sourcelen,
+		uint8_t **dest, size_t *destlen);
+static parserutils_error iconv_codec_write_char(iconv_codec *c,
+		uint32_t ucs4, uint8_t **dest, size_t *destlen);
+
+/**
+ * Determine whether this codec handles a specific charset
+ *
+ * \param charset  Charset to test
+ * \return true if handleable, false otherwise
+ */
+bool iconv_codec_handles_charset(const char *charset)
+{
+	iconv_t cd;
+	bool ret;
+
+	cd = iconv_open("UCS-4", charset);
+
+	ret = (cd != (iconv_t) -1);
+
+	if (ret)
+		iconv_close(cd);
+
+	return ret;
+}
+
+/**
+ * Create an iconv-based codec
+ *
+ * \param charset  The charset to read from / write to
+ * \param alloc    Memory (de)allocation function
+ * \param pw       Pointer to client-specific private data (may be NULL)
+ * \return Pointer to codec, or NULL on failure
+ */
+parserutils_charset_codec *iconv_codec_create(const char *charset,
+		parserutils_alloc alloc, void *pw)
+{
+	iconv_codec *codec;
+
+	codec = alloc(NULL, sizeof(iconv_codec), pw);
+	if (codec == NULL)
+		return NULL;
+
+	codec->read_cd = iconv_open("UCS-4", charset);
+	if (codec->read_cd == (iconv_t) -1) {
+		alloc(codec, 0, pw);
+		return NULL;
+	}
+
+	codec->write_cd = iconv_open(charset, "UCS-4");
+	if (codec->write_cd == (iconv_t) -1) {
+		iconv_close(codec->read_cd);
+		alloc(codec, 0, pw);
+		return NULL;
+	}
+
+	codec->inval_buf[0] = '\0';
+	codec->inval_len = 0;
+
+	codec->read_buf[0] = 0;
+	codec->read_len = 0;
+
+	codec->write_buf[0] = 0;
+	codec->write_len = 0;
+
+	/* Finally, populate vtable */
+	codec->base.handler.destroy = iconv_codec_destroy;
+	codec->base.handler.encode = iconv_codec_encode;
+	codec->base.handler.decode = iconv_codec_decode;
+	codec->base.handler.reset = iconv_codec_reset;
+
+	return (parserutils_charset_codec *) codec;
+}
+
+/**
+ * Destroy an iconv-based codec
+ *
+ * \param codec  The codec to destroy
+ */
+void iconv_codec_destroy (parserutils_charset_codec *codec)
+{
+	iconv_codec *c = (iconv_codec *) codec;
+
+	iconv_close(c->read_cd);
+	iconv_close(c->write_cd);
+
+	return;
+}
+
+/**
+ * Encode a chunk of UCS4 data into an iconv-based codec's charset
+ *
+ * \param codec      The codec to use
+ * \param source     Pointer to pointer to source data
+ * \param sourcelen  Pointer to length (in bytes) of source data
+ * \param dest       Pointer to pointer to output buffer
+ * \param destlen    Pointer to length (in bytes) of output buffer
+ * \return PARSERUTILS_OK          on success,
+ *         PARSERUTILS_NOMEM       if output buffer is too small,
+ *         PARSERUTILS_INVALID     if a character cannot be represented and the
+ *                             codec's error handling mode is set to STRICT,
+ *
+ * On exit, ::source will point immediately _after_ the last input character
+ * read. Any remaining output for the character will be buffered by the
+ * codec for writing on the next call.
+ *
+ * Note that, if failure occurs whilst attempting to write any output
+ * buffered by the last call, then ::source and ::sourcelen will remain
+ * unchanged (as nothing more has been read).
+ *
+ * ::sourcelen will be reduced appropriately on exit.
+ *
+ * ::dest will point immediately _after_ the last character written.
+ *
+ * ::destlen will be reduced appropriately on exit.
+ */
+parserutils_error iconv_codec_encode(parserutils_charset_codec *codec,
+		const uint8_t **source, size_t *sourcelen,
+		uint8_t **dest, size_t *destlen)
+{
+	iconv_codec *c = (iconv_codec *) codec;
+	uint32_t ucs4;
+	const uint32_t *towrite;
+	size_t towritelen;
+	parserutils_error error;
+
+	/* Process any outstanding characters from the previous call */
+	if (c->write_len > 0) {
+		uint32_t *pwrite = c->write_buf;
+
+		while (c->write_len > 0) {
+			error = iconv_codec_write_char(c, pwrite[0],
+					dest, destlen);
+			if (error != PARSERUTILS_OK) {
+				/* Copy outstanding chars down, skipping
+				 * invalid one, if present, so as to avoid
+				 * reprocessing the invalid character */
+				if (error == PARSERUTILS_INVALID) {
+					for (ucs4 = 1; ucs4 < c->write_len;
+							ucs4++) {
+						c->write_buf[ucs4] =
+								pwrite[ucs4];
+					}
+				}
+
+				return error;
+			}
+
+			pwrite++;
+			c->write_len--;
+		}
+	}
+
+	/* Now process the characters for this call */
+	while (*sourcelen > 0) {
+		towrite = (const uint32_t *) (const void *) *source;
+		towritelen = 1;
+		ucs4 = *towrite;
+
+		/* Output current character(s) */
+		while (towritelen > 0) {
+			error = iconv_codec_write_char(c, towrite[0],
+					dest, destlen);
+
+			if (error != PARSERUTILS_OK) {
+				ucs4 = (error == PARSERUTILS_INVALID) ? 1 : 0;
+
+				if (towritelen - ucs4 >= WRITE_BUFSIZE)
+					abort();
+
+				c->write_len = towritelen - ucs4;
+
+				/* Copy pending chars to save area, for
+				 * processing next call; skipping invalid
+				 * character, if present, so it's not
+				 * reprocessed. */
+				for (; ucs4 < towritelen; ucs4++) {
+					c->write_buf[ucs4] = towrite[ucs4];
+				}
+
+				/* Claim character we've just buffered,
+				 * so it's not repreocessed */
+				*source += 4;
+				*sourcelen -= 4;
+
+				return error;
+			}
+
+			towrite++;
+			towritelen--;
+		}
+
+		*source += 4;
+		*sourcelen -= 4;
+	}
+
+	return PARSERUTILS_OK;
+}
+
+/**
+ * Decode a chunk of data in an iconv-based codec's charset into UCS4
+ *
+ * \param codec      The codec to use
+ * \param source     Pointer to pointer to source data
+ * \param sourcelen  Pointer to length (in bytes) of source data
+ * \param dest       Pointer to pointer to output buffer
+ * \param destlen    Pointer to length (in bytes) of output buffer
+ * \return PARSERUTILS_OK          on success,
+ *         PARSERUTILS_NOMEM       if output buffer is too small,
+ *         PARSERUTILS_INVALID     if a character cannot be represented and the
+ *                            codec's error handling mode is set to STRICT,
+ *
+ * On exit, ::source will point immediately _after_ the last input character
+ * read, if the result is _OK or _NOMEM. Any remaining output for the
+ * character will be buffered by the codec for writing on the next call.
+ *
+ * In the case of the result being _INVALID, ::source will point _at_ the 
+ * last input character read; nothing will be written or buffered for the 
+ * failed character. It is up to the client to fix the cause of the failure 
+ * and retry the decoding process.
+ *
+ * Note that, if failure occurs whilst attempting to write any output
+ * buffered by the last call, then ::source and ::sourcelen will remain
+ * unchanged (as nothing more has been read).
+ *
+ * If STRICT error handling is configured and an illegal sequence is split
+ * over two calls, then _INVALID will be returned from the second call,
+ * but ::source will point mid-way through the invalid sequence (i.e. it
+ * will be unmodified over the second call). In addition, the internal
+ * incomplete-sequence buffer will be emptied, such that subsequent calls
+ * will progress, rather than re-evaluating the same invalid sequence.
+ *
+ * ::sourcelen will be reduced appropriately on exit.
+ *
+ * ::dest will point immediately _after_ the last character written.
+ *
+ * ::destlen will be reduced appropriately on exit.
+ *
+ * Call this with a source length of 0 to flush the output buffer.
+ */
+parserutils_error iconv_codec_decode(parserutils_charset_codec *codec,
+		const uint8_t **source, size_t *sourcelen,
+		uint8_t **dest, size_t *destlen)
+{
+	iconv_codec *c = (iconv_codec *) codec;
+	parserutils_error error;
+
+	if (c->read_len > 0) {
+		/* Output left over from last decode
+		 * Attempt to finish this here */
+		uint32_t *pread = c->read_buf;
+
+		while (c->read_len > 0 && *destlen >= c->read_len * 4) {
+			*((uint32_t *) (void *) *dest) = pread[0];
+
+			*dest += 4;
+			*destlen -= 4;
+
+			pread++;
+			c->read_len--;
+		}
+
+		if (*destlen < c->read_len * 4) {
+			/* Run out of output buffer */
+			size_t i;
+
+			/* Shuffle remaining output down */
+			for (i = 0; i < c->read_len; i++) {
+				c->read_buf[i] = pread[i];
+			}
+
+			return PARSERUTILS_NOMEM;
+		}
+	}
+
+	if (c->inval_len > 0) {
+		/* The last decode ended in an incomplete sequence.
+		 * Fill up inval_buf with data from the start of the
+		 * new chunk and process it. */
+		uint8_t *in = c->inval_buf;
+		size_t ol = c->inval_len;
+		size_t l = min(INVAL_BUFSIZE - ol - 1, *sourcelen);
+		size_t orig_l = l;
+
+		memcpy(c->inval_buf + ol, *source, l);
+
+		l += c->inval_len;
+
+		error = iconv_codec_read_char(c,
+				(const uint8_t **) &in, &l, dest, destlen);
+		if (error != PARSERUTILS_OK && error != PARSERUTILS_NOMEM) {
+			return error;
+		}
+
+
+		/* And now, fix everything up so the normal processing
+		 * does the right thing. */
+		*source += max((signed) (orig_l - l), 0);
+		*sourcelen -= max((signed) (orig_l - l), 0);
+
+		/* Failed to resolve an incomplete character and
+		 * ran out of buffer space. No recovery strategy
+		 * possible, so explode everywhere. */
+		if ((orig_l + ol) - l == 0)
+			abort();
+
+		/* Handle memry exhaustion case from above */
+		if (error != PARSERUTILS_OK)
+			return error;
+	}
+
+	while (*sourcelen > 0) {
+		error = iconv_codec_read_char(c,
+				source, sourcelen, dest, destlen);
+		if (error != PARSERUTILS_OK) {
+			return error;
+		}
+	}
+
+	return PARSERUTILS_OK;
+}
+
+/**
+ * Clear an iconv-based codec's encoding state
+ *
+ * \param codec  The codec to reset
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error iconv_codec_reset(parserutils_charset_codec *codec)
+{
+	iconv_codec *c = (iconv_codec *) codec;
+
+	iconv(c->read_cd, NULL, NULL, NULL, NULL);
+	iconv(c->write_cd, NULL, NULL, NULL, NULL);
+
+	c->inval_buf[0] = '\0';
+	c->inval_len = 0;
+
+	c->read_buf[0] = 0;
+	c->read_len = 0;
+
+	c->write_buf[0] = 0;
+	c->write_len = 0;
+
+	return PARSERUTILS_OK;
+}
+
+/**
+ * Output a UCS4 character
+ *
+ * \param c        Codec to use
+ * \param ucs4     UCS4 character (big endian)
+ * \param dest     Pointer to pointer to output buffer
+ * \param destlen  Pointer to output buffer length
+ * \return PARSERUTILS_OK          on success,
+ *         PARSERUTILS_NOMEM       if output buffer is too small,
+ */
+parserutils_error iconv_codec_output_decoded_char(iconv_codec *c,
+		uint32_t ucs4, uint8_t **dest, size_t *destlen)
+{
+	if (*destlen < 4) {
+		/* Run out of output buffer */
+
+		c->read_len = 1;
+		c->read_buf[0] = ucs4;
+
+		return PARSERUTILS_NOMEM;
+	}
+
+	*((uint32_t *) (void *) *dest) = ucs4;
+	*dest += 4;
+	*destlen -= 4;
+
+	return PARSERUTILS_OK;
+}
+
+/**
+ * Read a character from the codec's native charset to UCS4 (big endian)
+ *
+ * \param c          The codec
+ * \param source     Pointer to pointer to source buffer (updated on exit)
+ * \param sourcelen  Pointer to length of source buffer (updated on exit)
+ * \param dest       Pointer to pointer to output buffer (updated on exit)
+ * \param destlen    Pointer to length of output buffer (updated on exit)
+ * \return PARSERUTILS_OK on success,
+ *         PARSERUTILS_NOMEM       if output buffer is too small,
+ *         PARSERUTILS_INVALID     if a character cannot be represented and the
+ *                            codec's error handling mode is set to STRICT,
+ *
+ * On exit, ::source will point immediately _after_ the last input character
+ * read, if the result is _OK or _NOMEM. Any remaining output for the
+ * character will be buffered by the codec for writing on the next call.
+ *
+ * In the case of the result being _INVALID, ::source will point _at_ the 
+ * last input character read; nothing will be written or buffered for the 
+ * failed character. It is up to the client to fix the cause of the failure 
+ * and retry the decoding process.
+ *
+ * ::sourcelen will be reduced appropriately on exit.
+ *
+ * ::dest will point immediately _after_ the last character written.
+ *
+ * ::destlen will be reduced appropriately on exit.
+ */
+parserutils_error iconv_codec_read_char(iconv_codec *c,
+		const uint8_t **source, size_t *sourcelen,
+		uint8_t **dest, size_t *destlen)
+{
+	size_t iconv_ret;
+	const uint8_t *origsrc = *source;
+	size_t origsrclen = *sourcelen;
+	uint32_t ucs4;
+	uint8_t *pucs4 = (uint8_t *) &ucs4;
+	size_t sucs4 = 4;
+	parserutils_error error;
+
+	/* Use iconv to convert a single character
+	 * Side effect: Updates *source to point at next input
+	 * character and *sourcelen to reflect reduced input length
+	 */
+	iconv_ret = iconv(c->read_cd, (char **) source, sourcelen,
+			(char **) (void *) &pucs4, &sucs4);
+
+	if (iconv_ret != (size_t) -1 ||
+			(*source != origsrc && sucs4 == 0)) {
+		/* Read a character */
+		error = iconv_codec_output_decoded_char(c, ucs4, dest, destlen);
+		if (error != PARSERUTILS_OK && error != PARSERUTILS_NOMEM) {
+			/* output failed; restore source pointers */
+			*source = origsrc;
+			*sourcelen = origsrclen;
+		}
+
+		/* Clear inval buffer */
+		c->inval_buf[0] = '\0';
+		c->inval_len = 0;
+
+		return error;
+	} else if (errno == E2BIG) {
+		/* Should never happen */
+		abort();
+	} else if (errno == EINVAL) {
+		/* Incomplete input sequence */
+		if (*sourcelen > INVAL_BUFSIZE)
+			abort();
+
+		memmove(c->inval_buf, (const char *) *source, *sourcelen);
+		c->inval_buf[*sourcelen] = '\0';
+		c->inval_len = *sourcelen;
+
+		*source += *sourcelen;
+		*sourcelen = 0;
+
+		return PARSERUTILS_OK;
+	} else if (errno == EILSEQ) {
+		/* Illegal input sequence */
+		bool found = false;
+		const uint8_t *oldsrc;
+		size_t oldsrclen;
+
+		/* Clear inval buffer */
+		c->inval_buf[0] = '\0';
+		c->inval_len = 0;
+
+		/* Strict errormode; simply flag invalid character */
+		if (c->base.errormode == 
+				PARSERUTILS_CHARSET_CODEC_ERROR_STRICT) {
+			/* restore source pointers */
+			*source = origsrc;
+			*sourcelen = origsrclen;
+
+			return PARSERUTILS_INVALID;
+		}
+
+		/* Ok, this becomes problematic. The iconv API here
+		* is particularly unhelpful; *source will point at
+		* the _start_ of the illegal sequence. This means
+		* that we must find the end of the sequence */
+
+		/* Search for the start of the next valid input
+		 * sequence (or the end of the input stream) */
+		while (*sourcelen > 1) {
+			pucs4 = (uint8_t *) &ucs4;
+			sucs4 = 4;
+
+			(*source)++;
+			(*sourcelen)--;
+
+			oldsrc = *source;
+			oldsrclen = *sourcelen;
+
+			iconv_ret = iconv(c->read_cd,
+					(char **) source, sourcelen,
+					(char **) (void *) &pucs4, &sucs4);
+			if (iconv_ret != (size_t) -1 || errno != EILSEQ) {
+				found = true;
+				break;
+			}
+		}
+
+		if (found) {
+			/* Found start of next valid sequence */
+			*source = oldsrc;
+			*sourcelen = oldsrclen;
+		} else {
+			/* Not found - skip last byte in buffer */
+			(*source)++;
+			(*sourcelen)--;
+
+			if (*sourcelen != 0)
+				abort();
+		}
+
+		/* output U+FFFD and continue processing. */
+		error = iconv_codec_output_decoded_char(c,
+				htonl(0xFFFD), dest, destlen);
+		if (error != PARSERUTILS_OK && error != PARSERUTILS_NOMEM) {
+			/* output failed; restore source pointers */
+			*source = origsrc;
+			*sourcelen = origsrclen;
+		}
+
+		return error;
+	}
+
+	return PARSERUTILS_OK;
+}
+
+/**
+ * Write a UCS4 character in a codec's native charset
+ *
+ * \param c        The codec
+ * \param ucs4     The UCS4 character to write (big endian)
+ * \param dest     Pointer to pointer to output buffer (updated on exit)
+ * \param destlen  Pointer to length of output buffer (updated on exit)
+ * \return PARSERUTILS_OK       on success,
+ *         PARSERUTILS_NOMEM    if output buffer is too small,
+ *         PARSERUTILS_INVALID  if character cannot be represented and the
+ *                         codec's error handling mode is set to STRICT.
+ */
+parserutils_error iconv_codec_write_char(iconv_codec *c,
+		uint32_t ucs4, uint8_t **dest, size_t *destlen)
+{
+	size_t iconv_ret;
+	uint8_t *pucs4 = (uint8_t *) &ucs4;
+	size_t sucs4 = 4;
+	uint8_t *origdest = *dest;
+
+	iconv_ret = iconv(c->write_cd, (char **) (void *) &pucs4,
+			&sucs4, (char **) dest, destlen);
+
+	if (iconv_ret == (size_t) -1 && errno == E2BIG) {
+		/* Output buffer is too small */
+		return PARSERUTILS_NOMEM;
+	} else if (iconv_ret == (size_t) -1 && errno == EILSEQ) {
+		/* Illegal multibyte sequence */
+		/* This should never happen */
+		abort();
+	} else if (iconv_ret == (size_t) -1 && errno == EINVAL) {
+		/* Incomplete input character */
+		/* This should never happen */
+		abort();
+	} else if (*dest == origdest) {
+		/* Nothing was output */
+		switch (c->base.errormode) {
+		case PARSERUTILS_CHARSET_CODEC_ERROR_STRICT:
+			return PARSERUTILS_INVALID;
+
+		case PARSERUTILS_CHARSET_CODEC_ERROR_TRANSLIT:
+			/** \todo transliteration */
+		case PARSERUTILS_CHARSET_CODEC_ERROR_LOOSE:
+		{
+			pucs4 = (uint8_t *) &ucs4;
+			sucs4 = 4;
+
+			ucs4 = parserutils_charset_mibenum_is_unicode(
+					c->base.mibenum)
+					? htonl(0xFFFD) : htonl(0x3F);
+
+			iconv_ret = iconv(c->write_cd,
+					(char **) (void *) &pucs4, &sucs4,
+					(char **) dest, destlen);
+
+			if (iconv_ret == (size_t) -1 && errno == E2BIG) {
+				return PARSERUTILS_NOMEM;
+			} else if (iconv_ret == (size_t) -1 &&
+					errno == EILSEQ) {
+				/* Illegal multibyte sequence */
+				/* This should never happen */
+				abort();
+			} else if (iconv_ret == (size_t) -1 &&
+					errno == EINVAL) {
+				/* Incomplete input character */
+				/* This should never happen */
+				abort();
+			}
+		}
+			break;
+		}
+	}
+
+	return PARSERUTILS_OK;
+}
+
+const parserutils_charset_handler iconv_codec_handler = {
+	iconv_codec_handles_charset,
+	iconv_codec_create
+};
+
+#endif
diff --git a/src/charset/codecs/codec_impl.h b/src/charset/codecs/codec_impl.h
new file mode 100644
index 0000000..9183594
--- /dev/null
+++ b/src/charset/codecs/codec_impl.h
@@ -0,0 +1,48 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#ifndef parserutils_charset_codecs_codecimpl_h_
+#define parserutils_charset_codecs_codecimpl_h_
+
+#include <stdbool.h>
+#include <inttypes.h>
+
+#include <parserutils/charset/codec.h>
+
+/**
+ * Core charset codec definition; implementations extend this
+ */
+struct parserutils_charset_codec {
+	uint16_t mibenum;			/**< MIB enum for charset */
+
+	parserutils_charset_codec_errormode errormode;	/**< error mode */
+
+	parserutils_alloc alloc;		/**< allocation function */
+	void *alloc_pw;				/**< private word */
+
+	struct {
+		void (*destroy)(parserutils_charset_codec *codec);
+		parserutils_error (*encode)(parserutils_charset_codec *codec,
+				const uint8_t **source, size_t *sourcelen,
+				uint8_t **dest, size_t *destlen);
+		parserutils_error (*decode)(parserutils_charset_codec *codec,
+				const uint8_t **source, size_t *sourcelen,
+				uint8_t **dest, size_t *destlen);
+		parserutils_error (*reset)(parserutils_charset_codec *codec);
+	} handler; /**< Vtable for handler code */
+};
+
+/**
+ * Codec factory component definition
+ */
+typedef struct parserutils_charset_handler {
+	bool (*handles_charset)(const char *charset);
+	parserutils_charset_codec *(*create)(const char *charset,
+			parserutils_alloc alloc, void *pw);
+} parserutils_charset_handler;
+
+#endif
diff --git a/src/charset/codecs/codec_utf16.c b/src/charset/codecs/codec_utf16.c
new file mode 100644
index 0000000..0dd7a07
--- /dev/null
+++ b/src/charset/codecs/codec_utf16.c
@@ -0,0 +1,544 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#include <stdlib.h>
+#include <string.h>
+
+/* These two are for htonl / ntohl */
+#include <arpa/inet.h>
+#include <netinet/in.h>
+
+#include <parserutils/charset/mibenum.h>
+#include <parserutils/charset/utf16.h>
+
+#include "charset/codecs/codec_impl.h"
+#include "utils/utils.h"
+
+/**
+ * UTF-16 charset codec
+ */
+typedef struct charset_utf16_codec {
+	parserutils_charset_codec base;	/**< Base class */
+
+#define INVAL_BUFSIZE (32)
+	uint8_t inval_buf[INVAL_BUFSIZE];	/**< Buffer for fixing up
+						 * incomplete input
+						 * sequences */
+	size_t inval_len;		/*< Byte length of inval_buf **/
+
+#define READ_BUFSIZE (8)
+	uint32_t read_buf[READ_BUFSIZE];	/**< Buffer for partial
+						 * output sequences (decode)
+						 * (host-endian) */
+	size_t read_len;		/**< Character length of read_buf */
+
+#define WRITE_BUFSIZE (8)
+	uint32_t write_buf[WRITE_BUFSIZE];	/**< Buffer for partial
+						 * output sequences (encode)
+						 * (host-endian) */
+	size_t write_len;		/**< Character length of write_buf */
+
+} charset_utf16_codec;
+
+static bool charset_utf16_codec_handles_charset(const char *charset);
+static parserutils_charset_codec *charset_utf16_codec_create(
+		const char *charset, parserutils_alloc alloc, void *pw);
+static void charset_utf16_codec_destroy (parserutils_charset_codec *codec);
+static parserutils_error charset_utf16_codec_encode(
+		parserutils_charset_codec *codec,
+		const uint8_t **source, size_t *sourcelen,
+		uint8_t **dest, size_t *destlen);
+static parserutils_error charset_utf16_codec_decode(
+		parserutils_charset_codec *codec,
+		const uint8_t **source, size_t *sourcelen,
+		uint8_t **dest, size_t *destlen);
+static parserutils_error charset_utf16_codec_reset(
+		parserutils_charset_codec *codec);
+static inline parserutils_error charset_utf16_codec_read_char(
+		charset_utf16_codec *c,
+		const uint8_t **source, size_t *sourcelen,
+		uint8_t **dest, size_t *destlen);
+static inline parserutils_error charset_utf16_codec_output_decoded_char(
+		charset_utf16_codec *c,
+		uint32_t ucs4, uint8_t **dest, size_t *destlen);
+
+/**
+ * Determine whether this codec handles a specific charset
+ *
+ * \param charset  Charset to test
+ * \return true if handleable, false otherwise
+ */
+bool charset_utf16_codec_handles_charset(const char *charset)
+{
+	return parserutils_charset_mibenum_from_name(charset, strlen(charset)) 
+		==
+		parserutils_charset_mibenum_from_name("UTF-16", SLEN("UTF-16"));
+}
+
+/**
+ * Create a utf16 codec
+ *
+ * \param charset  The charset to read from / write to
+ * \param alloc    Memory (de)allocation function
+ * \param pw       Pointer to client-specific private data (may be NULL)
+ * \return Pointer to codec, or NULL on failure
+ */
+parserutils_charset_codec *charset_utf16_codec_create(const char *charset,
+		parserutils_alloc alloc, void *pw)
+{
+	charset_utf16_codec *codec;
+
+	UNUSED(charset);
+
+	codec = alloc(NULL, sizeof(charset_utf16_codec), pw);
+	if (codec == NULL)
+		return NULL;
+
+	codec->inval_buf[0] = '\0';
+	codec->inval_len = 0;
+
+	codec->read_buf[0] = 0;
+	codec->read_len = 0;
+
+	codec->write_buf[0] = 0;
+	codec->write_len = 0;
+
+	/* Finally, populate vtable */
+	codec->base.handler.destroy = charset_utf16_codec_destroy;
+	codec->base.handler.encode = charset_utf16_codec_encode;
+	codec->base.handler.decode = charset_utf16_codec_decode;
+	codec->base.handler.reset = charset_utf16_codec_reset;
+
+	return (parserutils_charset_codec *) codec;
+}
+
+/**
+ * Destroy a utf16 codec
+ *
+ * \param codec  The codec to destroy
+ */
+void charset_utf16_codec_destroy (parserutils_charset_codec *codec)
+{
+	UNUSED(codec);
+}
+
+/**
+ * Encode a chunk of UCS4 data into utf16
+ *
+ * \param codec      The codec to use
+ * \param source     Pointer to pointer to source data
+ * \param sourcelen  Pointer to length (in bytes) of source data
+ * \param dest       Pointer to pointer to output buffer
+ * \param destlen    Pointer to length (in bytes) of output buffer
+ * \return PARSERUTILS_OK          on success,
+ *         PARSERUTILS_NOMEM       if output buffer is too small,
+ *         PARSERUTILS_INVALID     if a character cannot be represented and the
+ *                            codec's error handling mode is set to STRICT,
+ *
+ * On exit, ::source will point immediately _after_ the last input character
+ * read. Any remaining output for the character will be buffered by the
+ * codec for writing on the next call. 
+ *
+ * Note that, if failure occurs whilst attempting to write any output
+ * buffered by the last call, then ::source and ::sourcelen will remain
+ * unchanged (as nothing more has been read).
+ *
+ * ::sourcelen will be reduced appropriately on exit.
+ *
+ * ::dest will point immediately _after_ the last character written.
+ *
+ * ::destlen will be reduced appropriately on exit.
+ */
+parserutils_error charset_utf16_codec_encode(parserutils_charset_codec *codec,
+		const uint8_t **source, size_t *sourcelen,
+		uint8_t **dest, size_t *destlen)
+{
+	charset_utf16_codec *c = (charset_utf16_codec *) codec;
+	uint32_t ucs4;
+	uint32_t *towrite;
+	size_t towritelen;
+	parserutils_error error;
+
+	/* Process any outstanding characters from the previous call */
+	if (c->write_len > 0) {
+		uint32_t *pwrite = c->write_buf;
+		uint8_t buf[4];
+		size_t len;
+
+		while (c->write_len > 0) {
+			error = parserutils_charset_utf16_from_ucs4(
+					pwrite[0], buf, &len);
+			if (error != PARSERUTILS_OK)
+				abort();
+
+			if (*destlen < len) {
+				/* Insufficient output buffer space */
+				for (len = 0; len < c->write_len; len++)
+					c->write_buf[len] = pwrite[len];
+
+				return PARSERUTILS_NOMEM;
+			}
+
+			memcpy(*dest, buf, len);
+
+			*dest += len;
+			*destlen -= len;
+
+			pwrite++;
+			c->write_len--;
+		}
+	}
+
+	/* Now process the characters for this call */
+	while (*sourcelen > 0) {
+		ucs4 = ntohl(*((uint32_t *) (void *) *source));
+		towrite = &ucs4;
+		towritelen = 1;
+
+		/* Output current characters */
+		while (towritelen > 0) {
+			uint8_t buf[4];
+			size_t len;
+
+			error = parserutils_charset_utf16_from_ucs4(
+					towrite[0], buf, &len);
+			if (error != PARSERUTILS_OK)
+				abort();
+
+			if (*destlen < len) {
+				/* Insufficient output space */
+				if (towritelen >= WRITE_BUFSIZE)
+					abort();
+
+				c->write_len = towritelen;
+
+				/* Copy pending chars to save area, for
+				 * processing next call. */
+				for (len = 0; len < towritelen; len++)
+					c->write_buf[len] = towrite[len];
+
+				/* Claim character we've just buffered,
+				 * so it's not reprocessed */
+				*source += 4;
+				*sourcelen -= 4;
+
+				return PARSERUTILS_NOMEM;
+			}
+
+			memcpy(*dest, buf, len);
+
+			*dest += len;
+			*destlen -= len;
+
+			towrite++;
+			towritelen--;
+		}
+
+		*source += 4;
+		*sourcelen -= 4;
+	}
+
+	return PARSERUTILS_OK;
+}
+
+/**
+ * Decode a chunk of utf16 data into UCS4
+ *
+ * \param codec      The codec to use
+ * \param source     Pointer to pointer to source data
+ * \param sourcelen  Pointer to length (in bytes) of source data
+ * \param dest       Pointer to pointer to output buffer
+ * \param destlen    Pointer to length (in bytes) of output buffer
+ * \return PARSERUTILS_OK          on success,
+ *         PARSERUTILS_NOMEM       if output buffer is too small,
+ *         PARSERUTILS_INVALID     if a character cannot be represented and the
+ *                            codec's error handling mode is set to STRICT,
+ *
+ * On exit, ::source will point immediately _after_ the last input character
+ * read, if the result is _OK or _NOMEM. Any remaining output for the
+ * character will be buffered by the codec for writing on the next call.
+ *
+ * In the case of the result being _INVALID, ::source will point _at_ the 
+ * last input character read; nothing will be written or buffered for the 
+ * failed character. It is up to the client to fix the cause of the failure 
+ * and retry the decoding process.
+ *
+ * Note that, if failure occurs whilst attempting to write any output
+ * buffered by the last call, then ::source and ::sourcelen will remain
+ * unchanged (as nothing more has been read).
+ *
+ * If STRICT error handling is configured and an illegal sequence is split
+ * over two calls, then _INVALID will be returned from the second call,
+ * but ::source will point mid-way through the invalid sequence (i.e. it
+ * will be unmodified over the second call). In addition, the internal
+ * incomplete-sequence buffer will be emptied, such that subsequent calls
+ * will progress, rather than re-evaluating the same invalid sequence.
+ *
+ * ::sourcelen will be reduced appropriately on exit.
+ *
+ * ::dest will point immediately _after_ the last character written.
+ *
+ * ::destlen will be reduced appropriately on exit.
+ *
+ * Call this with a source length of 0 to flush the output buffer.
+ */
+parserutils_error charset_utf16_codec_decode(parserutils_charset_codec *codec,
+		const uint8_t **source, size_t *sourcelen,
+		uint8_t **dest, size_t *destlen)
+{
+	charset_utf16_codec *c = (charset_utf16_codec *) codec;
+	parserutils_error error;
+
+	if (c->read_len > 0) {
+		/* Output left over from last decode */
+		uint32_t *pread = c->read_buf;
+
+		while (c->read_len > 0 && *destlen >= c->read_len * 4) {
+			*((uint32_t *) (void *) *dest) = htonl(pread[0]);
+
+			*dest += 4;
+			*destlen -= 4;
+
+			pread++;
+			c->read_len--;
+		}
+
+		if (*destlen < c->read_len * 4) {
+			/* Ran out of output buffer */
+			size_t i;
+
+			/* Shuffle remaining output down */
+			for (i = 0; i < c->read_len; i++)
+				c->read_buf[i] = pread[i];
+
+			return PARSERUTILS_NOMEM;
+		}
+	}
+
+	if (c->inval_len > 0) {
+		/* The last decode ended in an incomplete sequence.
+		 * Fill up inval_buf with data from the start of the
+		 * new chunk and process it. */
+		uint8_t *in = c->inval_buf;
+		size_t ol = c->inval_len;
+		size_t l = min(INVAL_BUFSIZE - ol - 1, *sourcelen);
+		size_t orig_l = l;
+
+		memcpy(c->inval_buf + ol, *source, l);
+
+		l += c->inval_len;
+
+		error = charset_utf16_codec_read_char(c,
+				(const uint8_t **) &in, &l, dest, destlen);
+		if (error != PARSERUTILS_OK && error != PARSERUTILS_NOMEM) {
+			return error;
+		}
+
+		/* And now, fix up source pointers */
+		*source += max((signed) (orig_l - l), 0);
+		*sourcelen -= max((signed) (orig_l - l), 0);
+
+		/* Failed to resolve an incomplete character and
+		 * ran out of buffer space. No recovery strategy
+		 * possible, so explode everywhere. */
+		if ((orig_l + ol) - l == 0)
+			abort();
+
+		/* Report memory exhaustion case from above */
+		if (error != PARSERUTILS_OK)
+			return error;
+	}
+
+	/* Finally, the "normal" case; process all outstanding characters */
+	while (*sourcelen > 0) {
+		error = charset_utf16_codec_read_char(c,
+				source, sourcelen, dest, destlen);
+		if (error != PARSERUTILS_OK) {
+			return error;
+		}
+	}
+
+	return PARSERUTILS_OK;
+}
+
+/**
+ * Clear a utf16 codec's encoding state
+ *
+ * \param codec  The codec to reset
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error charset_utf16_codec_reset(parserutils_charset_codec *codec)
+{
+	charset_utf16_codec *c = (charset_utf16_codec *) codec;
+
+	c->inval_buf[0] = '\0';
+	c->inval_len = 0;
+
+	c->read_buf[0] = 0;
+	c->read_len = 0;
+
+	c->write_buf[0] = 0;
+	c->write_len = 0;
+
+	return PARSERUTILS_OK;
+}
+
+
+/**
+ * Read a character from the UTF-16 to UCS4 (big endian)
+ *
+ * \param c          The codec
+ * \param source     Pointer to pointer to source buffer (updated on exit)
+ * \param sourcelen  Pointer to length of source buffer (updated on exit)
+ * \param dest       Pointer to pointer to output buffer (updated on exit)
+ * \param destlen    Pointer to length of output buffer (updated on exit)
+ * \return PARSERUTILS_OK on success,
+ *         PARSERUTILS_NOMEM       if output buffer is too small,
+ *         PARSERUTILS_INVALID     if a character cannot be represented and the
+ *                            codec's error handling mode is set to STRICT,
+ *
+ * On exit, ::source will point immediately _after_ the last input character
+ * read, if the result is _OK or _NOMEM. Any remaining output for the
+ * character will be buffered by the codec for writing on the next call.
+ *
+ * In the case of the result being _INVALID, ::source will point _at_ the 
+ * last input character read; nothing will be written or buffered for the 
+ * failed character. It is up to the client to fix the cause of the failure 
+ * and retry the decoding process.
+ *
+ * ::sourcelen will be reduced appropriately on exit.
+ *
+ * ::dest will point immediately _after_ the last character written.
+ *
+ * ::destlen will be reduced appropriately on exit.
+ */
+parserutils_error charset_utf16_codec_read_char(charset_utf16_codec *c,
+		const uint8_t **source, size_t *sourcelen,
+		uint8_t **dest, size_t *destlen)
+{
+	uint32_t ucs4;
+	size_t sucs4;
+	parserutils_error error;
+
+	/* Convert a single character */
+	error = parserutils_charset_utf16_to_ucs4(*source, *sourcelen, 
+			&ucs4, &sucs4);
+	if (error == PARSERUTILS_OK) {
+		/* Read a character */
+		error = charset_utf16_codec_output_decoded_char(c,
+				ucs4, dest, destlen);
+		if (error == PARSERUTILS_OK || error == PARSERUTILS_NOMEM) {
+			/* output succeeded; update source pointers */
+			*source += sucs4;
+			*sourcelen -= sucs4;
+		}
+
+		/* Clear inval buffer */
+		c->inval_buf[0] = '\0';
+		c->inval_len = 0;
+
+		return error;
+	} else if (error == PARSERUTILS_NEEDDATA) {
+		/* Incomplete input sequence */
+		if (*sourcelen > INVAL_BUFSIZE)
+			abort();
+
+		memmove(c->inval_buf, (char *) *source, *sourcelen);
+		c->inval_buf[*sourcelen] = '\0';
+		c->inval_len = *sourcelen;
+
+		*source += *sourcelen;
+		*sourcelen = 0;
+
+		return PARSERUTILS_OK;
+	} else if (error == PARSERUTILS_INVALID) {
+		/* Illegal input sequence */
+		uint32_t nextchar;
+
+		/* Clear inval buffer */
+		c->inval_buf[0] = '\0';
+		c->inval_len = 0;
+
+		/* Strict errormode; simply flag invalid character */
+		if (c->base.errormode == 
+				PARSERUTILS_CHARSET_CODEC_ERROR_STRICT) {
+			return PARSERUTILS_INVALID;
+		}
+
+		/* Find next valid UTF-16 sequence.
+		 * We're processing client-provided data, so let's
+		 * be paranoid about its validity. */
+		error = parserutils_charset_utf16_next_paranoid(
+				*source, *sourcelen, 0, &nextchar);
+		if (error != PARSERUTILS_OK) {
+			if (error == PARSERUTILS_NEEDDATA) {
+				/* Need more data to be sure */
+				if (*sourcelen > INVAL_BUFSIZE)
+					abort();
+
+				memmove(c->inval_buf, (char *) *source,
+						*sourcelen);
+				c->inval_buf[*sourcelen] = '\0';
+				c->inval_len = *sourcelen;
+
+				*source += *sourcelen;
+				*sourcelen = 0;
+
+				nextchar = 0;
+			} else {
+				return error;
+			}
+		}
+
+		/* output U+FFFD and continue processing. */
+		error = charset_utf16_codec_output_decoded_char(c,
+				0xFFFD, dest, destlen);
+		if (error == PARSERUTILS_OK || error == PARSERUTILS_NOMEM) {
+			/* output succeeded; update source pointers */
+			*source += nextchar;
+			*sourcelen -= nextchar;
+		}
+
+		return error;
+	}
+
+	return PARSERUTILS_OK;
+}
+
+/**
+ * Output a UCS4 character
+ *
+ * \param c        Codec to use
+ * \param ucs4     UCS4 character (host endian)
+ * \param dest     Pointer to pointer to output buffer
+ * \param destlen  Pointer to output buffer length
+ * \return PARSERUTILS_OK          on success,
+ *         PARSERUTILS_NOMEM       if output buffer is too small,
+ */
+parserutils_error charset_utf16_codec_output_decoded_char(charset_utf16_codec *c,
+		uint32_t ucs4, uint8_t **dest, size_t *destlen)
+{
+	if (*destlen < 4) {
+		/* Run out of output buffer */
+		c->read_len = 1;
+		c->read_buf[0] = ucs4;
+
+		return PARSERUTILS_NOMEM;
+	}
+
+	*((uint32_t *) (void *) *dest) = htonl(ucs4);
+	*dest += 4;
+	*destlen -= 4;
+
+	return PARSERUTILS_OK;
+}
+
+
+const parserutils_charset_handler charset_utf16_codec_handler = {
+	charset_utf16_codec_handles_charset,
+	charset_utf16_codec_create
+};
diff --git a/src/charset/codecs/codec_utf8.c b/src/charset/codecs/codec_utf8.c
new file mode 100644
index 0000000..838d051
--- /dev/null
+++ b/src/charset/codecs/codec_utf8.c
@@ -0,0 +1,546 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#include <stdlib.h>
+#include <string.h>
+
+/* These two are for htonl / ntohl */
+#include <arpa/inet.h>
+#include <netinet/in.h>
+
+#include <parserutils/charset/mibenum.h>
+
+#include "charset/codecs/codec_impl.h"
+#include "charset/encodings/utf8impl.h"
+#include "utils/utils.h"
+
+/**
+ * UTF-8 charset codec
+ */
+typedef struct charset_utf8_codec {
+	parserutils_charset_codec base;	/**< Base class */
+
+#define INVAL_BUFSIZE (32)
+	uint8_t inval_buf[INVAL_BUFSIZE];	/**< Buffer for fixing up
+						 * incomplete input
+						 * sequences */
+	size_t inval_len;		/*< Byte length of inval_buf **/
+
+#define READ_BUFSIZE (8)
+	uint32_t read_buf[READ_BUFSIZE];	/**< Buffer for partial
+						 * output sequences (decode)
+						 * (host-endian) */
+	size_t read_len;		/**< Character length of read_buf */
+
+#define WRITE_BUFSIZE (8)
+	uint32_t write_buf[WRITE_BUFSIZE];	/**< Buffer for partial
+						 * output sequences (encode)
+						 * (host-endian) */
+	size_t write_len;		/**< Character length of write_buf */
+
+} charset_utf8_codec;
+
+static bool charset_utf8_codec_handles_charset(const char *charset);
+static parserutils_charset_codec *charset_utf8_codec_create(const char *charset,
+		parserutils_alloc alloc, void *pw);
+static void charset_utf8_codec_destroy (parserutils_charset_codec *codec);
+static parserutils_error charset_utf8_codec_encode(
+		parserutils_charset_codec *codec,
+		const uint8_t **source, size_t *sourcelen,
+		uint8_t **dest, size_t *destlen);
+static parserutils_error charset_utf8_codec_decode(
+		parserutils_charset_codec *codec,
+		const uint8_t **source, size_t *sourcelen,
+		uint8_t **dest, size_t *destlen);
+static parserutils_error charset_utf8_codec_reset(
+		parserutils_charset_codec *codec);
+static inline parserutils_error charset_utf8_codec_read_char(
+		charset_utf8_codec *c,
+		const uint8_t **source, size_t *sourcelen,
+		uint8_t **dest, size_t *destlen);
+static inline parserutils_error charset_utf8_codec_output_decoded_char(
+		charset_utf8_codec *c,
+		uint32_t ucs4, uint8_t **dest, size_t *destlen);
+
+/**
+ * Determine whether this codec handles a specific charset
+ *
+ * \param charset  Charset to test
+ * \return true if handleable, false otherwise
+ */
+bool charset_utf8_codec_handles_charset(const char *charset)
+{
+	return parserutils_charset_mibenum_from_name(charset, 
+				strlen(charset)) ==
+			parserutils_charset_mibenum_from_name("UTF-8", 
+				SLEN("UTF-8"));
+}
+
+/**
+ * Create a utf8 codec
+ *
+ * \param charset  The charset to read from / write to
+ * \param alloc    Memory (de)allocation function
+ * \param pw       Pointer to client-specific private data (may be NULL)
+ * \return Pointer to codec, or NULL on failure
+ */
+parserutils_charset_codec *charset_utf8_codec_create(const char *charset,
+		parserutils_alloc alloc, void *pw)
+{
+	charset_utf8_codec *codec;
+
+	UNUSED(charset);
+
+	codec = alloc(NULL, sizeof(charset_utf8_codec), pw);
+	if (codec == NULL)
+		return NULL;
+
+	codec->inval_buf[0] = '\0';
+	codec->inval_len = 0;
+
+	codec->read_buf[0] = 0;
+	codec->read_len = 0;
+
+	codec->write_buf[0] = 0;
+	codec->write_len = 0;
+
+	/* Finally, populate vtable */
+	codec->base.handler.destroy = charset_utf8_codec_destroy;
+	codec->base.handler.encode = charset_utf8_codec_encode;
+	codec->base.handler.decode = charset_utf8_codec_decode;
+	codec->base.handler.reset = charset_utf8_codec_reset;
+
+	return (parserutils_charset_codec *) codec;
+}
+
+/**
+ * Destroy a utf8 codec
+ *
+ * \param codec  The codec to destroy
+ */
+void charset_utf8_codec_destroy (parserutils_charset_codec *codec)
+{
+	UNUSED(codec);
+}
+
+/**
+ * Encode a chunk of UCS4 data into utf8
+ *
+ * \param codec      The codec to use
+ * \param source     Pointer to pointer to source data
+ * \param sourcelen  Pointer to length (in bytes) of source data
+ * \param dest       Pointer to pointer to output buffer
+ * \param destlen    Pointer to length (in bytes) of output buffer
+ * \return PARSERUTILS_OK          on success,
+ *         PARSERUTILS_NOMEM       if output buffer is too small,
+ *         PARSERUTILS_INVALID     if a character cannot be represented and the
+ *                            codec's error handling mode is set to STRICT,
+ *
+ * On exit, ::source will point immediately _after_ the last input character
+ * read. Any remaining output for the character will be buffered by the
+ * codec for writing on the next call.
+ *
+ * Note that, if failure occurs whilst attempting to write any output
+ * buffered by the last call, then ::source and ::sourcelen will remain
+ * unchanged (as nothing more has been read).
+ *
+ * ::sourcelen will be reduced appropriately on exit.
+ *
+ * ::dest will point immediately _after_ the last character written.
+ *
+ * ::destlen will be reduced appropriately on exit.
+ */
+parserutils_error charset_utf8_codec_encode(parserutils_charset_codec *codec,
+		const uint8_t **source, size_t *sourcelen,
+		uint8_t **dest, size_t *destlen)
+{
+	charset_utf8_codec *c = (charset_utf8_codec *) codec;
+	uint32_t ucs4;
+	uint32_t *towrite;
+	size_t towritelen;
+	parserutils_error error;
+
+	/* Process any outstanding characters from the previous call */
+	if (c->write_len > 0) {
+		uint32_t *pwrite = c->write_buf;
+
+		while (c->write_len > 0) {
+			UTF8_FROM_UCS4(pwrite[0], dest, destlen, error);
+			if (error != PARSERUTILS_OK) {
+				if (error != PARSERUTILS_NOMEM)
+					abort();
+
+				/* Insufficient output buffer space */
+				for (uint32_t len = 0; 
+						len < c->write_len; len++) {
+					c->write_buf[len] = pwrite[len];
+				}
+
+				return PARSERUTILS_NOMEM;
+			}
+
+			pwrite++;
+			c->write_len--;
+		}
+	}
+
+	/* Now process the characters for this call */
+	while (*sourcelen > 0) {
+		ucs4 = ntohl(*((uint32_t *) (void *) *source));
+		towrite = &ucs4;
+		towritelen = 1;
+
+		/* Output current characters */
+		while (towritelen > 0) {
+			UTF8_FROM_UCS4(towrite[0], dest, destlen, error);
+			if (error != PARSERUTILS_OK) {
+				if (error != PARSERUTILS_NOMEM)
+					abort();
+
+				/* Insufficient output space */
+				if (towritelen >= WRITE_BUFSIZE)
+					abort();
+
+				c->write_len = towritelen;
+
+				/* Copy pending chars to save area, for
+				 * processing next call. */
+				for (uint32_t len = 0; len < towritelen; len++)
+					c->write_buf[len] = towrite[len];
+
+				/* Claim character we've just buffered,
+				 * so it's not reprocessed */
+				*source += 4;
+				*sourcelen -= 4;
+
+				return PARSERUTILS_NOMEM;
+			}
+
+			towrite++;
+			towritelen--;
+		}
+
+		*source += 4;
+		*sourcelen -= 4;
+	}
+
+	return PARSERUTILS_OK;
+}
+
+/**
+ * Decode a chunk of utf8 data into UCS4
+ *
+ * \param codec      The codec to use
+ * \param source     Pointer to pointer to source data
+ * \param sourcelen  Pointer to length (in bytes) of source data
+ * \param dest       Pointer to pointer to output buffer
+ * \param destlen    Pointer to length (in bytes) of output buffer
+ * \return PARSERUTILS_OK          on success,
+ *         PARSERUTILS_NOMEM       if output buffer is too small,
+ *         PARSERUTILS_INVALID     if a character cannot be represented and the
+ *                            codec's error handling mode is set to STRICT,
+ *
+ * On exit, ::source will point immediately _after_ the last input character
+ * read, if the result is _OK or _NOMEM. Any remaining output for the
+ * character will be buffered by the codec for writing on the next call.
+ *
+ * In the case of the result being _INVALID, ::source will point _at_ the 
+ * last input character read; nothing will be written or buffered for the 
+ * failed character. It is up to the client to fix the cause of the failure 
+ * and retry the decoding process.
+ *
+ * Note that, if failure occurs whilst attempting to write any output
+ * buffered by the last call, then ::source and ::sourcelen will remain
+ * unchanged (as nothing more has been read).
+ *
+ * If STRICT error handling is configured and an illegal sequence is split
+ * over two calls, then _INVALID will be returned from the second call,
+ * but ::source will point mid-way through the invalid sequence (i.e. it
+ * will be unmodified over the second call). In addition, the internal
+ * incomplete-sequence buffer will be emptied, such that subsequent calls
+ * will progress, rather than re-evaluating the same invalid sequence.
+ *
+ * ::sourcelen will be reduced appropriately on exit.
+ *
+ * ::dest will point immediately _after_ the last character written.
+ *
+ * ::destlen will be reduced appropriately on exit.
+ *
+ * Call this with a source length of 0 to flush the output buffer.
+ */
+parserutils_error charset_utf8_codec_decode(parserutils_charset_codec *codec,
+		const uint8_t **source, size_t *sourcelen,
+		uint8_t **dest, size_t *destlen)
+{
+	charset_utf8_codec *c = (charset_utf8_codec *) codec;
+	parserutils_error error;
+
+	if (c->read_len > 0) {
+		/* Output left over from last decode */
+		uint32_t *pread = c->read_buf;
+
+		while (c->read_len > 0 && *destlen >= c->read_len * 4) {
+			*((uint32_t *) (void *) *dest) = htonl(pread[0]);
+
+			*dest += 4;
+			*destlen -= 4;
+
+			pread++;
+			c->read_len--;
+		}
+
+		if (*destlen < c->read_len * 4) {
+			/* Ran out of output buffer */
+			size_t i;
+
+			/* Shuffle remaining output down */
+			for (i = 0; i < c->read_len; i++)
+				c->read_buf[i] = pread[i];
+
+			return PARSERUTILS_NOMEM;
+		}
+	}
+
+	if (c->inval_len > 0) {
+		/* The last decode ended in an incomplete sequence.
+		 * Fill up inval_buf with data from the start of the
+		 * new chunk and process it. */
+		uint8_t *in = c->inval_buf;
+		size_t ol = c->inval_len;
+		size_t l = min(INVAL_BUFSIZE - ol - 1, *sourcelen);
+		size_t orig_l = l;
+
+		memcpy(c->inval_buf + ol, *source, l);
+
+		l += c->inval_len;
+
+		error = charset_utf8_codec_read_char(c,
+				(const uint8_t **) &in, &l, dest, destlen);
+		if (error != PARSERUTILS_OK && error != PARSERUTILS_NOMEM) {
+			return error;
+		}
+
+		/* And now, fix up source pointers */
+		*source += max((signed) (orig_l - l), 0);
+		*sourcelen -= max((signed) (orig_l - l), 0);
+
+		/* Failed to resolve an incomplete character and
+		 * ran out of buffer space. No recovery strategy
+		 * possible, so explode everywhere. */
+		if ((orig_l + ol) - l == 0)
+			abort();
+
+		/* Report memory exhaustion case from above */
+		if (error != PARSERUTILS_OK)
+			return error;
+	}
+
+	/* Finally, the "normal" case; process all outstanding characters */
+	while (*sourcelen > 0) {
+		error = charset_utf8_codec_read_char(c,
+				source, sourcelen, dest, destlen);
+		if (error != PARSERUTILS_OK) {
+			return error;
+		}
+	}
+
+	return PARSERUTILS_OK;
+}
+
+/**
+ * Clear a utf8 codec's encoding state
+ *
+ * \param codec  The codec to reset
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error charset_utf8_codec_reset(parserutils_charset_codec *codec)
+{
+	charset_utf8_codec *c = (charset_utf8_codec *) codec;
+
+	c->inval_buf[0] = '\0';
+	c->inval_len = 0;
+
+	c->read_buf[0] = 0;
+	c->read_len = 0;
+
+	c->write_buf[0] = 0;
+	c->write_len = 0;
+
+	return PARSERUTILS_OK;
+}
+
+
+/**
+ * Read a character from the UTF-8 to UCS4 (big endian)
+ *
+ * \param c          The codec
+ * \param source     Pointer to pointer to source buffer (updated on exit)
+ * \param sourcelen  Pointer to length of source buffer (updated on exit)
+ * \param dest       Pointer to pointer to output buffer (updated on exit)
+ * \param destlen    Pointer to length of output buffer (updated on exit)
+ * \return PARSERUTILS_OK on success,
+ *         PARSERUTILS_NOMEM       if output buffer is too small,
+ *         PARSERUTILS_INVALID     if a character cannot be represented and the
+ *                            codec's error handling mode is set to STRICT,
+ *
+ * On exit, ::source will point immediately _after_ the last input character
+ * read, if the result is _OK or _NOMEM. Any remaining output for the
+ * character will be buffered by the codec for writing on the next call.
+ *
+ * In the case of the result being _INVALID, ::source will point _at_ the 
+ * last input character read; nothing will be written or buffered for the 
+ * failed character. It is up to the client to fix the cause of the failure 
+ * and retry the decoding process.
+ *
+ * ::sourcelen will be reduced appropriately on exit.
+ *
+ * ::dest will point immediately _after_ the last character written.
+ *
+ * ::destlen will be reduced appropriately on exit.
+ */
+parserutils_error charset_utf8_codec_read_char(charset_utf8_codec *c,
+		const uint8_t **source, size_t *sourcelen,
+		uint8_t **dest, size_t *destlen)
+{
+	uint32_t ucs4;
+	size_t sucs4;
+	parserutils_error error;
+
+	/* Convert a single character */
+	{
+		const uint8_t *src = *source;
+		size_t srclen = *sourcelen;
+		uint32_t *uptr = &ucs4;
+		size_t *usptr = &sucs4;
+		UTF8_TO_UCS4(src, srclen, uptr, usptr, error);
+	}
+	if (error == PARSERUTILS_OK) {
+		/* Read a character */
+		error = charset_utf8_codec_output_decoded_char(c,
+				ucs4, dest, destlen);
+		if (error == PARSERUTILS_OK || error == PARSERUTILS_NOMEM) {
+			/* output succeeded; update source pointers */
+			*source += sucs4;
+			*sourcelen -= sucs4;
+		}
+
+		/* Clear inval buffer */
+		c->inval_buf[0] = '\0';
+		c->inval_len = 0;
+
+		return error;
+	} else if (error == PARSERUTILS_NEEDDATA) {
+		/* Incomplete input sequence */
+		if (*sourcelen > INVAL_BUFSIZE)
+			abort();
+
+		memmove(c->inval_buf, (char *) *source, *sourcelen);
+		c->inval_buf[*sourcelen] = '\0';
+		c->inval_len = *sourcelen;
+
+		*source += *sourcelen;
+		*sourcelen = 0;
+
+		return PARSERUTILS_OK;
+	} else if (error == PARSERUTILS_INVALID) {
+		/* Illegal input sequence */
+		uint32_t nextchar;
+	
+		/* Strict errormode; simply flag invalid character */
+		if (c->base.errormode == 
+				PARSERUTILS_CHARSET_CODEC_ERROR_STRICT) {
+			/* Clear inval buffer */
+			c->inval_buf[0] = '\0';
+			c->inval_len = 0;
+
+			return PARSERUTILS_INVALID;
+		}
+
+		/* Find next valid UTF-8 sequence.
+		 * We're processing client-provided data, so let's
+		 * be paranoid about its validity. */
+		{
+			const uint8_t *src = *source;
+			size_t srclen = *sourcelen;
+			uint32_t off = 0;
+			uint32_t *ncptr = &nextchar;
+
+			UTF8_NEXT_PARANOID(src, srclen, off, ncptr, error);
+		}
+		if (error != PARSERUTILS_OK) {
+			if (error == PARSERUTILS_NEEDDATA) {
+				/* Need more data to be sure */
+				if (*sourcelen > INVAL_BUFSIZE)
+					abort();
+
+				memmove(c->inval_buf, (char *) *source,
+						*sourcelen);
+				c->inval_buf[*sourcelen] = '\0';
+				c->inval_len = *sourcelen;
+
+				*source += *sourcelen;
+				*sourcelen = 0;
+
+				nextchar = 0;
+			} else {
+				return error;
+			}
+		}
+
+		/* Clear inval buffer */
+		c->inval_buf[0] = '\0';
+		c->inval_len = 0;
+
+		/* output U+FFFD and continue processing. */
+		error = charset_utf8_codec_output_decoded_char(c,
+				0xFFFD, dest, destlen);
+		if (error == PARSERUTILS_OK || error == PARSERUTILS_NOMEM) {
+			/* output succeeded; update source pointers */
+			*source += nextchar;
+			*sourcelen -= nextchar;
+		}
+
+		return error;
+	}
+
+	return PARSERUTILS_OK;
+}
+
+/**
+ * Output a UCS4 character
+ *
+ * \param c        Codec to use
+ * \param ucs4     UCS4 character (host endian)
+ * \param dest     Pointer to pointer to output buffer
+ * \param destlen  Pointer to output buffer length
+ * \return PARSERUTILS_OK          on success,
+ *         PARSERUTILS_NOMEM       if output buffer is too small,
+ */
+parserutils_error charset_utf8_codec_output_decoded_char(charset_utf8_codec *c,
+		uint32_t ucs4, uint8_t **dest, size_t *destlen)
+{
+	if (*destlen < 4) {
+		/* Run out of output buffer */
+		c->read_len = 1;
+		c->read_buf[0] = ucs4;
+
+		return PARSERUTILS_NOMEM;
+	}
+
+	*((uint32_t *) (void *) *dest) = htonl(ucs4);
+	*dest += 4;
+	*destlen -= 4;
+
+	return PARSERUTILS_OK;
+}
+
+
+const parserutils_charset_handler charset_utf8_codec_handler = {
+	charset_utf8_codec_handles_charset,
+	charset_utf8_codec_create
+};
+
diff --git a/src/charset/encodings/Makefile b/src/charset/encodings/Makefile
new file mode 100644
index 0000000..47d9210
--- /dev/null
+++ b/src/charset/encodings/Makefile
@@ -0,0 +1,46 @@
+# Child makefile fragment
+#
+# Toolchain is provided by top-level makefile
+#
+# Variables provided by top-level makefile
+#
+# COMPONENT		The name of the component
+# EXPORT		The location of the export directory
+# TOP			The location of the source tree root
+# RELEASEDIR		The place to put release objects
+# DEBUGDIR		The place to put debug objects
+#
+# do_include		Canned command sequence to include a child makefile
+#
+# Variables provided by parent makefile:
+#
+# DIR			The name of the directory we're in, relative to $(TOP)
+#
+# Variables we can manipulate:
+#
+# ITEMS_CLEAN		The list of items to remove for "make clean"
+# ITEMS_DISTCLEAN	The list of items to remove for "make distclean"
+# TARGET_TESTS		The list of target names to run for "make test"
+#
+# SOURCES		The list of sources to build for $(COMPONENT)
+#
+# Plus anything from the toolchain
+
+# Push parent directory onto the directory stack
+sp             := $(sp).x
+dirstack_$(sp) := $(d)
+d              := $(DIR)
+
+# Sources
+SRCS_$(d) := utf8.c utf16.c
+
+# Append to sources for component
+SOURCES += $(addprefix $(d), $(SRCS_$(d)))
+
+# Now include any children we may have
+MAKE_INCLUDES := $(wildcard $(d)*/Makefile)
+$(eval $(foreach INC, $(MAKE_INCLUDES), $(call do_include,$(INC))))
+
+# Finally, pop off the directory stack
+d  := $(dirstack_$(sp))
+sp := $(basename $(sp))
diff --git a/src/charset/encodings/utf16.c b/src/charset/encodings/utf16.c
new file mode 100644
index 0000000..95dc64f
--- /dev/null
+++ b/src/charset/encodings/utf16.c
@@ -0,0 +1,239 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+/** \file
+ * UTF-16 manipulation functions (implementation).
+ */
+
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <parserutils/charset/utf16.h>
+
+/**
+ * Convert a UTF-16 sequence into a single UCS4 character
+ *
+ * \param s     The sequence to process
+ * \param len   Length of sequence
+ * \param ucs4  Pointer to location to receive UCS4 character (host endian)
+ * \param clen  Pointer to location to receive byte length of UTF-16 sequence
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_utf16_to_ucs4(const uint8_t *s, 
+		size_t len, uint32_t *ucs4, size_t *clen)
+{
+	const uint16_t *ss = (const uint16_t *) (const void *) s;
+
+	if (s == NULL || ucs4 == NULL || clen == NULL)
+		return PARSERUTILS_BADPARM;
+
+	if (len < 2)
+		return PARSERUTILS_NEEDDATA;
+
+	if (*ss < 0xD800 || *ss > 0xDFFF) {
+		*ucs4 = *ss;
+		*clen = 2;
+	} else if (0xD800 <= *ss && *ss <= 0xBFFF) {
+		if (len < 4)
+			return PARSERUTILS_NEEDDATA;
+
+		if (0xDC00 <= ss[1] && ss[1] <= 0xE000) {
+			*ucs4 = (((s[0] >> 6) & 0x1f) + 1) |
+					((s[0] & 0x3f) | (s[1] & 0x3ff));
+			*clen = 4;
+		} else {
+			return PARSERUTILS_INVALID;
+		}
+	}
+
+	return PARSERUTILS_OK;
+}
+
+/**
+ * Convert a single UCS4 character into a UTF-16 sequence
+ *
+ * \param ucs4  The character to process (0 <= c <= 0x7FFFFFFF) (host endian)
+ * \param s     Pointer to 4 byte long output buffer
+ * \param len   Pointer to location to receive length of multibyte sequence
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_utf16_from_ucs4(uint32_t ucs4, uint8_t *s,
+		size_t *len)
+{
+	uint16_t *ss = (uint16_t *) (void *) s;
+	uint32_t l = 0;
+
+	if (s == NULL || len == NULL)
+		return PARSERUTILS_BADPARM;
+	else if (ucs4 < 0x10000) {
+		*ss = (uint16_t) ucs4;
+		l = 2;
+	} else if (ucs4 < 0x110000) {
+		ss[0] = 0xD800 | (((ucs4 >> 16) & 0x1f) - 1) | (ucs4 >> 10);
+		ss[1] = 0xDC00 | (ucs4 & 0x3ff);
+		l = 4;
+	} else {
+		return PARSERUTILS_INVALID;
+	}
+
+	*len = l;
+
+	return PARSERUTILS_OK;
+}
+
+/**
+ * Calculate the length (in characters) of a bounded UTF-16 string
+ *
+ * \param s    The string
+ * \param max  Maximum length
+ * \param len  Pointer to location to receive length of string
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_utf16_length(const uint8_t *s, size_t max,
+		size_t *len)
+{
+	const uint16_t *ss = (const uint16_t *) (const void *) s;
+	const uint16_t *end = (const uint16_t *) (const void *) (s + max);
+	int l = 0;
+
+	if (s == NULL || len == NULL)
+		return PARSERUTILS_BADPARM;
+
+	while (ss < end) {
+		if (*ss < 0xD800 || 0xDFFF < *ss)
+			ss++;
+		else
+			ss += 2;
+
+		l++;
+	}
+
+	*len = l;
+
+	return PARSERUTILS_OK;
+}
+
+/**
+ * Calculate the length (in bytes) of a UTF-16 character
+ *
+ * \param s    Pointer to start of character
+ * \param len  Pointer to location to receive length
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_utf16_char_byte_length(const uint8_t *s,
+		size_t *len)
+{
+	const uint16_t *ss = (const uint16_t *) (const void *) s;
+
+	if (s == NULL || len == NULL)
+		return PARSERUTILS_BADPARM;
+
+	if (*ss < 0xD800 || 0xDFFF < *ss)
+		*len = 2;
+	else
+		*len = 4;
+
+	return PARSERUTILS_OK;
+}
+
+/**
+ * Find previous legal UTF-16 char in string
+ *
+ * \param s        The string
+ * \param off      Offset in the string to start at
+ * \param prevoff  Pointer to location to receive offset of first byte of
+ *                 previous legal character
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_utf16_prev(const uint8_t *s, uint32_t off,
+		uint32_t *prevoff)
+{
+	const uint16_t *ss = (const uint16_t *) (const void *) s;
+
+	if (s == NULL || prevoff == NULL)
+		return PARSERUTILS_BADPARM;
+
+	if (off < 2)
+		*prevoff = 0;
+	else if (ss[-1] < 0xDC00 || ss[-1] > 0xDFFF)
+		*prevoff = off - 2;
+	else
+		*prevoff = (off < 4) ? 0 : off - 4;
+
+	return PARSERUTILS_OK;
+}
+
+/**
+ * Find next legal UTF-16 char in string
+ *
+ * \param s        The string (assumed valid)
+ * \param len      Maximum offset in string
+ * \param off      Offset in the string to start at
+ * \param nextoff  Pointer to location to receive offset of first byte of
+ *                 next legal character
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_utf16_next(const uint8_t *s, uint32_t len,
+		uint32_t off, uint32_t *nextoff)
+{
+	const uint16_t *ss = (const uint16_t *) (const void *) s;
+
+	if (s == NULL || off >= len || nextoff == NULL)
+		return PARSERUTILS_BADPARM;
+
+	if (len - off < 4)
+		*nextoff = len;
+	else if (ss[1] < 0xD800 || ss[1] > 0xDBFF)
+		*nextoff = off + 2;
+	else
+		*nextoff = (len - off < 6) ? len : off + 4;
+
+	return PARSERUTILS_OK;
+}
+
+/**
+ * Find next legal UTF-16 char in string
+ *
+ * \param s        The string (assumed to be of dubious validity)
+ * \param len      Maximum offset in string
+ * \param off      Offset in the string to start at
+ * \param nextoff  Pointer to location to receive offset of first byte of
+ *                 next legal character
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_utf16_next_paranoid(const uint8_t *s,
+		uint32_t len, uint32_t off, uint32_t *nextoff)
+{
+	const uint16_t *ss = (const uint16_t *) (const void *) s;
+
+	if (s == NULL || off >= len || nextoff == NULL)
+		return PARSERUTILS_BADPARM;
+
+	while (1) {
+		if (len - off < 4) {
+			return PARSERUTILS_NEEDDATA;
+		} else if (ss[1] < 0xD800 || ss[1] > 0xDFFF) {
+			*nextoff = off + 2;
+			break;
+		} else if (ss[1] >= 0xD800 && ss[1] <= 0xDBFF) {
+			if (len - off < 6)
+				return PARSERUTILS_NEEDDATA;
+
+			if (ss[2] >= 0xDC00 && ss[2] <= 0xDFFF) {
+				*nextoff = off + 4;
+				break;
+			} else {
+				ss++;
+				off += 2;
+			}
+		}
+	}
+
+	return PARSERUTILS_OK;
+}
+
diff --git a/src/charset/encodings/utf8.c b/src/charset/encodings/utf8.c
new file mode 100644
index 0000000..5b4ba95
--- /dev/null
+++ b/src/charset/encodings/utf8.c
@@ -0,0 +1,175 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+/** \file
+ * UTF-8 manipulation functions (implementation).
+ */
+
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <parserutils/charset/utf8.h>
+#include "charset/encodings/utf8impl.h"
+
+/** Number of continuation bytes for a given start byte */
+const uint8_t numContinuations[256] = {
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+	3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5,
+};
+
+/**
+ * Convert a UTF-8 multibyte sequence into a single UCS4 character
+ *
+ * Encoding of UCS values outside the UTF-16 plane has been removed from
+ * RFC3629. This function conforms to RFC2279, however.
+ *
+ * \param s     The sequence to process
+ * \param len   Length of sequence
+ * \param ucs4  Pointer to location to receive UCS4 character (host endian)
+ * \param clen  Pointer to location to receive byte length of UTF-8 sequence
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_utf8_to_ucs4(const uint8_t *s, size_t len,
+		uint32_t *ucs4, size_t *clen)
+{
+	parserutils_error error;
+
+	UTF8_TO_UCS4(s, len, ucs4, clen, error);
+
+	return error;
+}
+
+/**
+ * Convert a single UCS4 character into a UTF-8 multibyte sequence
+ *
+ * Encoding of UCS values outside the UTF-16 plane has been removed from
+ * RFC3629. This function conforms to RFC2279, however.
+ *
+ * \param ucs4  The character to process (0 <= c <= 0x7FFFFFFF) (host endian)
+ * \param s     Pointer to pointer to output buffer, updated on exit
+ * \param len   Pointer to length, in bytes, of output buffer, updated on exit
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_utf8_from_ucs4(uint32_t ucs4, 
+		uint8_t **s, size_t *len)
+{
+	parserutils_error error;
+
+	UTF8_FROM_UCS4(ucs4, s, len, error);
+
+	return error;
+}
+
+/**
+ * Calculate the length (in characters) of a bounded UTF-8 string
+ *
+ * \param s    The string
+ * \param max  Maximum length
+ * \param len  Pointer to location to receive length of string
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_utf8_length(const uint8_t *s, size_t max,
+		size_t *len)
+{
+	parserutils_error error;
+
+	UTF8_LENGTH(s, max, len, error);
+
+	return error;
+}
+
+/**
+ * Calculate the length (in bytes) of a UTF-8 character
+ *
+ * \param s    Pointer to start of character
+ * \param len  Pointer to location to receive length
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_utf8_char_byte_length(const uint8_t *s,
+		size_t *len)
+{
+	parserutils_error error;
+
+	UTF8_CHAR_BYTE_LENGTH(s, len, error);
+
+	return error;
+}
+
+/**
+ * Find previous legal UTF-8 char in string
+ *
+ * \param s        The string
+ * \param off      Offset in the string to start at
+ * \param prevoff  Pointer to location to receive offset of first byte of
+ *                 previous legal character
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_utf8_prev(const uint8_t *s, uint32_t off,
+		uint32_t *prevoff)
+{
+	parserutils_error error;
+
+	UTF8_PREV(s, off, prevoff, error);
+
+	return error;
+}
+
+/**
+ * Find next legal UTF-8 char in string
+ *
+ * \param s        The string (assumed valid)
+ * \param len      Maximum offset in string
+ * \param off      Offset in the string to start at
+ * \param nextoff  Pointer to location to receive offset of first byte of
+ *                 next legal character
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_utf8_next(const uint8_t *s, uint32_t len,
+		uint32_t off, uint32_t *nextoff)
+{
+	parserutils_error error;
+
+	UTF8_NEXT(s, len, off, nextoff, error);
+
+	return error;
+}
+
+/**
+ * Find next legal UTF-8 char in string
+ *
+ * \param s        The string (assumed to be of dubious validity)
+ * \param len      Maximum offset in string
+ * \param off      Offset in the string to start at
+ * \param nextoff  Pointer to location to receive offset of first byte of
+ *                 next legal character
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_utf8_next_paranoid(const uint8_t *s, 
+		uint32_t len, uint32_t off, uint32_t *nextoff)
+{
+	parserutils_error error;
+
+	UTF8_NEXT_PARANOID(s, len, off, nextoff, error);
+
+	return error;
+}
+
diff --git a/src/charset/encodings/utf8impl.h b/src/charset/encodings/utf8impl.h
new file mode 100644
index 0000000..1ca9de7
--- /dev/null
+++ b/src/charset/encodings/utf8impl.h
@@ -0,0 +1,339 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#ifndef parserutils_charset_encodings_utf8impl_h_
+#define parserutils_charset_encodings_utf8impl_h_
+
+/** \file
+ * UTF-8 manipulation macros (implementation).
+ */
+
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+
+/** Number of continuation bytes for a given start byte */
+extern const uint8_t numContinuations[256];
+
+/**
+ * Convert a UTF-8 multibyte sequence into a single UCS4 character
+ *
+ * Encoding of UCS values outside the UTF-16 plane has been removed from
+ * RFC3629. This macro conforms to RFC2279, however.
+ *
+ * \param s      The sequence to process
+ * \param len    Length of sequence
+ * \param ucs4   Pointer to location to receive UCS4 character (host endian)
+ * \param clen   Pointer to location to receive byte length of UTF-8 sequence
+ * \param error  Location to receive error code
+ */
+#define UTF8_TO_UCS4(s, len, ucs4, clen, error)				\
+do {									\
+	uint32_t c, min;						\
+	uint8_t n;							\
+									\
+	error = PARSERUTILS_OK;						\
+									\
+	if (s == NULL || ucs4 == NULL || clen == NULL) {		\
+		error = PARSERUTILS_BADPARM;				\
+		break;							\
+	}								\
+									\
+	if (len == 0) {							\
+		error = PARSERUTILS_NEEDDATA;				\
+		break;							\
+	}								\
+									\
+	c = s[0];							\
+									\
+	if (c < 0x80) {							\
+		n = 1;							\
+		min = 0;						\
+	} else if ((c & 0xE0) == 0xC0) {				\
+		c &= 0x1F;						\
+		n = 2;							\
+		min = 0x80;						\
+	} else if ((c & 0xF0) == 0xE0) {				\
+		c &= 0x0F;						\
+		n = 3;							\
+		min = 0x800;						\
+	} else if ((c & 0xF8) == 0xF0) {				\
+		c &= 0x07;						\
+		n = 4;							\
+		min = 0x10000;						\
+	} else if ((c & 0xFC) == 0xF8) {				\
+		c &= 0x03;						\
+		n = 5;							\
+		min = 0x200000;						\
+	} else if ((c & 0xFE) == 0xFC) {				\
+		c &= 0x01;						\
+		n = 6;							\
+		min = 0x4000000;					\
+	} else {							\
+		error = PARSERUTILS_INVALID;				\
+		break;							\
+	}								\
+									\
+	if (len < n) {							\
+		error = PARSERUTILS_NEEDDATA;				\
+		break;							\
+	}								\
+									\
+	for (uint8_t i = 1; i < n; i++) {				\
+		uint32_t t = s[i];					\
+									\
+		if ((t & 0xC0) != 0x80) {				\
+			error = PARSERUTILS_INVALID;			\
+			break;						\
+		}							\
+									\
+		c <<= 6;						\
+		c |= t & 0x3F;						\
+	}								\
+									\
+	if (error == PARSERUTILS_OK) {					\
+		/* Detect overlong sequences, surrogates and fffe/ffff */ \
+		if (c < min || (c >= 0xD800 && c <= 0xDFFF) ||		\
+				c == 0xFFFE || c == 0xFFFF) {		\
+			error = PARSERUTILS_INVALID;			\
+			break;						\
+		}							\
+									\
+		*ucs4 = c;						\
+		*clen = n;						\
+	}								\
+} while(0)
+
+/**
+ * Convert a single UCS4 character into a UTF-8 multibyte sequence
+ *
+ * Encoding of UCS values outside the UTF-16 plane has been removed from
+ * RFC3629. This macro conforms to RFC2279, however.
+ *
+ * \param ucs4   The character to process (0 <= c <= 0x7FFFFFFF) (host endian)
+ * \param s      Pointer to pointer to output buffer, updated on exit
+ * \param len    Pointer to length, in bytes, of output buffer, updated on exit
+ * \param error  Location to receive error code
+ */
+#define UTF8_FROM_UCS4(ucs4, s, len, error)				\
+do {									\
+	uint8_t *buf;							\
+	uint8_t l = 0;							\
+									\
+	error = PARSERUTILS_OK;						\
+									\
+	if (s == NULL || *s == NULL || len == NULL) {			\
+		error = PARSERUTILS_BADPARM;				\
+		break;							\
+	}								\
+									\
+	if (ucs4 < 0x80) {						\
+		l = 1;							\
+	} else if (ucs4 < 0x800) {					\
+		l = 2;							\
+	} else if (ucs4 < 0x10000) {					\
+		l = 3;							\
+	} else if (ucs4 < 0x200000) {					\
+		l = 4;							\
+	} else if (ucs4 < 0x4000000) {					\
+		l = 5;							\
+	} else if (ucs4 <= 0x7FFFFFFF) {				\
+		l = 6;							\
+	} else {							\
+		error = PARSERUTILS_INVALID;				\
+		break;							\
+	}								\
+									\
+	if (l > *len) {							\
+		error = PARSERUTILS_NOMEM;				\
+		break;							\
+	}								\
+									\
+	buf = *s;							\
+									\
+	if (l == 1) {							\
+		buf[0] = (uint8_t) ucs4;				\
+	} else {							\
+		for (uint8_t i = l; i > 1; i--) {			\
+			buf[i - 1] = 0x80 | (ucs4 & 0x3F);		\
+			ucs4 >>= 6;					\
+		}							\
+		buf[0] = ~((1 << (8 - l)) - 1) | ucs4;			\
+	}								\
+									\
+	*s += l;							\
+	*len -= l;							\
+} while(0)
+
+/**
+ * Calculate the length (in characters) of a bounded UTF-8 string
+ *
+ * \param s      The string
+ * \param max    Maximum length
+ * \param len    Pointer to location to receive length of string
+ * \param error  Location to receive error code
+ */
+#define UTF8_LENGTH(s, max, len, error)					\
+do {									\
+	const uint8_t *end = s + max;					\
+	int l = 0;							\
+									\
+	error = PARSERUTILS_OK;						\
+									\
+	if (s == NULL || len == NULL) {					\
+		error = PARSERUTILS_BADPARM;				\
+		break;							\
+	}								\
+									\
+	while (s < end) {						\
+		uint32_t c = s[0];					\
+									\
+		if ((c & 0x80) == 0x00)					\
+			s += 1;						\
+		else if ((c & 0xE0) == 0xC0)				\
+			s += 2;						\
+		else if ((c & 0xF0) == 0xE0)				\
+			s += 3;						\
+		else if ((c & 0xF8) == 0xF0)				\
+			s += 4;						\
+		else if ((c & 0xFC) == 0xF8)				\
+			s += 5;						\
+		else if ((c & 0xFE) == 0xFC)				\
+			s += 6;						\
+		else {							\
+			error = PARSERUTILS_INVALID;			\
+			break;						\
+		}							\
+									\
+		l++;							\
+	}								\
+									\
+	if (error == PARSERUTILS_OK)					\
+		*len = l;						\
+} while(0)
+
+/**
+ * Calculate the length (in bytes) of a UTF-8 character
+ *
+ * \param s      Pointer to start of character
+ * \param len    Pointer to location to receive length
+ * \param error  Location to receive error code
+ */
+#define UTF8_CHAR_BYTE_LENGTH(s, len, error)				\
+do {									\
+	if (s == NULL || len == NULL) {					\
+		error = PARSERUTILS_BADPARM;				\
+		break;							\
+	}								\
+									\
+	*len = numContinuations[s[0]] + 1 /* Start byte */;		\
+									\
+	error = PARSERUTILS_OK;						\
+} while(0)
+
+/**
+ * Find previous legal UTF-8 char in string
+ *
+ * \param s        The string
+ * \param off      Offset in the string to start at
+ * \param prevoff  Pointer to location to receive offset of first byte of
+ *                 previous legal character
+ * \param error    Location to receive error code
+ */
+#define UTF8_PREV(s, off, prevoff, error)				\
+do {									\
+	if (s == NULL || prevoff == NULL) {				\
+		error = PARSERUTILS_BADPARM;				\
+		break;							\
+	}								\
+									\
+	while (off != 0 && (s[--off] & 0xC0) == 0x80)			\
+		/* do nothing */;					\
+									\
+	*prevoff = off;							\
+									\
+	error = PARSERUTILS_OK;						\
+} while(0)
+
+/**
+ * Find next legal UTF-8 char in string
+ *
+ * \param s        The string (assumed valid)
+ * \param len      Maximum offset in string
+ * \param off      Offset in the string to start at
+ * \param nextoff  Pointer to location to receive offset of first byte of
+ *                 next legal character
+ * \param error    Location to receive error code
+ */
+#define UTF8_NEXT(s, len, off, nextoff, error)				\
+do {									\
+	if (s == NULL || off >= len || nextoff == NULL) {		\
+		error = PARSERUTILS_BADPARM;				\
+		break;							\
+	}								\
+									\
+	/* Skip current start byte (if present - may be mid-sequence) */\
+	if (s[off] < 0x80 || (s[off] & 0xC0) == 0xC0)			\
+		off++;							\
+									\
+	while (off < len && (s[off] & 0xC0) == 0x80)			\
+		off++;							\
+									\
+	*nextoff = off;							\
+									\
+	error = PARSERUTILS_OK;						\
+} while(0)
+
+/**
+ * Skip to start of next sequence in UTF-8 input
+ *
+ * \param s        The string (assumed to be of dubious validity)
+ * \param len      Maximum offset in string
+ * \param off      Offset in the string to start at
+ * \param nextoff  Pointer to location to receive offset of first byte of
+ *                 next legal character
+ * \param error    Location to receive error code
+ */
+#define UTF8_NEXT_PARANOID(s, len, off, nextoff, error)			\
+do {									\
+	uint8_t c;							\
+									\
+	error = PARSERUTILS_OK;						\
+									\
+	if (s == NULL || off >= len || nextoff == NULL) {		\
+		error = PARSERUTILS_BADPARM;				\
+		break;							\
+	}								\
+									\
+	c = s[off];							\
+									\
+	/* If we're mid-sequence, simply advance to next byte */	\
+	if (!(c < 0x80 || (c & 0xC0) == 0xC0)) {			\
+		off++;							\
+	} else {							\
+		uint32_t nCont = numContinuations[c];			\
+		uint32_t nToSkip;					\
+									\
+		if (off + nCont + 1 >= len) {				\
+			error = PARSERUTILS_NEEDDATA;			\
+			break;						\
+		}							\
+									\
+		/* Verify continuation bytes */				\
+		for (nToSkip = 1; nToSkip <= nCont; nToSkip++) {	\
+			if ((s[off + nToSkip] & 0xC0) != 0x80)		\
+				break;					\
+		}							\
+									\
+		/* Skip over the valid bytes */				\
+		off += nToSkip;						\
+	}								\
+									\
+	*nextoff = off;							\
+} while(0)
+
+#endif
diff --git a/src/input/Makefile b/src/input/Makefile
new file mode 100644
index 0000000..d62740e
--- /dev/null
+++ b/src/input/Makefile
@@ -0,0 +1,46 @@
+# Child makefile fragment
+#
+# Toolchain is provided by top-level makefile
+#
+# Variables provided by top-level makefile
+#
+# COMPONENT		The name of the component
+# EXPORT		The location of the export directory
+# TOP			The location of the source tree root
+# RELEASEDIR		The place to put release objects
+# DEBUGDIR		The place to put debug objects
+#
+# do_include		Canned command sequence to include a child makefile
+#
+# Variables provided by parent makefile:
+#
+# DIR			The name of the directory we're in, relative to $(TOP)
+#
+# Variables we can manipulate:
+#
+# ITEMS_CLEAN		The list of items to remove for "make clean"
+# ITEMS_DISTCLEAN	The list of items to remove for "make distclean"
+# TARGET_TESTS		The list of target names to run for "make test"
+#
+# SOURCES		The list of sources to build for $(COMPONENT)
+#
+# Plus anything from the toolchain
+
+# Push parent directory onto the directory stack
+sp             := $(sp).x
+dirstack_$(sp) := $(d)
+d              := $(DIR)
+
+# Sources
+SRCS_$(d) := filter.c inputstream.c
+
+# Append to sources for component
+SOURCES += $(addprefix $(d), $(SRCS_$(d)))
+
+# Now include any children we may have
+MAKE_INCLUDES := $(wildcard $(d)*/Makefile)
+$(eval $(foreach INC, $(MAKE_INCLUDES), $(call do_include,$(INC))))
+
+# Finally, pop off the directory stack
+d  := $(dirstack_$(sp))
+sp := $(basename $(sp))
diff --git a/src/input/filter.c b/src/input/filter.c
new file mode 100644
index 0000000..f40c98f
--- /dev/null
+++ b/src/input/filter.c
@@ -0,0 +1,384 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#include <errno.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+
+#ifdef WITH_ICONV_FILTER
+#include <iconv.h>
+#endif
+
+#include <parserutils/charset/mibenum.h>
+#include <parserutils/charset/codec.h>
+
+#include "input/filter.h"
+#include "utils/utils.h"
+
+/** Input filter */
+struct parserutils_filter {
+#ifdef WITH_ICONV_FILTER
+	iconv_t cd;			/**< Iconv conversion descriptor */
+	uint16_t int_enc;		/**< The internal encoding */
+#else
+	parserutils_charset_codec *read_codec;	/**< Read codec */
+	parserutils_charset_codec *write_codec;	/**< Write codec */
+
+	uint32_t pivot_buf[64];		/**< Conversion pivot buffer */
+
+	bool leftover;			/**< Data remains from last call */
+	uint8_t *pivot_left;		/**< Remaining pivot to write */
+	size_t pivot_len;		/**< Length of pivot remaining */
+#endif
+
+	struct {
+		uint16_t encoding;	/**< Input encoding */
+	} settings;			/**< Filter settings */
+
+	parserutils_alloc alloc;	/**< Memory (de)allocation function */
+	void *pw;			/**< Client private data */
+};
+
+static parserutils_error filter_set_defaults(parserutils_filter *input);
+static parserutils_error filter_set_encoding(parserutils_filter *input,
+		const char *enc);
+
+/**
+ * Create an input filter
+ *
+ * \param int_enc  Desired encoding of document
+ * \param alloc    Function used to (de)allocate data
+ * \param pw       Pointer to client-specific private data (may be NULL)
+ * \return Pointer to filter instance, or NULL on failure
+ */
+parserutils_filter *parserutils_filter_create(const char *int_enc,
+		parserutils_alloc alloc, void *pw)
+{
+	parserutils_filter *filter;
+
+	if (int_enc == NULL || alloc == NULL)
+		return NULL;
+
+	filter = alloc(NULL, sizeof(*filter), pw);
+	if (!filter)
+		return NULL;
+
+#ifdef WITH_ICONV_FILTER
+	filter->cd = (iconv_t) -1;
+	filter->int_enc = parserutils_charset_mibenum_from_name(
+			int_enc, strlen(int_enc));
+	if (filter->int_enc == 0) {
+		alloc(filter, 0, pw);
+		return NULL;
+	}
+#else
+	filter->leftover = false;
+	filter->pivot_left = NULL;
+	filter->pivot_len = 0;
+#endif
+
+	filter->alloc = alloc;
+	filter->pw = pw;
+
+	if (filter_set_defaults(filter) != PARSERUTILS_OK) {
+		filter->alloc(filter, 0, pw);
+		return NULL;
+	}
+
+#ifndef WITH_ICONV_FILTER
+	filter->write_codec = 
+			parserutils_charset_codec_create(int_enc, alloc, pw);
+	if (filter->write_codec == NULL) {
+		if (filter->read_codec != NULL)
+			parserutils_charset_codec_destroy(filter->read_codec);
+		filter->alloc(filter, 0, pw);
+		return NULL;
+	}
+#endif
+
+	return filter;
+}
+
+/**
+ * Destroy an input filter
+ *
+ * \param input  Pointer to filter instance
+ */
+void parserutils_filter_destroy(parserutils_filter *input)
+{
+	if (input == NULL)
+		return;
+
+#ifdef WITH_ICONV_FILTER
+	if (input->cd != (iconv_t) -1)
+		iconv_close(input->cd);
+#else
+	if (input->read_codec != NULL)
+		parserutils_charset_codec_destroy(input->read_codec);
+
+	if (input->write_codec != NULL)
+		parserutils_charset_codec_destroy(input->write_codec);
+#endif
+
+	input->alloc(input, 0, input->pw);
+
+	return;
+}
+
+/**
+ * Configure an input filter
+ *
+ * \param input   Pointer to filter instance
+ * \param type    Input option type to configure
+ * \param params  Option-specific parameters
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_filter_setopt(parserutils_filter *input,
+		parserutils_filter_opttype type,
+		parserutils_filter_optparams *params)
+{
+	parserutils_error error = PARSERUTILS_OK;
+
+	if (input == NULL || params == NULL)
+		return PARSERUTILS_BADPARM;
+
+	switch (type) {
+	case PARSERUTILS_FILTER_SET_ENCODING:
+		error = filter_set_encoding(input, params->encoding.name);
+		break;
+	}
+
+	return error;
+}
+
+/**
+ * Process a chunk of data
+ *
+ * \param input   Pointer to filter instance
+ * \param data    Pointer to pointer to input buffer
+ * \param len     Pointer to length of input buffer
+ * \param output  Pointer to pointer to output buffer
+ * \param outlen  Pointer to length of output buffer
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ *
+ * Call this with an input buffer length of 0 to flush any buffers.
+ */
+parserutils_error parserutils_filter_process_chunk(parserutils_filter *input,
+		const uint8_t **data, size_t *len,
+		uint8_t **output, size_t *outlen)
+{
+	if (input == NULL || data == NULL || *data == NULL || len == NULL ||
+			output == NULL || *output == NULL || outlen == NULL)
+		return PARSERUTILS_BADPARM;
+
+#ifdef WITH_ICONV_FILTER
+	if (iconv(input->cd, (char **) data, len, 
+			(char **) output, outlen) == (size_t) -1) {
+		switch (errno) {
+		case E2BIG:
+			return PARSERUTILS_NOMEM;
+		case EILSEQ:
+			if (*outlen < 3)
+				return PARSERUTILS_NOMEM;
+
+			(*output)[0] = 0xef;
+			(*output)[1] = 0xbf;
+			(*output)[2] = 0xbd;
+
+			*output += 3;
+			*outlen -= 3;
+
+			(*data)++;
+			(*len)--;
+
+			while (*len > 0) {
+				size_t ret;
+				
+				ret = iconv(input->cd, (char **) data, len, 
+						(char **) output, outlen);
+				if (ret != (size_t) -1 || errno != EILSEQ)
+					break;
+
+				(*data)++;
+				(*len)--;
+			}
+
+			return errno == E2BIG ? PARSERUTILS_NOMEM 
+					      : PARSERUTILS_OK;
+		}
+	}
+
+	return PARSERUTILS_OK;
+#else
+	parserutils_error read_error, write_error;
+
+	if (input->leftover) {
+		/* Some data left to be written from last call */
+
+		/* Attempt to flush the remaining data. */
+		write_error = parserutils_charset_codec_encode(
+				input->write_codec,
+				(const uint8_t **) &input->pivot_left,
+				&input->pivot_len,
+				output, outlen);
+
+		if (write_error != PARSERUTILS_OK)
+			return write_error;
+
+
+		/* And clear leftover */
+		input->pivot_left = NULL;
+		input->pivot_len = 0;
+		input->leftover = false;
+	}
+
+	while (*len > 0) {
+		size_t pivot_len = sizeof(input->pivot_buf);
+		uint8_t *pivot = (uint8_t *) input->pivot_buf;
+
+		read_error = parserutils_charset_codec_decode(input->read_codec,
+				data, len,
+				(uint8_t **) &pivot, &pivot_len);
+
+		pivot = (uint8_t *) input->pivot_buf;
+		pivot_len = sizeof(input->pivot_buf) - pivot_len;
+
+		if (pivot_len > 0) {
+			write_error = parserutils_charset_codec_encode(
+					input->write_codec,
+					(const uint8_t **) &pivot,
+					&pivot_len,
+					output, outlen);
+
+			if (write_error != PARSERUTILS_OK) {
+				input->leftover = true;
+				input->pivot_left = pivot;
+				input->pivot_len = pivot_len;
+
+				return write_error;
+			}
+		}
+
+		if (read_error != PARSERUTILS_OK && 
+				read_error != PARSERUTILS_NOMEM)
+			return read_error;
+	}
+
+	return PARSERUTILS_OK;
+#endif
+}
+
+/**
+ * Reset an input filter's state
+ *
+ * \param input  The input filter to reset
+ * \param PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_filter_reset(parserutils_filter *input)
+{
+	if (input == NULL)
+		return PARSERUTILS_BADPARM;
+
+#ifdef WITH_ICONV_FILTER
+	iconv(input->cd, NULL, 0, NULL, 0);
+#else
+	parserutils_error error;
+
+	/* Clear pivot buffer leftovers */
+	input->pivot_left = NULL;
+	input->pivot_len = 0;
+	input->leftover = false;
+
+	/* Reset read codec */
+	error = parserutils_charset_codec_reset(input->read_codec);
+	if (error != PARSERUTILS_OK)
+		return error;
+
+	/* Reset write codec */
+	error = parserutils_charset_codec_reset(input->write_codec);
+	if (error != PARSERUTILS_OK)
+		return error;
+#endif
+
+	return PARSERUTILS_OK;
+}
+
+/**
+ * Set an input filter's default settings
+ *
+ * \param input  Input filter to configure
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error filter_set_defaults(parserutils_filter *input)
+{
+	parserutils_error error;
+
+	if (input == NULL)
+		return PARSERUTILS_BADPARM;
+
+#ifndef WITH_ICONV_FILTER
+	input->read_codec = NULL;
+	input->write_codec = NULL;
+#endif
+
+	input->settings.encoding = 0;
+	error = filter_set_encoding(input, "UTF-8");
+	if (error != PARSERUTILS_OK)
+		return error;
+
+	return PARSERUTILS_OK;
+}
+
+/**
+ * Set an input filter's encoding
+ *
+ * \param input  Input filter to configure
+ * \param enc    Encoding name
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error filter_set_encoding(parserutils_filter *input,
+		const char *enc)
+{
+	const char *old_enc;
+	uint16_t mibenum;
+
+	if (input == NULL || enc == NULL)
+		return PARSERUTILS_BADPARM;
+
+	mibenum = parserutils_charset_mibenum_from_name(enc, strlen(enc));
+	if (mibenum == 0)
+		return PARSERUTILS_INVALID;
+
+	/* Exit early if we're already using this encoding */
+	if (input->settings.encoding == mibenum)
+		return PARSERUTILS_OK;
+
+	old_enc = parserutils_charset_mibenum_to_name(input->settings.encoding);
+	if (old_enc == NULL)
+		old_enc = "UTF-8";
+
+#ifdef WITH_ICONV_FILTER
+	if (input->cd != (iconv_t) -1)
+		iconv_close(input->cd);
+
+	input->cd = iconv_open(
+		parserutils_charset_mibenum_to_name(input->int_enc), enc);
+#else
+	if (input->read_codec != NULL)
+		parserutils_charset_codec_destroy(input->read_codec);
+
+	input->read_codec = parserutils_charset_codec_create(enc, input->alloc,
+			input->pw);
+	if (input->read_codec == NULL)
+		return PARSERUTILS_NOMEM;
+#endif
+
+	input->settings.encoding = mibenum;
+
+	return PARSERUTILS_OK;
+}
diff --git a/src/input/filter.h b/src/input/filter.h
new file mode 100644
index 0000000..96941a6
--- /dev/null
+++ b/src/input/filter.h
@@ -0,0 +1,57 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#ifndef parserutils_input_filter_h_
+#define parserutils_input_filter_h_
+
+#include <inttypes.h>
+
+#include <parserutils/errors.h>
+#include <parserutils/functypes.h>
+
+typedef struct parserutils_filter parserutils_filter;
+
+/**
+ * Input filter option types
+ */
+typedef enum parserutils_filter_opttype {
+	PARSERUTILS_FILTER_SET_ENCODING       = 0,
+} parserutils_filter_opttype;
+
+/**
+ * Input filter option parameters
+ */
+typedef union parserutils_filter_optparams {
+	/** Parameters for encoding setting */
+	struct {
+		/** Encoding name */
+		const char *name;
+	} encoding;
+} parserutils_filter_optparams;
+
+
+/* Create an input filter */
+parserutils_filter *parserutils_filter_create(const char *int_enc,
+		parserutils_alloc alloc, void *pw);
+/* Destroy an input filter */
+void parserutils_filter_destroy(parserutils_filter *input);
+
+/* Configure an input filter */
+parserutils_error parserutils_filter_setopt(parserutils_filter *input,
+		parserutils_filter_opttype type,
+		parserutils_filter_optparams *params);
+
+/* Process a chunk of data */
+parserutils_error parserutils_filter_process_chunk(parserutils_filter *input,
+		const uint8_t **data, size_t *len,
+		uint8_t **output, size_t *outlen);
+
+/* Reset an input filter's state */
+parserutils_error parserutils_filter_reset(parserutils_filter *input);
+
+#endif
+
diff --git a/src/input/inputstream.c b/src/input/inputstream.c
new file mode 100644
index 0000000..fd44995
--- /dev/null
+++ b/src/input/inputstream.c
@@ -0,0 +1,477 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <parserutils/charset/mibenum.h>
+#include <parserutils/charset/utf8.h>
+#include <parserutils/input/inputstream.h>
+
+#include "input/filter.h"
+#include "utils/utils.h"
+
+/**
+ * Private input stream definition
+ */
+typedef struct parserutils_inputstream_private {
+	parserutils_inputstream public;	/**< Public part. Must be first */
+
+	parserutils_buffer *raw;	/**< Buffer containing raw data */
+
+	bool done_first_chunk;		/**< Whether the first chunk has 
+					 * been processed */
+
+	uint16_t mibenum;		/**< MIB enum for charset, or 0 */
+	uint32_t encsrc;		/**< Charset source */
+
+	parserutils_filter *input;	/**< Charset conversion filter */
+
+	parserutils_charset_detect_func csdetect; /**< Charset detection func.*/
+
+	parserutils_alloc alloc;	/**< Memory (de)allocation function */
+	void *pw;			/**< Client private data */
+} parserutils_inputstream_private;
+
+static inline parserutils_error parserutils_inputstream_refill_buffer(
+		parserutils_inputstream_private *stream);
+static inline parserutils_error parserutils_inputstream_strip_bom(
+		uint16_t mibenum, parserutils_buffer *buffer);
+
+/**
+ * Create an input stream
+ *
+ * \param enc       Document charset, or NULL to autodetect
+ * \param encsrc    Value for encoding source, if specified, or 0
+ * \param csdetect  Charset detection function, or NULL
+ * \param alloc     Memory (de)allocation function
+ * \param pw        Pointer to client-specific private data (may be NULL)
+ * \return Pointer to stream instance, or NULL on failure
+ *
+ * The value 0 is defined as being the lowest priority encoding source 
+ * (i.e. the default fallback encoding). Beyond this, no further 
+ * interpretation is made upon the encoding source.
+ */
+parserutils_inputstream *parserutils_inputstream_create(const char *enc,
+		uint32_t encsrc, parserutils_charset_detect_func csdetect,
+		parserutils_alloc alloc, void *pw)
+{
+	parserutils_inputstream_private *stream;
+
+	if (alloc == NULL)
+		return NULL;
+
+	stream = alloc(NULL, sizeof(parserutils_inputstream_private), pw);
+	if (stream == NULL)
+		return NULL;
+
+	stream->raw = parserutils_buffer_create(alloc, pw);
+	if (stream->raw == NULL) {
+		alloc(stream, 0, pw);
+		return NULL;
+	}
+
+	stream->public.utf8 = parserutils_buffer_create(alloc, pw);
+	if (stream->public.utf8 == NULL) {
+		parserutils_buffer_destroy(stream->raw);
+		alloc(stream, 0, pw);
+		return NULL;
+	}
+
+	stream->public.cursor = 0;
+	stream->public.had_eof = false;
+	stream->done_first_chunk = false;
+
+	stream->input = parserutils_filter_create("UTF-8", alloc, pw);
+	if (stream->input == NULL) {
+		parserutils_buffer_destroy(stream->public.utf8);
+		parserutils_buffer_destroy(stream->raw);
+		alloc(stream, 0, pw);
+		return NULL;
+	}
+
+	if (enc != NULL) {
+		parserutils_error error;
+		parserutils_filter_optparams params;
+
+		stream->mibenum = 
+			parserutils_charset_mibenum_from_name(enc, strlen(enc));
+
+		if (stream->mibenum != 0) {
+			params.encoding.name = enc;
+
+			error = parserutils_filter_setopt(stream->input,
+					PARSERUTILS_FILTER_SET_ENCODING, 
+					&params);
+			if (error != PARSERUTILS_OK && 
+					error != PARSERUTILS_INVALID) {
+				parserutils_filter_destroy(stream->input);
+				parserutils_buffer_destroy(stream->public.utf8);
+				parserutils_buffer_destroy(stream->raw);
+				alloc(stream, 0, pw);
+				return NULL;
+			}
+
+			stream->encsrc = encsrc;
+		}
+	} else {
+		stream->mibenum = 0;
+		stream->encsrc = 0;
+	}
+
+	stream->csdetect = csdetect;
+
+	stream->alloc = alloc;
+	stream->pw = pw;
+
+	return (parserutils_inputstream *) stream;
+}
+
+/**
+ * Destroy an input stream
+ *
+ * \param stream  Input stream to destroy
+ */
+void parserutils_inputstream_destroy(parserutils_inputstream *stream)
+{
+	parserutils_inputstream_private *s = 
+			(parserutils_inputstream_private *) stream;
+
+	if (stream == NULL)
+		return;
+
+	parserutils_filter_destroy(s->input);
+	parserutils_buffer_destroy(s->public.utf8);
+	parserutils_buffer_destroy(s->raw);
+	s->alloc(s, 0, s->pw);
+}
+
+/**
+ * Append data to an input stream
+ *
+ * \param stream  Input stream to append data to
+ * \param data    Data to append (in document charset), or NULL to flag EOF
+ * \param len     Length, in bytes, of data
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_inputstream_append(
+		parserutils_inputstream *stream, 
+		const uint8_t *data, size_t len)
+{
+	parserutils_inputstream_private *s = 
+			(parserutils_inputstream_private *) stream;
+
+	if (stream == NULL)
+		return PARSERUTILS_BADPARM;
+
+	if (data == NULL) {
+		s->public.had_eof = true;
+		return PARSERUTILS_OK;
+	}
+
+	return parserutils_buffer_append(s->raw, data, len);
+}
+
+/**
+ * Insert data into stream at current location
+ *
+ * \param stream  Input stream to insert into
+ * \param data    Data to insert (UTF-8 encoded)
+ * \param len     Length, in bytes, of data
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_inputstream_insert(
+		parserutils_inputstream *stream,
+		const uint8_t *data, size_t len)
+{
+	parserutils_inputstream_private *s = 
+			(parserutils_inputstream_private *) stream;
+
+	if (stream == NULL || data == NULL)
+		return PARSERUTILS_BADPARM;
+
+	return parserutils_buffer_insert(s->public.utf8, s->public.cursor, 
+			data, len);
+}
+
+#define IS_ASCII(x) (((x) & 0x80) == 0)
+
+/* Look at the character in the stream that starts at 
+ * offset bytes from the cursor (slow version)
+ *
+ * \param stream  Stream to look in
+ * \param offset  Byte offset of start of character
+ * \param length  Pointer to location to receive character length (in bytes)
+ * \return Pointer to character data, or EOF or OOD.
+ *
+ * Once the character pointed to by the result of this call has been advanced
+ * past (i.e. parserutils_inputstream_advance has caused the stream cursor to 
+ * pass over the character), then no guarantee is made as to the validity of 
+ * the data pointed to. Thus, any attempt to dereference the pointer after 
+ * advancing past the data it points to is a bug.
+ */
+uintptr_t parserutils_inputstream_peek_slow(parserutils_inputstream *stream, 
+		size_t offset, size_t *length)
+{
+	parserutils_inputstream_private *s = 
+			(parserutils_inputstream_private *) stream;
+	parserutils_error error = PARSERUTILS_OK;
+	size_t len;
+
+	if (stream == NULL)
+		return PARSERUTILS_INPUTSTREAM_OOD;
+
+	/* There's insufficient data in the buffer, so read some more */
+	if (s->raw->length == 0) {
+		/* No more data to be had */
+		return s->public.had_eof ? PARSERUTILS_INPUTSTREAM_EOF
+					 : PARSERUTILS_INPUTSTREAM_OOD;
+	}
+
+	/* Refill utf8 buffer from raw buffer */
+	error = parserutils_inputstream_refill_buffer(s);
+	if (error != PARSERUTILS_OK)
+		return PARSERUTILS_INPUTSTREAM_OOD;
+
+	/* Now try the read */
+	if (IS_ASCII(s->public.utf8->data[s->public.cursor + offset])) {
+		len = 1;
+	} else {
+		error = parserutils_charset_utf8_char_byte_length(
+			s->public.utf8->data + s->public.cursor + offset,
+			&len);
+
+		if (error != PARSERUTILS_OK && error != PARSERUTILS_NEEDDATA)
+			return PARSERUTILS_INPUTSTREAM_OOD;
+
+		if (error == PARSERUTILS_NEEDDATA) {
+			return s->public.had_eof ? PARSERUTILS_INPUTSTREAM_EOF
+						 : PARSERUTILS_INPUTSTREAM_OOD;
+		}
+	}
+
+	*length = len;
+
+	return (uintptr_t) (s->public.utf8->data + s->public.cursor + offset);
+}
+
+#undef IS_ASCII
+
+/**
+ * Read the source charset of the input stream
+ *
+ * \param stream  Input stream to query
+ * \param source  Pointer to location to receive charset source identifier
+ * \return Pointer to charset name (constant; do not free)
+ */
+const char *parserutils_inputstream_read_charset(
+		parserutils_inputstream *stream, uint32_t *source)
+{
+	parserutils_inputstream_private *s = 
+			(parserutils_inputstream_private *) stream;
+
+	if (stream == NULL || source == NULL)
+		return NULL;
+
+	*source = s->encsrc;
+
+	if (s->encsrc == 0)
+		return "UTF-8";
+
+	return parserutils_charset_mibenum_to_name(s->mibenum);
+}
+
+/******************************************************************************
+ ******************************************************************************/
+
+/**
+ * Refill the UTF-8 buffer from the raw buffer
+ *
+ * \param stream  The inputstream to operate on
+ * \return PARSERUTILS_OK on success
+ */
+parserutils_error parserutils_inputstream_refill_buffer(
+		parserutils_inputstream_private *stream)
+{
+	const uint8_t *raw;
+	uint8_t *utf8;
+	size_t raw_length, utf8_space;
+	parserutils_error error;
+
+	/* If this is the first chunk of data, we must detect the charset and
+	 * strip the BOM, if one exists */
+	if (!stream->done_first_chunk) {
+		if (stream->csdetect != NULL) {
+			error = stream->csdetect(stream->raw->data, 
+				stream->raw->length,
+				&stream->mibenum, &stream->encsrc);
+			if (error != PARSERUTILS_OK)
+				return error;
+		} else {
+			/* Default to UTF-8 */
+			stream->mibenum = 
+				parserutils_charset_mibenum_from_name("UTF-8", 
+					SLEN("UTF-8"));
+			stream->encsrc = 0;
+		}
+
+		if (stream->mibenum == 0)
+			abort();
+
+		error = parserutils_inputstream_strip_bom(stream->mibenum, 
+				stream->raw);
+		if (error != PARSERUTILS_OK)
+			return error;
+
+		stream->done_first_chunk = true;
+	}
+
+	/* Work out how to perform the buffer fill */
+	if (stream->public.cursor == stream->public.utf8->length) {
+		/* Cursor's at the end, so simply reuse the entire buffer */
+		utf8 = stream->public.utf8->data;
+		utf8_space = stream->public.utf8->allocated;
+	} else {
+		/* Cursor's not at the end, so shift data after cursor to the
+		 * bottom of the buffer. If the buffer's still over half full, 
+		 * extend it. */
+		memmove(stream->public.utf8->data,
+			stream->public.utf8->data + stream->public.cursor,
+			stream->public.utf8->length - stream->public.cursor);
+
+		stream->public.utf8->length -= stream->public.cursor;
+
+		if (stream->public.utf8->length > 
+				stream->public.utf8->allocated / 2) {
+			error = parserutils_buffer_grow(stream->public.utf8);
+			if (error != PARSERUTILS_OK)
+				return error;
+		}
+
+		utf8 = stream->public.utf8->data + stream->public.utf8->length;
+		utf8_space = stream->public.utf8->allocated - 
+				stream->public.utf8->length;
+	}
+
+	raw = stream->raw->data;
+	raw_length = stream->raw->length;
+
+	/* Try to fill utf8 buffer from the raw data */
+	error = parserutils_filter_process_chunk(stream->input, 
+			&raw, &raw_length, &utf8, &utf8_space);
+	/* _NOMEM implies that there's more input to read than available space
+	 * in the utf8 buffer. That's fine, so we'll ignore that error. */
+	if (error != PARSERUTILS_OK && error != PARSERUTILS_NOMEM)
+		return error;
+
+	/* Remove the raw data we've processed from the raw buffer */
+	error = parserutils_buffer_discard(stream->raw, 0, 
+			stream->raw->length - raw_length);
+	if (error != PARSERUTILS_OK)
+		return error;
+
+	/* Fix up the utf8 buffer information */
+	stream->public.utf8->length = 
+			stream->public.utf8->allocated - utf8_space;
+
+	/* Finally, fix up the cursor */
+	stream->public.cursor = 0;
+
+	return PARSERUTILS_OK;
+}
+
+/**
+ * Strip a BOM from a buffer in the given encoding
+ *
+ * \param mibenum  The character set of the buffer
+ * \param buffer   The buffer to process
+ */
+parserutils_error parserutils_inputstream_strip_bom(uint16_t mibenum, 
+		parserutils_buffer *buffer)
+{
+	static uint16_t utf8;
+	static uint16_t utf16;
+	static uint16_t utf16be;
+	static uint16_t utf16le;
+	static uint16_t utf32;
+	static uint16_t utf32be;
+	static uint16_t utf32le;
+
+	if (utf8 == 0) {
+		utf8 = parserutils_charset_mibenum_from_name("UTF-8", 
+				SLEN("UTF-8"));
+		utf16 = parserutils_charset_mibenum_from_name("UTF-16", 
+				SLEN("UTF-16"));
+		utf16be = parserutils_charset_mibenum_from_name("UTF-16BE",
+				SLEN("UTF-16BE"));
+		utf16le = parserutils_charset_mibenum_from_name("UTF-16LE",
+				SLEN("UTF-16LE"));
+		utf32 = parserutils_charset_mibenum_from_name("UTF-32", 
+				SLEN("UTF-32"));
+		utf32be = parserutils_charset_mibenum_from_name("UTF-32BE",
+				SLEN("UTF-32BE"));
+		utf32le = parserutils_charset_mibenum_from_name("UTF-32LE",
+				SLEN("UTF-32LE"));
+	}
+
+	/** \todo Handle unmarked UTF-16 and UTF-32. Endianness is specified 
+	 * by the BOM, if present, or is assumed to be big endian. */
+
+#define UTF32_BOM_LEN (4)
+#define UTF16_BOM_LEN (2)
+#define UTF8_BOM_LEN  (3)
+
+	if (mibenum == utf8) {
+		if (buffer->length >= UTF8_BOM_LEN && 
+				buffer->data[0] == 0xEF &&
+				buffer->data[1] == 0xBB && 
+				buffer->data[2] == 0xBF) {
+			return parserutils_buffer_discard(
+					buffer, 0, UTF8_BOM_LEN);
+		}
+	} else if (mibenum == utf16be) {
+		if (buffer->length >= UTF16_BOM_LEN &&
+				buffer->data[0] == 0xFE &&
+				buffer->data[1] == 0xFF) {
+			return parserutils_buffer_discard(
+					buffer, 0, UTF16_BOM_LEN);
+		}
+	} else if (mibenum == utf16le) {
+		if (buffer->length >= UTF16_BOM_LEN &&
+				buffer->data[0] == 0xFF &&
+				buffer->data[1] == 0xFE) {
+			return parserutils_buffer_discard(
+					buffer, 0, UTF16_BOM_LEN);
+		}
+	} else if (mibenum == utf32be) {
+		if (buffer->length >= UTF32_BOM_LEN &&
+				buffer->data[0] == 0x00 &&
+				buffer->data[1] == 0x00 &&
+				buffer->data[2] == 0xFE &&
+				buffer->data[3] == 0xFF) {
+			return parserutils_buffer_discard(
+					buffer, 0, UTF32_BOM_LEN);
+		}
+	} else if (mibenum == utf32le) {
+		if (buffer->length >= UTF32_BOM_LEN &&
+				buffer->data[0] == 0xFF &&
+				buffer->data[1] == 0xFE &&
+				buffer->data[2] == 0x00 &&
+				buffer->data[3] == 0x00) {
+			return parserutils_buffer_discard(
+					buffer, 0, UTF32_BOM_LEN);
+		}
+	}
+
+#undef UTF8_BOM_LEN
+#undef UTF16_BOM_LEN
+#undef UTF32_BOM_LEN
+
+	return PARSERUTILS_OK;
+}
+
diff --git a/src/parserutils.c b/src/parserutils.c
new file mode 100644
index 0000000..ed9b21f
--- /dev/null
+++ b/src/parserutils.c
@@ -0,0 +1,54 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#include <parserutils/parserutils.h>
+
+#include "charset/charset.h"
+
+/**
+ * Initialise the ParserUtils library for use.
+ *
+ * This _must_ be called before using any libparserutils functions
+ *
+ * \param aliases_file  Pointer to name of file containing encoding alias data
+ * \param alloc         Pointer to (de)allocation function
+ * \param pw            Pointer to client-specific private data (may be NULL)
+ * \return PARSERUTILS_OK on success, applicable error otherwise.
+ */
+parserutils_error parserutils_initialise(const char *aliases_file,
+		parserutils_alloc alloc, void *pw)
+{
+	parserutils_error error;
+
+	if (aliases_file == NULL || alloc == NULL)
+		return PARSERUTILS_BADPARM;
+
+	error = parserutils_charset_initialise(aliases_file, alloc, pw);
+	if (error != PARSERUTILS_OK)
+		return error;
+
+	return PARSERUTILS_OK;
+}
+
+/**
+ * Clean up after Libparserutils
+ *
+ * \param alloc  Pointer to (de)allocation function
+ * \param pw     Pointer to client-specific private data (may be NULL)
+ * \return PARSERUTILS_OK on success, applicable error otherwise.
+ */
+parserutils_error parserutils_finalise(parserutils_alloc alloc, void *pw)
+{
+	if (alloc == NULL)
+		return PARSERUTILS_BADPARM;
+
+	parserutils_charset_finalise(alloc, pw);
+
+	return PARSERUTILS_OK;
+}
+
+
diff --git a/src/utils/Makefile b/src/utils/Makefile
new file mode 100644
index 0000000..e053673
--- /dev/null
+++ b/src/utils/Makefile
@@ -0,0 +1,49 @@
+# Child makefile fragment
+#
+# Toolchain is provided by top-level makefile
+#
+# Variables provided by top-level makefile
+#
+# COMPONENT		The name of the component
+# EXPORT		The location of the export directory
+# TOP			The location of the source tree root
+# RELEASEDIR		The place to put release objects
+# DEBUGDIR		The place to put debug objects
+#
+# do_include		Canned command sequence to include a child makefile
+#
+# Variables provided by parent makefile:
+#
+# DIR			The name of the directory we're in, relative to $(TOP)
+#
+# Variables we can manipulate:
+#
+# ITEMS_CLEAN		The list of items to remove for "make clean"
+# ITEMS_DISTCLEAN	The list of items to remove for "make distclean"
+# TARGET_TESTS		The list of target names to run for "make test"
+#
+# SOURCES		The list of sources to build for $(COMPONENT)
+#
+# Plus anything from the toolchain
+
+# Push parent directory onto the directory stack
+sp             := $(sp).x
+dirstack_$(sp) := $(d)
+d              := $(DIR)
+
+# Manipulate include paths
+override CFLAGS := $(CFLAGS) -I$(d)
+
+# Sources
+SRCS_$(d) := buffer.c errors.c
+
+# Append to sources for component
+SOURCES += $(addprefix $(d), $(SRCS_$(d)))
+
+# Now include any children we may have
+MAKE_INCLUDES := $(wildcard $(d)*/Makefile)
+$(eval $(foreach INC, $(MAKE_INCLUDES), $(call do_include,$(INC))))
+
+# Finally, pop off the directory stack
+d  := $(dirstack_$(sp))
+sp := $(basename $(sp))
diff --git a/src/utils/buffer.c b/src/utils/buffer.c
new file mode 100644
index 0000000..21c47fc
--- /dev/null
+++ b/src/utils/buffer.c
@@ -0,0 +1,156 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2008 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#include <string.h>
+
+#include <parserutils/utils/buffer.h>
+
+#define DEFAULT_SIZE (4096)
+
+/**
+ * Create a memory buffer
+ *
+ * \param alloc  Memory (de)allocation function
+ * \param pw     Pointer to client-specific private data
+ * \return Pointer to memory buffer, or NULL on memory exhaustion
+ */
+parserutils_buffer *parserutils_buffer_create(parserutils_alloc alloc, void *pw)
+{
+	parserutils_buffer *buffer = 
+			alloc(NULL, sizeof(parserutils_buffer), pw);
+
+	if (buffer == NULL)
+		return NULL;
+
+	buffer->data = alloc(NULL, DEFAULT_SIZE, pw);
+	if (buffer->data == NULL) {
+		alloc(buffer, 0, pw);
+		return NULL;
+	}
+
+	buffer->length = 0;
+	buffer->allocated = DEFAULT_SIZE;
+
+	buffer->alloc = alloc;
+	buffer->pw = pw;
+
+	return buffer;
+}
+
+/**
+ * Destroy a memory buffer
+ *
+ * \param buffer  The buffer to destroy
+ */
+void parserutils_buffer_destroy(parserutils_buffer *buffer)
+{
+	if (buffer == NULL)
+		return;
+
+	buffer->alloc(buffer->data, 0, buffer->pw);
+	buffer->alloc(buffer, 0, buffer->pw);
+}
+
+/**
+ * Append data to a memory buffer
+ *
+ * \param buffer  The buffer to append to
+ * \param data    The data to append
+ * \param len     The length, in bytes, of the data to append
+ * \return PARSERUTILS_OK on success, appropriate error otherwise.
+ */
+parserutils_error parserutils_buffer_append(parserutils_buffer *buffer, 
+		const uint8_t *data, size_t len)
+{
+	while (len >= buffer->allocated - buffer->length) {
+		parserutils_error error = parserutils_buffer_grow(buffer);
+		if (error != PARSERUTILS_OK)
+			return error;
+	}
+
+	memcpy(buffer->data + buffer->length, data, len);
+
+	buffer->length += len;
+
+	return PARSERUTILS_OK;
+}
+
+/**
+ * Insert data into a memory buffer
+ *
+ * \param buffer  The buffer to insert into
+ * \param offset  The offset into the buffer to insert at
+ * \param data    The data to insert
+ * \param len     The length, in bytes, of the data to insert
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_buffer_insert(parserutils_buffer *buffer, 
+		size_t offset, const uint8_t *data, size_t len)
+{
+	if (offset > buffer->length)
+		return PARSERUTILS_BADPARM;
+
+	if (offset == buffer->length)
+		return parserutils_buffer_append(buffer, data, len);
+
+	while (len >= buffer->allocated - buffer->length) {
+		parserutils_error error = parserutils_buffer_grow(buffer);
+		if (error != PARSERUTILS_OK)
+			return error;
+	}
+
+	memmove(buffer->data + buffer->length + len, 
+			buffer->data + offset, buffer->length - offset);
+
+	memcpy(buffer->data + offset, data, len);
+
+	buffer->length += len;
+
+	return PARSERUTILS_OK;
+}
+
+/**
+ * Discard a section of a memory buffer
+ *
+ * \param buffer  The buffer to discard data from
+ * \param offset  The offset into the buffer of the start of the section
+ * \param len     The number of bytes to discard
+ * \return PARSERUTILS_OK on success, appropriate error otherwise.
+ */
+parserutils_error parserutils_buffer_discard(parserutils_buffer *buffer, 
+		size_t offset, size_t len)
+{
+	if (offset >= buffer->length || offset + len > buffer->length)
+		return PARSERUTILS_BADPARM;
+
+	memmove(buffer->data + offset, buffer->data + offset + len, 
+			buffer->length - len);
+
+	buffer->length -= len;
+
+	return PARSERUTILS_OK;
+}
+
+/**
+ * Extend the amount of space allocated for a memory buffer
+ *
+ * \param buffer  The buffer to extend
+ * \return PARSERUTILS_OK on success, appropriate error otherwise.
+ */
+parserutils_error parserutils_buffer_grow(parserutils_buffer *buffer)
+{
+	uint8_t *temp = buffer->alloc(buffer->data, 
+			buffer->allocated * 2, buffer->pw);
+	if (temp == NULL)
+		return PARSERUTILS_NOMEM;
+
+	buffer->data = temp;
+	buffer->allocated *= 2;
+
+	return PARSERUTILS_OK;
+}
+
diff --git a/src/utils/errors.c b/src/utils/errors.c
new file mode 100644
index 0000000..353cda1
--- /dev/null
+++ b/src/utils/errors.c
@@ -0,0 +1,70 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#include <string.h>
+
+#include <parserutils/errors.h>
+
+/**
+ * Convert a parserutils error code to a string
+ *
+ * \param error  The error code to convert
+ * \return Pointer to string representation of error, or NULL if unknown.
+ */
+const char *parserutils_error_to_string(parserutils_error error)
+{
+	const char *result = NULL;
+
+	switch (error) {
+	case PARSERUTILS_OK:
+		result = "No error";
+		break;
+	case PARSERUTILS_NOMEM:
+		result = "Insufficient memory";
+		break;
+	case PARSERUTILS_BADPARM:
+		result = "Bad parameter";
+		break;
+	case PARSERUTILS_INVALID:
+		result = "Invalid input";
+		break;
+	case PARSERUTILS_FILENOTFOUND:
+		result = "File not found";
+		break;
+	case PARSERUTILS_NEEDDATA:
+		result = "Insufficient data";
+		break;
+	}
+
+	return result;
+}
+
+/**
+ * Convert a string representation of an error name to a parserutils error code
+ *
+ * \param str  String containing error name
+ * \param len  Length of string (bytes)
+ * \return Error code, or PARSERUTILS_OK if unknown
+ */
+parserutils_error parserutils_error_from_string(const char *str, size_t len)
+{
+	if (strncmp(str, "PARSERUTILS_OK", len) == 0) {
+		return PARSERUTILS_OK;
+	} else if (strncmp(str, "PARSERUTILS_NOMEM", len) == 0) {
+		return PARSERUTILS_NOMEM;
+	} else if (strncmp(str, "PARSERUTILS_BADPARM", len) == 0) {
+		return PARSERUTILS_BADPARM;
+	} else if (strncmp(str, "PARSERUTILS_INVALID", len) == 0) {
+		return PARSERUTILS_INVALID;
+	} else if (strncmp(str, "PARSERUTILS_FILENOTFOUND", len) == 0) {
+		return PARSERUTILS_FILENOTFOUND;
+	} else if (strncmp(str, "PARSERUTILS_NEEDDATA", len) == 0) {
+		return PARSERUTILS_NEEDDATA;
+	}
+
+	return PARSERUTILS_OK;
+}
diff --git a/src/utils/utils.h b/src/utils/utils.h
new file mode 100644
index 0000000..5162945
--- /dev/null
+++ b/src/utils/utils.h
@@ -0,0 +1,28 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#ifndef parserutils_utils_h_
+#define parserutils_utils_h_
+
+#ifndef max
+#define max(a,b) ((a)>(b)?(a):(b))
+#endif
+
+#ifndef min
+#define min(a,b) ((a)<(b)?(a):(b))
+#endif
+
+#ifndef SLEN
+/* Calculate length of a string constant */
+#define SLEN(s) (sizeof((s)) - 1) /* -1 for '\0' */
+#endif
+
+#ifndef UNUSED
+#define UNUSED(x) ((x)=(x))
+#endif
+
+#endif
diff --git a/test/INDEX b/test/INDEX
new file mode 100644
index 0000000..772c82f
--- /dev/null
+++ b/test/INDEX
@@ -0,0 +1,15 @@
+# Index for testcases
+#
+# Test		Description				DataDir
+
+charset		Charset initialisation/finalisation
+parserutils	Library initialisation/finalisation
+aliases		Encoding alias handling
+cscodec		Charset codec implementation		cscodec
+filter		Input stream filtering
+inputstream	Inputstream handling			input
+
+# Regression tests
+regression/cscodec-segv	Segfault in charset codecs
+regression/filter-segv	Segfault in input filtering
+regression/stream-nomem	Inputstream buffer expansion
diff --git a/test/Makefile b/test/Makefile
new file mode 100644
index 0000000..2ed0b44
--- /dev/null
+++ b/test/Makefile
@@ -0,0 +1,80 @@
+# Child makefile fragment
+#
+# Toolchain is provided by top-level makefile
+#
+# Variables provided by top-level makefile
+#
+# COMPONENT		The name of the component
+# EXPORT		The location of the export directory
+# TOP			The location of the source tree root
+# RELEASEDIR		The place to put release objects
+# DEBUGDIR		The place to put debug objects
+#
+# do_include		Canned command sequence to include a child makefile
+#
+# Variables provided by parent makefile:
+#
+# DIR			The name of the directory we're in, relative to $(TOP)
+#
+# Variables we can manipulate:
+#
+# ITEMS_CLEAN		The list of items to remove for "make clean"
+# ITEMS_DISTCLEAN	The list of items to remove for "make distclean"
+# TARGET_TESTS		The list of target names to run for "make test"
+#
+# SOURCES		The list of sources to build for $(COMPONENT)
+#
+# Plus anything from the toolchain
+
+# Push parent directory onto the directory stack
+sp             := $(sp).x
+dirstack_$(sp) := $(d)
+d              := $(DIR)
+
+# Extend toolchain settings
+override CFLAGS := $(CFLAGS) -I$(TOP)/src/ -I$(d)
+
+# Tests
+TESTS_$(d) := aliases cscodec charset filter inputstream parserutils
+TESTS_$(d) := $(TESTS_$(d)) regression/cscodec-segv regression/filter-segv \
+	regression/stream-nomem
+
+# Items for top-level makefile to use
+ITEMS_CLEAN := $(ITEMS_CLEAN) \
+	$(addprefix $(d), $(addsuffix $(EXEEXT), $(TESTS_$(d)))) \
+	$(addprefix $(d), $(addsuffix .gcda, $(TESTS_$(d)))) \
+	$(addprefix $(d), $(addsuffix .gcno, $(TESTS_$(d))))
+ITEMS_DISTCLEAN := $(ITEMS_DISTCLEAN) $(d)log
+
+# Targets for top-level makefile to run
+TARGET_TESTS := $(TARGET_TESTS) test_$(d)
+
+# Now we get to hack around so that we know what directory we're in.
+# $(d) no longer exists when running the commands for a target, so we can't
+# simply use it verbatim. Assigning to a variable doesn't really help, as
+# there's no guarantee that someone else hasn't overridden that variable.
+# So, what we do is make the target depend on $(d), then pick it out of the
+# dependency list when running commands. This isn't pretty, but is effective.
+test_$(d): $(d) $(addprefix $(d), $(TESTS_$(d)))
+	@$(PERL) $(TOP)/$<testrunner.pl $(TOP)/$< $(EXEEXT)
+
+# Build rules for each test binary -- they all depend on the debug library
+define compile_test
+$(2): $$(TOP)/$$(COMPONENT)-debug.a $(1)
+	@$$(ECHO) $$(ECHOFLAGS) "==> $(1)"
+	@$$(CC) -c -g $$(DEBUGCFLAGS) -o $$@.o $(1)
+	@$$(LD) -g -o $$@ $$@.o $$(LDFLAGS) -lparserutils-debug
+	@$$(RM) $$(RMFLAGS) $$@.o
+
+endef
+
+$(eval $(foreach TEST,$(addprefix $(d), $(TESTS_$(d))), \
+	$(call compile_test,$(addsuffix .c, $(TEST)),$(TEST))))
+
+# Now include any children we may have
+MAKE_INCLUDES := $(wildcard $(d)*/Makefile)
+$(eval $(foreach INC, $(MAKE_INCLUDES), $(call do_include,$(INC))))
+
+# Finally, pop off the directory stack
+d  := $(dirstack_$(sp))
+sp := $(basename $(sp))
diff --git a/test/README b/test/README
new file mode 100644
index 0000000..7e41abf
--- /dev/null
+++ b/test/README
@@ -0,0 +1,84 @@
+Libcharset testcases
+====================
+
+Testcases for Libcharset are self-contained binaries which test various parts
+of the charset library. These may make use of external data files to drive
+the testing.
+
+Testcase command lines
+----------------------
+
+Testcase command lines are in a unified format, thus:
+
+ 	<aliases_file> [ <data_file> ]
+
+The aliases file parameter will always be specified (as it is required for
+the library to work at all).
+
+The data file parameter is optional and may be provided on a test-by-test
+basis.
+
+Testcase output
+---------------
+
+Testcases may output anything at all to stdout. The final line of the 
+output must begin with either PASS or FAIL (case sensitive), indicating 
+the success status of the test.
+
+Test Index
+----------
+
+In the test sources directory, is a file, named INDEX, which provides an 
+index of all available test binaries. Any new test applications should be
+added to this index as they are created.
+
+The test index file format is as follows:
+
+	file         = *line
+
+	line         = ( entry / comment / blank ) LF
+
+	entry        = testname 1*HTAB description [ 1*HTAB datadir ]
+	comment      = "#" *non-newline
+	blank        = 0<OCTET>
+
+	testname     = 1*non-reserved
+	description  = 1*non-reserved
+	datadir      = 1*non-reserved
+
+	non-newline  = VCHAR / WSP
+	non-reserved = VCHAR / SP
+
+Each entry contains a mandatory binary name and description followed by 
+an optional data directory specifier. The data directory specifier is 
+used to state the name of the directory containing data files for the 
+test name. This directory will be searched for within the "data" 
+directory in the source tree. 
+
+If a data directory is specified, the test binary will be invoked for
+each data file listed within the data directory INDEX, passing the 
+filename as the second parameter (<data_file>, above).
+
+Data Index
+----------
+
+Each test data directory contains a file, named INDEX, which provides an 
+index of all available test data files.
+
+The data index file format is as follows:
+
+	file         = *line
+
+	line         = ( entry / comment / blank ) LF
+
+	entry        = dataname 1*HTAB description
+	comment      = "#" *non-newline
+	blank        = 0<OCTET>
+
+	dataname     = 1*non-reserved
+	description  = 1*non-reserved
+
+	non-newline  = VCHAR / WSP
+	non-reserved = VCHAR / SP
+
+Each entry contains a mandatory data file name and description.
diff --git a/test/aliases.c b/test/aliases.c
new file mode 100644
index 0000000..dff31c6
--- /dev/null
+++ b/test/aliases.c
@@ -0,0 +1,62 @@
+#include <stdio.h>
+#include <string.h>
+
+#include "charset/aliases.h"
+
+#include "testutils.h"
+
+extern void charset_aliases_dump(void);
+
+static void *myrealloc(void *ptr, size_t len, void *pw)
+{
+	UNUSED(pw);
+
+	return realloc(ptr, len);
+}
+
+int main (int argc, char **argv)
+{
+	parserutils_charset_aliases_canon *c;
+
+	if (argc != 2) {
+		printf("Usage: %s <filename>\n", argv[0]);
+		return 1;
+	}
+
+	parserutils_charset_aliases_create(argv[1], myrealloc, NULL);
+
+	parserutils_charset_aliases_dump();
+
+	c = parserutils_charset_alias_canonicalise("moose", 5);
+	if (c) {
+		printf("FAIL - found invalid encoding 'moose'\n");
+		return 1;
+	}
+
+	c = parserutils_charset_alias_canonicalise("csinvariant", 11);
+	if (c) {
+		printf("%s %d\n", c->name, c->mib_enum);
+	} else {
+		printf("FAIL - failed finding encoding 'csinvariant'\n");
+		return 1;
+	}
+
+	c = parserutils_charset_alias_canonicalise("nats-sefi-add", 13);
+	if (c) {
+		printf("%s %d\n", c->name, c->mib_enum);
+	} else {
+		printf("FAIL - failed finding encoding 'nats-sefi-add'\n");
+		return 1;
+	}
+
+	printf("%d\n", parserutils_charset_mibenum_from_name(c->name, 
+			strlen(c->name)));
+
+	printf("%s\n", parserutils_charset_mibenum_to_name(c->mib_enum));
+
+	parserutils_charset_aliases_destroy(myrealloc, NULL);
+
+	printf("PASS\n");
+
+	return 0;
+}
diff --git a/test/charset.c b/test/charset.c
new file mode 100644
index 0000000..a793e7e
--- /dev/null
+++ b/test/charset.c
@@ -0,0 +1,31 @@
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "charset/charset.h"
+
+#include "testutils.h"
+
+static void *myrealloc(void *ptr, size_t len, void *pw)
+{
+	UNUSED(pw);
+
+	return realloc(ptr, len);
+}
+
+int main(int argc, char **argv)
+{
+	if (argc != 2) {
+		printf("Usage: %s <filename>\n", argv[0]);
+		return 1;
+	}
+
+	assert(parserutils_charset_initialise(argv[1], myrealloc, NULL) == 
+			PARSERUTILS_OK);
+
+	assert (parserutils_charset_finalise(myrealloc, NULL) == 
+			PARSERUTILS_OK);
+
+	printf("PASS\n");
+
+	return 0;
+}
diff --git a/test/cscodec.c b/test/cscodec.c
new file mode 100644
index 0000000..d3b1b76
--- /dev/null
+++ b/test/cscodec.c
@@ -0,0 +1,232 @@
+#include <stdio.h>
+#include <string.h>
+
+#include "charset/charset.h"
+#include <parserutils/charset/codec.h>
+
+#include "utils/utils.h"
+
+#include "testutils.h"
+
+typedef struct line_ctx {
+	parserutils_charset_codec *codec;
+
+	size_t buflen;
+	size_t bufused;
+	uint8_t *buf;
+	size_t explen;
+	size_t expused;
+	uint8_t *exp;
+
+	bool indata;
+	bool inexp;
+
+	parserutils_error exp_ret;
+
+	enum { ENCODE, DECODE, BOTH } dir;
+} line_ctx;
+
+static bool handle_line(const char *data, size_t datalen, void *pw);
+static void run_test(line_ctx *ctx);
+
+static void *myrealloc(void *ptr, size_t len, void *pw)
+{
+	UNUSED(pw);
+
+	return realloc(ptr, len);
+}
+
+int main(int argc, char **argv)
+{
+	line_ctx ctx;
+
+	if (argc != 3) {
+		printf("Usage: %s <aliases_file> <filename>\n", argv[0]);
+		return 1;
+	}
+
+	assert(parserutils_charset_initialise(argv[1], myrealloc, NULL) == 
+			PARSERUTILS_OK);
+
+	assert(parserutils_charset_codec_create("NATS-SEFI-ADD",
+			myrealloc, NULL) == NULL);
+
+	ctx.codec = parserutils_charset_codec_create("UTF-8", myrealloc, NULL);
+	assert(ctx.codec != NULL);
+
+	ctx.buflen = parse_filesize(argv[2]);
+	if (ctx.buflen == 0)
+		return 1;
+
+	ctx.buf = malloc(2 * ctx.buflen);
+	if (ctx.buf == NULL) {
+		printf("Failed allocating %u bytes\n",
+				(unsigned int) ctx.buflen);
+		return 1;
+	}
+
+	ctx.exp = ctx.buf + ctx.buflen;
+	ctx.explen = ctx.buflen;
+
+	ctx.buf[0] = '\0';
+	ctx.exp[0] = '\0';
+	ctx.bufused = 0;
+	ctx.expused = 0;
+	ctx.indata = false;
+	ctx.inexp = false;
+	ctx.exp_ret = PARSERUTILS_OK;
+
+	assert(parse_testfile(argv[2], handle_line, &ctx) == true);
+
+	/* and run final test */
+	if (ctx.bufused > 0 && ctx.buf[ctx.bufused - 1] == '\n')
+		ctx.bufused -= 1;
+
+	if (ctx.expused > 0 && ctx.exp[ctx.expused - 1] == '\n')
+		ctx.expused -= 1;
+
+	run_test(&ctx);
+
+	free(ctx.buf);
+
+	parserutils_charset_codec_destroy(ctx.codec);
+
+	assert(parserutils_charset_finalise(myrealloc, NULL) == 
+			PARSERUTILS_OK);
+
+	printf("PASS\n");
+
+	return 0;
+}
+
+bool handle_line(const char *data, size_t datalen, void *pw)
+{
+	line_ctx *ctx = (line_ctx *) pw;
+
+	if (data[0] == '#') {
+		if (ctx->inexp) {
+			/* This marks end of testcase, so run it */
+
+			if (ctx->buf[ctx->bufused - 1] == '\n')
+				ctx->bufused -= 1;
+
+			if (ctx->exp[ctx->expused - 1] == '\n')
+				ctx->expused -= 1;
+
+			run_test(ctx);
+
+			ctx->buf[0] = '\0';
+			ctx->exp[0] = '\0';
+			ctx->bufused = 0;
+			ctx->expused = 0;
+			ctx->exp_ret = PARSERUTILS_OK;
+		}
+
+		if (strncasecmp(data+1, "data", 4) == 0) {
+			parserutils_charset_codec_optparams params;
+			const char *ptr = data + 6;
+
+			ctx->indata = true;
+			ctx->inexp = false;
+
+			if (strncasecmp(ptr, "decode", 6) == 0)
+				ctx->dir = DECODE;
+			else if (strncasecmp(ptr, "encode", 6) == 0)
+				ctx->dir = ENCODE;
+			else
+				ctx->dir = BOTH;
+
+			ptr += 7;
+
+			if (strncasecmp(ptr, "LOOSE", 5) == 0) {
+				params.error_mode.mode =
+					PARSERUTILS_CHARSET_CODEC_ERROR_LOOSE;
+				ptr += 6;
+			} else if (strncasecmp(ptr, "STRICT", 6) == 0) {
+				params.error_mode.mode =
+					PARSERUTILS_CHARSET_CODEC_ERROR_STRICT;
+				ptr += 7;
+			} else {
+				params.error_mode.mode =
+					PARSERUTILS_CHARSET_CODEC_ERROR_TRANSLIT;
+				ptr += 9;
+			}
+
+			assert(parserutils_charset_codec_setopt(ctx->codec,
+				PARSERUTILS_CHARSET_CODEC_ERROR_MODE,
+				(parserutils_charset_codec_optparams *) &params)
+				== PARSERUTILS_OK);
+		} else if (strncasecmp(data+1, "expected", 8) == 0) {
+			ctx->indata = false;
+			ctx->inexp = true;
+
+			ctx->exp_ret = parserutils_error_from_string(data + 10,
+					datalen - 10 - 1 /* \n */);
+		} else if (strncasecmp(data+1, "reset", 5) == 0) {
+			ctx->indata = false;
+			ctx->inexp = false;
+
+			parserutils_charset_codec_reset(ctx->codec);
+		}
+	} else {
+		if (ctx->indata) {
+			memcpy(ctx->buf + ctx->bufused, data, datalen);
+			ctx->bufused += datalen;
+		}
+		if (ctx->inexp) {
+			memcpy(ctx->exp + ctx->expused, data, datalen);
+			ctx->expused += datalen;
+		}
+	}
+
+	return true;
+}
+
+void run_test(line_ctx *ctx)
+{
+	static int testnum;
+	size_t destlen = ctx->bufused * 4;
+	uint8_t dest[destlen];
+	uint8_t *pdest = dest;
+	const uint8_t *psrc = ctx->buf;
+	size_t srclen = ctx->bufused;
+	size_t i;
+
+	if (ctx->dir == DECODE) {
+		assert(parserutils_charset_codec_decode(ctx->codec,
+				&psrc, &srclen,
+				&pdest, &destlen) == ctx->exp_ret);
+	} else if (ctx->dir == ENCODE) {
+		assert(parserutils_charset_codec_encode(ctx->codec,
+				&psrc, &srclen,
+				&pdest, &destlen) == ctx->exp_ret);
+	} else {
+		size_t templen = ctx->bufused * 4;
+		uint8_t temp[templen];
+		uint8_t *ptemp = temp;
+
+		assert(parserutils_charset_codec_decode(ctx->codec,
+				&psrc, &srclen,
+				&ptemp, &templen) == ctx->exp_ret);
+		ptemp = temp;
+		templen = ctx->bufused * 4 - templen;
+		assert(parserutils_charset_codec_encode(ctx->codec,
+				(const uint8_t **) &ptemp, &templen,
+				&pdest, &destlen) == ctx->exp_ret);
+	}
+
+	printf("%d: Read '", ++testnum);
+	for (i = 0; i < ctx->expused; i++) {
+		printf("%c%c ", "0123456789abcdef"[(dest[i] >> 4) & 0xf],
+				"0123456789abcdef"[dest[i] & 0xf]);
+	}
+	printf("' Expected '");
+	for (i = 0; i < ctx->expused; i++) {
+		printf("%c%c ", "0123456789abcdef"[(ctx->exp[i] >> 4) & 0xf],
+				"0123456789abcdef"[ctx->exp[i] & 0xf]);
+	}
+	printf("'\n");
+
+	assert(memcmp(dest, ctx->exp, ctx->expused) == 0);
+}
+
diff --git a/test/data/Aliases b/test/data/Aliases
new file mode 100644
index 0000000..db61ff1
--- /dev/null
+++ b/test/data/Aliases
@@ -0,0 +1,302 @@
+# > Unicode:Files.Aliases
+# Mapping of character set encoding names to their canonical form
+#
+# Lines starting with a '#' are comments, blank lines are ignored.
+#
+# Based on http://www.iana.org/assignments/character-sets and
+# http://www.iana.org/assignments/ianacharset-mib
+#
+# Canonical Form	MIBenum		Aliases...
+#
+US-ASCII		3		iso-ir-6 ANSI_X3.4-1986 ISO_646.irv:1991 ASCII ISO646-US ANSI_X3.4-1968 us IBM367 cp367 csASCII
+ISO-10646-UTF-1		27		csISO10646UTF1
+ISO_646.basic:1983	28		ref csISO646basic1983
+INVARIANT		29		csINVARIANT
+ISO_646.irv:1983	30		iso-ir-2 irv csISO2IntlRefVersion
+BS_4730			20		iso-ir-4 ISO646-GB gb uk csISO4UnitedKingdom
+NATS-SEFI		31		iso-ir-8-1 csNATSSEFI
+NATS-SEFI-ADD		32		iso-ir-8-2 csNATSSEFIADD
+NATS-DANO		33		iso-ir-9-1 csNATSDANO
+NATS-DANO-ADD		34		iso-ir-9-2 csNATSDANOADD
+SEN_850200_B		35		iso-ir-10 FI ISO646-FI ISO646-SE se csISO10Swedish
+SEN_850200_C		21		iso-ir-11 ISO646-SE2 se2 csISO11SwedishForNames
+KS_C_5601-1987		36		iso-ir-149 KS_C_5601-1989 KSC_5601 korean csKSC56011987
+ISO-2022-KR		37		csISO2022KR
+EUC-KR			38		csEUCKR EUCKR
+ISO-2022-JP		39		csISO2022JP
+ISO-2022-JP-2		40		csISO2022JP2
+ISO-2022-CN		104
+ISO-2022-CN-EXT		105
+JIS_C6220-1969-jp	41		JIS_C6220-1969 iso-ir-13 katakana x0201-7 csISO13JISC6220jp
+JIS_C6220-1969-ro	42		iso-ir-14 jp ISO646-JP csISO14JISC6220ro
+IT			22		iso-ir-15 ISO646-IT csISO15Italian
+PT			43		iso-ir-16 ISO646-PT csISO16Portuguese
+ES			23		iso-ir-17 ISO646-ES csISO17Spanish
+greek7-old		44		iso-ir-18 csISO18Greek7Old
+latin-greek		45		iso-ir-19 csISO19LatinGreek
+DIN_66003		24		iso-ir-21 de ISO646-DE csISO21German
+NF_Z_62-010_(1973)	46		iso-ir-25 ISO646-FR1 csISO25French
+Latin-greek-1		47		iso-ir-27 csISO27LatinGreek1
+ISO_5427		48		iso-ir-37 csISO5427Cyrillic
+JIS_C6226-1978		49		iso-ir-42 csISO42JISC62261978
+BS_viewdata		50		iso-ir-47 csISO47BSViewdata
+INIS			51		iso-ir-49 csISO49INIS
+INIS-8			52		iso-ir-50 csISO50INIS8
+INIS-cyrillic		53		iso-ir-51 csISO51INISCyrillic
+ISO_5427:1981		54		iso-ir-54 ISO5427Cyrillic1981
+ISO_5428:1980		55		iso-ir-55 csISO5428Greek
+GB_1988-80		56		iso-ir-57 cn ISO646-CN csISO57GB1988
+GB_2312-80		57		iso-ir-58 chinese csISO58GB231280
+NS_4551-1		25		iso-ir-60 ISO646-NO no csISO60DanishNorwegian csISO60Norwegian1
+NS_4551-2		58		ISO646-NO2 iso-ir-61 no2 csISO61Norwegian2
+NF_Z_62-010		26		iso-ir-69 ISO646-FR fr csISO69French
+videotex-suppl		59		iso-ir-70 csISO70VideotexSupp1
+PT2			60		iso-ir-84 ISO646-PT2 csISO84Portuguese2
+ES2			61		iso-ir-85 ISO646-ES2 csISO85Spanish2
+MSZ_7795.3		62		iso-ir-86 ISO646-HU hu csISO86Hungarian
+JIS_C6226-1983		63		iso-ir-87 x0208 JIS_X0208-1983 csISO87JISX0208
+greek7			64		iso-ir-88 csISO88Greek7
+ASMO_449		65		ISO_9036 arabic7 iso-ir-89 csISO89ASMO449
+iso-ir-90		66		csISO90
+JIS_C6229-1984-a	67		iso-ir-91 jp-ocr-a csISO91JISC62291984a
+JIS_C6229-1984-b	68		iso-ir-92 ISO646-JP-OCR-B jp-ocr-b csISO92JISC62991984b
+JIS_C6229-1984-b-add	69		iso-ir-93 jp-ocr-b-add csISO93JIS62291984badd
+JIS_C6229-1984-hand	70		iso-ir-94 jp-ocr-hand csISO94JIS62291984hand
+JIS_C6229-1984-hand-add	71		iso-ir-95 jp-ocr-hand-add csISO95JIS62291984handadd
+JIS_C6229-1984-kana	72		iso-ir-96 csISO96JISC62291984kana
+ISO_2033-1983		73		iso-ir-98 e13b csISO2033
+ANSI_X3.110-1983	74		iso-ir-99 CSA_T500-1983 NAPLPS csISO99NAPLPS
+ISO-8859-1		4		iso-ir-100 ISO_8859-1 ISO_8859-1:1987 latin1 l1 IBM819 CP819 csISOLatin1 8859_1 ISO8859-1
+ISO-8859-2		5		iso-ir-101 ISO_8859-2 ISO_8859-2:1987 latin2 l2 csISOLatin2 8859_2 ISO8859-2
+T.61-7bit		75		iso-ir-102 csISO102T617bit
+T.61-8bit		76		T.61 iso-ir-103 csISO103T618bit
+ISO-8859-3		6		iso-ir-109 ISO_8859-3 ISO_8859-3:1988 latin3 l3 csISOLatin3 8859_3 ISO8859-3
+ISO-8859-4		7		iso-ir-110 ISO_8859-4 ISO_8859-4:1988 latin4 l4 csISOLatin4 8859_4 ISO8859-4
+ECMA-cyrillic		77		iso-ir-111 KOI8-E csISO111ECMACyrillic
+CSA_Z243.4-1985-1	78		iso-ir-121 ISO646-CA csa7-1 ca csISO121Canadian1
+CSA_Z243.4-1985-2	79		iso-ir-122 ISO646-CA2 csa7-2 csISO122Canadian2
+CSA_Z243.4-1985-gr	80		iso-ir-123 csISO123CSAZ24341985gr
+ISO-8859-6		9		iso-ir-127 ISO_8859-6 ISO_8859-6:1987 ECMA-114 ASMO-708 arabic csISOLatinArabic
+ISO-8859-6-E		81		csISO88596E ISO_8859-6-E
+ISO-8859-6-I		82		csISO88596I ISO_8859-6-I
+ISO-8859-7		10		iso-ir-126 ISO_8859-7 ISO_8859-7:1987 ELOT_928 ECMA-118 greek greek8 csISOLatinGreek 8859_7 ISO8859-7
+T.101-G2		83		iso-ir-128 csISO128T101G2
+ISO-8859-8		11		iso-ir-138 ISO_8859-8 ISO_8859-8:1988 hebrew csISOLatinHebrew 8859_8 ISO8859-8
+ISO-8859-8-E		84		csISO88598E ISO_8859-8-E
+ISO-8859-8-I		85		csISO88598I ISO_8859-8-I
+CSN_369103		86		iso-ir-139 csISO139CSN369103
+JUS_I.B1.002		87		iso-ir-141 ISO646-YU js yu csISO141JUSIB1002
+ISO_6937-2-add		14		iso-ir-142 csISOTextComm
+IEC_P27-1		88		iso-ir-143 csISO143IECP271
+ISO-8859-5		8		iso-ir-144 ISO_8859-5 ISO_8859-5:1988 cyrillic csISOLatinCyrillic 8859_5 ISO8859-5
+JUS_I.B1.003-serb	89		iso-ir-146 serbian csISO146Serbian
+JUS_I.B1.003-mac	90		macedonian iso-ir-147 csISO147Macedonian
+ISO-8859-9		12		iso-ir-148 ISO_8859-9 ISO_8859-9:1989 latin5 l5 csISOLatin5 8859_9 ISO8859-9
+greek-ccitt		91		iso-ir-150 csISO150 csISO150GreekCCITT
+NC_NC00-10:81		92		cuba iso-ir-151 ISO646-CU csISO151Cuba
+ISO_6937-2-25		93		iso-ir-152 csISO6937Add
+GOST_19768-74		94		ST_SEV_358-88 iso-ir-153 csISO153GOST1976874
+ISO_8859-supp		95		iso-ir-154 latin1-2-5 csISO8859Supp
+ISO_10367-box		96		iso-ir-155 csISO10367Box
+ISO-8859-10		13		iso-ir-157 l6 ISO_8859-10:1992 csISOLatin6 latin6 8859_10 ISO8859-10
+latin-lap		97		lap iso-ir-158 csISO158Lap
+JIS_X0212-1990		98		x0212 iso-ir-159 csISO159JISX02121990
+DS_2089			99		DS2089 ISO646-DK dk csISO646Danish
+us-dk			100		csUSDK
+dk-us			101		csDKUS
+JIS_X0201		15		X0201 csHalfWidthKatakana
+KSC5636			102		ISO646-KR csKSC5636
+ISO-10646-UCS-2		1000		csUnicode UCS-2 UCS2
+ISO-10646-UCS-4		1001		csUCS4 UCS-4 UCS4
+DEC-MCS			2008		dec csDECMCS
+hp-roman8		2004		roman8 r8 csHPRoman8
+macintosh		2027		mac csMacintosh MACROMAN MAC-ROMAN X-MAC-ROMAN
+IBM037			2028		cp037 ebcdic-cp-us ebcdic-cp-ca ebcdic-cp-wt ebcdic-cp-nl csIBM037
+IBM038			2029		EBCDIC-INT cp038 csIBM038
+IBM273			2030		CP273 csIBM273
+IBM274			2031		EBCDIC-BE CP274 csIBM274
+IBM275			2032		EBCDIC-BR cp275 csIBM275
+IBM277			2033		EBCDIC-CP-DK EBCDIC-CP-NO csIBM277
+IBM278			2034		CP278 ebcdic-cp-fi ebcdic-cp-se csIBM278
+IBM280			2035		CP280 ebcdic-cp-it csIBM280
+IBM281			2036		EBCDIC-JP-E cp281 csIBM281
+IBM284			2037		CP284 ebcdic-cp-es csIBM284
+IBM285			2038		CP285 ebcdic-cp-gb csIBM285
+IBM290			2039		cp290 EBCDIC-JP-kana csIBM290
+IBM297			2040		cp297 ebcdic-cp-fr csIBM297
+IBM420			2041		cp420 ebcdic-cp-ar1 csIBM420
+IBM423			2042		cp423 ebcdic-cp-gr csIBM423
+IBM424			2043		cp424 ebcdic-cp-he csIBM424
+IBM437			2011		cp437 437 csPC8CodePage437
+IBM500			2044		CP500 ebcdic-cp-be ebcdic-cp-ch csIBM500
+IBM775			2087		cp775 csPC775Baltic
+IBM850			2009		cp850 850 csPC850Multilingual
+IBM851			2045		cp851 851 csIBM851
+IBM852			2010		cp852 852 csPCp852
+IBM855			2046		cp855 855 csIBM855
+IBM857			2047		cp857 857 csIBM857
+IBM860			2048		cp860 860 csIBM860
+IBM861			2049		cp861 861 cp-is csIBM861
+IBM862			2013		cp862 862 csPC862LatinHebrew
+IBM863			2050		cp863 863 csIBM863
+IBM864			2051		cp864 csIBM864
+IBM865			2052		cp865 865 csIBM865
+IBM866			2086		cp866 866 csIBM866
+IBM868			2053		CP868 cp-ar csIBM868
+IBM869			2054		cp869 869 cp-gr csIBM869
+IBM870			2055		CP870 ebcdic-cp-roece ebcdic-cp-yu csIBM870
+IBM871			2056		CP871 ebcdic-cp-is csIBM871
+IBM880			2057		cp880 EBCDIC-Cyrillic csIBM880
+IBM891			2058		cp891 csIBM891
+IBM903			2059		cp903 csIBM903
+IBM904			2060		cp904 904 csIBBM904
+IBM905			2061		CP905 ebcdic-cp-tr csIBM905
+IBM918			2062		CP918 ebcdic-cp-ar2 csIBM918
+IBM1026			2063		CP1026 csIBM1026
+EBCDIC-AT-DE		2064		csIBMEBCDICATDE
+EBCDIC-AT-DE-A		2065		csEBCDICATDEA
+EBCDIC-CA-FR		2066		csEBCDICCAFR
+EBCDIC-DK-NO		2067		csEBCDICDKNO
+EBCDIC-DK-NO-A		2068		csEBCDICDKNOA
+EBCDIC-FI-SE		2069		csEBCDICFISE
+EBCDIC-FI-SE-A		2070		csEBCDICFISEA
+EBCDIC-FR		2071		csEBCDICFR
+EBCDIC-IT		2072		csEBCDICIT
+EBCDIC-PT		2073		csEBCDICPT
+EBCDIC-ES		2074		csEBCDICES
+EBCDIC-ES-A		2075		csEBCDICESA
+EBCDIC-ES-S		2076		csEBCDICESS
+EBCDIC-UK		2077		csEBCDICUK
+EBCDIC-US		2078		csEBCDICUS
+UNKNOWN-8BIT		2079		csUnknown8BiT
+MNEMONIC		2080		csMnemonic
+MNEM			2081		csMnem
+VISCII			2082		csVISCII
+VIQR			2083		csVIQR
+KOI8-R			2084		csKOI8R
+KOI8-U			2088
+IBM00858		2089		CCSID00858 CP00858 PC-Multilingual-850+euro
+IBM00924		2090		CCSID00924 CP00924 ebcdic-Latin9--euro
+IBM01140		2091		CCSID01140 CP01140 ebcdic-us-37+euro
+IBM01141		2092		CCSID01141 CP01141 ebcdic-de-273+euro
+IBM01142		2093		CCSID01142 CP01142 ebcdic-dk-277+euro ebcdic-no-277+euro
+IBM01143		2094		CCSID01143 CP01143 ebcdic-fi-278+euro ebcdic-se-278+euro
+IBM01144		2095		CCSID01144 CP01144 ebcdic-it-280+euro
+IBM01145		2096		CCSID01145 CP01145 ebcdic-es-284+euro
+IBM01146		2097		CCSID01146 CP01146 ebcdic-gb-285+euro
+IBM01147		2098		CCSID01147 CP01147 ebcdic-fr-297+euro
+IBM01148		2099		CCSID01148 CP01148 ebcdic-international-500+euro
+IBM01149		2100		CCSID01149 CP01149 ebcdic-is-871+euro
+Big5-HKSCS		2101
+IBM1047			2102		IBM-1047
+PTCP154			2103		csPTCP154 PT154 CP154 Cyrillic-Asian
+Amiga-1251		2104		Ami1251 Amiga1251 Ami-1251
+KOI7-switched		2105
+UNICODE-1-1		1010		csUnicode11
+SCSU			1011
+UTF-7			1012
+UTF-16BE		1013
+UTF-16LE		1014
+UTF-16			1015
+CESU-8			1016		csCESU-8
+UTF-32			1017
+UTF-32BE		1018
+UTF-32LE		1019
+BOCU-1			1020		csBOCU-1
+UNICODE-1-1-UTF-7	103		csUnicode11UTF7
+UTF-8			106		UNICODE-1-1-UTF-8 UNICODE-2-0-UTF-8 utf8
+ISO-8859-13		109		8859_13 ISO8859-13
+ISO-8859-14		110		iso-ir-199 ISO_8859-14:1998 ISO_8859-14 latin8 iso-celtic l8 8859_14 ISO8859-14
+ISO-8859-15		111		ISO_8859-15 Latin-9 8859_15 ISO8859-15
+ISO-8859-16		112		iso-ir-226 ISO_8859-16:2001 ISO_8859-16 latin10 l10
+GBK			113		CP936 MS936 windows-936
+GB18030			114
+OSD_EBCDIC_DF04_15	115
+OSD_EBCDIC_DF03_IRV	116
+OSD_EBCDIC_DF04_1	117
+JIS_Encoding		16		csJISEncoding
+Shift_JIS		17		MS_Kanji csShiftJIS X-SJIS Shift-JIS
+EUC-JP			18		csEUCPkdFmtJapanese Extended_UNIX_Code_Packed_Format_for_Japanese EUCJP
+Extended_UNIX_Code_Fixed_Width_for_Japanese	19		csEUCFixWidJapanese
+ISO-10646-UCS-Basic	1002		csUnicodeASCII
+ISO-10646-Unicode-Latin1	1003		csUnicodeLatin1 ISO-10646
+ISO-Unicode-IBM-1261	1005		csUnicodeIBM1261
+ISO-Unicode-IBM-1268	1006		csUnicodeIBM1268
+ISO-Unicode-IBM-1276	1007		csUnicodeIBM1276
+ISO-Unicode-IBM-1264	1008		csUnicodeIBM1264
+ISO-Unicode-IBM-1265	1009		csUnicodeIBM1265
+ISO-8859-1-Windows-3.0-Latin-1	2000		csWindows30Latin1
+ISO-8859-1-Windows-3.1-Latin-1	2001		csWindows31Latin1
+ISO-8859-2-Windows-Latin-2	2002		csWindows31Latin2
+ISO-8859-9-Windows-Latin-5	2003		csWindows31Latin5
+Adobe-Standard-Encoding	2005		csAdobeStandardEncoding
+Ventura-US		2006		csVenturaUS
+Ventura-International	2007		csVenturaInternational
+PC8-Danish-Norwegian	2012		csPC8DanishNorwegian
+PC8-Turkish		2014		csPC8Turkish
+IBM-Symbols		2015		csIBMSymbols
+IBM-Thai		2016		csIBMThai
+HP-Legal		2017		csHPLegal
+HP-Pi-font		2018		csHPPiFont
+HP-Math8		2019		csHPMath8
+Adobe-Symbol-Encoding	2020		csHPPSMath
+HP-DeskTop		2021		csHPDesktop
+Ventura-Math		2022		csVenturaMath
+Microsoft-Publishing	2023		csMicrosoftPublishing
+Windows-31J		2024		csWindows31J
+GB2312			2025		csGB2312 EUC-CN EUCCN CN-GB
+Big5			2026		csBig5 BIG-FIVE BIG-5 CN-BIG5 BIG_FIVE
+windows-1250		2250		CP1250 MS-EE
+windows-1251		2251		CP1251 MS-CYRL
+windows-1252		2252		CP1252 MS-ANSI
+windows-1253		2253		CP1253 MS-GREEK
+windows-1254		2254		CP1254 MS-TURK
+windows-1255		2255
+windows-1256		2256		CP1256 MS-ARAB
+windows-1257		2257		CP1257 WINBALTRIM
+windows-1258		2258
+TIS-620			2259
+HZ-GB-2312		2085
+
+# Additional encodings not defined by IANA
+
+# Arbitrary allocations
+#CP737			3001
+#CP853			3002
+#CP856			3003
+CP874			3004		WINDOWS-874
+#CP922			3005
+#CP1046			3006
+#CP1124			3007
+#CP1125			3008		WINDOWS-1125
+#CP1129			3009
+#CP1133			3010		IBM-CP1133
+#CP1161			3011		IBM-1161 IBM1161 CSIBM1161
+#CP1162			3012		IBM-1162 IBM1162 CSIBM1162
+#CP1163			3013		IBM-1163 IBM1163 CSIBM1163
+#GEORGIAN-ACADEMY	3014
+#GEORGIAN-PS		3015
+#KOI8-RU		3016
+#KOI8-T			3017
+#MACARABIC		3018		X-MAC-ARABIC MAC-ARABIC
+#MACCROATIAN		3019		X-MAC-CROATIAN MAC-CROATIAN
+#MACGREEK		3020		X-MAC-GREEK MAC-GREEK
+#MACHEBREW		3021		X-MAC-HEBREW MAC-HEBREW
+#MACICELAND		3022		X-MAC-ICELAND MAC-ICELAND
+#MACROMANIA		3023		X-MAC-ROMANIA MAC-ROMANIA
+#MACTHAI		3024		X-MAC-THAI MAC-THAI
+#MACTURKISH		3025		X-MAC-TURKISH MAC-TURKISH
+#MULELAO-1		3026
+
+# From Unicode Lib
+ISO-IR-182		4000
+ISO-IR-197		4002
+ISO-2022-JP-1		4008
+MACCYRILLIC		4009		X-MAC-CYRILLIC MAC-CYRILLIC
+MACUKRAINE		4010		X-MAC-UKRAINIAN MAC-UKRAINIAN
+MACCENTRALEUROPE	4011		X-MAC-CENTRALEURROMAN MAC-CENTRALEURROMAN
+JOHAB			4012
+ISO-8859-11		4014		iso-ir-166 ISO_8859-11 ISO8859-11 8859_11
+X-CURRENT		4999		X-SYSTEM
+X-ACORN-LATIN1		5001
+X-ACORN-FUZZY		5002
diff --git a/test/data/cscodec/INDEX b/test/data/cscodec/INDEX
new file mode 100644
index 0000000..d6d338a
--- /dev/null
+++ b/test/data/cscodec/INDEX
@@ -0,0 +1,6 @@
+# Index file for charset codec tests
+#
+# Test			Description
+
+simple.dat		Simple tests, designed to validate testdriver
+UTF-8-test.txt		Markus Kuhn's UTF-8 decoding test file
diff --git a/test/data/cscodec/UTF-8-test.txt b/test/data/cscodec/UTF-8-test.txt
new file mode 100644
index 0000000..920e54e
Binary files /dev/null and b/test/data/cscodec/UTF-8-test.txt differ
diff --git a/test/data/cscodec/simple.dat b/test/data/cscodec/simple.dat
new file mode 100644
index 0000000..3e2c7ae
Binary files /dev/null and b/test/data/cscodec/simple.dat differ
diff --git a/test/data/input/INDEX b/test/data/input/INDEX
new file mode 100644
index 0000000..c2c97ea
--- /dev/null
+++ b/test/data/input/INDEX
@@ -0,0 +1,5 @@
+# Index file for inputstream tests
+#
+# Test			Description
+
+UTF-8-test.txt		Markus Kuhn's UTF-8 decoding test file
diff --git a/test/data/input/UTF-8-test.txt b/test/data/input/UTF-8-test.txt
new file mode 100644
index 0000000..abd16f7
Binary files /dev/null and b/test/data/input/UTF-8-test.txt differ
diff --git a/test/filter.c b/test/filter.c
new file mode 100644
index 0000000..ff4d1e7
--- /dev/null
+++ b/test/filter.c
@@ -0,0 +1,357 @@
+#include <inttypes.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <parserutils/parserutils.h>
+
+#include "utils/utils.h"
+
+#include "input/filter.h"
+
+#include "testutils.h"
+
+static void *myrealloc(void *ptr, size_t len, void *pw)
+{
+	UNUSED(pw);
+
+	return realloc(ptr, len);
+}
+
+int main(int argc, char **argv)
+{
+	parserutils_filter_optparams params;
+	parserutils_filter *input;
+	uint8_t inbuf[64], outbuf[64];
+	size_t inlen, outlen;
+	const uint8_t *in = inbuf;
+	uint8_t *out = outbuf;
+
+	if (argc != 2) {
+		printf("Usage: %s <filename>\n", argv[0]);
+		return 1;
+	}
+
+	/* Initialise library */
+	assert(parserutils_initialise(argv[1], myrealloc, NULL) == 
+			PARSERUTILS_OK);
+
+	/* Create input filter */
+	input = parserutils_filter_create("UTF-8", myrealloc, NULL);
+	assert(input);
+
+	/* Convert filter to UTF-8 encoding */
+	params.encoding.name = "UTF-8";
+	assert(parserutils_filter_setopt(input, PARSERUTILS_FILTER_SET_ENCODING,
+			(parserutils_filter_optparams *) &params) == 
+			PARSERUTILS_OK);
+
+
+	/* Simple case - valid input & output buffer large enough */
+	in = inbuf;
+	out = outbuf;
+	strcpy((char *) inbuf, "hell\xc2\xa0o!");
+	inlen = strlen((const char *) inbuf);
+	outbuf[0] = '\0';
+	outlen = 64;
+
+	assert(parserutils_filter_process_chunk(input, &in, &inlen,
+			&out, &outlen) == PARSERUTILS_OK);
+
+	printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen,
+			(int) (out - ((uint8_t *) outbuf)),
+			outbuf, (int) outlen);
+
+	assert(parserutils_filter_reset(input) == PARSERUTILS_OK);
+
+	assert(memcmp(outbuf, "hell\xc2\xa0o!",
+			SLEN("hell\xc2\xa0o!")) == 0);
+
+
+	/* Too small an output buffer; no encoding edge cases */
+	in = inbuf;
+	out = outbuf;
+	strcpy((char *) inbuf, "hello!");
+	inlen = strlen((const char *) inbuf);
+	outbuf[0] = '\0';
+	outlen = 5;
+
+	assert(parserutils_filter_process_chunk(input, &in, &inlen,
+			&out, &outlen) == PARSERUTILS_NOMEM);
+
+	printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen,
+			(int) (out - ((uint8_t *) outbuf)),
+			outbuf, (int) outlen);
+
+	outlen = 64 - 5 + outlen;
+
+	assert(parserutils_filter_process_chunk(input, &in, &inlen,
+			&out, &outlen) == PARSERUTILS_OK);
+
+	printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen,
+			(int) (out - ((uint8_t *) outbuf)),
+			outbuf, (int) outlen);
+
+	assert(parserutils_filter_reset(input) == PARSERUTILS_OK);
+
+	assert(memcmp(outbuf, "hello!",
+			SLEN("hello!")) == 0);
+
+
+	/* Illegal input sequence; output buffer large enough */
+	in = inbuf;
+	out = outbuf;
+	strcpy((char *) inbuf, "hell\x96o!");
+	inlen = strlen((const char *) inbuf);
+	outbuf[0] = '\0';
+	outlen = 64;
+
+	/* Input does loose decoding, converting to U+FFFD if illegal
+	 * input is encountered */
+	assert(parserutils_filter_process_chunk(input, &in, &inlen,
+			&out, &outlen) == PARSERUTILS_OK);
+
+	printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen,
+			(int) (out - ((uint8_t *) outbuf)),
+			outbuf, (int) outlen);
+
+	assert(parserutils_filter_reset(input) == PARSERUTILS_OK);
+
+	assert(memcmp(outbuf, "hell\xef\xbf\xbdo!",
+			SLEN("hell\xef\xbf\xbdo!")) == 0);
+
+
+	/* Input ends mid-sequence */
+	in = inbuf;
+	out = outbuf;
+	strcpy((char *) inbuf, "hell\xc2\xa0o!");
+	inlen = strlen((const char *) inbuf) - 3;
+	outbuf[0] = '\0';
+	outlen = 64;
+
+	assert(parserutils_filter_process_chunk(input, &in, &inlen,
+			&out, &outlen) == PARSERUTILS_OK);
+
+	printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen,
+			(int) (out - ((uint8_t *) outbuf)),
+			outbuf, (int) outlen);
+
+	inlen += 3;
+
+	assert(parserutils_filter_process_chunk(input, &in, &inlen,
+			&out, &outlen) == PARSERUTILS_OK);
+
+	printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen,
+			(int) (out - ((uint8_t *) outbuf)),
+			outbuf, (int) outlen);
+
+	assert(parserutils_filter_reset(input) == PARSERUTILS_OK);
+
+	assert(memcmp(outbuf, "hell\xc2\xa0o!",
+			SLEN("hell\xc2\xa0o!")) == 0);
+
+
+	/* Input ends mid-sequence, but second attempt has too small a
+	 * buffer, but large enough to write out the incomplete character. */
+	in = inbuf;
+	out = outbuf;
+	strcpy((char *) inbuf, "hell\xc2\xa0o!");
+	inlen = strlen((const char *) inbuf) - 3;
+	outbuf[0] = '\0';
+	outlen = 64;
+
+	assert(parserutils_filter_process_chunk(input, &in, &inlen,
+			&out, &outlen) == PARSERUTILS_OK);
+
+	printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen,
+			(int) (out - ((uint8_t *) outbuf)),
+			outbuf, (int) outlen);
+
+	inlen += 3;
+	outlen = 3;
+
+	assert(parserutils_filter_process_chunk(input, &in, &inlen,
+			&out, &outlen) == PARSERUTILS_NOMEM);
+
+	printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen,
+			(int) (out - ((uint8_t *) outbuf)),
+			outbuf, (int) outlen);
+
+	outlen = 64 - 7;
+
+	assert(parserutils_filter_process_chunk(input, &in, &inlen,
+			&out, &outlen) == PARSERUTILS_OK);
+
+	printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen,
+			(int) (out - ((uint8_t *) outbuf)),
+			outbuf, (int) outlen);
+
+	assert(parserutils_filter_reset(input) == PARSERUTILS_OK);
+
+	assert(memcmp(outbuf, "hell\xc2\xa0o!",
+			SLEN("hell\xc2\xa0o!")) == 0);
+
+
+	/* Input ends mid-sequence, but second attempt has too small a
+	 * buffer, not large enough to write out the incomplete character. */
+	in = inbuf;
+	out = outbuf;
+	strcpy((char *) inbuf, "hell\xc2\xa0o!");
+	inlen = strlen((const char *) inbuf) - 3;
+	outbuf[0] = '\0';
+	outlen = 64;
+
+	assert(parserutils_filter_process_chunk(input, &in, &inlen,
+			&out, &outlen) == PARSERUTILS_OK);
+
+	printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen,
+			(int) (out - ((uint8_t *) outbuf)),
+			outbuf, (int) outlen);
+
+	inlen += 3;
+	outlen = 1;
+
+	assert(parserutils_filter_process_chunk(input, &in, &inlen,
+			&out, &outlen) == PARSERUTILS_NOMEM);
+
+	printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen,
+			(int) (out - ((uint8_t *) outbuf)),
+			outbuf, (int) outlen);
+
+	outlen = 60;
+
+	assert(parserutils_filter_process_chunk(input, &in, &inlen,
+			&out, &outlen) == PARSERUTILS_OK);
+
+	printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen,
+			(int) (out - ((uint8_t *) outbuf)),
+			outbuf, (int) outlen);
+
+	assert(parserutils_filter_reset(input) == PARSERUTILS_OK);
+
+	assert(memcmp(outbuf, "hell\xc2\xa0o!",
+			SLEN("hell\xc2\xa0o!")) == 0);
+
+
+	/* Input ends mid-sequence, but second attempt contains
+	 * invalid character */
+	in = inbuf;
+	out = outbuf;
+	strcpy((char *) inbuf, "hell\xc2\xc2o!");
+	inlen = strlen((const char *) inbuf) - 3;
+	outbuf[0] = '\0';
+	outlen = 64;
+
+	assert(parserutils_filter_process_chunk(input, &in, &inlen,
+			&out, &outlen) == PARSERUTILS_OK);
+
+	printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen,
+			(int) (out - ((uint8_t *) outbuf)),
+			outbuf, (int) outlen);
+
+	inlen += 3;
+
+	/* Input does loose decoding, converting to U+FFFD if illegal
+	 * input is encountered */
+	assert(parserutils_filter_process_chunk(input, &in, &inlen,
+			&out, &outlen) == PARSERUTILS_OK);
+
+	printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen,
+			(int) (out - ((uint8_t *) outbuf)),
+			outbuf, (int) outlen);
+
+	assert(parserutils_filter_reset(input) == PARSERUTILS_OK);
+
+	assert(memcmp(outbuf, "hell\xef\xbf\xbd\xef\xbf\xbdo!",
+			SLEN("hell\xef\xbf\xbd\xef\xbf\xbdo!")) == 0);
+
+
+	/* Input ends mid-sequence, but second attempt contains another
+	 * incomplete character */
+	in = inbuf;
+	out = outbuf;
+	strcpy((char *) inbuf, "hell\xc2\xa0\xc2\xa1o!");
+	inlen = strlen((const char *) inbuf) - 5;
+	outbuf[0] = '\0';
+	outlen = 64;
+
+	assert(parserutils_filter_process_chunk(input, &in, &inlen,
+			&out, &outlen) == PARSERUTILS_OK);
+
+	printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen,
+			(int) (out - ((uint8_t *) outbuf)),
+			outbuf, (int) outlen);
+
+	inlen += 2;
+
+	assert(parserutils_filter_process_chunk(input, &in, &inlen,
+			&out, &outlen) == PARSERUTILS_OK);
+
+	printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen,
+			(int) (out - ((uint8_t *) outbuf)),
+			outbuf, (int) outlen);
+
+	inlen += 3;
+
+	assert(parserutils_filter_process_chunk(input, &in, &inlen,
+			&out, &outlen) == PARSERUTILS_OK);
+
+	printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen,
+			(int) (out - ((uint8_t *) outbuf)),
+			outbuf, (int) outlen);
+
+	assert(parserutils_filter_reset(input) == PARSERUTILS_OK);
+
+	assert(memcmp(outbuf, "hell\xc2\xa0\xc2\xa1o!",
+			SLEN("hell\xc2\xa0\xc2\xa1o!")) == 0);
+
+
+	/* Input ends mid-sequence, but second attempt contains insufficient
+	 * data to complete the incomplete character */
+	in = inbuf;
+	out = outbuf;
+	strcpy((char *) inbuf, "hell\xe2\x80\xa2o!");
+	inlen = strlen((const char *) inbuf) - 4;
+	outbuf[0] = '\0';
+	outlen = 64;
+
+	assert(parserutils_filter_process_chunk(input, &in, &inlen,
+			&out, &outlen) == PARSERUTILS_OK);
+
+	printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen,
+			(int) (out - ((uint8_t *) outbuf)),
+			outbuf, (int) outlen);
+
+	inlen += 1;
+
+	assert(parserutils_filter_process_chunk(input, &in, &inlen,
+			&out, &outlen) == PARSERUTILS_OK);
+
+	printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen,
+			(int) (out - ((uint8_t *) outbuf)),
+			outbuf, (int) outlen);
+
+	inlen += 3;
+
+	assert(parserutils_filter_process_chunk(input, &in, &inlen,
+			&out, &outlen) == PARSERUTILS_OK);
+
+	printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen,
+			(int) (out - ((uint8_t *) outbuf)),
+			outbuf, (int) outlen);
+
+	assert(parserutils_filter_reset(input) == PARSERUTILS_OK);
+
+	assert(memcmp(outbuf, "hell\xe2\x80\xa2o!",
+			SLEN("hell\xe2\x80\xa2o!")) == 0);
+
+
+	/* Clean up */
+	parserutils_filter_destroy(input);
+
+	assert(parserutils_finalise(myrealloc, NULL) == PARSERUTILS_OK);
+
+	printf("PASS\n");
+
+	return 0;
+}
diff --git a/test/inputstream.c b/test/inputstream.c
new file mode 100644
index 0000000..bad3127
--- /dev/null
+++ b/test/inputstream.c
@@ -0,0 +1,97 @@
+#include <inttypes.h>
+#include <stdio.h>
+
+#include <parserutils/parserutils.h>
+#include <parserutils/charset/utf8.h>
+#include <parserutils/input/inputstream.h>
+
+#include "utils/utils.h"
+
+#include "testutils.h"
+
+static void *myrealloc(void *ptr, size_t len, void *pw)
+{
+	UNUSED(pw);
+
+	return realloc(ptr, len);
+}
+
+int main(int argc, char **argv)
+{
+	parserutils_inputstream *stream;
+	FILE *fp;
+	size_t len, origlen;
+#define CHUNK_SIZE (4096)
+	uint8_t buf[CHUNK_SIZE];
+	uintptr_t c;
+	size_t clen;
+
+	if (argc != 3) {
+		printf("Usage: %s <aliases_file> <filename>\n", argv[0]);
+		return 1;
+	}
+
+	/* Initialise library */
+	assert(parserutils_initialise(argv[1], myrealloc, NULL) == 
+			PARSERUTILS_OK);
+
+	stream = parserutils_inputstream_create("UTF-8", 1, NULL, 
+			myrealloc, NULL);
+	assert(stream != NULL);
+
+	fp = fopen(argv[2], "rb");
+	if (fp == NULL) {
+		printf("Failed opening %s\n", argv[2]);
+		return 1;
+	}
+
+	fseek(fp, 0, SEEK_END);
+	origlen = len = ftell(fp);
+	fseek(fp, 0, SEEK_SET);
+
+	while (len >= CHUNK_SIZE) {
+		fread(buf, 1, CHUNK_SIZE, fp);
+
+		assert(parserutils_inputstream_append(stream,
+				buf, CHUNK_SIZE) == PARSERUTILS_OK);
+
+		len -= CHUNK_SIZE;
+
+		while ((c = parserutils_inputstream_peek(stream, 0, &clen)) !=
+				PARSERUTILS_INPUTSTREAM_OOD) {
+			parserutils_inputstream_advance(stream, clen);
+		}
+	}
+
+	if (len > 0) {
+		fread(buf, 1, len, fp);
+
+		assert(parserutils_inputstream_append(stream,
+				buf, len) == PARSERUTILS_OK);
+
+		len = 0;
+	}
+
+	fclose(fp);
+
+	assert(parserutils_inputstream_insert(stream,
+			(const uint8_t *) "hello!!!",
+			SLEN("hello!!!")) == PARSERUTILS_OK);
+
+	assert(parserutils_inputstream_append(stream, NULL, 0) == 
+			PARSERUTILS_OK);
+
+	while ((c = parserutils_inputstream_peek(stream, 0, &clen)) !=
+			PARSERUTILS_INPUTSTREAM_EOF) {
+		parserutils_inputstream_advance(stream, clen);
+	}
+
+	parserutils_inputstream_destroy(stream);
+
+	assert(parserutils_finalise(myrealloc, NULL) == PARSERUTILS_OK);
+
+	printf("PASS\n");
+
+	return 0;
+}
+
diff --git a/test/parserutils.c b/test/parserutils.c
new file mode 100644
index 0000000..c6d671a
--- /dev/null
+++ b/test/parserutils.c
@@ -0,0 +1,30 @@
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <parserutils/parserutils.h>
+
+#include "testutils.h"
+
+static void *myrealloc(void *ptr, size_t len, void *pw)
+{
+	UNUSED(pw);
+
+	return realloc(ptr, len);
+}
+
+int main(int argc, char **argv)
+{
+	if (argc != 2) {
+		printf("Usage: %s <filename>\n", argv[0]);
+		return 1;
+	}
+
+	assert(parserutils_initialise(argv[1], myrealloc, NULL) == 
+			PARSERUTILS_OK);
+
+	assert (parserutils_finalise(myrealloc, NULL) == PARSERUTILS_OK);
+
+	printf("PASS\n");
+
+	return 0;
+}
diff --git a/test/regression/cscodec-segv.c b/test/regression/cscodec-segv.c
new file mode 100644
index 0000000..5802fdf
--- /dev/null
+++ b/test/regression/cscodec-segv.c
@@ -0,0 +1,38 @@
+#include <stdio.h>
+
+#include "charset/charset.h"
+#include <parserutils/charset/codec.h>
+
+#include "testutils.h"
+
+static void *myrealloc(void *ptr, size_t len, void *pw)
+{
+	UNUSED(pw);
+
+	return realloc(ptr, len);
+}
+
+int main(int argc, char **argv)
+{
+	parserutils_charset_codec *codec;
+
+	if (argc != 2) {
+		printf("Usage: %s <aliases_file>\n", argv[0]);
+		return 1;
+	}
+
+	assert(parserutils_charset_initialise(argv[1], myrealloc, NULL) == 
+			PARSERUTILS_OK);
+
+	codec = parserutils_charset_codec_create("UTF-8", myrealloc, NULL);
+	assert(codec != NULL);
+
+	parserutils_charset_codec_destroy(codec);
+
+	assert(parserutils_charset_finalise(myrealloc, NULL) == 
+			PARSERUTILS_OK);
+
+	printf("PASS\n");
+
+	return 0;
+}
diff --git a/test/regression/filter-segv.c b/test/regression/filter-segv.c
new file mode 100644
index 0000000..761caab
--- /dev/null
+++ b/test/regression/filter-segv.c
@@ -0,0 +1,39 @@
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <parserutils/parserutils.h>
+
+#include "input/filter.h"
+
+#include "testutils.h"
+
+static void *myrealloc(void *ptr, size_t len, void *pw)
+{
+	UNUSED(pw);
+
+	return realloc(ptr, len);
+}
+
+int main(int argc, char **argv)
+{
+	parserutils_filter *input;
+
+	if (argc != 2) {
+		printf("Usage: %s <filename>\n", argv[0]);
+		return 1;
+	}
+
+	assert(parserutils_initialise(argv[1], myrealloc, NULL) == 
+			PARSERUTILS_OK);
+
+	input = parserutils_filter_create("UTF-8", myrealloc, NULL);
+	assert(input);
+
+	parserutils_filter_destroy(input);
+
+	assert(parserutils_finalise(myrealloc, NULL) == PARSERUTILS_OK);
+
+	printf("PASS\n");
+
+	return 0;
+}
diff --git a/test/regression/stream-nomem.c b/test/regression/stream-nomem.c
new file mode 100644
index 0000000..f62b392
--- /dev/null
+++ b/test/regression/stream-nomem.c
@@ -0,0 +1,94 @@
+#include <stdio.h>
+#include <string.h>
+
+#include <parserutils/parserutils.h>
+#include <parserutils/input/inputstream.h>
+
+#include "utils/utils.h"
+
+#include "testutils.h"
+
+static void *myrealloc(void *ptr, size_t len, void *pw)
+{
+	UNUSED(pw);
+
+	return realloc(ptr, len);
+}
+
+int main(int argc, char **argv)
+{
+	parserutils_inputstream *stream;
+
+	/* This is specially calculated so that the inputstream is forced to 
+	 * reallocate (it assumes that the inputstream's buffer chunk size 
+	 * is 4k) */
+#define BUFFER_SIZE (4096 + 4)
+	uint8_t input_buffer[BUFFER_SIZE];
+//	uint8_t *buffer;
+//	size_t buflen;
+	uintptr_t c;
+	size_t clen;
+
+	if (argc != 2) {
+		printf("Usage: %s <aliases_file>\n", argv[0]);
+		return 1;
+	}
+
+	/* Populate the buffer with something sane */
+	memset(input_buffer, 'a', BUFFER_SIZE);
+	/* Now, set up our test data */
+	input_buffer[BUFFER_SIZE - 1] = '5';
+	input_buffer[BUFFER_SIZE - 2] = '4';
+	input_buffer[BUFFER_SIZE - 3] = '\xbd';
+	input_buffer[BUFFER_SIZE - 4] = '\xbf';
+	/* This byte will occupy the 4095th byte in the buffer and
+	 * thus cause the entirety of U+FFFD to be buffered until after
+	 * the buffer has been enlarged */
+	input_buffer[BUFFER_SIZE - 5] = '\xef';
+	input_buffer[BUFFER_SIZE - 6] = '3';
+	input_buffer[BUFFER_SIZE - 7] = '2';
+	input_buffer[BUFFER_SIZE - 8] = '1';
+
+	assert(parserutils_initialise(argv[1], myrealloc, NULL) == 
+			PARSERUTILS_OK);
+
+	stream = parserutils_inputstream_create("UTF-8", 0, 
+			NULL, myrealloc, NULL);
+	assert(stream != NULL);
+
+	assert(parserutils_inputstream_append(stream, 
+			input_buffer, BUFFER_SIZE) == PARSERUTILS_OK);
+
+	assert(parserutils_inputstream_append(stream, NULL, 0) == 
+			PARSERUTILS_OK);
+
+	while ((c = parserutils_inputstream_peek(stream, 0, &clen)) != 
+			PARSERUTILS_INPUTSTREAM_EOF)
+		parserutils_inputstream_advance(stream, clen);
+
+/*
+	assert(css_inputstream_claim_buffer(stream, &buffer, &buflen) == 
+			CSS_OK);
+
+	assert(buflen == BUFFER_SIZE);
+
+	printf("Buffer: '%.*s'\n", 8, buffer + (BUFFER_SIZE - 8));
+
+	assert( buffer[BUFFER_SIZE - 6] == '3' && 
+		buffer[BUFFER_SIZE - 5] == (uint8_t) '\xef' && 
+		buffer[BUFFER_SIZE - 4] == (uint8_t) '\xbf' && 
+		buffer[BUFFER_SIZE - 3] == (uint8_t) '\xbd' && 
+		buffer[BUFFER_SIZE - 2] == '4');
+
+	free(buffer);
+*/
+
+	parserutils_inputstream_destroy(stream);
+
+	assert(parserutils_finalise(myrealloc, NULL) == PARSERUTILS_OK);
+
+	printf("PASS\n");
+
+	return 0;
+}
+
diff --git a/test/testrunner.pl b/test/testrunner.pl
new file mode 100644
index 0000000..1c6c66d
--- /dev/null
+++ b/test/testrunner.pl
@@ -0,0 +1,167 @@
+#!/bin/perl
+#
+# Testcase runner
+#
+# Usage: testrunner <directory> [<executable extension>]
+#
+# Operates upon INDEX files described in the README.
+# Locates and executes testcases, feeding data files to programs 
+# as appropriate.
+# Logs testcase output to file.
+# Aborts test sequence on detection of error.
+#
+
+use warnings;
+use strict;
+use File::Spec;
+use IPC::Open3;
+
+if (@ARGV < 1) {
+	print "Usage: testrunner.pl <directory> [<exeext>]\n";
+	exit;
+}
+
+# Get directory
+my $directory = shift @ARGV;
+
+# Get EXE extension (if any)
+my $exeext = "";
+$exeext = shift @ARGV if (@ARGV > 0);
+
+# Open log file and /dev/null
+open(LOG, ">$directory/log") or die "Failed opening test log";
+open(NULL, "+<", File::Spec->devnull) or die "Failed opening /dev/null";
+
+# Open testcase index
+open(TINDEX, "<$directory/INDEX") or die "Failed opening test INDEX";
+
+# Parse testcase index, looking for testcases
+while (my $line = <TINDEX>) {
+	next if ($line =~ /^(#.*)?$/);
+
+	# Found one; decompose
+	(my $test, my $desc, my $data) = split /\t+/, $line;
+
+	# Strip whitespace
+	$test =~ s/^\s+|\s+$//g;
+	$desc =~ s/^\s+|\s+$//g;
+	$data =~ s/^\s+|\s+$//g if ($data);
+
+	# Append EXE extension to binary name
+	$test = $test . $exeext;
+
+	print "Test: $desc\n";
+
+	my $pid;
+
+	if ($data) {
+		# Testcase has external data files
+
+		# Open datafile index
+		open(DINDEX, "<$directory/data/$data/INDEX") or 
+			die "Failed opening $directory/data/$data/INDEX";
+
+		# Parse datafile index, looking for datafiles
+		while (my $dentry = <DINDEX>) {
+			next if ($dentry =~ /^(#.*)?$/);
+
+			# Found one; decompose
+			(my $dtest, my $ddesc) = split /\t+/, $dentry;
+
+			# Strip whitespace
+			$dtest =~ s/^\s+|\s+$//g;
+			$ddesc =~ s/^\s+|\s+$//g;
+
+			print LOG "Running $directory/$test " .
+					"$directory/data/Aliases " .
+					"$directory/data/$data/$dtest\n";
+
+			# Make message fit on an 80 column terminal
+			my $msg = "    ==> $test [$data/$dtest]";
+			$msg = $msg . "." x (80 - length($msg) - 8);
+
+			print $msg;
+
+			# Run testcase
+			$pid = open3("&<NULL", \*OUT, \*ERR, 
+					"$directory/$test", 
+					"$directory/data/Aliases", 
+					"$directory/data/$data/$dtest");
+
+			my $last = "FAIL";
+
+			# Marshal testcase output to log file
+			while (my $output = <OUT>) {
+				print LOG "    $output";
+				$last = $output;
+			}
+
+			# Wait for child to finish
+			waitpid($pid, 0);
+
+			print substr($last, 0, 4) . "\n";
+
+			# Bail, noisily, on failure
+			if (substr($last, 0, 4) eq "FAIL") {
+				# Write any stderr output to the log
+				while (my $errors = <ERR>) {
+					print LOG "    $errors";
+				}
+
+				print "\n\nFailure detected: " .
+						"consult log file\n\n\n";
+
+				exit(1);
+			}
+                }
+
+		close(DINDEX);
+	} else {
+		# Testcase has no external data files
+		print LOG "Running $directory/$test $directory/data/Aliases\n";
+
+		# Make message fit on an 80 column terminal
+		my $msg = "    ==> $test";
+		$msg = $msg . "." x (80 - length($msg) - 8);
+
+		print $msg;
+
+		# Run testcase
+		$pid = open3("&<NULL", \*OUT, \*ERR, 
+				"$directory/$test", "$directory/data/Aliases");
+
+		my $last = "FAIL";
+
+		# Marshal testcase output to log file
+		while (my $output = <OUT>) {
+			print LOG "    $output";
+			$last = $output;
+		}
+
+		# Wait for child to finish
+		waitpid($pid, 0);
+
+		print substr($last, 0, 4) . "\n";
+
+		# Bail, noisily, on failure
+		if (substr($last, 0, 4) eq "FAIL") {
+			# Write any stderr output to the log
+			while (my $errors = <ERR>) {
+				print LOG "    $errors";
+			}
+
+			print "\n\nFailure detected: " . 
+					"consult log file\n\n\n";
+
+			exit(1);
+		}
+	}
+
+	print "\n";
+}
+
+# Clean up
+close(TINDEX);
+
+close(NULL);
+close(LOG);
diff --git a/test/testutils.h b/test/testutils.h
new file mode 100644
index 0000000..c91c5b8
--- /dev/null
+++ b/test/testutils.h
@@ -0,0 +1,123 @@
+#ifndef test_testutils_h_
+#define test_testutils_h_
+
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#ifndef UNUSED
+#define UNUSED(x) ((x) = (x))
+#endif
+
+/* Redefine assert, so we can simply use the standard assert mechanism
+ * within testcases and exit with the right output for the testrunner
+ * to do the right thing. */
+void __assert2(const char *expr, const char *function,
+		const char *file, int line);
+
+void __assert2(const char *expr, const char *function,
+		const char *file, int line)
+{
+	UNUSED(function);
+	UNUSED(file);
+
+	printf("FAIL - %s at line %d\n", expr, line);
+
+	exit(EXIT_FAILURE);
+}
+
+#define assert(expr) \
+  ((void) ((expr) || (__assert2 (#expr, __func__, __FILE__, __LINE__), 0)))
+
+
+typedef bool (*line_func)(const char *data, size_t datalen, void *pw);
+
+static size_t parse_strlen(const char *str, size_t limit);
+bool parse_testfile(const char *filename, line_func callback, void *pw);
+size_t parse_filesize(const char *filename);
+
+/**
+ * Testcase datafile parser driver
+ *
+ * \param filename  Name of file to parse
+ * \param callback  Pointer to function to handle each line of input data
+ * \param pw        Pointer to client-specific private data
+ * \return true on success, false otherwise.
+ */
+bool parse_testfile(const char *filename, line_func callback, void *pw)
+{
+	FILE *fp;
+	char buf[300];
+
+	fp = fopen(filename, "rb");
+	if (fp == NULL) {
+		printf("Failed opening %s\n", filename);
+		return false;
+	}
+
+	while (fgets(buf, sizeof buf, fp)) {
+		if (buf[0] == '\n')
+			continue;
+
+		if (!callback(buf, parse_strlen(buf, sizeof buf), pw)) {
+			fclose(fp);
+			return false;
+		}
+	}
+
+	fclose(fp);
+
+	return true;
+}
+
+/**
+ * Utility string length measurer; assumes strings are '\n' terminated
+ *
+ * \param str    String to measure length of
+ * \param limit  Upper bound on string length
+ * \return String length
+ */
+size_t parse_strlen(const char *str, size_t limit)
+{
+	size_t len = 0;
+
+	if (str == NULL)
+		return 0;
+
+	while (len < limit - 1 && *str != '\n') {
+		len++;
+		str++;
+	}
+
+	len++;
+
+	return len;
+}
+
+/**
+ * Read the size of a file
+ *
+ * \param filename  Name of file to read size of
+ * \return File size (in bytes), or 0 on error
+ */
+size_t parse_filesize(const char *filename)
+{
+	FILE *fp;
+	size_t len = 0;
+
+	fp = fopen(filename, "rb");
+	if (fp == NULL) {
+		printf("Failed opening %s\n", filename);
+		return 0;
+	}
+
+	fseek(fp, 0, SEEK_END);
+	len = ftell(fp);
+
+	fclose(fp);
+
+	return len;
+}
+
+
+#endif
-- 
cgit v1.2.3