From 3479055b4a609032a1775871cc685fd7dd33ab32 Mon Sep 17 00:00:00 2001
From: John Mark Bell <jmb@netsurf-browser.org>
Date: Tue, 3 Mar 2009 18:08:01 +0000
Subject: Rationalise dom_string (some consideration is required as to what
 happens wrt interning -- lwc_strings should probably be used) Purge charset
 handling -- a) documents are always converted to utf-8 b) use parserutils for
 utf-8 handling Fix Hubbub binding to compile.

svn path=/trunk/dom/; revision=6682
---
 bindings/hubbub/parser.c          |  64 ++--
 bindings/xml/xmlbinding.c         |   5 +-
 bindings/xml/xmlparser.c          |  36 +--
 include/dom/bootstrap/implpriv.h  |   5 +-
 include/dom/core/document.h       |   3 +
 include/dom/core/implementation.h |   1 -
 include/dom/core/string.h         |  19 +-
 src/core/attr.c                   |   2 +-
 src/core/document.c               | 113 ++-----
 src/core/document.h               |   6 -
 src/core/implementation.c         |   4 +-
 src/core/node.c                   |  21 +-
 src/core/string.c                 | 632 ++++++++------------------------------
 src/utils/Makefile                |   2 +-
 src/utils/namespace.c             |  13 +-
 src/utils/utf16.c                 | 239 --------------
 src/utils/utf16.h                 |  38 ---
 src/utils/utf8.c                  | 368 ----------------------
 src/utils/utf8.h                  |  38 ---
 19 files changed, 202 insertions(+), 1407 deletions(-)
 delete mode 100644 src/utils/utf16.c
 delete mode 100644 src/utils/utf16.h
 delete mode 100644 src/utils/utf8.c
 delete mode 100644 src/utils/utf8.h

diff --git a/bindings/hubbub/parser.c b/bindings/hubbub/parser.c
index 9473438..7b5e6ab 100644
--- a/bindings/hubbub/parser.c
+++ b/bindings/hubbub/parser.c
@@ -20,7 +20,6 @@
  */
 struct dom_hubbub_parser {
 	hubbub_parser *parser;		/**< Hubbub parser instance */
-	const uint8_t *buffer;		/**< Parser buffer pointer */
 
 	struct dom_document *doc;	/**< DOM Document we're building */
 
@@ -35,9 +34,8 @@ struct dom_hubbub_parser {
 	void *mctx;			/**< Pointer to client data */
 };
 
-static void __dom_hubbub_buffer_handler(const uint8_t *buffer, size_t len, 
+static hubbub_error __dom_hubbub_token_handler(const hubbub_token *token, 
 		void *pw);
-static void __dom_hubbub_token_handler(const hubbub_token *token, void *pw);
 
 static bool __initialised;
 
@@ -63,6 +61,8 @@ dom_hubbub_parser *dom_hubbub_parser_create(const char *aliases,
 	dom_exception err;
 	hubbub_error e;
 
+	UNUSED(int_enc);
+
 	if (__initialised == false) {
 		e = hubbub_initialise(aliases, (hubbub_alloc) alloc, pw);
 		if (e != HUBBUB_OK) {
@@ -80,23 +80,11 @@ dom_hubbub_parser *dom_hubbub_parser_create(const char *aliases,
 		return NULL;
 	}
 
-	parser->parser = hubbub_parser_create(enc, int_enc, 
-			(hubbub_alloc) alloc, pw);
-	if (parser->parser == NULL) {
-		alloc(parser, 0, pw);
-		msg(DOM_MSG_CRITICAL, mctx, "Failed to create hubbub parser");
-		return NULL;
-	}
-
-	params.buffer_handler.handler = __dom_hubbub_buffer_handler;
-	params.buffer_handler.pw = parser;
-	e = hubbub_parser_setopt(parser->parser, HUBBUB_PARSER_BUFFER_HANDLER,
-			&params);
+	e = hubbub_parser_create(enc, true, (hubbub_alloc) alloc, pw,
+			&parser->parser);
 	if (e != HUBBUB_OK) {
-		hubbub_parser_destroy(parser->parser);
 		alloc(parser, 0, pw);
-		msg(DOM_MSG_CRITICAL, mctx, 
-				"Failed registering hubbub buffer handler");
+		msg(DOM_MSG_CRITICAL, mctx, "Failed to create hubbub parser");
 		return NULL;
 	}
 
@@ -118,8 +106,7 @@ dom_hubbub_parser *dom_hubbub_parser_create(const char *aliases,
 
 	/* Get DOM implementation */
 	/* Create string representation of the features we want */
-	err = dom_string_create_from_ptr_no_doc(alloc, pw,
-			DOM_STRING_UTF8,
+	err = dom_string_create(alloc, pw,
 			(const uint8_t *) "HTML", SLEN("HTML"), &features);
 	if (err != DOM_NO_ERR) {
 		hubbub_parser_destroy(parser->parser);
@@ -202,17 +189,7 @@ struct dom_document *dom_hubbub_parser_get_document(dom_hubbub_parser *parser)
 	return (parser->complete ? parser->doc : NULL);
 }
 
-void __dom_hubbub_buffer_handler(const uint8_t *buffer, size_t len, 
-		void *pw)
-{
-	dom_hubbub_parser *parser = (dom_hubbub_parser *) pw;
-
-	UNUSED(len);
-
-	parser->buffer = buffer;
-}
-
-void __dom_hubbub_token_handler(const hubbub_token *token, void *pw)
+hubbub_error __dom_hubbub_token_handler(const hubbub_token *token, void *pw)
 {
 	dom_hubbub_parser *parser = (dom_hubbub_parser *) pw;
 	static const char *token_names[] = {
@@ -221,55 +198,58 @@ void __dom_hubbub_token_handler(const hubbub_token *token, void *pw)
 	};
 	size_t i;
 
+	UNUSED(parser);
+
 	printf("%s: ", token_names[token->type]);
 
 	switch (token->type) {
 	case HUBBUB_TOKEN_DOCTYPE:
 		printf("'%.*s' (%svalid)\n",
 				(int) token->data.doctype.name.len,
-				parser->buffer + 
-					token->data.doctype.name.data_off,
-				token->data.doctype.correct ? "" : "in");
+				token->data.doctype.name.ptr,
+				token->data.doctype.force_quirks ? "in" : "");
 		break;
 	case HUBBUB_TOKEN_START_TAG:
 		printf("'%.*s' %s\n",
 				(int) token->data.tag.name.len,
-				parser->buffer + token->data.tag.name.data_off,
+				token->data.tag.name.ptr,
 				(token->data.tag.n_attributes > 0) ?
 						"attributes:" : "");
 		for (i = 0; i < token->data.tag.n_attributes; i++) {
 			printf("\t'%.*s' = '%.*s'\n",
 					(int) token->data.tag.attributes[i].name.len,
-					parser->buffer + token->data.tag.attributes[i].name.data_off,
+					token->data.tag.attributes[i].name.ptr,
 					(int) token->data.tag.attributes[i].value.len,
-					parser->buffer + token->data.tag.attributes[i].value.data_off);
+					token->data.tag.attributes[i].value.ptr);
 		}
 		break;
 	case HUBBUB_TOKEN_END_TAG:
 		printf("'%.*s' %s\n",
 				(int) token->data.tag.name.len,
-				parser->buffer + token->data.tag.name.data_off,
+				token->data.tag.name.ptr,
 				(token->data.tag.n_attributes > 0) ?
 						"attributes:" : "");
 		for (i = 0; i < token->data.tag.n_attributes; i++) {
 			printf("\t'%.*s' = '%.*s'\n",
 					(int) token->data.tag.attributes[i].name.len,
-					parser->buffer + token->data.tag.attributes[i].name.data_off,
+					token->data.tag.attributes[i].name.ptr,
 					(int) token->data.tag.attributes[i].value.len,
-					parser->buffer + token->data.tag.attributes[i].value.data_off);
+					token->data.tag.attributes[i].value.ptr);
 		}
 		break;
 	case HUBBUB_TOKEN_COMMENT:
 		printf("'%.*s'\n", (int) token->data.comment.len,
-				parser->buffer + token->data.comment.data_off);
+				token->data.comment.ptr);
 		break;
 	case HUBBUB_TOKEN_CHARACTER:
 		printf("'%.*s'\n", (int) token->data.character.len,
-				parser->buffer + token->data.character.data_off);
+				token->data.character.ptr);
 		break;
 	case HUBBUB_TOKEN_EOF:
 		printf("\n");
 		break;
 	}
+
+	return HUBBUB_OK;
 }
 
diff --git a/bindings/xml/xmlbinding.c b/bindings/xml/xmlbinding.c
index 2bbfb7b..b03b7af 100644
--- a/bindings/xml/xmlbinding.c
+++ b/bindings/xml/xmlbinding.c
@@ -38,7 +38,6 @@ static dom_exception xml_dom_implementation_create_document(
 		struct dom_string *qname,
 		struct dom_document_type *doctype,
 		struct dom_document **doc,
-		dom_string_charset charset,
 		dom_alloc alloc, void *pw);
 static dom_exception xml_dom_implementation_get_feature(
 		struct dom_implementation *impl,
@@ -237,7 +236,6 @@ dom_exception xml_dom_implementation_create_document_type(
  * \param qname      The qualified name of the document element
  * \param doctype    The type of document to create
  * \param doc        Pointer to location to receive result
- * \param charset    The charset to use for strings in the document
  * \param alloc      Memory (de)allocation function
  * \param pw         Pointer to client-specific private data
  * \return DOM_NO_ERR on success,
@@ -274,14 +272,13 @@ dom_exception xml_dom_implementation_create_document(
 		struct dom_string *qname,
 		struct dom_document_type *doctype,
 		struct dom_document **doc,
-		dom_string_charset charset,
 		dom_alloc alloc, void *pw)
 {
 	struct dom_document *d;
 	dom_exception err;
 
 	/* Create document object */
-	err = dom_document_create(impl, charset, alloc, pw, &d);
+	err = dom_document_create(impl, alloc, pw, &d);
 	if (err != DOM_NO_ERR)
 		return err;
 
diff --git a/bindings/xml/xmlparser.c b/bindings/xml/xmlparser.c
index 743a826..9e3786f 100644
--- a/bindings/xml/xmlparser.c
+++ b/bindings/xml/xmlparser.c
@@ -181,8 +181,7 @@ dom_xml_parser *dom_xml_parser_create(const char *enc, const char *int_enc,
 	parser->complete = false;
 
 	/* Create key for user data registration */
-	err = dom_string_create_from_ptr_no_doc((dom_alloc) alloc, pw,
-			DOM_STRING_UTF8, 
+	err = dom_string_create((dom_alloc) alloc, pw,
 			(const uint8_t *) "__xmlnode", SLEN("__xmlnode"),
 			&parser->udkey);
 	if (err != DOM_NO_ERR) {
@@ -194,8 +193,7 @@ dom_xml_parser *dom_xml_parser_create(const char *enc, const char *int_enc,
 
 	/* Get DOM implementation */
 	/* Create a string representation of the features we want */
-	err = dom_string_create_from_ptr_no_doc((dom_alloc) alloc, pw,
-			DOM_STRING_UTF8,
+	err = dom_string_create((dom_alloc) alloc, pw,
 			(const uint8_t *) "XML", SLEN("XML"), &features);
 	if (err != DOM_NO_ERR) {
 		dom_string_unref(parser->udkey);
@@ -329,7 +327,6 @@ void xml_parser_start_document(void *ctx)
 			/* qname */ NULL,
 			/* doctype */ NULL,
 			&doc,
-			DOM_STRING_UTF8,
 			(dom_alloc) parser->alloc,
 			parser->pw);
 	if (err != DOM_NO_ERR) {
@@ -650,9 +647,8 @@ void xml_parser_add_element_node(dom_xml_parser *parser,
 		struct dom_string *tag_name;
 
 		/* Create tag name DOM string */
-		err = dom_string_create_from_const_ptr(parser->doc,
-				child->name,
-				strlen((const char *) child->name),
+		err = dom_document_create_string(parser->doc,
+				child->name, strlen((const char *) child->name),
 				&tag_name);
 		if (err != DOM_NO_ERR) {
 			parser->msg(DOM_MSG_CRITICAL, parser->mctx,
@@ -684,7 +680,7 @@ void xml_parser_add_element_node(dom_xml_parser *parser,
 		uint8_t qnamebuf[qnamelen + 1 /* '\0' */];
 
 		/* Create namespace DOM string */
-		err = dom_string_create_from_const_ptr(parser->doc,
+		err = dom_document_create_string(parser->doc,
 				child->ns->href,
 				strlen((const char *) child->ns->href),
 				&namespace);
@@ -703,7 +699,7 @@ void xml_parser_add_element_node(dom_xml_parser *parser,
 			(const char *) child->name);
 
 		/* Create qname DOM string */
-		err = dom_string_create_from_ptr(parser->doc,
+		err = dom_document_create_string(parser->doc,
 				qnamebuf,
 				qnamelen,
 				&qname);
@@ -742,7 +738,7 @@ void xml_parser_add_element_node(dom_xml_parser *parser,
 			struct dom_string *name;
 
 			/* Create attribute name DOM string */
-			err = dom_string_create_from_const_ptr(parser->doc,
+			err = dom_document_create_string(parser->doc,
 					a->name,
 					strlen((const char *) a->name),
 					&name);
@@ -776,7 +772,7 @@ void xml_parser_add_element_node(dom_xml_parser *parser,
 			uint8_t qnamebuf[qnamelen + 1 /* '\0' */];
 
 			/* Create namespace DOM string */
-			err = dom_string_create_from_const_ptr(parser->doc,
+			err = dom_document_create_string(parser->doc,
 					a->ns->href,
 					strlen((const char *) a->ns->href),
 					&namespace);
@@ -795,7 +791,7 @@ void xml_parser_add_element_node(dom_xml_parser *parser,
 				(const char *) a->name);
 
 			/* Create qname DOM string */
-			err = dom_string_create_from_ptr(parser->doc,
+			err = dom_document_create_string(parser->doc,
 					qnamebuf,
 					qnamelen,
 					&qname);
@@ -904,7 +900,7 @@ void xml_parser_add_text_node(dom_xml_parser *parser, struct dom_node *parent,
 	dom_exception err;
 
 	/* Create DOM string data for text node */
-	err = dom_string_create_from_const_ptr(parser->doc, child->content,
+	err = dom_document_create_string(parser->doc, child->content,
 			strlen((const char *) child->content), &data);
 	if (err != DOM_NO_ERR) {
 		parser->msg(DOM_MSG_CRITICAL, parser->mctx,
@@ -965,7 +961,7 @@ void xml_parser_add_cdata_section(dom_xml_parser *parser,
 	dom_exception err;
 
 	/* Create DOM string data for cdata section */
-	err = dom_string_create_from_const_ptr(parser->doc, child->content,
+	err = dom_document_create_string(parser->doc, child->content,
 			strlen((const char *) child->content), &data);
 	if (err != DOM_NO_ERR) {
 		parser->msg(DOM_MSG_CRITICAL, parser->mctx,
@@ -1027,7 +1023,7 @@ void xml_parser_add_entity_reference(dom_xml_parser *parser,
 	dom_exception err;
 
 	/* Create name of entity reference */
-	err = dom_string_create_from_const_ptr(parser->doc, child->name,
+	err = dom_document_create_string(parser->doc, child->name,
 			strlen((const char *) child->name), &name);
 	if (err != DOM_NO_ERR) {
 		parser->msg(DOM_MSG_CRITICAL, parser->mctx,
@@ -1094,7 +1090,7 @@ void xml_parser_add_comment(dom_xml_parser *parser, struct dom_node *parent,
 	dom_exception err;
 
 	/* Create DOM string data for comment */
-	err = dom_string_create_from_const_ptr(parser->doc, child->content,
+	err = dom_document_create_string(parser->doc, child->content,
 			strlen((const char *) child->content), &data);
 	if (err != DOM_NO_ERR) {
 		parser->msg(DOM_MSG_CRITICAL, parser->mctx,
@@ -1156,7 +1152,7 @@ void xml_parser_add_document_type(dom_xml_parser *parser,
 	dom_exception err;
 
 	/* Create qname for doctype */
-	err = dom_string_create_from_const_ptr(parser->doc, dtd->name,
+	err = dom_document_create_string(parser->doc, dtd->name,
 			strlen((const char *) dtd->name), &qname);
 	if (err != DOM_NO_ERR) {
 		parser->msg(DOM_MSG_CRITICAL, parser->mctx,
@@ -1165,7 +1161,7 @@ void xml_parser_add_document_type(dom_xml_parser *parser,
 	}
 
 	/* Create public ID for doctype */
-	err = dom_string_create_from_const_ptr(parser->doc,
+	err = dom_document_create_string(parser->doc,
 			dtd->ExternalID,
 			(dtd->ExternalID == NULL) ? 0
 				: strlen((const char *) dtd->ExternalID),
@@ -1178,7 +1174,7 @@ void xml_parser_add_document_type(dom_xml_parser *parser,
 	}
 
 	/* Create system ID for doctype */
-	err = dom_string_create_from_const_ptr(parser->doc,
+	err = dom_document_create_string(parser->doc,
 			dtd->SystemID,
 			(dtd->SystemID == NULL) ? 0
 				: strlen((const char *) dtd->SystemID),
diff --git a/include/dom/bootstrap/implpriv.h b/include/dom/bootstrap/implpriv.h
index 97806a8..c99a9d2 100644
--- a/include/dom/bootstrap/implpriv.h
+++ b/include/dom/bootstrap/implpriv.h
@@ -94,7 +94,6 @@ struct dom_implementation {
 	 * \param qname      The qualified name of the document element
 	 * \param doctype    The type of document to create
 	 * \param doc        Pointer to location to receive result
-	 * \param charset    The charset to use for strings in the document
 	 * \param alloc      Memory (de)allocation function
 	 * \param pw         Pointer to client-specific private data
 	 * \return DOM_NO_ERR on success,
@@ -130,7 +129,6 @@ struct dom_implementation {
 			struct dom_string *qname,
 			struct dom_document_type *doctype,
 			struct dom_document **doc,
-			dom_string_charset charset,
 			dom_alloc alloc, void *pw);
 
 	/**
@@ -251,8 +249,7 @@ dom_exception dom_register_source(struct dom_implementation_source *source,
 
 /* Create a DOM document */
 dom_exception dom_document_create(struct dom_implementation *impl,
-		dom_string_charset charset, dom_alloc alloc, void *pw, 
-		struct dom_document **doc);
+		dom_alloc alloc, void *pw, struct dom_document **doc);
 
 /* Set a document's buffer */
 void dom_document_set_buffer(struct dom_document *doc, uint8_t *buffer, 
diff --git a/include/dom/core/document.h b/include/dom/core/document.h
index cce8e4b..6a5fd9f 100644
--- a/include/dom/core/document.h
+++ b/include/dom/core/document.h
@@ -9,6 +9,7 @@
 #define dom_core_document_h_
 
 #include <stdbool.h>
+#include <stdint.h>
 
 #include <dom/core/exceptions.h>
 
@@ -98,5 +99,7 @@ dom_exception dom_document_rename_node(struct dom_document *doc,
 		struct dom_node *node,
 		struct dom_string *namespace, struct dom_string *qname,
 		struct dom_node **result);
+dom_exception dom_document_create_string(struct dom_document *doc,
+		const uint8_t *data, size_t len, struct dom_string **result);
 
 #endif
diff --git a/include/dom/core/implementation.h b/include/dom/core/implementation.h
index a51493f..5e26432 100644
--- a/include/dom/core/implementation.h
+++ b/include/dom/core/implementation.h
@@ -37,7 +37,6 @@ dom_exception dom_implementation_create_document(
 		struct dom_string *namespace, struct dom_string *qname,
 		struct dom_document_type *doctype,
 		struct dom_document **doc,
-		dom_string_charset charset,
 		dom_alloc alloc, void *pw);
 
 dom_exception dom_implementation_get_feature(
diff --git a/include/dom/core/string.h b/include/dom/core/string.h
index e3dfa30..8da9dd7 100644
--- a/include/dom/core/string.h
+++ b/include/dom/core/string.h
@@ -14,33 +14,16 @@
 #include <dom/functypes.h>
 #include <dom/core/exceptions.h>
 
-struct dom_document;
 struct dom_string;
 
-typedef enum {
-	DOM_STRING_UTF8,
-	DOM_STRING_UTF16
-} dom_string_charset;
-
 /* Claim a reference on a DOM string */
 void dom_string_ref(struct dom_string *str);
 /* Release a reference on a DOM string */
 void dom_string_unref(struct dom_string *str);
 
-/* Create a DOM string from an offset into the document buffer */
-dom_exception dom_string_create_from_off(struct dom_document *doc,
-		uint32_t off, size_t len, struct dom_string **str);
 /* Create a DOM string from a string of characters */
-dom_exception dom_string_create_from_ptr(struct dom_document *doc,
-		const uint8_t *ptr, size_t len, struct dom_string **str);
-/* Create a DOM string from a constant string of characters */
-dom_exception dom_string_create_from_const_ptr(struct dom_document *doc,
+dom_exception dom_string_create(dom_alloc alloc, void *pw,
 		const uint8_t *ptr, size_t len, struct dom_string **str);
-/* Create a DOM string from a string of characters that does not belong
- * to a document */
-dom_exception dom_string_create_from_ptr_no_doc(dom_alloc alloc, void *pw,
-		dom_string_charset charset, const uint8_t *ptr, size_t len, 
-		struct dom_string **str);
 
 /* Case sensitively compare two DOM strings */
 int dom_string_cmp(struct dom_string *s1, struct dom_string *s2);
diff --git a/src/core/attr.c b/src/core/attr.c
index a82f117..5a85ac0 100644
--- a/src/core/attr.c
+++ b/src/core/attr.c
@@ -180,7 +180,7 @@ dom_exception dom_attr_get_value(struct dom_attr *attr,
 	struct dom_string *value, *temp;
 	dom_exception err;
 
-	err = dom_string_create_from_const_ptr(a->owner, 
+	err = dom_document_create_string(a->owner, 
 			(const uint8_t *) "", SLEN(""), &value);
 	if (err != DOM_NO_ERR) {
 		return err;
diff --git a/src/core/document.c b/src/core/document.c
index 3e06541..74283f9 100644
--- a/src/core/document.c
+++ b/src/core/document.c
@@ -56,8 +56,6 @@ struct dom_doc_nnm {
 struct dom_document {
 	struct dom_node base;		/**< Base node */
 
-	dom_string_charset charset;	/**< Charset of strings in document */
-
 	struct dom_implementation *impl;	/**< Owning implementation */
 
 	struct dom_doc_nl *nodelists;	/**< List of active nodelists */
@@ -73,7 +71,6 @@ struct dom_document {
 /** Interned node name strings, indexed by node type */
 /* Index 0 is unused */
 static struct dom_string *__nodenames_utf8[DOM_NODE_TYPE_COUNT + 1];
-static struct dom_string *__nodenames_utf16[DOM_NODE_TYPE_COUNT + 1];
 
 /**
  * Initialise the document module
@@ -102,27 +99,6 @@ dom_exception _dom_document_initialise(dom_alloc alloc, void *pw)
 		{ "#document-fragment",	18 },	/* Document fragment */
 		{ NULL,			0 }	/* Notation */
 	};
-
-	/** \todo This assumes Little Endian */
-	static struct {
-		const char *name;
-		size_t len;
-	} names_utf16[DOM_NODE_TYPE_COUNT + 1] = {
-		{ NULL,			0 },	/* Unused */
-		{ NULL,			0 },	/* Element */
-		{ NULL,			0 },	/* Attr */
-		{ "#\0t\0e\0x\0t\0",	10 },	/* Text */
-		{ "#\0c\0d\0a\0t\0a\0-\0s\0e\0c\0t\0i\0o\0n\0",	28 },	/* CDATA section */
-		{ NULL,			0 },	/* Entity reference */
-		{ NULL,			0 },	/* Entity */
-		{ NULL,			0 },	/* Processing instruction */
-		{ "#\0c\0o\0m\0m\0e\0n\0t\0",		16 },	/* Comment */
-		{ "#\0d\0o\0c\0u\0m\0e\0n\0t\0",		18 },	/* Document */
-		{ NULL,			0 },	/* Document type */
-		{ "#\0d\0o\0c\0u\0m\0e\0n\0t\0-\0f\0r\0a\0g\0m\0e\0n\0t\0",	36 },	/* Document fragment */
-		{ NULL,			0 }	/* Notation */
-	};
-
 	dom_exception err;
 
 	/* Initialise interned node names */
@@ -130,13 +106,11 @@ dom_exception _dom_document_initialise(dom_alloc alloc, void *pw)
 		if (names_utf8[i].name == NULL) {
 			/* Nothing to intern; skip this entry */
 			__nodenames_utf8[i] = NULL;
-			__nodenames_utf16[i] = NULL;
 			continue;
 		}
 
 		/* Make string */
-		err = dom_string_create_from_ptr_no_doc(alloc, pw,
-				DOM_STRING_UTF8,
+		err = dom_string_create(alloc, pw,
 				(const uint8_t *) names_utf8[i].name,
 				names_utf8[i].len, &__nodenames_utf8[i]);
 		if (err != DOM_NO_ERR) {
@@ -144,29 +118,10 @@ dom_exception _dom_document_initialise(dom_alloc alloc, void *pw)
 			for (int j = 0; j < i; j++) {
 				if (__nodenames_utf8[j] != NULL) {
 					dom_string_unref(__nodenames_utf8[j]);
-					dom_string_unref(__nodenames_utf16[j]);
 				}
 			}
 			return err;
 		}
-
-		err = dom_string_create_from_ptr_no_doc(alloc, pw,
-				DOM_STRING_UTF16,
-				(const uint8_t *) names_utf16[i].name,
-				names_utf16[i].len, &__nodenames_utf16[i]);
-		if (err != DOM_NO_ERR) {
-			/* Failed, clean up strings we've created so far */
-			for (int j = 0; j < i; j++) {
-				if (__nodenames_utf8[j] != NULL) {
-					dom_string_unref(__nodenames_utf8[j]);
-					dom_string_unref(__nodenames_utf16[j]);
-				}
-			}
-
-			dom_string_unref(__nodenames_utf8[i]);
-
-			return err;
-		}
 	}
 
 	return DOM_NO_ERR;
@@ -182,7 +137,6 @@ dom_exception _dom_document_finalise(void)
 	for (int i = 0; i <= DOM_NODE_TYPE_COUNT; i++) {
 		if (__nodenames_utf8[i] != NULL) {
 			dom_string_unref(__nodenames_utf8[i]);
-			dom_string_unref(__nodenames_utf16[i]);
 		}
 	}
 
@@ -193,7 +147,6 @@ dom_exception _dom_document_finalise(void)
  * Create a Document
  *
  * \param impl     The DOM implementation owning the document
- * \param charset  The charset used for strings in the document
  * \param alloc    Memory (de)allocation function
  * \param pw       Pointer to client-specific private data
  * \param doc      Pointer to location to receive created document
@@ -204,8 +157,7 @@ dom_exception _dom_document_finalise(void)
  * The returned document will already be referenced.
  */
 dom_exception dom_document_create(struct dom_implementation *impl,
-		dom_string_charset charset, dom_alloc alloc, void *pw, 
-		struct dom_document **doc)
+		dom_alloc alloc, void *pw, struct dom_document **doc)
 {
 	struct dom_document *d;
 	dom_exception err;
@@ -233,7 +185,6 @@ dom_exception dom_document_create(struct dom_implementation *impl,
 	}
 
 	/* Initialise remaining type-specific data */
-	d->charset = charset;
 	if (impl != NULL)
 		dom_implementation_ref(impl);
 	d->impl = impl;
@@ -241,8 +192,7 @@ dom_exception dom_document_create(struct dom_implementation *impl,
 	d->nodelists = NULL;
 	d->maps = NULL;
 
-	d->nodenames = (charset == DOM_STRING_UTF8) ? __nodenames_utf8 
-						    : __nodenames_utf16;
+	d->nodenames = __nodenames_utf8;
 
 	*doc = d;
 
@@ -1047,55 +997,30 @@ dom_exception dom_document_rename_node(struct dom_document *doc,
 	return DOM_NOT_SUPPORTED_ERR;
 }
 
-/*                                                                         */
-/* ----------------------------------------------------------------------- */
-/*                                                                         */
-
 /**
- * Acquire a pointer to the base of the document buffer
- *
- * \param doc  Document to retrieve pointer from
- * \return Pointer to document buffer
+ * Create a DOM string, using a document's allocation context
  *
- * The document buffer is _not_ reference counted (as it is an implicit part
- * of the document). It is destroyed with the document, and thus after all
- * users have been destroyed.
- */
-const uint8_t *dom_document_get_base(struct dom_document *doc)
-{
-	UNUSED(doc);
-
-	return NULL;
-}
-
-/**
- * Set the document buffer pointer
+ * \param doc     The document
+ * \param data    Pointer to string data
+ * \param len     Length, in bytes, of string
+ * \param result  Pointer to location to receive result
+ * \return DOM_NO_ERR on success, DOM_NO_MEM_ERR on memory exhaustion
  *
- * \param doc         Document to set buffer pointer of
- * \param buffer      Pointer to buffer
- * \param buffer_len  Length of buffer, in bytes
+ * The returned string will already be referenced, so there is no need
+ * to explicitly reference it.
  *
- * By calling this, ownership of the buffer is transferred to the document.
- * It should be called once per document node.
+ * The string of characters passed in will be copied for use by the
+ * returned DOM string.
  */
-void dom_document_set_buffer(struct dom_document *doc, uint8_t *buffer,
-		size_t buffer_len)
+dom_exception dom_document_create_string(struct dom_document *doc,
+		const uint8_t *data, size_t len, struct dom_string **result)
 {
-	UNUSED(doc);
-	UNUSED(buffer);
-	UNUSED(buffer_len);
+	return dom_string_create(doc->alloc, doc->pw, data, len, result);
 }
 
-/**
- * Retrieve the character set used to encode strings in the document
- *
- * \param doc  The document to get the charset of
- * \return The charset in use
- */
-dom_string_charset dom_document_get_charset(struct dom_document *doc)
-{
-	return doc->charset;
-}
+/*                                                                         */
+/* ----------------------------------------------------------------------- */
+/*                                                                         */
 
 /**
  * (De)allocate memory with a document's context
diff --git a/src/core/document.h b/src/core/document.h
index 6982b74..c5c13ac 100644
--- a/src/core/document.h
+++ b/src/core/document.h
@@ -27,12 +27,6 @@ dom_exception _dom_document_finalise(void);
 /* Destroy a document */
 void dom_document_destroy(struct dom_document *doc);
 
-/* Get base of document buffer */
-const uint8_t *dom_document_get_base(struct dom_document *doc);
-
-/* Get the document character set */ 
-dom_string_charset dom_document_get_charset(struct dom_document *doc);
-
 /* (De)allocate memory */
 void *dom_document_alloc(struct dom_document *doc, void *ptr, size_t size);
 
diff --git a/src/core/implementation.c b/src/core/implementation.c
index 9738b7c..e37b27d 100644
--- a/src/core/implementation.c
+++ b/src/core/implementation.c
@@ -94,7 +94,6 @@ dom_exception dom_implementation_create_document_type(
  * \param qname      The qualified name of the document element
  * \param doctype    The type of document to create
  * \param doc        Pointer to location to receive result
- * \param charset    The charset to use for strings in the document
  * \param alloc      Memory (de)allocation function
  * \param pw         Pointer to client-specific private data
  * \return DOM_NO_ERR on success,
@@ -127,11 +126,10 @@ dom_exception dom_implementation_create_document(
 		struct dom_string *namespace, struct dom_string *qname,
 		struct dom_document_type *doctype,
 		struct dom_document **doc,
-		dom_string_charset charset,
 		dom_alloc alloc, void *pw)
 {
 	return impl->create_document(impl, namespace, qname, doctype, doc,
-			charset, alloc, pw);
+			alloc, pw);
 }
 
 /**
diff --git a/src/core/node.c b/src/core/node.c
index 2284e4f..0eebfb0 100644
--- a/src/core/node.c
+++ b/src/core/node.c
@@ -306,15 +306,8 @@ dom_exception dom_node_get_node_name(struct dom_node *node,
 		struct dom_string *colon;
 		dom_exception err;
 
-		/* ugh! */
-		/** \todo Assumes little endian */
-		err = dom_string_create_from_const_ptr(node->owner,	
-			(const uint8_t *) (
-				(dom_document_get_charset(node->owner) == 
-					DOM_STRING_UTF8) ? ":" : ":\0"),
-			(dom_document_get_charset(node->owner) == 
-					DOM_STRING_UTF8) ? 1 : 2,
-			&colon);
+		err = dom_document_create_string(node->owner, 
+				(const uint8_t *) ":", SLEN(":"), &colon);
 		if (err != DOM_NO_ERR) {
 			return err;
 		}
@@ -1639,7 +1632,7 @@ bool _dom_node_readonly(const struct dom_node *node)
  * \param previous  Previous node in sibling list, or NULL if none
  * \param next      Next node in sibling list, or NULL if none
  */
-inline void _dom_node_attach(struct dom_node *node, struct dom_node *parent, 
+void _dom_node_attach(struct dom_node *node, struct dom_node *parent, 
 		struct dom_node *previous, struct dom_node *next)
 {
 	_dom_node_attach_range(node, node, parent, previous, next);
@@ -1650,7 +1643,7 @@ inline void _dom_node_attach(struct dom_node *node, struct dom_node *parent,
  *
  * \param node  The node to detach
  */
-inline void _dom_node_detach(struct dom_node *node)
+void _dom_node_detach(struct dom_node *node)
 {
 	_dom_node_detach_range(node, node);
 }
@@ -1666,7 +1659,7 @@ inline void _dom_node_detach(struct dom_node *node)
  *
  * The range is assumed to be a linked list of sibling nodes.
  */
-inline void _dom_node_attach_range(struct dom_node *first, 
+void _dom_node_attach_range(struct dom_node *first, 
 		struct dom_node *last,
 		struct dom_node *parent, 
 		struct dom_node *previous, 
@@ -1697,7 +1690,7 @@ inline void _dom_node_attach_range(struct dom_node *first,
  *
  * The range is assumed to be a linked list of sibling nodes.
  */
-inline void _dom_node_detach_range(struct dom_node *first, 
+void _dom_node_detach_range(struct dom_node *first, 
 		struct dom_node *last)
 {
 	if (first->previous != NULL)
@@ -1727,7 +1720,7 @@ inline void _dom_node_detach_range(struct dom_node *first,
  * we want to perform any special replacement-related behaviour 
  * at a later date.
  */
-inline void _dom_node_replace(struct dom_node *old,
+void _dom_node_replace(struct dom_node *old,
 		struct dom_node *replacement)
 {
 	struct dom_node *first, *last;
diff --git a/src/core/string.c b/src/core/string.c
index 8ec44aa..2540e26 100644
--- a/src/core/string.c
+++ b/src/core/string.c
@@ -9,62 +9,37 @@
 #include <inttypes.h>
 #include <string.h>
 
+#include <parserutils/charset/utf8.h>
+
 #include <dom/core/string.h>
 
 #include "core/document.h"
 #include "utils/utils.h"
-#include "utils/utf8.h"
-#include "utils/utf16.h"
 
 /**
  * A DOM string
  *
- * DOM strings store either a pointer to allocated data, a pointer
- * to constant data or an offset into a document buffer.
- *
- * They are reference counted so freeing is performed correctly.
+ * Strings are reference counted so destruction is performed correctly.
  */
 struct dom_string {
-	enum { DOM_STRING_PTR,
-	       DOM_STRING_CONST_PTR,
-	       DOM_STRING_OFFSET,
-	       DOM_STRING_PTR_NODOC
-	} type;				/**< String type */
-
-	dom_string_charset charset;	/**< Charset of string */
-
-	union {
-		uint8_t *ptr;
-		const uint8_t *cptr;
-		uint32_t offset;
-	} data;				/**< Type-specific data */
+	uint8_t *ptr;			/**< Pointer to string data */
 
 	size_t len;			/**< Byte length of string */
 
-	union {
-		struct dom_document *doc;	/**< Owning document */
-		struct {
-			dom_alloc alloc;	/**< Memory (de)allocation
-						 * function */
-			void *pw;	/**< Client-specific data */
-		} nodoc;
-	} ctx;				/**< Allocation context */
+	dom_alloc alloc;		/**< Memory (de)allocation function */
+	void *pw;			/**< Client-specific data */
 
 	uint32_t refcnt;		/**< Reference count */
 };
 
 static struct dom_string empty_string = { 
-	.type = DOM_STRING_CONST_PTR,
-	.charset = DOM_STRING_UTF8,
-	.data.ptr = NULL,
+	.ptr = NULL,
 	.len = 0,
-	.ctx.doc = NULL,
+	.alloc = NULL,
+	.pw = NULL,
 	.refcnt = 1
 };
 
-static dom_exception __dom_string_get_data(struct dom_string *str,
-		const uint8_t **data, size_t *len);
-
 /**
  * Claim a reference on a DOM string
  *
@@ -86,155 +61,18 @@ void dom_string_ref(struct dom_string *str)
 void dom_string_unref(struct dom_string *str)
 {
 	if (--str->refcnt == 0) {
-		if (str->type == DOM_STRING_PTR_NODOC) {
-			str->ctx.nodoc.alloc(str->data.ptr, 0,
-					str->ctx.nodoc.pw);
-
-			str->ctx.nodoc.alloc(str, 0, str->ctx.nodoc.pw);
-		} else {
-			if (str->type == DOM_STRING_PTR) {
-				dom_document_alloc(str->ctx.doc,
-						str->data.ptr, 0);
-			}
-
-			dom_document_alloc(str->ctx.doc, str, 0);
+		if (str->alloc != NULL) {
+			str->alloc(str->ptr, 0, str->pw);
+			str->alloc(str, 0, str->pw);
 		}
 	}
 }
 
-/**
- * Create a DOM string from an offset into the document buffer
- *
- * \param doc  The document in which the string resides
- * \param off  Offset from start of document buffer
- * \param len  Length, in bytes, of string
- * \param str  Pointer to location to receive pointer to new string
- * \return DOM_NO_ERR on success, DOM_NO_MEM_ERR on memory exhaustion
- *
- * The returned string will already be referenced, so there is no need
- * to explicitly reference it.
- */
-dom_exception dom_string_create_from_off(struct dom_document *doc,
-		uint32_t off, size_t len, struct dom_string **str)
-{
-	struct dom_string *ret;
-
-	ret = dom_document_alloc(doc, NULL, sizeof(struct dom_string));
-	if (ret == NULL)
-		return DOM_NO_MEM_ERR;
-
-	ret->type = DOM_STRING_OFFSET;
-
-	ret->charset = dom_document_get_charset(doc);
-
-	ret->data.offset = off;
-
-	ret->len = len;
-
-	ret->ctx.doc = doc;
-
-	ret->refcnt = 1;
-
-	*str = ret;
-
-	return DOM_NO_ERR;
-}
-
 /**
  * Create a DOM string from a string of characters
  *
- * \param doc  The document in which the string resides
- * \param ptr  Pointer to string of characters
- * \param len  Length, in bytes, of string of characters
- * \param str  Pointer to location to receive pointer to new string
- * \return DOM_NO_ERR on success, DOM_NO_MEM_ERR on memory exhaustion
- *
- * The returned string will already be referenced, so there is no need
- * to explicitly reference it.
- *
- * The string of characters passed in will be copied for use by the
- * returned DOM string.
- */
-dom_exception dom_string_create_from_ptr(struct dom_document *doc,
-		const uint8_t *ptr, size_t len, struct dom_string **str)
-{
-	struct dom_string *ret;
-
-	ret = dom_document_alloc(doc, NULL, sizeof(struct dom_string));
-	if (ret == NULL)
-		return DOM_NO_MEM_ERR;
-
-	ret->data.ptr = dom_document_alloc(doc, NULL, len);
-	if (ret->data.ptr == NULL) {
-		dom_document_alloc(doc, ret, 0);
-		return DOM_NO_MEM_ERR;
-	}
-
-	ret->type = DOM_STRING_PTR;
-
-	ret->charset = dom_document_get_charset(doc);
-
-	memcpy(ret->data.ptr, ptr, len);
-
-	ret->len = len;
-
-	ret->ctx.doc = doc;
-
-	ret->refcnt = 1;
-
-	*str = ret;
-
-	return DOM_NO_ERR;
-}
-
-/**
- * Create a DOM string from a constant string of characters
- *
- * \param doc  The document in which the string resides
- * \param ptr  Pointer to string of characters
- * \param len  Length, in bytes, of string of characters
- * \param str  Pointer to location to receive pointer to new string
- * \return DOM_NO_ERR on success, DOM_NO_MEM_ERR on memory exhaustion
- *
- * The returned string will already be referenced, so there is no need
- * to explicitly reference it.
- *
- * The string of characters passed in will _not_ be copied for use by the
- * returned DOM string.
- */
-dom_exception dom_string_create_from_const_ptr(struct dom_document *doc,
-		const uint8_t *ptr, size_t len, struct dom_string **str)
-{
-	struct dom_string *ret;
-
-	ret = dom_document_alloc(doc, NULL, sizeof(struct dom_string));
-	if (ret == NULL)
-		return DOM_NO_MEM_ERR;
-
-	ret->type = DOM_STRING_CONST_PTR;
-
-	ret->charset = dom_document_get_charset(doc);
-
-	ret->data.cptr = ptr;
-
-	ret->len = len;
-
-	ret->ctx.doc = doc;
-
-	ret->refcnt = 1;
-
-	*str = ret;
-
-	return DOM_NO_ERR;
-}
-
-/**
- * Create a DOM string from a string of characters that does not belong
- * to a document
- *
  * \param alloc    Memory (de)allocation function
  * \param pw       Pointer to client-specific private data
- * \param charset  The charset of the string
  * \param ptr      Pointer to string of characters
  * \param len      Length, in bytes, of string of characters
  * \param str      Pointer to location to receive result
@@ -243,12 +81,11 @@ dom_exception dom_string_create_from_const_ptr(struct dom_document *doc,
  * The returned string will already be referenced, so there is no need
  * to explicitly reference it.
  *
- * The string of characters passed in will be copied for use by the
+ * The string of characters passed in will be copied for use by the 
  * returned DOM string.
  */
-dom_exception dom_string_create_from_ptr_no_doc(dom_alloc alloc, void *pw,
-		dom_string_charset charset, const uint8_t *ptr, size_t len, 
-		struct dom_string **str)
+dom_exception dom_string_create(dom_alloc alloc, void *pw,
+		const uint8_t *ptr, size_t len, struct dom_string **str)
 {
 	struct dom_string *ret;
 
@@ -256,22 +93,18 @@ dom_exception dom_string_create_from_ptr_no_doc(dom_alloc alloc, void *pw,
 	if (ret == NULL)
 		return DOM_NO_MEM_ERR;
 
-	ret->data.ptr = alloc(NULL, len, pw);
-	if (ret->data.ptr == NULL) {
+	ret->ptr = alloc(NULL, len, pw);
+	if (ret->ptr == NULL) {
 		alloc(ret, 0, pw);
 		return DOM_NO_MEM_ERR;
 	}
 
-	ret->type = DOM_STRING_PTR_NODOC;
-
-	ret->charset = charset;
-
-	memcpy(ret->data.ptr, ptr, len);
+	memcpy(ret->ptr, ptr, len);
 
 	ret->len = len;
 
-	ret->ctx.nodoc.alloc = alloc;
-	ret->ctx.nodoc.pw = pw;
+	ret->alloc = alloc;
+	ret->pw = pw;
 
 	ret->refcnt = 1;
 
@@ -291,48 +124,16 @@ dom_exception dom_string_create_from_ptr_no_doc(dom_alloc alloc, void *pw,
  */
 int dom_string_cmp(struct dom_string *s1, struct dom_string *s2)
 {
-	const uint8_t *d1 = NULL;
-	const uint8_t *d2 = NULL;
-	size_t l1, l2;
-	dom_exception err;
-
-	err = __dom_string_get_data(s1, &d1, &l1);
-	if (err != DOM_NO_ERR)
-		return 1; /* arbitrary */
+	if (s1 == NULL)
+		s1 = &empty_string;
 
-	err = __dom_string_get_data(s2, &d2, &l2);
-	if (err != DOM_NO_ERR)
-		return 1; /* arbitrary */
+	if (s2 == NULL)
+		s2 = &empty_string;
 
-	while (l1 > 0 && l2 > 0) {
-		uint32_t c1, c2;
-		size_t cl1, cl2;
-		charset_error err;
-
-		err = (s1->charset == DOM_STRING_UTF8) 
-				? _dom_utf8_to_ucs4(d1, l1, &c1, &cl1) 
-				: _dom_utf16_to_ucs4(d1, l1, &c1, &cl1);
-		if (err != CHARSET_OK) {
-		}
+	if (s1->len != s2->len)
+		return 1;
 
-		err = (s2->charset == DOM_STRING_UTF8)
-				? _dom_utf8_to_ucs4(d2, l2, &c2, &cl2)
-				: _dom_utf16_to_ucs4(d2, l2, &c2, &cl2);
-		if (err != CHARSET_OK) {
-		}
-
-		if (c1 != c2) {
-			return (int)(c1 - c2);
-		}
-
-		d1 += cl1;
-		d2 += cl2;
-
-		l1 -= cl1;
-		l2 -= cl2;
-	}
-
-	return (int)(l1 - l2);
+	return memcmp(s1->ptr, s2->ptr, s1->len);
 }
 
 /**
@@ -349,31 +150,28 @@ int dom_string_icmp(struct dom_string *s1, struct dom_string *s2)
 	const uint8_t *d1 = NULL;
 	const uint8_t *d2 = NULL;
 	size_t l1, l2;
-	dom_exception err;
 
-	err = __dom_string_get_data(s1, &d1, &l1);
-	if (err != DOM_NO_ERR)
-		return 1; /* arbitrary */
+	if (s1 == NULL)
+		s1 = &empty_string;
+	if (s2 == NULL)
+		s2 = &empty_string;
 
-	err = __dom_string_get_data(s2, &d2, &l2);
-	if (err != DOM_NO_ERR)
-		return 1; /* arbitrary */
+	d1 = s1->ptr;
+	d2 = s2->ptr;
+	l1 = s1->len;
+	l2 = s2->len;
 
 	while (l1 > 0 && l2 > 0) {
 		uint32_t c1, c2;
 		size_t cl1, cl2;
-		charset_error err;
+		parserutils_error err;
 
-		err = (s1->charset == DOM_STRING_UTF8) 
-				? _dom_utf8_to_ucs4(d1, l1, &c1, &cl1) 
-				: _dom_utf16_to_ucs4(d1, l1, &c1, &cl1);
-		if (err != CHARSET_OK) {
+		err = parserutils_charset_utf8_to_ucs4(d1, l1, &c1, &cl1); 
+		if (err != PARSERUTILS_OK) {
 		}
 
-		err = (s2->charset == DOM_STRING_UTF8)
-				? _dom_utf8_to_ucs4(d2, l2, &c2, &cl2)
-				: _dom_utf16_to_ucs4(d2, l2, &c2, &cl2);
-		if (err != CHARSET_OK) {
+		err = parserutils_charset_utf8_to_ucs4(d2, l2, &c2, &cl2);
+		if (err != PARSERUTILS_OK) {
 		}
 
 		/** \todo improved lower-casing algorithm */
@@ -403,20 +201,19 @@ uint32_t dom_string_index(struct dom_string *str, uint32_t chr)
 	const uint8_t *s;
 	size_t clen, slen;
 	uint32_t c, index;
-	charset_error err;
+	parserutils_error err;
 
-	__dom_string_get_data(str, &s, &slen);
+	if (str == NULL)
+		str = &empty_string;
+
+	s = str->ptr;
+	slen = str->len;
 
 	index = 0;
 
 	while (slen > 0) {
-		if (str->charset == DOM_STRING_UTF8) {
-			err = _dom_utf8_to_ucs4(s, slen, &c, &clen);
-		} else {
-			err = _dom_utf16_to_ucs4(s, slen, &c, &clen);
-		}
-
-		if (err != CHARSET_OK) {
+		err = parserutils_charset_utf8_to_ucs4(s, slen, &c, &clen);
+		if (err != PARSERUTILS_OK) {
 			return (uint32_t) -1;
 		}
 
@@ -444,28 +241,25 @@ uint32_t dom_string_rindex(struct dom_string *str, uint32_t chr)
 	const uint8_t *s;
 	size_t clen, slen;
 	uint32_t c, index;
-	charset_error err;
+	parserutils_error err;
+
+	if (str == NULL)
+		str = &empty_string;
 
-	__dom_string_get_data(str, &s, &slen);
+	s = str->ptr;
+	slen = str->len;
 
 	index = dom_string_length(str);
 
 	while (slen > 0) {
-		if (str->charset == DOM_STRING_UTF8) {
-			err = _dom_utf8_prev(s, slen, &clen);
-			if (err == CHARSET_OK) {
-				err = _dom_utf8_to_ucs4(s + clen, slen - clen, 
-						&c, &clen);
-			}
-		} else {
-			err = _dom_utf16_prev(s, slen, &clen);
-			if (err == CHARSET_OK) {
-				err = _dom_utf16_to_ucs4(s + clen, slen - clen,
-						&c, &clen);
-			}
+		err = parserutils_charset_utf8_prev(s, slen, 
+				(uint32_t *) &clen);
+		if (err == PARSERUTILS_OK) {
+			err = parserutils_charset_utf8_to_ucs4(s + clen, 
+					slen - clen, &c, &clen);
 		}
 
-		if (err != CHARSET_OK) {
+		if (err != PARSERUTILS_OK) {
 			return (uint32_t) -1;
 		}
 
@@ -478,7 +272,6 @@ uint32_t dom_string_rindex(struct dom_string *str, uint32_t chr)
 	}
 
 	return (uint32_t) -1;
-
 }
 
 /**
@@ -489,20 +282,14 @@ uint32_t dom_string_rindex(struct dom_string *str, uint32_t chr)
  */
 uint32_t dom_string_length(struct dom_string *str)
 {
-	const uint8_t *s;
-	size_t slen;
-	uint32_t clen;
-	charset_error err;
-
-	__dom_string_get_data(str, &s, &slen);
+	size_t clen;
+	parserutils_error err;
 
-	if (str->charset == DOM_STRING_UTF8) {
-		err = _dom_utf8_length(s, slen, &clen);
-	} else {
-		err = _dom_utf16_length(s, slen, &clen);
-	}
+	if (str == NULL)
+		str = &empty_string;
 
-	if (err != CHARSET_OK) {
+	err = parserutils_charset_utf8_length(str->ptr, str->len, &clen);
+	if (err != PARSERUTILS_OK) {
 		return 0;
 	}
 
@@ -527,60 +314,28 @@ dom_exception dom_string_concat(struct dom_string *s1, struct dom_string *s2,
 		struct dom_string **result)
 {
 	struct dom_string *concat;
-	const uint8_t *s;
-	size_t slen;
 
-	if (s1->type == DOM_STRING_PTR_NODOC) {
-		concat = s1->ctx.nodoc.alloc(NULL, 
-				sizeof(struct dom_string), s1->ctx.nodoc.pw);
-	} else {
-		concat = dom_document_alloc(s1->ctx.doc, 
-				NULL, sizeof(struct dom_string));
-	}
+	concat = s1->alloc(NULL, sizeof(struct dom_string), s1->pw);
 
 	if (concat == NULL) {
 		return DOM_NO_MEM_ERR;
 	}
 
-	/** \todo support attempted concatenation of mismatched charsets */
+	concat->ptr = s1->alloc(NULL, s1->len + s2->len, s1->pw);
+	if (concat->ptr == NULL) {
+		s1->alloc(concat, 0, s1->pw);
 
-	if (s1->type == DOM_STRING_PTR_NODOC) {
-		concat->data.ptr = s1->ctx.nodoc.alloc(NULL, 
-				s1->len + s2->len, s1->ctx.nodoc.pw);
-	} else {
-		concat->data.ptr = dom_document_alloc(s1->ctx.doc, 
-				NULL, s1->len + s2->len);
-	}
-	if (concat->data.ptr == NULL) {
-		if (s1->type == DOM_STRING_PTR_NODOC) {
-			s1->ctx.nodoc.alloc(concat, 0, s1->ctx.nodoc.pw);
-		} else {
-			dom_document_alloc(s1->ctx.doc, concat, 0);
-		}
 		return DOM_NO_MEM_ERR;
 	}
 
-	concat->type = (s1->type == DOM_STRING_PTR_NODOC) 
-			? DOM_STRING_PTR_NODOC : DOM_STRING_PTR;
-
-	concat->charset = s1->charset;
-
-	__dom_string_get_data(s1, &s, &slen);
+	memcpy(concat->ptr, s1->ptr, s1->len);
 
-	memcpy(concat->data.ptr, s, slen);
-
-	__dom_string_get_data(s2, &s, &slen);
-
-	memcpy(concat->data.ptr + s1->len, s, slen);
+	memcpy(concat->ptr + s1->len, s2->ptr, s2->len);
 
 	concat->len = s1->len + s2->len;
 
-	if (concat->type == DOM_STRING_PTR_NODOC) {
-		concat->ctx.nodoc.alloc = s1->ctx.nodoc.alloc;
-		concat->ctx.nodoc.pw = s1->ctx.nodoc.pw;
-	} else {
-		concat->ctx.doc = s1->ctx.doc;
-	}
+	concat->alloc = s1->alloc;
+	concat->pw = s1->pw;
 
 	concat->refcnt = 1;
 
@@ -607,12 +362,10 @@ dom_exception dom_string_concat(struct dom_string *s1, struct dom_string *s2,
 dom_exception dom_string_substr(struct dom_string *str, 
 		uint32_t i1, uint32_t i2, struct dom_string **result)
 {
-	const uint8_t *s;
-	size_t slen;
+	const uint8_t *s = str->ptr;
+	size_t slen = str->len;
 	size_t b1, b2;
-	charset_error err;
-
-	__dom_string_get_data(str, &s, &slen);
+	parserutils_error err;
 
 	/* Initialise the byte index of the start to 0 */
 	b1 = 0;
@@ -621,13 +374,9 @@ dom_exception dom_string_substr(struct dom_string *str,
 
 	/* Calculate the byte index of the start */
 	while (i1 > 0) {
-		if (str->charset == DOM_STRING_UTF8) {
-			err = _dom_utf8_next(s, slen - b1, b1, &b1);
-		} else {
-			err = _dom_utf16_next(s, slen - b1, b1, &b1);
-		}
-
-		if (err != CHARSET_OK) {
+		err = parserutils_charset_utf8_next(s, slen - b1, b1, 
+				(uint32_t *) &b1);
+		if (err != PARSERUTILS_OK) {
 			return DOM_NO_MEM_ERR;
 		}
 
@@ -639,13 +388,10 @@ dom_exception dom_string_substr(struct dom_string *str,
 
 	/* Calculate the byte index of the end */
 	while (i2 > 0) {
-		if (str->charset == DOM_STRING_UTF8) {
-			err = _dom_utf8_next(s, slen - b2, b2, &b2);
-		} else {
-			err = _dom_utf16_next(s, slen - b2, b2, &b2);
-		}
+		err = parserutils_charset_utf8_next(s, slen - b2, b2, 
+				(uint32_t *) &b2);
 
-		if (err != CHARSET_OK) {
+		if (err != PARSERUTILS_OK) {
 			return DOM_NO_MEM_ERR;
 		}
 
@@ -653,14 +399,7 @@ dom_exception dom_string_substr(struct dom_string *str,
 	}
 
 	/* Create a string from the specified byte range */
-	return (str->type == DOM_STRING_PTR_NODOC)
-			? dom_string_create_from_ptr_no_doc(
-					str->ctx.nodoc.alloc,
-					str->ctx.nodoc.pw,
-					str->charset, 
-					s + b1, b2 - b1, result)
-			: dom_string_create_from_ptr(str->ctx.doc,
-					s + b1, b2 - b1, result);
+	return dom_string_create(str->alloc, str->pw, s + b1, b2 - b1, result);
 }
 
 /**
@@ -688,11 +427,12 @@ dom_exception dom_string_insert(struct dom_string *target,
 	const uint8_t *t, *s;
 	uint32_t tlen, slen, clen;
 	uint32_t ins = 0;
-	charset_error err;
-
-	__dom_string_get_data(target, &t, &tlen);
+	parserutils_error err;
 
-	__dom_string_get_data(source, &s, &slen);
+	t = target->ptr;
+	tlen = target->len;
+	s = source->ptr;
+	slen = source->len;
 
 	clen = dom_string_length(target);
 
@@ -706,13 +446,10 @@ dom_exception dom_string_insert(struct dom_string *target,
 		ins = tlen;
 	} else {
 		while (offset > 0) {
-			if (target->charset == DOM_STRING_UTF8) {
-				err = _dom_utf8_next(t, tlen - ins, ins, &ins);
-			} else {
-				err = _dom_utf16_next(t, tlen - ins, ins, &ins);
-			}
+			err = parserutils_charset_utf8_next(t, tlen - ins, 
+					ins, &ins);
 
-			if (err != CHARSET_OK) {
+			if (err != PARSERUTILS_OK) {
 				return DOM_NO_MEM_ERR;
 			}
 
@@ -721,65 +458,36 @@ dom_exception dom_string_insert(struct dom_string *target,
 	}
 
 	/* Allocate result string */
-	if (target->type == DOM_STRING_PTR_NODOC) {
-		res = target->ctx.nodoc.alloc(NULL, sizeof(struct dom_string), 
-				target->ctx.nodoc.pw);
-	} else {
-		res = dom_document_alloc(target->ctx.doc, 
-				NULL, sizeof(struct dom_string));
-	}
-
+	res = target->alloc(NULL, sizeof(struct dom_string), target->pw);
 	if (res == NULL) {
 		return DOM_NO_MEM_ERR;
 	}
 
-	/** \todo support insertion of a string from a different charset  */
-
 	/* Allocate data buffer for result contents */
-	if (target->type == DOM_STRING_PTR_NODOC) {
-		res->data.ptr = target->ctx.nodoc.alloc(NULL, 
-				tlen + slen, target->ctx.nodoc.pw);
-	} else {
-		res->data.ptr = dom_document_alloc(target->ctx.doc, 
-				NULL, tlen + slen);
-	}
-	if (res->data.ptr == NULL) {
-		if (target->type == DOM_STRING_PTR_NODOC) {
-			target->ctx.nodoc.alloc(res, 0, target->ctx.nodoc.pw);
-		} else {
-			dom_document_alloc(target->ctx.doc, res, 0);
-		}
+	res->ptr = target->alloc(NULL, tlen + slen, target->pw);
+	if (res->ptr == NULL) {
+		target->alloc(res, 0, target->pw);
 		return DOM_NO_MEM_ERR;
 	}
 
-	/* Populate result members */
-	res->type = (target->type == DOM_STRING_PTR_NODOC) 
-			? DOM_STRING_PTR_NODOC : DOM_STRING_PTR;
-
-	res->charset = target->charset;
-
 	/* Copy initial portion of target, if any, into result */
 	if (ins > 0) {
-		memcpy(res->data.ptr, t, ins);
+		memcpy(res->ptr, t, ins);
 	}
 
 	/* Copy inserted data into result */
-	memcpy(res->data.ptr + ins, s, slen);
+	memcpy(res->ptr + ins, s, slen);
 
 	/* Copy remainder of target, if any, into result */
 	if (tlen - ins > 0) {
-		memcpy(res->data.ptr + ins + slen, t + ins, tlen - ins);
+		memcpy(res->ptr + ins + slen, t + ins, tlen - ins);
 	}
 
 	res->len = tlen + slen;
 
-	if (res->type == DOM_STRING_PTR_NODOC) {
-		res->ctx.nodoc.alloc = target->ctx.nodoc.alloc;
-		res->ctx.nodoc.pw = target->ctx.nodoc.pw;
-	} else {
-		res->ctx.doc = target->ctx.doc;
-	}
-
+	res->alloc = target->alloc;
+	res->pw = target->pw;
+	
 	res->refcnt = 1;
 
 	*result = res;
@@ -811,11 +519,12 @@ dom_exception dom_string_replace(struct dom_string *target,
 	const uint8_t *t, *s;
 	uint32_t tlen, slen;
 	uint32_t b1, b2;
-	charset_error err;
-
-	__dom_string_get_data(target, &t, &tlen);
+	parserutils_error err;
 
-	__dom_string_get_data(source, &s, &slen);
+	t = target->ptr;
+	tlen = target->len;
+	s = source->ptr;
+	slen = source->len;
 
 	/* Initialise the byte index of the start to 0 */
 	b1 = 0;
@@ -824,13 +533,9 @@ dom_exception dom_string_replace(struct dom_string *target,
 
 	/* Calculate the byte index of the start */
 	while (i1 > 0) {
-		if (target->charset == DOM_STRING_UTF8) {
-			err = _dom_utf8_next(s, slen - b1, b1, &b1);
-		} else {
-			err = _dom_utf16_next(s, slen - b1, b1, &b1);
-		}
+		err = parserutils_charset_utf8_next(s, slen - b1, b1, &b1);
 
-		if (err != CHARSET_OK) {
+		if (err != PARSERUTILS_OK) {
 			return DOM_NO_MEM_ERR;
 		}
 
@@ -842,13 +547,9 @@ dom_exception dom_string_replace(struct dom_string *target,
 
 	/* Calculate the byte index of the end */
 	while (i2 > 0) {
-		if (target->charset == DOM_STRING_UTF8) {
-			err = _dom_utf8_next(s, slen - b2, b2, &b2);
-		} else {
-			err = _dom_utf16_next(s, slen - b2, b2, &b2);
-		}
+		err = parserutils_charset_utf8_next(s, slen - b2, b2, &b2);
 
-		if (err != CHARSET_OK) {
+		if (err != PARSERUTILS_OK) {
 			return DOM_NO_MEM_ERR;
 		}
 
@@ -856,66 +557,38 @@ dom_exception dom_string_replace(struct dom_string *target,
 	}
 
 	/* Allocate result string */
-	if (target->type == DOM_STRING_PTR_NODOC) {
-		res = target->ctx.nodoc.alloc(NULL, sizeof(struct dom_string), 
-				target->ctx.nodoc.pw);
-	} else {
-		res = dom_document_alloc(target->ctx.doc, 
-				NULL, sizeof(struct dom_string));
-	}
+	res = target->alloc(NULL, sizeof(struct dom_string), target->pw);
 
 	if (res == NULL) {
 		return DOM_NO_MEM_ERR;
 	}
 
-	/** \todo support insertion of a string from a different charset  */
-
 	/* Allocate data buffer for result contents */
-	if (target->type == DOM_STRING_PTR_NODOC) {
-		res->data.ptr = target->ctx.nodoc.alloc(NULL, 
-				tlen + slen - (b2 - b1), target->ctx.nodoc.pw);
-	} else {
-		res->data.ptr = dom_document_alloc(target->ctx.doc, 
-				NULL, tlen + slen - (b2 - b1));
-	}
-	if (res->data.ptr == NULL) {
-		if (target->type == DOM_STRING_PTR_NODOC) {
-			target->ctx.nodoc.alloc(res, 0, target->ctx.nodoc.pw);
-		} else {
-			dom_document_alloc(target->ctx.doc, res, 0);
-		}
+	res->ptr = target->alloc(NULL, tlen + slen - (b2 - b1), target->pw);
+	if (res->ptr == NULL) {
+		target->alloc(res, 0, target->pw);
 		return DOM_NO_MEM_ERR;
 	}
 
-	/* Populate result members */
-	res->type = (target->type == DOM_STRING_PTR_NODOC) 
-			? DOM_STRING_PTR_NODOC : DOM_STRING_PTR;
-
-	res->charset = target->charset;
-
 	/* Copy initial portion of target, if any, into result */
 	if (b1 > 0) {
-		memcpy(res->data.ptr, t, b1);
+		memcpy(res->ptr, t, b1);
 	}
 
 	/* Copy replacement data into result */
 	if (slen > 0) {
-		memcpy(res->data.ptr + b1, s, slen);
+		memcpy(res->ptr + b1, s, slen);
 	}
 
 	/* Copy remainder of target, if any, into result */
 	if (tlen - b2 > 0) {
-		memcpy(res->data.ptr + b1 + slen, t + b2, tlen - b2);
+		memcpy(res->ptr + b1 + slen, t + b2, tlen - b2);
 	}
 
 	res->len = tlen + slen - (b2 - b1);
 
-	if (res->type == DOM_STRING_PTR_NODOC) {
-		res->ctx.nodoc.alloc = target->ctx.nodoc.alloc;
-		res->ctx.nodoc.pw = target->ctx.nodoc.pw;
-	} else {
-		res->ctx.doc = target->ctx.doc;
-	}
+	res->alloc = target->alloc;
+	res->pw = target->pw;
 
 	res->refcnt = 1;
 
@@ -940,19 +613,8 @@ dom_exception dom_string_replace(struct dom_string *target,
 dom_exception dom_string_dup(struct dom_string *str, 
 		struct dom_string **result)
 {
-	const uint8_t *s;
-	size_t slen;
-
-	__dom_string_get_data(str, &s, &slen);
-
-	return str->type == DOM_STRING_PTR_NODOC 
-			? dom_string_create_from_ptr_no_doc(
-				str->ctx.nodoc.alloc,
-				str->ctx.nodoc.pw,
-				str->charset,
-				s, slen, result) 
-			: dom_string_create_from_ptr(str->ctx.doc,
-					s, slen, result);
+	return dom_string_create(str->alloc, str->pw, str->ptr, str->len, 
+			result);
 }
 
 /**
@@ -963,12 +625,10 @@ dom_exception dom_string_dup(struct dom_string *str,
  */
 uint32_t dom_string_hash(struct dom_string *str)
 {
-	const uint8_t *s;
-	size_t slen;
+	const uint8_t *s = str->ptr;
+	size_t slen = str->len;
 	uint32_t hash = 0x01000193;
 
-	__dom_string_get_data(str, &s, &slen);
-
 	while (slen > 0) {
 		hash *= 0x01000193;
 		hash ^= *s;
@@ -980,47 +640,3 @@ uint32_t dom_string_hash(struct dom_string *str)
 	return hash;
 }
 
-/*                                                                           */
-/*---------------------------------------------------------------------------*/
-/*                                                                           */
-
-/**
- * Get a pointer to the string of characters within a DOM string
- *
- * \param str   Pointer to DOM string to retrieve pointer from
- * \param data  Pointer to location to receive data
- * \param len   Pointer to location to receive byte length of data
- * \return DOM_NO_ERR on success
- *
- * The caller must have previously claimed a reference on the DOM string.
- * The returned pointer must not be freed.
- */
-dom_exception __dom_string_get_data(struct dom_string *str,
-		const uint8_t **data, size_t *len)
-{
-	/* Assume that a NULL str pointer indicates the empty string */
-	if (str == NULL)
-		str = &empty_string;
-
-	switch (str->type) {
-	case DOM_STRING_PTR:
-		*data = str->data.ptr;
-		break;
-	case DOM_STRING_CONST_PTR:
-		*data = str->data.cptr;
-		break;
-	case DOM_STRING_OFFSET:
-		*data = dom_document_get_base(str->ctx.doc) +
-				str->data.offset;
-		break;
-	case DOM_STRING_PTR_NODOC:
-		*data = str->data.ptr;
-		break;
-	}
-
-	*len = str->len;
-
-	return DOM_NO_ERR;
-}
-
-
diff --git a/src/utils/Makefile b/src/utils/Makefile
index ac87ded..29369ae 100644
--- a/src/utils/Makefile
+++ b/src/utils/Makefile
@@ -22,7 +22,7 @@
 CFLAGS += -I$(CURDIR)
 
 # Objects
-OBJS = namespace utf8 utf16
+OBJS = namespace
 
 .PHONY: clean debug distclean export release setup test
 
diff --git a/src/utils/namespace.c b/src/utils/namespace.c
index 9c0d214..8002b8e 100644
--- a/src/utils/namespace.c
+++ b/src/utils/namespace.c
@@ -32,14 +32,13 @@ dom_exception _dom_namespace_initialise(dom_alloc alloc, void *pw)
 {
 	dom_exception err;
 
-	err = dom_string_create_from_ptr_no_doc(alloc, pw,
-		DOM_STRING_UTF8, (const uint8_t *) "xml", SLEN("xml"), &xml);
+	err = dom_string_create(alloc, pw,
+		(const uint8_t *) "xml", SLEN("xml"), &xml);
 	if (err != DOM_NO_ERR) {
 		return err;
 	}
 
-	err = dom_string_create_from_ptr_no_doc(alloc, pw,
-		DOM_STRING_UTF8,
+	err = dom_string_create(alloc, pw,
 		(const uint8_t *) "http://www.w3.org/XML/1998/namespace", 
 		SLEN("http://www.w3.org/XML/1998/namespace"),
 		&xml_ns);
@@ -48,8 +47,7 @@ dom_exception _dom_namespace_initialise(dom_alloc alloc, void *pw)
 		return err;
 	}
 
-	err = dom_string_create_from_ptr_no_doc(alloc, pw,
-		DOM_STRING_UTF8, 
+	err = dom_string_create(alloc, pw,
 		(const uint8_t *) "xmlns", SLEN("xmlns"), &xmlns);
 	if (err != DOM_NO_ERR) {
 		dom_string_unref(xml_ns);
@@ -57,8 +55,7 @@ dom_exception _dom_namespace_initialise(dom_alloc alloc, void *pw)
 		return err;
 	}
 
-	err = dom_string_create_from_ptr_no_doc(alloc, pw,
-		DOM_STRING_UTF8,
+	err = dom_string_create(alloc, pw,
 		(const uint8_t *) "http://www.w3.org/2000/xmlns",
 		SLEN("http://www.w3.org/2000/xmlns"),
 		&xmlns_ns);
diff --git a/src/utils/utf16.c b/src/utils/utf16.c
deleted file mode 100644
index 8917328..0000000
--- a/src/utils/utf16.c
+++ /dev/null
@@ -1,239 +0,0 @@
-/*
- * This file is part of Hubbub.
- * Licensed under the MIT License,
- *                http://www.opensource.org/licenses/mit-license.php
- * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
- */
-
-/** \file
- * UTF-16 manipulation functions (implementation).
- */
-
-#include <stdbool.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "utils/utf16.h"
-
-/**
- * Convert a UTF-16 sequence into a single UCS4 character
- *
- * \param s     The sequence to process
- * \param len   Length of sequence
- * \param ucs4  Pointer to location to receive UCS4 character (host endian)
- * \param clen  Pointer to location to receive byte length of UTF-16 sequence
- * \return CHARSET_OK on success, appropriate error otherwise
- */
-inline charset_error _dom_utf16_to_ucs4(const uint8_t *s, size_t len,
-		uint32_t *ucs4, size_t *clen)
-{
-	const uint16_t *ss = (const uint16_t *) (const void *) s;
-
-	if (s == NULL || ucs4 == NULL || clen == NULL)
-		return CHARSET_BADPARM;
-
-	if (len < 2)
-		return CHARSET_NEEDDATA;
-
-	if (*ss < 0xD800 || *ss > 0xDFFF) {
-		*ucs4 = *ss;
-		*clen = 2;
-	} else if (0xD800 <= *ss && *ss <= 0xBFFF) {
-		if (len < 4)
-			return CHARSET_NEEDDATA;
-
-		if (0xDC00 <= ss[1] && ss[1] <= 0xE000) {
-			*ucs4 = (((s[0] >> 6) & 0x1f) + 1) |
-					((s[0] & 0x3f) | (s[1] & 0x3ff));
-			*clen = 4;
-		} else {
-			return CHARSET_INVALID;
-		}
-	}
-
-	return CHARSET_OK;
-}
-
-/**
- * Convert a single UCS4 character into a UTF-16 sequence
- *
- * \param ucs4  The character to process (0 <= c <= 0x7FFFFFFF) (host endian)
- * \param s     Pointer to 4 byte long output buffer
- * \param len   Pointer to location to receive length of multibyte sequence
- * \return CHARSET_OK on success, appropriate error otherwise
- */
-inline charset_error _dom_utf16_from_ucs4(uint32_t ucs4, uint8_t *s,
-		size_t *len)
-{
-	uint16_t *ss = (uint16_t *) (void *) s;
-	uint32_t l = 0;
-
-	if (s == NULL || len == NULL)
-		return CHARSET_BADPARM;
-	else if (ucs4 < 0x10000) {
-		*ss = (uint16_t) ucs4;
-		l = 2;
-	} else if (ucs4 < 0x110000) {
-		ss[0] = 0xD800 | (((ucs4 >> 16) & 0x1f) - 1) | (ucs4 >> 10);
-		ss[1] = 0xDC00 | (ucs4 & 0x3ff);
-		l = 4;
-	} else {
-		return CHARSET_INVALID;
-	}
-
-	*len = l;
-
-	return CHARSET_OK;
-}
-
-/**
- * Calculate the length (in characters) of a bounded UTF-16 string
- *
- * \param s    The string
- * \param max  Maximum length
- * \param len  Pointer to location to receive length of string
- * \return CHARSET_OK on success, appropriate error otherwise
- */
-inline charset_error _dom_utf16_length(const uint8_t *s, size_t max,
-		size_t *len)
-{
-	const uint16_t *ss = (const uint16_t *) (const void *) s;
-	const uint16_t *end = (const uint16_t *) (const void *) (s + max);
-	int l = 0;
-
-	if (s == NULL || len == NULL)
-		return CHARSET_BADPARM;
-
-	while (ss < end) {
-		if (*ss < 0xD800 || 0xDFFF < *ss)
-			ss++;
-		else
-			ss += 2;
-
-		l++;
-	}
-
-	*len = l;
-
-	return CHARSET_OK;
-}
-
-/**
- * Calculate the length (in bytes) of a UTF-16 character
- *
- * \param s    Pointer to start of character
- * \param len  Pointer to location to receive length
- * \return CHARSET_OK on success, appropriate error otherwise
- */
-inline charset_error _dom_utf16_char_byte_length(const uint8_t *s,
-		size_t *len)
-{
-	const uint16_t *ss = (const uint16_t *) (const void *) s;
-
-	if (s == NULL || len == NULL)
-		return CHARSET_BADPARM;
-
-	if (*ss < 0xD800 || 0xDFFF < *ss)
-		*len = 2;
-	else
-		*len = 4;
-
-	return CHARSET_OK;
-}
-
-/**
- * Find previous legal UTF-16 char in string
- *
- * \param s        The string
- * \param off      Offset in the string to start at
- * \param prevoff  Pointer to location to receive offset of first byte of
- *                 previous legal character
- * \return CHARSET_OK on success, appropriate error otherwise
- */
-inline charset_error _dom_utf16_prev(const uint8_t *s, uint32_t off,
-		uint32_t *prevoff)
-{
-	const uint16_t *ss = (const uint16_t *) (const void *) s;
-
-	if (s == NULL || prevoff == NULL)
-		return CHARSET_BADPARM;
-
-	if (off < 2)
-		*prevoff = 0;
-	else if (ss[-1] < 0xDC00 || ss[-1] > 0xDFFF)
-		*prevoff = off - 2;
-	else
-		*prevoff = (off < 4) ? 0 : off - 4;
-
-	return CHARSET_OK;
-}
-
-/**
- * Find next legal UTF-16 char in string
- *
- * \param s        The string (assumed valid)
- * \param len      Maximum offset in string
- * \param off      Offset in the string to start at
- * \param nextoff  Pointer to location to receive offset of first byte of
- *                 next legal character
- * \return CHARSET_OK on success, appropriate error otherwise
- */
-inline charset_error _dom_utf16_next(const uint8_t *s, uint32_t len,
-		uint32_t off, uint32_t *nextoff)
-{
-	const uint16_t *ss = (const uint16_t *) (const void *) s;
-
-	if (s == NULL || off >= len || nextoff == NULL)
-		return CHARSET_BADPARM;
-
-	if (len - off < 4)
-		*nextoff = len;
-	else if (ss[1] < 0xD800 || ss[1] > 0xDBFF)
-		*nextoff = off + 2;
-	else
-		*nextoff = (len - off < 6) ? len : off + 4;
-
-	return CHARSET_OK;
-}
-
-/**
- * Find next legal UTF-16 char in string
- *
- * \param s        The string (assumed to be of dubious validity)
- * \param len      Maximum offset in string
- * \param off      Offset in the string to start at
- * \param nextoff  Pointer to location to receive offset of first byte of
- *                 next legal character
- * \return CHARSET_OK on success, appropriate error otherwise
- */
-inline charset_error _dom_utf16_next_paranoid(const uint8_t *s,
-		uint32_t len, uint32_t off, uint32_t *nextoff)
-{
-	const uint16_t *ss = (const uint16_t *) (const void *) s;
-
-	if (s == NULL || off >= len || nextoff == NULL)
-		return CHARSET_BADPARM;
-
-	while (1) {
-		if (len - off < 4) {
-			return CHARSET_NEEDDATA;
-		} else if (ss[1] < 0xD800 || ss[1] > 0xDFFF) {
-			*nextoff = off + 2;
-			break;
-		} else if (ss[1] >= 0xD800 && ss[1] <= 0xDBFF) {
-			if (len - off < 6)
-				return CHARSET_NEEDDATA;
-
-			if (ss[2] >= 0xDC00 && ss[2] <= 0xDFFF) {
-				*nextoff = off + 4;
-				break;
-			} else {
-				ss++;
-				off += 2;
-			}
-		}
-	}
-
-	return CHARSET_OK;
-}
-
diff --git a/src/utils/utf16.h b/src/utils/utf16.h
deleted file mode 100644
index 7b9e15f..0000000
--- a/src/utils/utf16.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * This file is part of libdom.
- * Licensed under the MIT License,
- *                http://www.opensource.org/licenses/mit-license.php
- * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
- */
-
-/** \file
- * UTF-16 manipulation functions (interface).
- */
-
-#ifndef dom_utils_utf16_h_
-#define dom_utils_utf16_h_
-
-#include <inttypes.h>
-
-#include "utils/charset_errors.h"
-
-inline charset_error _dom_utf16_to_ucs4(const uint8_t *s, size_t len,
-		uint32_t *ucs4, size_t *clen);
-inline charset_error _dom_utf16_from_ucs4(uint32_t ucs4, uint8_t *s,
-		size_t *len);
-
-inline charset_error _dom_utf16_length(const uint8_t *s, size_t max,
-		size_t *len);
-inline charset_error _dom_utf16_char_byte_length(const uint8_t *s,
-		size_t *len);
-
-inline charset_error _dom_utf16_prev(const uint8_t *s, uint32_t off,
-		uint32_t *prevoff);
-inline charset_error _dom_utf16_next(const uint8_t *s, uint32_t len,
-		uint32_t off, uint32_t *nextoff);
-
-inline charset_error _dom_utf16_next_paranoid(const uint8_t *s,
-		uint32_t len, uint32_t off, uint32_t *nextoff);
-
-#endif
-
diff --git a/src/utils/utf8.c b/src/utils/utf8.c
deleted file mode 100644
index b80f04e..0000000
--- a/src/utils/utf8.c
+++ /dev/null
@@ -1,368 +0,0 @@
-/*
- * This file is part of libdom.
- * Licensed under the MIT License,
- *                http://www.opensource.org/licenses/mit-license.php
- * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
- */
-
-/** \file
- * UTF-8 manipulation functions (implementation).
- */
-
-#include <stdbool.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "utils/utf8.h"
-
-/** Number of continuation bytes for a given start byte */
-static const uint8_t numContinuations[256] = {
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-	3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5,
-};
-
-/**
- * Convert a UTF-8 multibyte sequence into a single UCS4 character
- *
- * Encoding of UCS values outside the UTF-16 plane has been removed from
- * RFC3629. This function conforms to RFC2279, however.
- *
- * \param s     The sequence to process
- * \param len   Length of sequence
- * \param ucs4  Pointer to location to receive UCS4 character (host endian)
- * \param clen  Pointer to location to receive byte length of UTF-8 sequence
- * \return CHARSET_OK on success, appropriate error otherwise
- */
-inline charset_error _dom_utf8_to_ucs4(const uint8_t *s, size_t len,
-		uint32_t *ucs4, size_t *clen)
-{
-	if (s == NULL || ucs4 == NULL || clen == NULL)
-		return CHARSET_BADPARM;
-
-	if (len == 0)
-		return CHARSET_NEEDDATA;
-
-	if (*s < 0x80) {
-		*ucs4 = *s;
-		*clen = 1;
-	} else if ((*s & 0xE0) == 0xC0) {
-		if (len < 2)
-			return CHARSET_NEEDDATA;
-		else if ((*(s+1) & 0xC0) != 0x80)
-			return CHARSET_INVALID;
-		else {
-			*ucs4 = ((*s & 0x1F) << 6) | (*(s+1) & 0x3F);
-			*clen = 2;
-		}
-	} else if ((*s & 0xF0) == 0xE0) {
-		if (len < 3)
-			return CHARSET_NEEDDATA;
-		else if ((*(s+1) & 0xC0) != 0x80 ||
-				(*(s+2) & 0xC0) != 0x80)
-			return CHARSET_INVALID;
-		else {
-			*ucs4 = ((*s & 0x0F) << 12) |
-				((*(s+1) & 0x3F) << 6) |
-				(*(s+2) & 0x3F);
-			*clen = 3;
-		}
-	} else if ((*s & 0xF8) == 0xF0) {
-		if (len < 4)
-			return CHARSET_NEEDDATA;
-		else if ((*(s+1) & 0xC0) != 0x80 ||
-				(*(s+2) & 0xC0) != 0x80 ||
-				(*(s+3) & 0xC0) != 0x80)
-			return CHARSET_INVALID;
-		else {
-			*ucs4 = ((*s & 0x0F) << 18) |
-				((*(s+1) & 0x3F) << 12) |
-				((*(s+2) & 0x3F) << 6) |
-				(*(s+3) & 0x3F);
-			*clen = 4;
-		}
-	} else if ((*s & 0xFC) == 0xF8) {
-		if (len < 5)
-			return CHARSET_NEEDDATA;
-		else if ((*(s+1) & 0xC0) != 0x80 ||
-				(*(s+2) & 0xC0) != 0x80 ||
-				(*(s+3) & 0xC0) != 0x80 ||
-				(*(s+4) & 0xC0) != 0x80)
-			return CHARSET_INVALID;
-		else {
-			*ucs4 = ((*s & 0x0F) << 24) |
-				((*(s+1) & 0x3F) << 18) |
-				((*(s+2) & 0x3F) << 12) |
-				((*(s+3) & 0x3F) << 6) |
-				(*(s+4) & 0x3F);
-			*clen = 5;
-		}
-	} else if ((*s & 0xFE) == 0xFC) {
-		if (len < 6)
-			return CHARSET_NEEDDATA;
-		else if ((*(s+1) & 0xC0) != 0x80 ||
-				(*(s+2) & 0xC0) != 0x80 ||
-				(*(s+3) & 0xC0) != 0x80 ||
-				(*(s+4) & 0xC0) != 0x80 ||
-				(*(s+5) & 0xC0) != 0x80)
-			return CHARSET_INVALID;
-		else {
-			*ucs4 = ((*s & 0x0F) << 28) |
-				((*(s+1) & 0x3F) << 24) |
-				((*(s+2) & 0x3F) << 18) |
-				((*(s+3) & 0x3F) << 12) |
-				((*(s+4) & 0x3F) << 6) |
-				(*(s+5) & 0x3F);
-			*clen = 6;
-		}
-	} else {
-		return CHARSET_INVALID;
-	}
-
-	return CHARSET_OK;
-}
-
-/**
- * Convert a single UCS4 character into a UTF-8 multibyte sequence
- *
- * Encoding of UCS values outside the UTF-16 plane has been removed from
- * RFC3629. This function conforms to RFC2279, however.
- *
- * \param ucs4  The character to process (0 <= c <= 0x7FFFFFFF) (host endian)
- * \param s     Pointer to 6 byte long output buffer
- * \param len   Pointer to location to receive length of multibyte sequence
- * \return CHARSET_OK on success, appropriate error otherwise
- */
-inline charset_error _dom_utf8_from_ucs4(uint32_t ucs4, uint8_t *s,
-		size_t *len)
-{
-	uint32_t l = 0;
-
-	if (s == NULL || len == NULL)
-		return CHARSET_BADPARM;
-	else if (ucs4 < 0x80) {
-		*s = (uint8_t) ucs4;
-		l = 1;
-	} else if (ucs4 < 0x800) {
-		*s = 0xC0 | ((ucs4 >> 6) & 0x1F);
-		*(s+1) = 0x80 | (ucs4 & 0x3F);
-		l = 2;
-	} else if (ucs4 < 0x10000) {
-		*s = 0xE0 | ((ucs4 >> 12) & 0xF);
-		*(s+1) = 0x80 | ((ucs4 >> 6) & 0x3F);
-		*(s+2) = 0x80 | (ucs4 & 0x3F);
-		l = 3;
-	} else if (ucs4 < 0x200000) {
-		*s = 0xF0 | ((ucs4 >> 18) & 0x7);
-		*(s+1) = 0x80 | ((ucs4 >> 12) & 0x3F);
-		*(s+2) = 0x80 | ((ucs4 >> 6) & 0x3F);
-		*(s+3) = 0x80 | (ucs4 & 0x3F);
-		l = 4;
-	} else if (ucs4 < 0x4000000) {
-		*s = 0xF8 | ((ucs4 >> 24) & 0x3);
-		*(s+1) = 0x80 | ((ucs4 >> 18) & 0x3F);
-		*(s+2) = 0x80 | ((ucs4 >> 12) & 0x3F);
-		*(s+3) = 0x80 | ((ucs4 >> 6) & 0x3F);
-		*(s+4) = 0x80 | (ucs4 & 0x3F);
-		l = 5;
-	} else if (ucs4 <= 0x7FFFFFFF) {
-		*s = 0xFC | ((ucs4 >> 30) & 0x1);
-		*(s+1) = 0x80 | ((ucs4 >> 24) & 0x3F);
-		*(s+2) = 0x80 | ((ucs4 >> 18) & 0x3F);
-		*(s+3) = 0x80 | ((ucs4 >> 12) & 0x3F);
-		*(s+4) = 0x80 | ((ucs4 >> 6) & 0x3F);
-		*(s+5) = 0x80 | (ucs4 & 0x3F);
-		l = 6;
-	} else {
-		return CHARSET_INVALID;
-	}
-
-	*len = l;
-
-	return CHARSET_OK;
-}
-
-/**
- * Calculate the length (in characters) of a bounded UTF-8 string
- *
- * \param s    The string
- * \param max  Maximum length
- * \param len  Pointer to location to receive length of string
- * \return CHARSET_OK on success, appropriate error otherwise
- */
-inline charset_error _dom_utf8_length(const uint8_t *s, size_t max,
-		size_t *len)
-{
-	const uint8_t *end = s + max;
-	int l = 0;
-
-	if (s == NULL || len == NULL)
-		return CHARSET_BADPARM;
-
-	while (s < end) {
-		if ((*s & 0x80) == 0x00)
-			s += 1;
-		else if ((*s & 0xE0) == 0xC0)
-			s += 2;
-		else if ((*s & 0xF0) == 0xE0)
-			s += 3;
-		else if ((*s & 0xF8) == 0xF0)
-			s += 4;
-		else if ((*s & 0xFC) == 0xF8)
-			s += 5;
-		else if ((*s & 0xFE) == 0xFC)
-			s += 6;
-		else
-			return CHARSET_INVALID;
-		l++;
-	}
-
-	*len = l;
-
-	return CHARSET_OK;
-}
-
-/**
- * Calculate the length (in bytes) of a UTF-8 character
- *
- * \param s    Pointer to start of character
- * \param len  Pointer to location to receive length
- * \return CHARSET_OK on success, appropriate error otherwise
- */
-inline charset_error _dom_utf8_char_byte_length(const uint8_t *s,
-		size_t *len)
-{
-	if (s == NULL || len == NULL)
-		return CHARSET_BADPARM;
-
-	*len = numContinuations[s[0]] + 1 /* Start byte */;
-
-	return CHARSET_OK;
-}
-
-/**
- * Find previous legal UTF-8 char in string
- *
- * \param s        The string
- * \param off      Offset in the string to start at
- * \param prevoff  Pointer to location to receive offset of first byte of
- *                 previous legal character
- * \return CHARSET_OK on success, appropriate error otherwise
- */
-inline charset_error _dom_utf8_prev(const uint8_t *s, uint32_t off,
-		uint32_t *prevoff)
-{
-	if (s == NULL || prevoff == NULL)
-		return CHARSET_BADPARM;
-
-	while (off != 0 && (s[--off] & 0xC0) == 0x80)
-		/* do nothing */;
-
-	*prevoff = off;
-
-	return CHARSET_OK;
-}
-
-/**
- * Find next legal UTF-8 char in string
- *
- * \param s        The string (assumed valid)
- * \param len      Maximum offset in string
- * \param off      Offset in the string to start at
- * \param nextoff  Pointer to location to receive offset of first byte of
- *                 next legal character
- * \return CHARSET_OK on success, appropriate error otherwise
- */
-inline charset_error _dom_utf8_next(const uint8_t *s, uint32_t len,
-		uint32_t off, uint32_t *nextoff)
-{
-	if (s == NULL || off >= len || nextoff == NULL)
-		return CHARSET_BADPARM;
-
-	/* Skip current start byte (if present - may be mid-sequence) */
-	if (s[off] < 0x80 || (s[off] & 0xC0) == 0xC0)
-		off++;
-
-	while (off < len && (s[off] & 0xC0) == 0x80)
-		off++;
-
-	*nextoff = off;
-
-	return CHARSET_OK;
-}
-
-/**
- * Find next legal UTF-8 char in string
- *
- * \param s        The string (assumed to be of dubious validity)
- * \param len      Maximum offset in string
- * \param off      Offset in the string to start at
- * \param nextoff  Pointer to location to receive offset of first byte of
- *                 next legal character
- * \return CHARSET_OK on success, appropriate error otherwise
- */
-inline charset_error _dom_utf8_next_paranoid(const uint8_t *s, uint32_t len,
-		uint32_t off, uint32_t *nextoff)
-{
-	bool valid;
-
-	if (s == NULL || off >= len || nextoff == NULL)
-		return CHARSET_BADPARM;
-
-	/* Skip current start byte (if present - may be mid-sequence) */
-	if (s[off] < 0x80 || (s[off] & 0xC0) == 0xC0)
-		off++;
-
-	while (1) {
-		/* Find next possible start byte */
-		while (off < len && (s[off] & 0xC0) == 0x80)
-			off++;
-
-		/* Ran off end of data */
-		if (off == len || off + numContinuations[s[off]] >= len)
-			return CHARSET_NEEDDATA;
-
-		/* Found if start byte is ascii,
-		 * or next n bytes are valid continuations */
-		valid = true;
-
-		switch (numContinuations[s[off]]) {
-		case 5:
-			valid &= ((s[off + 5] & 0xC0) == 0x80);
-		case 4:
-			valid &= ((s[off + 4] & 0xC0) == 0x80);
-		case 3:
-			valid &= ((s[off + 3] & 0xC0) == 0x80);
-		case 2:
-			valid &= ((s[off + 2] & 0xC0) == 0x80);
-		case 1:
-			valid &= ((s[off + 1] & 0xC0) == 0x80);
-		case 0:
-			valid &= (s[off + 0] < 0x80);
-		}
-
-		if (valid)
-			break;
-
-		/* Otherwise, skip this (invalid) start byte and try again */
-		off++;
-	}
-
-	*nextoff = off;
-
-	return CHARSET_OK;
-}
-
diff --git a/src/utils/utf8.h b/src/utils/utf8.h
deleted file mode 100644
index 154dbb8..0000000
--- a/src/utils/utf8.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * This file is part of libdom.
- * Licensed under the MIT License,
- *                http://www.opensource.org/licenses/mit-license.php
- * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
- */
-
-/** \file
- * UTF-8 manipulation functions (interface).
- */
-
-#ifndef dom_utils_utf8_h_
-#define dom_utils_utf8_h_
-
-#include <inttypes.h>
-
-#include "utils/charset_errors.h"
-
-inline charset_error _dom_utf8_to_ucs4(const uint8_t *s, size_t len,
-		uint32_t *ucs4, size_t *clen);
-inline charset_error _dom_utf8_from_ucs4(uint32_t ucs4, uint8_t *s,
-		size_t *len);
-
-inline charset_error _dom_utf8_length(const uint8_t *s, size_t max,
-		size_t *len);
-inline charset_error _dom_utf8_char_byte_length(const uint8_t *s,
-		size_t *len);
-
-inline charset_error _dom_utf8_prev(const uint8_t *s, uint32_t off,
-		uint32_t *prevoff);
-inline charset_error _dom_utf8_next(const uint8_t *s, uint32_t len,
-		uint32_t off, uint32_t *nextoff);
-
-inline charset_error _dom_utf8_next_paranoid(const uint8_t *s, uint32_t len,
-		uint32_t off, uint32_t *nextoff);
-
-#endif
-
-- 
cgit v1.2.3