From f3c02943d778e9b00064bf0e103aaecb06ab5e01 Mon Sep 17 00:00:00 2001
From: John Mark Bell <jmb@netsurf-browser.org>
Date: Wed, 3 Oct 2007 23:44:32 +0000
Subject: Make the dom string class more useful. Purge all trace of
 dom_string_get_data() from outside the dom string implementation. Port
 affected code to new, more useful, APIs. This also fixes the interned node
 name strings mentioned in the previous commit.

svn path=/trunk/dom/; revision=3621
---
 src/core/attr.c     | 104 +++----------
 src/core/document.c | 184 ++++++++++++++++-------
 src/core/document.h |   5 +
 src/core/element.c  |   2 +-
 src/core/node.c     |  48 +++---
 src/core/string.c   | 422 ++++++++++++++++++++++++++++++++++++++++++++++------
 6 files changed, 564 insertions(+), 201 deletions(-)

(limited to 'src/core')

diff --git a/src/core/attr.c b/src/core/attr.c
index 232f7ba..a82f117 100644
--- a/src/core/attr.c
+++ b/src/core/attr.c
@@ -177,120 +177,62 @@ dom_exception dom_attr_get_value(struct dom_attr *attr,
 {
 	struct dom_node *a = (struct dom_node *) attr;
 	struct dom_node *c;
-	uint8_t *rep;
-	size_t rep_len;
-	size_t rep_alloc;
+	struct dom_string *value, *temp;
 	dom_exception err;
 
-#define CHUNK 128
-
-	rep = dom_document_alloc(a->owner, NULL, CHUNK);
-	if (rep == NULL)
-		return DOM_NO_MEM_ERR;
-
-	rep_len = 0;
-	rep_alloc = CHUNK;
+	err = dom_string_create_from_const_ptr(a->owner, 
+			(const uint8_t *) "", SLEN(""), &value);
+	if (err != DOM_NO_ERR) {
+		return err;
+	}
 
 	/* Traverse children, building a string representation as we go */
 	for (c = a->first_child; c != NULL; c = c->next) {
 		if (c->type == DOM_TEXT_NODE && c->value != NULL) {
-			const uint8_t *data;
-			size_t len;
-
-			err = dom_string_get_data(c->value, &data, &len);
+			/* Append to existing value */
+			err = dom_string_concat(value, c->value, &temp);
 			if (err != DOM_NO_ERR) {
-				dom_document_alloc(a->owner, rep, 0);
+				dom_string_unref(value);
 				return err;
 			}
 
-			/* Extend buffer, if necessary */
-			if (rep_len + len >= rep_alloc) {
-				uint8_t *temp;
-				size_t required = (rep_len + len) - rep_alloc;
-
-				/* Round required up to a chunk boundary */
-				required = 
-					(required + CHUNK - 1) & ~(CHUNK - 1);
-
-				temp = dom_document_alloc(a->owner, rep, 
-						rep_alloc + required);
-				if (temp == NULL) {
-					dom_document_alloc(a->owner, rep, 0);
-					return DOM_NO_MEM_ERR;
-				}
-
-				rep = temp;
-				rep_alloc += required;
-			}
-
-			/* Copy text into buffer */
-			memcpy(rep + rep_len, data, len);
+			/* Finished with previous value */
+			dom_string_unref(value);
 
-			/* And fix up length information */
-			rep_len += len;
+			/* Claim new value */
+			value = temp;
 		} else if (c->type == DOM_ENTITY_REFERENCE_NODE) {
 			struct dom_string *tr;
-			const uint8_t *data;
-			size_t len;
 
 			/* Get textual representation of entity */
 			err = dom_entity_reference_get_textual_representation(
 					(struct dom_entity_reference *) c,
 					&tr);
 			if (err != DOM_NO_ERR) {
-				dom_document_alloc(a->owner, rep, 0);
+				dom_string_unref(value);
 				return err;
 			}
 
-			err = dom_string_get_data(tr, &data, &len);
+			/* Append to existing value */
+			err = dom_string_concat(value, tr, &temp);
 			if (err != DOM_NO_ERR) {
 				dom_string_unref(tr);
-				dom_document_alloc(a->owner, rep, 0);
+				dom_string_unref(value);
 				return err;
 			}
 
-			/* Extend buffer, if necessary */
-			if (rep_len + len >= rep_alloc) {
-				uint8_t *temp;
-				size_t required = (rep_len + len) - rep_alloc;
-
-				/* Round required up to a chunk boundary */
-				required = 
-					(required + CHUNK - 1) & ~(CHUNK - 1);
-
-				temp = dom_document_alloc(a->owner, rep, 
-						rep_alloc + required);
-				if (temp == NULL) {
-					dom_document_alloc(a->owner, rep, 0);
-					return DOM_NO_MEM_ERR;
-				}
-
-				rep = temp;
-				rep_alloc += required;
-			}
-
-			/* Copy text into buffer */
-			memcpy(rep + rep_len, data, len);
-
-			/* And fix up length information */
-			rep_len += len;
-
 			/* No longer need textual representation */
 			dom_string_unref(tr);
-		}
-	}
 
-#undef CHUNK
+			/* Finished with previous value */
+			dom_string_unref(value);
 
-	/* Create DOMString */
-	err = dom_string_create_from_ptr(a->owner, rep, rep_len, result);
-	if (err != DOM_NO_ERR) {
-		dom_document_alloc(a->owner, rep, 0);
-		return err;
+			/* Claim new value */
+			value = temp;
+		}
 	}
 
-	/* Cleanup */
-	dom_document_alloc(a->owner, rep, 0);
+	*result = value;
 
 	return DOM_NO_ERR;
 }
diff --git a/src/core/document.c b/src/core/document.c
index e188868..42d2686 100644
--- a/src/core/document.c
+++ b/src/core/document.c
@@ -64,14 +64,131 @@ struct dom_document {
 
 	struct dom_doc_nnm *maps;	/**< List of active namednodemaps */
 
-	/** Interned node name strings, indexed by node type */
-	/* Index 0 is unused */
-	struct dom_string *nodenames[DOM_NODE_TYPE_COUNT + 1];
+	struct dom_string **nodenames;	/**< Interned nodenames */
 
 	dom_alloc alloc;		/**< Memory (de)allocation function */
 	void *pw;			/**< Pointer to client data */
 };
 
+/** Interned node name strings, indexed by node type */
+/* Index 0 is unused */
+static struct dom_string *__nodenames_utf8[DOM_NODE_TYPE_COUNT + 1];
+static struct dom_string *__nodenames_utf16[DOM_NODE_TYPE_COUNT + 1];
+
+/**
+ * Initialise the document module
+ *
+ * \param alloc  Memory (de)allocation function
+ * \param pw     Pointer to client-specific private data
+ * \return DOM_NO_ERR on success
+ */
+dom_exception _dom_document_initialise(dom_alloc alloc, void *pw)
+{
+	static struct {
+		const char *name;
+		size_t len;
+	} names_utf8[DOM_NODE_TYPE_COUNT + 1] = {
+		{ NULL,			0 },	/* Unused */
+		{ NULL,			0 },	/* Element */
+		{ NULL,			0 },	/* Attr */
+		{ "#text",		5 },	/* Text */
+		{ "#cdata-section",	14 },	/* CDATA section */
+		{ NULL,			0 },	/* Entity reference */
+		{ NULL,			0 },	/* Entity */
+		{ NULL,			0 },	/* Processing instruction */
+		{ "#comment",		8 },	/* Comment */
+		{ "#document",		9 },	/* Document */
+		{ NULL,			0 },	/* Document type */
+		{ "#document-fragment",	18 },	/* Document fragment */
+		{ NULL,			0 }	/* Notation */
+	};
+
+	/** \todo This assumes Little Endian */
+	static struct {
+		const char *name;
+		size_t len;
+	} names_utf16[DOM_NODE_TYPE_COUNT + 1] = {
+		{ NULL,			0 },	/* Unused */
+		{ NULL,			0 },	/* Element */
+		{ NULL,			0 },	/* Attr */
+		{ "#\0t\0e\0x\0t\0",	10 },	/* Text */
+		{ "#\0c\0d\0a\0t\0a\0-\0s\0e\0c\0t\0i\0o\0n\0",	28 },	/* CDATA section */
+		{ NULL,			0 },	/* Entity reference */
+		{ NULL,			0 },	/* Entity */
+		{ NULL,			0 },	/* Processing instruction */
+		{ "#\0c\0o\0m\0m\0e\0n\0t\0",		16 },	/* Comment */
+		{ "#\0d\0o\0c\0u\0m\0e\0n\0t\0",		18 },	/* Document */
+		{ NULL,			0 },	/* Document type */
+		{ "#\0d\0o\0c\0u\0m\0e\0n\0t\0-\0f\0r\0a\0g\0m\0e\0n\0t\0",	36 },	/* Document fragment */
+		{ NULL,			0 }	/* Notation */
+	};
+
+	dom_exception err;
+
+	/* Initialise interned node names */
+	for (int i = 0; i <= DOM_NODE_TYPE_COUNT; i++) {
+		if (names_utf8[i].name == NULL) {
+			/* Nothing to intern; skip this entry */
+			__nodenames_utf8[i] = NULL;
+			__nodenames_utf16[i] = NULL;
+			continue;
+		}
+
+		/* Make string */
+		err = dom_string_create_from_ptr_no_doc(alloc, pw,
+				DOM_STRING_UTF8,
+				(const uint8_t *) names_utf8[i].name,
+				names_utf8[i].len, &__nodenames_utf8[i]);
+		if (err != DOM_NO_ERR) {
+			/* Failed, clean up strings we've created so far */
+			for (int j = 0; j < i; j++) {
+				if (__nodenames_utf8[j] != NULL) {
+					dom_string_unref(__nodenames_utf8[j]);
+					dom_string_unref(__nodenames_utf16[j]);
+				}
+			}
+			return err;
+		}
+
+		err = dom_string_create_from_ptr_no_doc(alloc, pw,
+				DOM_STRING_UTF16,
+				(const uint8_t *) names_utf16[i].name,
+				names_utf16[i].len, &__nodenames_utf16[i]);
+		if (err != DOM_NO_ERR) {
+			/* Failed, clean up strings we've created so far */
+			for (int j = 0; j < i; j++) {
+				if (__nodenames_utf8[j] != NULL) {
+					dom_string_unref(__nodenames_utf8[j]);
+					dom_string_unref(__nodenames_utf16[j]);
+				}
+			}
+
+			dom_string_unref(__nodenames_utf8[i]);
+
+			return err;
+		}
+	}
+
+	return DOM_NO_ERR;
+}
+
+/**
+ * Finalise the document module
+ *
+ * \return DOM_NO_ERR.
+ */
+dom_exception _dom_document_finalise(void)
+{
+	for (int i = 0; i <= DOM_NODE_TYPE_COUNT; i++) {
+		if (__nodenames_utf8[i] != NULL) {
+			dom_string_unref(__nodenames_utf8[i]);
+			dom_string_unref(__nodenames_utf16[i]);
+		}
+	}
+
+	return DOM_NO_ERR;
+}
+
 /**
  * Create a Document
  *
@@ -90,21 +207,6 @@ dom_exception dom_document_create(struct dom_implementation *impl,
 		dom_string_charset charset, dom_alloc alloc, void *pw, 
 		struct dom_document **doc)
 {
-	static const char *names[DOM_NODE_TYPE_COUNT + 1] = {
-		NULL,			/* Unused */
-		NULL,			/* Element */
-		NULL,			/* Attr */
-		"#text",		/* Text */
-		"#cdata-section",	/* CDATA section */
-		NULL,			/* Entity reference */
-		NULL,			/* Entity */
-		NULL,			/* Processing instruction */
-		"#comment",		/* Comment */
-		"#document",		/* Document */
-		NULL,			/* Document type */
-		"#document-fragment",	/* Document fragment */
-		NULL			/* Notation */
-	};
 	struct dom_document *d;
 	dom_exception err;
 
@@ -114,34 +216,9 @@ dom_exception dom_document_create(struct dom_implementation *impl,
 		return DOM_NO_MEM_ERR;
 
 	/* Set up document allocation context - must be first */
-	d->charset = charset;
 	d->alloc = alloc;
 	d->pw = pw;
 
-	/* Initialise interned node names */
-	for (int i = 0; i <= DOM_NODE_TYPE_COUNT; i++) {
-		if (names[i] == NULL) {
-			/* Nothing to intern; skip this entry */
-			d->nodenames[i] = NULL;
-			continue;
-		}
-
-		/* Make string */
-		err = dom_string_create_from_const_ptr(d,
-				(const uint8_t *) names[i],
-				strlen(names[i]), &d->nodenames[i]);
-		if (err != DOM_NO_ERR) {
-			/* Failed, clean up strings we've created so far */
-			for (int j = 0; j < i; j++) {
-				if (d->nodenames[i] != NULL)
-					dom_string_unref(d->nodenames[i]);
-			}
-			/* And destroy document */
-			alloc(d, 0, pw);
-			return err;
-		}
-	}
-
 	/* Initialise base class -- the Document has no parent, so
 	 * destruction will be attempted as soon as its reference count
 	 * reaches zero. Documents own themselves (this simplifies the 
@@ -150,17 +227,13 @@ dom_exception dom_document_create(struct dom_implementation *impl,
 	err = dom_node_initialise(&d->base, d, DOM_DOCUMENT_NODE,
 			NULL, NULL, NULL, NULL);
 	if (err != DOM_NO_ERR) {
-		/* Clean up interned strings */
-		for (int i = 0; i <= DOM_NODE_TYPE_COUNT; i++) {
-			if (d->nodenames[i] != NULL)
-				dom_string_unref(d->nodenames[i]);
-		}
-		/* And document */
+		/* Clean up document */
 		alloc(d, 0, pw);
 		return err;
 	}
 
 	/* Initialise remaining type-specific data */
+	d->charset = charset;
 	if (impl != NULL)
 		dom_implementation_ref(impl);
 	d->impl = impl;
@@ -168,6 +241,9 @@ dom_exception dom_document_create(struct dom_implementation *impl,
 	d->nodelists = NULL;
 	d->maps = NULL;
 
+	d->nodenames = (charset == DOM_STRING_UTF8) ? __nodenames_utf8 
+						    : __nodenames_utf16;
+
 	*doc = d;
 
 	return DOM_NO_ERR;
@@ -224,12 +300,6 @@ void dom_document_destroy(struct dom_document *doc)
 	doc->nodelists = NULL;
 	doc->maps = NULL;
 
-	/* Clean up interned strings */
-	for (int i = 0; i <= DOM_NODE_TYPE_COUNT; i++) {
-		if (doc->nodenames[i] != NULL)
-			dom_string_unref(doc->nodenames[i]);
-	}
-
 	/* Finalise base class */
 	dom_node_finalise(doc, &doc->base);
 
@@ -569,7 +639,7 @@ dom_exception dom_document_create_element_ns(struct dom_document *doc,
 	}
 
 	/* Divide QName into prefix/localname pair */
-	err = _dom_namespace_split_qname(qname, doc, &prefix, &localname);
+	err = _dom_namespace_split_qname(qname, &prefix, &localname);
 	if (err != DOM_NO_ERR) {
 		return err;
 	}
@@ -630,7 +700,7 @@ dom_exception dom_document_create_attribute_ns(struct dom_document *doc,
 	}
 
 	/* Divide QName into prefix/localname pair */
-	err = _dom_namespace_split_qname(qname, doc, &prefix, &localname);
+	err = _dom_namespace_split_qname(qname, &prefix, &localname);
 	if (err != DOM_NO_ERR) {
 		return err;
 	}
diff --git a/src/core/document.h b/src/core/document.h
index 5149f2e..6982b74 100644
--- a/src/core/document.h
+++ b/src/core/document.h
@@ -19,6 +19,11 @@ struct dom_namednodemap;
 struct dom_node;
 struct dom_nodelist;
 
+/* Initialise the document module */
+dom_exception _dom_document_initialise(dom_alloc alloc, void *pw);
+/* Finalise the document module */
+dom_exception _dom_document_finalise(void);
+
 /* Destroy a document */
 void dom_document_destroy(struct dom_document *doc);
 
diff --git a/src/core/element.c b/src/core/element.c
index 2e95a9f..37e3a7e 100644
--- a/src/core/element.c
+++ b/src/core/element.c
@@ -597,7 +597,7 @@ dom_exception dom_element_set_attribute_ns(struct dom_element *element,
 	}
 
 	/* Decompose QName */
-	err = _dom_namespace_split_qname(qname, e->owner, &prefix, &localname);
+	err = _dom_namespace_split_qname(qname, &prefix, &localname);
 	if (err != DOM_NO_ERR) {
 		return err;
 	}
diff --git a/src/core/node.c b/src/core/node.c
index 8eff2ec..2284e4f 100644
--- a/src/core/node.c
+++ b/src/core/node.c
@@ -303,28 +303,43 @@ dom_exception dom_node_get_node_name(struct dom_node *node,
 	if ((node->type == DOM_ELEMENT_NODE ||
 			node->type == DOM_ATTRIBUTE_NODE) &&
 			node->prefix != NULL) {
-		const uint8_t *prefix, *localname;
-		size_t prefix_len, local_len;
+		struct dom_string *colon;
 		dom_exception err;
 
-		dom_string_get_data(node->prefix, &prefix, &prefix_len);
-
-		dom_string_get_data(node->name, &localname, &local_len);
+		/* ugh! */
+		/** \todo Assumes little endian */
+		err = dom_string_create_from_const_ptr(node->owner,	
+			(const uint8_t *) (
+				(dom_document_get_charset(node->owner) == 
+					DOM_STRING_UTF8) ? ":" : ":\0"),
+			(dom_document_get_charset(node->owner) == 
+					DOM_STRING_UTF8) ? 1 : 2,
+			&colon);
+		if (err != DOM_NO_ERR) {
+			return err;
+		}
 
-		uint8_t qname[prefix_len + 1 /* : */ + local_len + 1 /* \0 */];
+		/* Prefix + : */
+		err = dom_string_concat(node->prefix, colon, &node_name);
+		if (err != DOM_NO_ERR) {
+			dom_string_unref(colon);
+			return err;
+		}
 
-		sprintf((char *) qname, "%.*s:%.*s", 
-			prefix_len, (const char *) prefix, 
-			local_len, (const char *) localname);
+		/* Finished with colon */
+		dom_string_unref(colon);
 
-		/* Create the string */
-		err = dom_string_create_from_ptr(node->owner, qname, 
-				prefix_len + 1 + local_len, &node_name);
+		/* Prefix + : + Localname */
+		err = dom_string_concat(node_name, node->name, &colon);
 		if (err != DOM_NO_ERR) {
+			dom_string_unref(node_name);
 			return err;
 		}
 
-		/* QName is referenced on exit from constructor */
+		/* Finished with intermediate node name */
+		dom_string_unref(node_name);
+
+		node_name = colon;
 	} else {
 		dom_string_ref(node->name);
 
@@ -1128,13 +1143,8 @@ dom_exception dom_node_set_prefix(struct dom_node *node,
 
 	/* Set the prefix */
 	if (prefix != NULL) {
-		const uint8_t *data;
-		size_t len;
-
-		dom_string_get_data(prefix, &data, &len);
-
 		/* Empty string is treated as NULL */
-		if (len == 0) {
+		if (dom_string_length(prefix) == 0) {
 			node->prefix = NULL;
 		} else {
 			dom_string_ref(prefix);
diff --git a/src/core/string.c b/src/core/string.c
index faa3c85..1e3817c 100644
--- a/src/core/string.c
+++ b/src/core/string.c
@@ -62,6 +62,9 @@ static struct dom_string empty_string = {
 	.refcnt = 1
 };
 
+static dom_exception __dom_string_get_data(struct dom_string *str,
+		const uint8_t **data, size_t *len);
+
 /**
  * Claim a reference on a DOM string
  *
@@ -277,45 +280,6 @@ dom_exception dom_string_create_from_ptr_no_doc(dom_alloc alloc, void *pw,
 	return DOM_NO_ERR;
 }
 
-/**
- * Get a pointer to the string of characters within a DOM string
- *
- * \param str   Pointer to DOM string to retrieve pointer from
- * \param data  Pointer to location to receive data
- * \param len   Pointer to location to receive byte length of data
- * \return DOM_NO_ERR on success
- *
- * The caller must have previously claimed a reference on the DOM string.
- * The returned pointer must not be freed.
- */
-dom_exception dom_string_get_data(struct dom_string *str,
-		const uint8_t **data, size_t *len)
-{
-	/* Assume that a NULL str pointer indicates the empty string */
-	if (str == NULL)
-		str = &empty_string;
-
-	switch (str->type) {
-	case DOM_STRING_PTR:
-		*data = str->data.ptr;
-		break;
-	case DOM_STRING_CONST_PTR:
-		*data = str->data.cptr;
-		break;
-	case DOM_STRING_OFFSET:
-		*data = dom_document_get_base(str->ctx.doc) +
-				str->data.offset;
-		break;
-	case DOM_STRING_PTR_NODOC:
-		*data = str->data.ptr;
-		break;
-	}
-
-	*len = str->len;
-
-	return DOM_NO_ERR;
-}
-
 /**
  * Case sensitively compare two DOM strings
  *
@@ -332,11 +296,11 @@ int dom_string_cmp(struct dom_string *s1, struct dom_string *s2)
 	size_t l1, l2;
 	dom_exception err;
 
-	err = dom_string_get_data(s1, &d1, &l1);
+	err = __dom_string_get_data(s1, &d1, &l1);
 	if (err != DOM_NO_ERR)
 		return 1; /* arbitrary */
 
-	err = dom_string_get_data(s2, &d2, &l2);
+	err = __dom_string_get_data(s2, &d2, &l2);
 	if (err != DOM_NO_ERR)
 		return 1; /* arbitrary */
 
@@ -387,11 +351,11 @@ int dom_string_icmp(struct dom_string *s1, struct dom_string *s2)
 	size_t l1, l2;
 	dom_exception err;
 
-	err = dom_string_get_data(s1, &d1, &l1);
+	err = __dom_string_get_data(s1, &d1, &l1);
 	if (err != DOM_NO_ERR)
 		return 1; /* arbitrary */
 
-	err = dom_string_get_data(s2, &d2, &l2);
+	err = __dom_string_get_data(s2, &d2, &l2);
 	if (err != DOM_NO_ERR)
 		return 1; /* arbitrary */
 
@@ -427,3 +391,375 @@ int dom_string_icmp(struct dom_string *s1, struct dom_string *s2)
 	return (int)(l1 - l2);
 }
 
+/**
+ * Get the index of the first occurrence of a character in a dom string 
+ * 
+ * \param str  The string to search in
+ * \param chr  UCS4 value to look for
+ * \return Character index of found character, or -1 if none found 
+ */
+uint32_t dom_string_index(struct dom_string *str, uint32_t chr)
+{
+	const uint8_t *s;
+	size_t clen, slen;
+	uint32_t c, index;
+	charset_error err;
+
+	__dom_string_get_data(str, &s, &slen);
+
+	index = 0;
+
+	while (slen > 0) {
+		if (str->charset == DOM_STRING_UTF8) {
+			err = _dom_utf8_to_ucs4(s, slen, &c, &clen);
+		} else {
+			err = _dom_utf16_to_ucs4(s, slen, &c, &clen);
+		}
+
+		if (err != CHARSET_OK) {
+			return (uint32_t) -1;
+		}
+
+		if (c == chr) {
+			return index;
+		}
+
+		s += clen;
+		slen -= clen;
+		index++;
+	}
+
+	return (uint32_t) -1;
+}
+
+/**
+ * Get the index of the last occurrence of a character in a dom string 
+ * 
+ * \param str  The string to search in
+ * \param chr  UCS4 value to look for
+ * \return Character index of found character, or -1 if none found
+ */
+uint32_t dom_string_rindex(struct dom_string *str, uint32_t chr)
+{
+	const uint8_t *s;
+	size_t clen, slen;
+	uint32_t c, index;
+	charset_error err;
+
+	__dom_string_get_data(str, &s, &slen);
+
+	index = dom_string_length(str);
+
+	while (slen > 0) {
+		if (str->charset == DOM_STRING_UTF8) {
+			err = _dom_utf8_prev(s, slen, &clen);
+			if (err == CHARSET_OK) {
+				err = _dom_utf8_to_ucs4(s + clen, slen - clen, 
+						&c, &clen);
+			}
+		} else {
+			err = _dom_utf16_prev(s, slen, &clen);
+			if (err == CHARSET_OK) {
+				err = _dom_utf16_to_ucs4(s + clen, slen - clen,
+						&c, &clen);
+			}
+		}
+
+		if (err != CHARSET_OK) {
+			return (uint32_t) -1;
+		}
+
+		if (c == chr) {
+			return index;
+		}
+
+		slen -= clen;
+		index--;
+	}
+
+	return (uint32_t) -1;
+
+}
+
+/**
+ * Get the length, in characters, of a dom string
+ *
+ * \param str  The string to measure the length of
+ * \return The length of the string, in characters
+ */
+uint32_t dom_string_length(struct dom_string *str)
+{
+	const uint8_t *s;
+	size_t slen;
+	uint32_t clen;
+	charset_error err;
+
+	__dom_string_get_data(str, &s, &slen);
+
+	if (str->charset == DOM_STRING_UTF8) {
+		err = _dom_utf8_length(s, slen, &clen);
+	} else {
+		err = _dom_utf16_length(s, slen, &clen);
+	}
+
+	if (err != CHARSET_OK) {
+		return 0;
+	}
+
+	return clen;
+}
+
+/** 
+ * Concatenate two dom strings 
+ * 
+ * \param s1      The first string
+ * \param s2      The second string
+ * \param result  Pointer to location to receive result
+ * \return DOM_NO_ERR on success, DOM_NO_MEM_ERR on memory exhaustion
+ *
+ * The returned string will be allocated using the allocation details
+ * stored in ::s1.
+ * 
+ * The returned string will have its reference count increased. The client
+ * should dereference it once it has finished with it.
+ */
+dom_exception dom_string_concat(struct dom_string *s1, struct dom_string *s2,
+		struct dom_string **result)
+{
+	struct dom_string *concat;
+	const uint8_t *s;
+	size_t slen;
+
+	if (s1->type == DOM_STRING_PTR_NODOC) {
+		concat = s1->ctx.nodoc.alloc(NULL, 
+				sizeof(struct dom_string), s1->ctx.nodoc.pw);
+	} else {
+		concat = dom_document_alloc(s1->ctx.doc, 
+				NULL, sizeof(struct dom_string));
+	}
+
+	if (concat == NULL) {
+		return DOM_NO_MEM_ERR;
+	}
+
+	/** \todo support attempted concatenation of mismatched charsets */
+
+	if (s1->type == DOM_STRING_PTR_NODOC) {
+		concat->data.ptr = s1->ctx.nodoc.alloc(NULL, 
+				s1->len + s2->len, s1->ctx.nodoc.pw);
+	} else {
+		concat->data.ptr = dom_document_alloc(s1->ctx.doc, 
+				NULL, s1->len + s2->len);
+	}
+	if (concat->data.ptr == NULL) {
+		if (s1->type == DOM_STRING_PTR_NODOC) {
+			s1->ctx.nodoc.alloc(concat, 0, s1->ctx.nodoc.pw);
+		} else {
+			dom_document_alloc(s1->ctx.doc, concat, 0);
+		}
+		return DOM_NO_MEM_ERR;
+	}
+
+	concat->type = (s1->type == DOM_STRING_PTR_NODOC) 
+			? DOM_STRING_PTR_NODOC : DOM_STRING_PTR;
+
+	concat->charset = s1->charset;
+
+	__dom_string_get_data(s1, &s, &slen);
+
+	memcpy(concat->data.ptr, s, slen);
+
+	__dom_string_get_data(s2, &s, &slen);
+
+	memcpy(concat->data.ptr + s1->len, s, slen);
+
+	concat->len = s1->len + s2->len;
+
+	if (concat->type == DOM_STRING_PTR_NODOC) {
+		concat->ctx.nodoc.alloc = s1->ctx.nodoc.alloc;
+		concat->ctx.nodoc.pw = s1->ctx.nodoc.pw;
+	} else {
+		concat->ctx.doc = s1->ctx.doc;
+	}
+
+	concat->refcnt = 1;
+
+	*result = concat;
+
+	return DOM_NO_ERR;
+}
+
+/**
+ * Extract a substring from a dom string 
+ *
+ * \param str     The string to extract from
+ * \param i1      The character index of the start of the substring
+ * \param i2      The character index of the end of the substring
+ * \param result  Pointer to location to receive result
+ * \return DOM_NO_ERR on success, DOM_NO_MEM_ERR on memory exhaustion
+ *
+ * The returned string will be allocated using the allocation details
+ * stored in ::str.
+ *
+ * The returned string will have its reference count increased. The client
+ * should dereference it once it has finished with it.
+ */
+dom_exception dom_string_substr(struct dom_string *str, 
+		uint32_t i1, uint32_t i2, struct dom_string **result)
+{
+	const uint8_t *s;
+	size_t slen;
+	size_t b1, b2;
+	charset_error err;
+
+	__dom_string_get_data(str, &s, &slen);
+
+	/* Initialise the byte index of the start to 0 */
+	b1 = 0;
+	/* Make the end a character offset from the start */
+	i2 -= i1;
+
+	/* Calculate the byte index of the start */
+	while (i1 > 0) {
+		if (str->charset == DOM_STRING_UTF8) {
+			err = _dom_utf8_next(s, slen, b1, &b1);
+		} else {
+			err = _dom_utf16_next(s, slen, b1, &b1);
+		}
+
+		if (err != CHARSET_OK) {
+			return DOM_NO_MEM_ERR;
+		}
+
+		i1--;
+	}
+
+	/* Initialise the byte index of the end to that of the start */
+	b2 = b1;
+
+	/* Calculate the byte index of the end */
+	while (i2 > 0) {
+		if (str->charset == DOM_STRING_UTF8) {
+			err = _dom_utf8_next(s, slen, b2, &b2);
+		} else {
+			err = _dom_utf16_next(s, slen, b2, &b2);
+		}
+
+		if (err != CHARSET_OK) {
+			return DOM_NO_MEM_ERR;
+		}
+
+		i2--;
+	}
+
+	/* Create a string from the specified byte range */
+	return (str->type == DOM_STRING_PTR_NODOC)
+			? dom_string_create_from_ptr_no_doc(
+					str->ctx.nodoc.alloc,
+					str->ctx.nodoc.pw,
+					str->charset, 
+					s + b1, b2 - b1, result)
+			: dom_string_create_from_ptr(str->ctx.doc,
+					s + b1, b2 - b2, result);
+}
+
+/**
+ * Duplicate a dom string 
+ *
+ * \param str     The string to duplicate
+ * \param result  Pointer to location to receive result
+ * \return DOM_NO_ERR on success, DOM_NO_MEM_ERR on memory exhaustion
+ *
+ * The returned string will be allocated using the allocation details
+ * stored in ::str.
+ *
+ * The returned string will have its reference count increased. The client
+ * should dereference it once it has finished with it.
+ */
+dom_exception dom_string_dup(struct dom_string *str, 
+		struct dom_string **result)
+{
+	const uint8_t *s;
+	size_t slen;
+
+	__dom_string_get_data(str, &s, &slen);
+
+	return str->type == DOM_STRING_PTR_NODOC 
+			? dom_string_create_from_ptr_no_doc(
+				str->ctx.nodoc.alloc,
+				str->ctx.nodoc.pw,
+				str->charset,
+				s, slen, result) 
+			: dom_string_create_from_ptr(str->ctx.doc,
+					s, slen, result);
+}
+
+/**
+ * Calculate a hash value from a dom string 
+ *
+ * \param str  The string to calculate a hash of
+ * \return The hash value associated with the string
+ */
+uint32_t dom_string_hash(struct dom_string *str)
+{
+	const uint8_t *s;
+	size_t slen;
+	uint32_t hash = 0x01000193;
+
+	__dom_string_get_data(str, &s, &slen);
+
+	while (slen > 0) {
+		hash *= 0x01000193;
+		hash ^= *s;
+
+		s++;
+		slen--;
+	}
+
+	return hash;
+}
+
+/*                                                                           */
+/*---------------------------------------------------------------------------*/
+/*                                                                           */
+
+/**
+ * Get a pointer to the string of characters within a DOM string
+ *
+ * \param str   Pointer to DOM string to retrieve pointer from
+ * \param data  Pointer to location to receive data
+ * \param len   Pointer to location to receive byte length of data
+ * \return DOM_NO_ERR on success
+ *
+ * The caller must have previously claimed a reference on the DOM string.
+ * The returned pointer must not be freed.
+ */
+dom_exception __dom_string_get_data(struct dom_string *str,
+		const uint8_t **data, size_t *len)
+{
+	/* Assume that a NULL str pointer indicates the empty string */
+	if (str == NULL)
+		str = &empty_string;
+
+	switch (str->type) {
+	case DOM_STRING_PTR:
+		*data = str->data.ptr;
+		break;
+	case DOM_STRING_CONST_PTR:
+		*data = str->data.cptr;
+		break;
+	case DOM_STRING_OFFSET:
+		*data = dom_document_get_base(str->ctx.doc) +
+				str->data.offset;
+		break;
+	case DOM_STRING_PTR_NODOC:
+		*data = str->data.ptr;
+		break;
+	}
+
+	*len = str->len;
+
+	return DOM_NO_ERR;
+}
+
+
-- 
cgit v1.2.3