1 files changed, 337 insertions, 21 deletions
diff --git a/src/core/string.c b/src/core/string.c
index 806531a..e35f416 100644
--- a/src/core/string.c
+++ b/src/core/string.c
@@ -3,16 +3,17 @@
  * Licensed under the MIT License,
  *                http://www.opensource.org/licenses/mit-license.php
  * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ * Copyright 2009 Bo Yang <struggleyb.nku@gmail.com>
  */
 
+#include <assert.h>
 #include <ctype.h>
 #include <inttypes.h>
 #include <string.h>
 
 #include <parserutils/charset/utf8.h>
 
-#include <dom/core/string.h>
-
+#include "core/string.h"
 #include "core/document.h"
 #include "utils/utils.h"
 
@@ -26,6 +27,10 @@ struct dom_string {
 
 	size_t len;			/**< Byte length of string */
 
+	lwc_string *intern;		/**< The lwc_string of this string */
+
+	lwc_context *context;	/**< The lwc_context for the lwc_string */
+
 	dom_alloc alloc;		/**< Memory (de)allocation function */
 	void *pw;			/**< Client-specific data */
 
@@ -40,6 +45,8 @@ static struct dom_string empty_string = {
 	.refcnt = 1
 };
 
+
+
 /**
  * Claim a reference on a DOM string
  *
@@ -60,8 +67,15 @@ void dom_string_ref(struct dom_string *str)
  */
 void dom_string_unref(struct dom_string *str)
 {
+	if (str == NULL)
+		return;
+	
 	if (--str->refcnt == 0) {
-		if (str->alloc != NULL) {
+		if (str->intern != NULL) {
+			lwc_context_unref(str->context);
+			lwc_context_string_unref(str->context, str->intern);
+			str->alloc(str, 0, str->pw);
+		} else if (str->alloc != NULL) {
 			str->alloc(str->ptr, 0, str->pw);
 			str->alloc(str, 0, str->pw);
 		}
@@ -71,11 +85,11 @@ void dom_string_unref(struct dom_string *str)
 /**
  * Create a DOM string from a string of characters
  *
- * \param alloc    Memory (de)allocation function
- * \param pw       Pointer to client-specific private data
- * \param ptr      Pointer to string of characters
- * \param len      Length, in bytes, of string of characters
- * \param str      Pointer to location to receive result
+ * \param alloc  Memory (de)allocation function
+ * \param pw     Pointer to client-specific private data
+ * \param ptr    Pointer to string of characters
+ * \param len    Length, in bytes, of string of characters
+ * \param str    Pointer to location to receive result
  * \return DOM_NO_ERR on success, DOM_NO_MEM_ERR on memory exhaustion
  *
  * The returned string will already be referenced, so there is no need
@@ -89,7 +103,7 @@ dom_exception dom_string_create(dom_alloc alloc, void *pw,
 {
 	struct dom_string *ret;
 
-	if (ptr == NULL && len == 0) {
+	if (ptr == NULL || len == 0) {
 		dom_string_ref(&empty_string);
 
 		*str = &empty_string;
@@ -114,6 +128,9 @@ dom_exception dom_string_create(dom_alloc alloc, void *pw,
 	ret->alloc = alloc;
 	ret->pw = pw;
 
+	ret->intern = NULL;
+	ret->context = NULL;
+
 	ret->refcnt = 1;
 
 	*str = ret;
@@ -122,6 +139,152 @@ dom_exception dom_string_create(dom_alloc alloc, void *pw,
 }
 
 /**
+ * Clone a dom_string if necessary. This method is used to create a new string
+ * with a new allocator, but if the allocator is the same with the paramter 
+ * str, just ref the string.
+ *
+ * \param alloc  The new allocator for this string
+ * \param pw     The new pw for this string
+ * \param str    The source dom_string
+ * \param ret    The cloned dom_string
+ * \return DOM_NO_ERR on success, DOM_NO_MEM_ERR on memory exhaustion
+ *
+ * @note: When both the alloc and pw are the same as the str's, we need no
+ *	  real clone, just ref the source string is ok.
+ */
+dom_exception dom_string_clone(dom_alloc alloc, void *pw, 
+		struct dom_string *str, struct dom_string **ret)
+{
+	if (alloc == str->alloc && pw == str->pw) {
+		*ret = str;
+		dom_string_ref(str);
+		return DOM_NO_ERR;
+	}
+
+	if (str->intern != NULL) {
+		return _dom_string_create_from_lwcstring(alloc, pw,
+				str->context, str->intern, ret);
+	} else {
+		return dom_string_create(alloc, pw, str->ptr, str->len, ret);
+	}
+}
+
+/**
+ * Create a dom_string from a lwc_string
+ * 
+ * \param ctx  The lwc_context
+ * \param str  The lwc_string
+ * \param ret  The new dom_string
+ * \return DOM_NO_ERR on success, DOM_NO_MEM_ERR on memory exhaustion
+ */
+dom_exception _dom_string_create_from_lwcstring(dom_alloc alloc, void *pw,
+		lwc_context *ctx, lwc_string *str, struct dom_string **ret)
+{
+	dom_string *r;
+
+	if (str == NULL) {
+		*ret = NULL;
+		return DOM_NO_ERR;
+	}
+
+	r = alloc(NULL, sizeof(struct dom_string), pw);
+	if (r == NULL)
+		return DOM_NO_MEM_ERR;
+
+	if (str == NULL) {
+		*ret = &empty_string;
+		dom_string_ref(*ret);
+		return DOM_NO_ERR;
+	}
+
+	r->context = ctx;
+	r->intern = str;
+	r->ptr = (uint8_t *)lwc_string_data(str);
+	r->len = lwc_string_length(str);
+
+	r->alloc = alloc;
+	r->pw = pw;
+
+	r->refcnt = 1;
+
+	/* Ref the lwc_string */
+	lwc_context_ref(ctx);
+	lwc_context_string_ref(ctx, str);
+
+	*ret = r;
+	return DOM_NO_ERR;
+
+}
+
+/**
+ * Make the dom_string be interned in the lwc_context
+ *
+ * \param str     The dom_string to be interned
+ * \param ctx     The lwc_context to intern this dom_string
+ * \param lwcstr  The result lwc_string	
+ * \return DOM_NO_ERR on success, appropriate dom_exception on failure.
+ */
+dom_exception _dom_string_intern(struct dom_string *str, 
+		struct lwc_context_s *ctx, struct lwc_string_s **lwcstr)
+{
+	lwc_string *ret;
+	lwc_error lerr;
+
+	/* If this string is interned with the same context, do nothing */
+	if (str->context != NULL && str->context == ctx) {
+		*lwcstr = str->intern;
+		return DOM_NO_ERR;
+	}
+
+	lerr = lwc_context_intern(ctx, (const char *)str->ptr, str->len, &ret);
+	if (lerr != lwc_error_ok) {
+		return _dom_exception_from_lwc_error(lerr);
+	}
+
+	if (str->context != NULL) {
+		lwc_context_unref(str->context);
+		lwc_context_string_unref(str->context, str->intern);
+		str->ptr = NULL;
+	}
+
+	str->context = ctx;
+	str->intern = ret;
+	lwc_context_ref(ctx);
+	lwc_context_string_ref(ctx, ret);
+
+	if (str->ptr != NULL) {
+		str->alloc(str->ptr, 0, str->pw);
+	}
+
+	str->ptr = (uint8_t *) lwc_string_data(ret);
+
+	*lwcstr = ret;
+	return DOM_NO_ERR;
+}
+
+/**
+ * Get the internal lwc_string 
+ *
+ * \param str     The dom_string object
+ * \param ctx     The lwc_context which intern this dom_string
+ * \param lwcstr  The lwc_string of this dom-string
+ * \return DOM_NO_ERR on success, appropriate dom_exception on failure.
+ */
+dom_exception dom_string_get_intern(struct dom_string *str, 
+		struct lwc_context_s **ctx, struct lwc_string_s **lwcstr)
+{
+	*ctx = str->context;
+	*lwcstr = str->intern;
+
+	if (*ctx != NULL)
+		lwc_context_ref(*ctx);
+	if (*lwcstr != NULL)
+		lwc_context_string_ref(*ctx, *lwcstr);
+
+	return DOM_NO_ERR;
+}
+
+/**
  * Case sensitively compare two DOM strings
  *
  * \param s1  The first string to compare
@@ -132,12 +295,26 @@ dom_exception dom_string_create(dom_alloc alloc, void *pw,
  */
 int dom_string_cmp(struct dom_string *s1, struct dom_string *s2)
 {
+	bool ret;
+
 	if (s1 == NULL)
 		s1 = &empty_string;
 
 	if (s2 == NULL)
 		s2 = &empty_string;
 
+	if (s1->context == s2->context && s1->context != NULL) {
+		assert(s1->intern != NULL);
+		assert(s2->intern != NULL);
+		lwc_context_string_isequal(s1->context, s1->intern, 
+				s2->intern, &ret);
+		if (ret == true) {
+			return 0;
+		} else {
+			return -1;
+		}
+	}
+
 	if (s1->len != s2->len)
 		return 1;
 
@@ -164,6 +341,19 @@ int dom_string_icmp(struct dom_string *s1, struct dom_string *s2)
 	if (s2 == NULL)
 		s2 = &empty_string;
 
+	bool ret;
+	if (s1->context == s2->context && s1->context != NULL) {
+		assert(s1->intern != NULL);
+		assert(s2->intern != NULL);
+		lwc_context_string_caseless_isequal(s1->context, s1->intern, 
+				s2->intern, &ret);
+		if (ret == true) {
+			return 0;
+		} else {
+			return -1;
+		}
+	}
+
 	d1 = s1->ptr;
 	d2 = s2->ptr;
 	l1 = s1->len;
@@ -304,6 +494,56 @@ uint32_t dom_string_length(struct dom_string *str)
 	return clen;
 }
 
+/**
+ * Get the UCS4 character at position index
+ *
+ * \param index  The position of the charater
+ * \param ch     The UCS4 character
+ * \return DOM_NO_ERR on success, appropriate dom_exception on failure.
+ */
+dom_exception dom_string_at(struct dom_string *str, uint32_t index, 
+		uint32_t *ch)
+{
+	const uint8_t *s;
+	size_t clen, slen;
+	uint32_t c, i;
+	parserutils_error err;
+
+	if (str == NULL)
+		str = &empty_string;
+
+	s = str->ptr;
+	slen = str->len;
+
+	i = 0;
+
+	while (slen > 0) {
+		err = parserutils_charset_utf8_char_byte_length(s, &clen);
+		if (err != PARSERUTILS_OK) {
+			return (uint32_t) -1;
+		}
+
+		i++;
+		if (i == index + 1)
+			break;
+
+		s += clen;
+		slen -= clen;
+	}
+
+	if (i == index + 1) {
+		err = parserutils_charset_utf8_to_ucs4(s, slen, &c, &clen);
+		if (err != PARSERUTILS_OK) {
+			return (uint32_t) -1;
+		}
+
+		*ch = c;
+		return DOM_NO_ERR;
+	} else {
+		return DOM_DOMSTRING_SIZE_ERR;
+	}
+}
+
 /** 
  * Concatenate two dom strings 
  * 
@@ -322,16 +562,33 @@ dom_exception dom_string_concat(struct dom_string *s1, struct dom_string *s2,
 		struct dom_string **result)
 {
 	struct dom_string *concat;
+	dom_alloc alloc;
+	void *pw;
+
+	assert(s1 != NULL);
+	assert(s2 != NULL);
+
+	if (s1->alloc != NULL) {
+		alloc = s1->alloc;
+		pw = s1->pw;
+	} else if (s2->alloc != NULL) {
+		alloc = s2->alloc;
+		pw = s2->pw;
+	} else {
+		/* s1 == s2 == empty_string */
+		*result = &empty_string;
+		return DOM_NO_ERR;
+	}
 
-	concat = s1->alloc(NULL, sizeof(struct dom_string), s1->pw);
+	concat = alloc(NULL, sizeof(struct dom_string), pw);
 
 	if (concat == NULL) {
 		return DOM_NO_MEM_ERR;
 	}
 
-	concat->ptr = s1->alloc(NULL, s1->len + s2->len, s1->pw);
+	concat->ptr = alloc(NULL, s1->len + s2->len, pw);
 	if (concat->ptr == NULL) {
-		s1->alloc(concat, 0, s1->pw);
+		alloc(concat, 0, pw);
 
 		return DOM_NO_MEM_ERR;
 	}
@@ -342,8 +599,10 @@ dom_exception dom_string_concat(struct dom_string *s1, struct dom_string *s2,
 
 	concat->len = s1->len + s2->len;
 
-	concat->alloc = s1->alloc;
-	concat->pw = s1->pw;
+	concat->alloc = alloc;
+	concat->pw = pw;
+	concat->context = NULL;
+	concat->intern = NULL;
 
 	concat->refcnt = 1;
 
@@ -382,7 +641,7 @@ dom_exception dom_string_substr(struct dom_string *str,
 
 	/* Calculate the byte index of the start */
 	while (i1 > 0) {
-		err = parserutils_charset_utf8_next(s, slen - b1, b1, &b1);
+		err = parserutils_charset_utf8_next(s, slen, b1, &b1);
 		if (err != PARSERUTILS_OK) {
 			return DOM_NO_MEM_ERR;
 		}
@@ -395,7 +654,7 @@ dom_exception dom_string_substr(struct dom_string *str,
 
 	/* Calculate the byte index of the end */
 	while (i2 > 0) {
-		err = parserutils_charset_utf8_next(s, slen - b2, b2, &b2);
+		err = parserutils_charset_utf8_next(s, slen, b2, &b2);
 		if (err != PARSERUTILS_OK) {
 			return DOM_NO_MEM_ERR;
 		}
@@ -451,7 +710,7 @@ dom_exception dom_string_insert(struct dom_string *target,
 		ins = tlen;
 	} else {
 		while (offset > 0) {
-			err = parserutils_charset_utf8_next(t, tlen - ins, 
+			err = parserutils_charset_utf8_next(t, tlen, 
 					ins, &ins);
 
 			if (err != PARSERUTILS_OK) {
@@ -492,6 +751,8 @@ dom_exception dom_string_insert(struct dom_string *target,
 
 	res->alloc = target->alloc;
 	res->pw = target->pw;
+	res->intern = NULL;
+	res->context = NULL;
 	
 	res->refcnt = 1;
 
@@ -526,6 +787,9 @@ dom_exception dom_string_replace(struct dom_string *target,
 	uint32_t b1, b2;
 	parserutils_error err;
 
+	if (source == NULL)
+		source = &empty_string;
+
 	t = target->ptr;
 	tlen = target->len;
 	s = source->ptr;
@@ -538,7 +802,7 @@ dom_exception dom_string_replace(struct dom_string *target,
 
 	/* Calculate the byte index of the start */
 	while (i1 > 0) {
-		err = parserutils_charset_utf8_next(s, slen - b1, b1, &b1);
+		err = parserutils_charset_utf8_next(s, slen, b1, &b1);
 
 		if (err != PARSERUTILS_OK) {
 			return DOM_NO_MEM_ERR;
@@ -552,7 +816,7 @@ dom_exception dom_string_replace(struct dom_string *target,
 
 	/* Calculate the byte index of the end */
 	while (i2 > 0) {
-		err = parserutils_charset_utf8_next(s, slen - b2, b2, &b2);
+		err = parserutils_charset_utf8_next(s, slen, b2, &b2);
 
 		if (err != PARSERUTILS_OK) {
 			return DOM_NO_MEM_ERR;
@@ -594,6 +858,8 @@ dom_exception dom_string_replace(struct dom_string *target,
 
 	res->alloc = target->alloc;
 	res->pw = target->pw;
+	res->intern = NULL;
+	res->context = NULL;
 
 	res->refcnt = 1;
 
@@ -618,8 +884,13 @@ dom_exception dom_string_replace(struct dom_string *target,
 dom_exception dom_string_dup(struct dom_string *str, 
 		struct dom_string **result)
 {
-	return dom_string_create(str->alloc, str->pw, str->ptr, str->len, 
-			result);
+	if (str->intern != NULL) {
+		return _dom_string_create_from_lwcstring(str->alloc, str->pw,
+				str->context, str->intern, result);
+	} else {
+		return dom_string_create(str->alloc, str->pw, str->ptr,
+				str->len, result);
+	}
 }
 
 /**
@@ -645,3 +916,48 @@ uint32_t dom_string_hash(struct dom_string *str)
 	return hash;
 }
 
+/**
+ * Convert a lwc_error to a dom_exception
+ * 
+ * \param err  The input lwc_error
+ * \return the dom_exception
+ */
+dom_exception _dom_exception_from_lwc_error(lwc_error err)
+{
+	switch (err) {
+		case lwc_error_ok:
+			return DOM_NO_ERR;
+		case lwc_error_oom:
+			return DOM_NO_MEM_ERR;
+		case lwc_error_range:
+			return DOM_INDEX_SIZE_ERR;
+	}
+	assert ("Unknow lwc_error, can't convert to dom_exception");
+	/* Suppress compile errors */
+	return DOM_NO_ERR;
+}
+
+/**
+ * Compare the raw data of two lwc_strings for equality when the two strings
+ * belong to different lwc_context 
+ * 
+ * \param s1  The first lwc_string
+ * \param s2  The second lwc_string
+ * \return 0 for equal, non-zero otherwise
+ */
+int _dom_lwc_string_compare_raw(struct lwc_string_s *s1,
+		struct lwc_string_s *s2)
+{
+	const char *rs1, *rs2;
+	size_t len;
+
+	if (lwc_string_length(s1) != lwc_string_length(s2))
+		return -1;
+	
+	len = lwc_string_length(s1);
+	rs1 = lwc_string_data(s1);
+	rs2 = lwc_string_data(s2);
+
+	return memcmp(rs1, rs2, len);
+}
+