From e0e38d906c8974bb22a0368a9709af9590362927 Mon Sep 17 00:00:00 2001
From: John Mark Bell <jmb@netsurf-browser.org>
Date: Sun, 30 Sep 2007 21:10:50 +0000
Subject: DOM Strings are now capable of containing either UTF-8 or UTF-16
 encoded data. The charset used for strings within a document is specified at
 document creation time. Whilst it is possible to mix charsets within a
 document, it's not recommended.

Things that need fixing:

+ dom_string_get_data() doesn't return the charset. Better would be to permit
  the client to request a charset for the data to be returned in.
+ Interned node name strings will break if the document is UTF-16
  (dom_document_create()). In fact, these could quite happily be globals,
  rather than allocating a set for each document.
+ Other usage of dom string constructors need checking for sanity
+ DOM Strings need to gain more utility APIs (such as getting the character
  length of a string, string concatenation etc).


svn path=/trunk/dom/; revision=3614
---
 src/core/string.c | 93 +++++++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 80 insertions(+), 13 deletions(-)

(limited to 'src/core/string.c')

diff --git a/src/core/string.c b/src/core/string.c
index d43c571..faa3c85 100644
--- a/src/core/string.c
+++ b/src/core/string.c
@@ -5,6 +5,7 @@
  * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
  */
 
+#include <ctype.h>
 #include <inttypes.h>
 #include <string.h>
 
@@ -12,6 +13,8 @@
 
 #include "core/document.h"
 #include "utils/utils.h"
+#include "utils/utf8.h"
+#include "utils/utf16.h"
 
 /**
  * A DOM string
@@ -28,6 +31,8 @@ struct dom_string {
 	       DOM_STRING_PTR_NODOC
 	} type;				/**< String type */
 
+	dom_string_charset charset;	/**< Charset of string */
+
 	union {
 		uint8_t *ptr;
 		const uint8_t *cptr;
@@ -49,7 +54,8 @@ struct dom_string {
 };
 
 static struct dom_string empty_string = { 
-	.type = DOM_STRING_CONST_PTR, 
+	.type = DOM_STRING_CONST_PTR,
+	.charset = DOM_STRING_UTF8,
 	.data.ptr = NULL,
 	.len = 0,
 	.ctx.doc = NULL,
@@ -116,6 +122,8 @@ dom_exception dom_string_create_from_off(struct dom_document *doc,
 
 	ret->type = DOM_STRING_OFFSET;
 
+	ret->charset = dom_document_get_charset(doc);
+
 	ret->data.offset = off;
 
 	ret->len = len;
@@ -161,6 +169,8 @@ dom_exception dom_string_create_from_ptr(struct dom_document *doc,
 
 	ret->type = DOM_STRING_PTR;
 
+	ret->charset = dom_document_get_charset(doc);
+
 	memcpy(ret->data.ptr, ptr, len);
 
 	ret->len = len;
@@ -200,6 +210,8 @@ dom_exception dom_string_create_from_const_ptr(struct dom_document *doc,
 
 	ret->type = DOM_STRING_CONST_PTR;
 
+	ret->charset = dom_document_get_charset(doc);
+
 	ret->data.cptr = ptr;
 
 	ret->len = len;
@@ -217,11 +229,12 @@ dom_exception dom_string_create_from_const_ptr(struct dom_document *doc,
  * Create a DOM string from a string of characters that does not belong
  * to a document
  *
- * \param alloc  Memory (de)allocation function
- * \param pw     Pointer to client-specific private data
- * \param ptr    Pointer to string of characters
- * \param len    Length, in bytes, of string of characters
- * \param str    Pointer to location to receive result
+ * \param alloc    Memory (de)allocation function
+ * \param pw       Pointer to client-specific private data
+ * \param charset  The charset of the string
+ * \param ptr      Pointer to string of characters
+ * \param len      Length, in bytes, of string of characters
+ * \param str      Pointer to location to receive result
  * \return DOM_NO_ERR on success, DOM_NO_MEM_ERR on memory exhaustion
  *
  * The returned string will already be referenced, so there is no need
@@ -231,7 +244,8 @@ dom_exception dom_string_create_from_const_ptr(struct dom_document *doc,
  * returned DOM string.
  */
 dom_exception dom_string_create_from_ptr_no_doc(dom_alloc alloc, void *pw,
-		const uint8_t *ptr, size_t len, struct dom_string **str)
+		dom_string_charset charset, const uint8_t *ptr, size_t len, 
+		struct dom_string **str)
 {
 	struct dom_string *ret;
 
@@ -247,6 +261,8 @@ dom_exception dom_string_create_from_ptr_no_doc(dom_alloc alloc, void *pw,
 
 	ret->type = DOM_STRING_PTR_NODOC;
 
+	ret->charset = charset;
+
 	memcpy(ret->data.ptr, ptr, len);
 
 	ret->len = len;
@@ -324,10 +340,35 @@ int dom_string_cmp(struct dom_string *s1, struct dom_string *s2)
 	if (err != DOM_NO_ERR)
 		return 1; /* arbitrary */
 
-	if (l1 != l2)
-		return 1; /* arbitrary */
+	while (l1 > 0 && l2 > 0) {
+		uint32_t c1, c2;
+		size_t cl1, cl2;
+		charset_error err;
+
+		err = (s1->charset == DOM_STRING_UTF8) 
+				? _dom_utf8_to_ucs4(d1, l1, &c1, &cl1) 
+				: _dom_utf16_to_ucs4(d1, l1, &c1, &cl1);
+		if (err != CHARSET_OK) {
+		}
+
+		err = (s2->charset == DOM_STRING_UTF8)
+				? _dom_utf8_to_ucs4(d2, l2, &c2, &cl2)
+				: _dom_utf16_to_ucs4(d2, l2, &c2, &cl2);
+		if (err != CHARSET_OK) {
+		}
+
+		if (c1 != c2) {
+			return (int)(c1 - c2);
+		}
 
-	return strncmp((const char *) d1, (const char *) d2, l1);
+		d1 += cl1;
+		d2 += cl2;
+
+		l1 -= cl1;
+		l2 -= cl2;
+	}
+
+	return (int)(l1 - l2);
 }
 
 /**
@@ -354,9 +395,35 @@ int dom_string_icmp(struct dom_string *s1, struct dom_string *s2)
 	if (err != DOM_NO_ERR)
 		return 1; /* arbitrary */
 
-	if (l1 != l2)
-		return 1; /* arbitrary */
+	while (l1 > 0 && l2 > 0) {
+		uint32_t c1, c2;
+		size_t cl1, cl2;
+		charset_error err;
+
+		err = (s1->charset == DOM_STRING_UTF8) 
+				? _dom_utf8_to_ucs4(d1, l1, &c1, &cl1) 
+				: _dom_utf16_to_ucs4(d1, l1, &c1, &cl1);
+		if (err != CHARSET_OK) {
+		}
+
+		err = (s2->charset == DOM_STRING_UTF8)
+				? _dom_utf8_to_ucs4(d2, l2, &c2, &cl2)
+				: _dom_utf16_to_ucs4(d2, l2, &c2, &cl2);
+		if (err != CHARSET_OK) {
+		}
+
+		/** \todo improved lower-casing algorithm */
+		if (tolower(c1) != tolower(c2)) {
+			return (int)(tolower(c1) - tolower(c2));
+		}
+
+		d1 += cl1;
+		d2 += cl2;
+
+		l1 -= cl1;
+		l2 -= cl2;
+	}
 
-	return strncasecmp((const char *) d1, (const char *) d2, l1);
+	return (int)(l1 - l2);
 }
 
-- 
cgit v1.2.3