summaryrefslogtreecommitdiff
path: root/src/core
diff options
context:
space:
mode:
authorJohn Mark Bell <jmb@netsurf-browser.org>2007-09-30 21:10:50 +0000
committerJohn Mark Bell <jmb@netsurf-browser.org>2007-09-30 21:10:50 +0000
commite0e38d906c8974bb22a0368a9709af9590362927 (patch)
treef7f4c1acff769e9a7f6cd0f1c037ba2c28a66593 /src/core
parent49e419c9b75cc149e7f4c898c31aed33f4b2c960 (diff)
downloadlibdom-e0e38d906c8974bb22a0368a9709af9590362927.tar.gz
libdom-e0e38d906c8974bb22a0368a9709af9590362927.tar.bz2
DOM Strings are now capable of containing either UTF-8 or UTF-16 encoded data.
The charset used for strings within a document is specified at document creation time. Whilst it is possible to mix charsets within a document, it's not recommended. Things that need fixing: + dom_string_get_data() doesn't return the charset. Better would be to permit the client to request a charset for the data to be returned in. + Interned node name strings will break if the document is UTF-16 (dom_document_create()). In fact, these could quite happily be globals, rather than allocating a set for each document. + Other usage of dom string constructors need checking for sanity + DOM Strings need to gain more utility APIs (such as getting the character length of a string, string concatenation etc). svn path=/trunk/dom/; revision=3614
Diffstat (limited to 'src/core')
-rw-r--r--src/core/document.c44
-rw-r--r--src/core/document.h5
-rw-r--r--src/core/implementation.c4
-rw-r--r--src/core/string.c93
4 files changed, 126 insertions, 20 deletions
diff --git a/src/core/document.c b/src/core/document.c
index 5148224..e188868 100644
--- a/src/core/document.c
+++ b/src/core/document.c
@@ -56,6 +56,8 @@ struct dom_doc_nnm {
struct dom_document {
struct dom_node base; /**< Base node */
+ dom_string_charset charset; /**< Charset of strings in document */
+
struct dom_implementation *impl; /**< Owning implementation */
struct dom_doc_nl *nodelists; /**< List of active nodelists */
@@ -73,10 +75,11 @@ struct dom_document {
/**
* Create a Document
*
- * \param impl The DOM implementation owning the document
- * \param alloc Memory (de)allocation function
- * \param pw Pointer to client-specific private data
- * \param doc Pointer to location to receive created document
+ * \param impl The DOM implementation owning the document
+ * \param charset The charset used for strings in the document
+ * \param alloc Memory (de)allocation function
+ * \param pw Pointer to client-specific private data
+ * \param doc Pointer to location to receive created document
* \return DOM_NO_ERR on success, DOM_NO_MEM_ERR on memory exhaustion.
*
* ::impl will have its reference count increased.
@@ -84,7 +87,8 @@ struct dom_document {
* The returned document will already be referenced.
*/
dom_exception dom_document_create(struct dom_implementation *impl,
- dom_alloc alloc, void *pw, struct dom_document **doc)
+ dom_string_charset charset, dom_alloc alloc, void *pw,
+ struct dom_document **doc)
{
static const char *names[DOM_NODE_TYPE_COUNT + 1] = {
NULL, /* Unused */
@@ -110,6 +114,7 @@ dom_exception dom_document_create(struct dom_implementation *impl,
return DOM_NO_MEM_ERR;
/* Set up document allocation context - must be first */
+ d->charset = charset;
d->alloc = alloc;
d->pw = pw;
@@ -994,6 +999,35 @@ const uint8_t *dom_document_get_base(struct dom_document *doc)
}
/**
+ * Set the document buffer pointer
+ *
+ * \param doc Document to set buffer pointer of
+ * \param buffer Pointer to buffer
+ * \param buffer_len Length of buffer, in bytes
+ *
+ * By calling this, ownership of the buffer is transferred to the document.
+ * It should be called once per document node.
+ */
+void dom_document_set_buffer(struct dom_document *doc, uint8_t *buffer,
+ size_t buffer_len)
+{
+ UNUSED(doc);
+ UNUSED(buffer);
+ UNUSED(buffer_len);
+}
+
+/**
+ * Retrieve the character set used to encode strings in the document
+ *
+ * \param doc The document to get the charset of
+ * \return The charset in use
+ */
+dom_string_charset dom_document_get_charset(struct dom_document *doc)
+{
+ return doc->charset;
+}
+
+/**
* (De)allocate memory with a document's context
*
* \param doc The document context to allocate from
diff --git a/src/core/document.h b/src/core/document.h
index 367b1ec..5149f2e 100644
--- a/src/core/document.h
+++ b/src/core/document.h
@@ -12,12 +12,12 @@
#include <stddef.h>
#include <dom/core/node.h>
+#include <dom/core/string.h>
struct dom_document;
struct dom_namednodemap;
struct dom_node;
struct dom_nodelist;
-struct dom_string;
/* Destroy a document */
void dom_document_destroy(struct dom_document *doc);
@@ -25,6 +25,9 @@ void dom_document_destroy(struct dom_document *doc);
/* Get base of document buffer */
const uint8_t *dom_document_get_base(struct dom_document *doc);
+/* Get the document character set */
+dom_string_charset dom_document_get_charset(struct dom_document *doc);
+
/* (De)allocate memory */
void *dom_document_alloc(struct dom_document *doc, void *ptr, size_t size);
diff --git a/src/core/implementation.c b/src/core/implementation.c
index e37b27d..9738b7c 100644
--- a/src/core/implementation.c
+++ b/src/core/implementation.c
@@ -94,6 +94,7 @@ dom_exception dom_implementation_create_document_type(
* \param qname The qualified name of the document element
* \param doctype The type of document to create
* \param doc Pointer to location to receive result
+ * \param charset The charset to use for strings in the document
* \param alloc Memory (de)allocation function
* \param pw Pointer to client-specific private data
* \return DOM_NO_ERR on success,
@@ -126,10 +127,11 @@ dom_exception dom_implementation_create_document(
struct dom_string *namespace, struct dom_string *qname,
struct dom_document_type *doctype,
struct dom_document **doc,
+ dom_string_charset charset,
dom_alloc alloc, void *pw)
{
return impl->create_document(impl, namespace, qname, doctype, doc,
- alloc, pw);
+ charset, alloc, pw);
}
/**
diff --git a/src/core/string.c b/src/core/string.c
index d43c571..faa3c85 100644
--- a/src/core/string.c
+++ b/src/core/string.c
@@ -5,6 +5,7 @@
* Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
*/
+#include <ctype.h>
#include <inttypes.h>
#include <string.h>
@@ -12,6 +13,8 @@
#include "core/document.h"
#include "utils/utils.h"
+#include "utils/utf8.h"
+#include "utils/utf16.h"
/**
* A DOM string
@@ -28,6 +31,8 @@ struct dom_string {
DOM_STRING_PTR_NODOC
} type; /**< String type */
+ dom_string_charset charset; /**< Charset of string */
+
union {
uint8_t *ptr;
const uint8_t *cptr;
@@ -49,7 +54,8 @@ struct dom_string {
};
static struct dom_string empty_string = {
- .type = DOM_STRING_CONST_PTR,
+ .type = DOM_STRING_CONST_PTR,
+ .charset = DOM_STRING_UTF8,
.data.ptr = NULL,
.len = 0,
.ctx.doc = NULL,
@@ -116,6 +122,8 @@ dom_exception dom_string_create_from_off(struct dom_document *doc,
ret->type = DOM_STRING_OFFSET;
+ ret->charset = dom_document_get_charset(doc);
+
ret->data.offset = off;
ret->len = len;
@@ -161,6 +169,8 @@ dom_exception dom_string_create_from_ptr(struct dom_document *doc,
ret->type = DOM_STRING_PTR;
+ ret->charset = dom_document_get_charset(doc);
+
memcpy(ret->data.ptr, ptr, len);
ret->len = len;
@@ -200,6 +210,8 @@ dom_exception dom_string_create_from_const_ptr(struct dom_document *doc,
ret->type = DOM_STRING_CONST_PTR;
+ ret->charset = dom_document_get_charset(doc);
+
ret->data.cptr = ptr;
ret->len = len;
@@ -217,11 +229,12 @@ dom_exception dom_string_create_from_const_ptr(struct dom_document *doc,
* Create a DOM string from a string of characters that does not belong
* to a document
*
- * \param alloc Memory (de)allocation function
- * \param pw Pointer to client-specific private data
- * \param ptr Pointer to string of characters
- * \param len Length, in bytes, of string of characters
- * \param str Pointer to location to receive result
+ * \param alloc Memory (de)allocation function
+ * \param pw Pointer to client-specific private data
+ * \param charset The charset of the string
+ * \param ptr Pointer to string of characters
+ * \param len Length, in bytes, of string of characters
+ * \param str Pointer to location to receive result
* \return DOM_NO_ERR on success, DOM_NO_MEM_ERR on memory exhaustion
*
* The returned string will already be referenced, so there is no need
@@ -231,7 +244,8 @@ dom_exception dom_string_create_from_const_ptr(struct dom_document *doc,
* returned DOM string.
*/
dom_exception dom_string_create_from_ptr_no_doc(dom_alloc alloc, void *pw,
- const uint8_t *ptr, size_t len, struct dom_string **str)
+ dom_string_charset charset, const uint8_t *ptr, size_t len,
+ struct dom_string **str)
{
struct dom_string *ret;
@@ -247,6 +261,8 @@ dom_exception dom_string_create_from_ptr_no_doc(dom_alloc alloc, void *pw,
ret->type = DOM_STRING_PTR_NODOC;
+ ret->charset = charset;
+
memcpy(ret->data.ptr, ptr, len);
ret->len = len;
@@ -324,10 +340,35 @@ int dom_string_cmp(struct dom_string *s1, struct dom_string *s2)
if (err != DOM_NO_ERR)
return 1; /* arbitrary */
- if (l1 != l2)
- return 1; /* arbitrary */
+ while (l1 > 0 && l2 > 0) {
+ uint32_t c1, c2;
+ size_t cl1, cl2;
+ charset_error err;
+
+ err = (s1->charset == DOM_STRING_UTF8)
+ ? _dom_utf8_to_ucs4(d1, l1, &c1, &cl1)
+ : _dom_utf16_to_ucs4(d1, l1, &c1, &cl1);
+ if (err != CHARSET_OK) {
+ }
+
+ err = (s2->charset == DOM_STRING_UTF8)
+ ? _dom_utf8_to_ucs4(d2, l2, &c2, &cl2)
+ : _dom_utf16_to_ucs4(d2, l2, &c2, &cl2);
+ if (err != CHARSET_OK) {
+ }
+
+ if (c1 != c2) {
+ return (int)(c1 - c2);
+ }
- return strncmp((const char *) d1, (const char *) d2, l1);
+ d1 += cl1;
+ d2 += cl2;
+
+ l1 -= cl1;
+ l2 -= cl2;
+ }
+
+ return (int)(l1 - l2);
}
/**
@@ -354,9 +395,35 @@ int dom_string_icmp(struct dom_string *s1, struct dom_string *s2)
if (err != DOM_NO_ERR)
return 1; /* arbitrary */
- if (l1 != l2)
- return 1; /* arbitrary */
+ while (l1 > 0 && l2 > 0) {
+ uint32_t c1, c2;
+ size_t cl1, cl2;
+ charset_error err;
+
+ err = (s1->charset == DOM_STRING_UTF8)
+ ? _dom_utf8_to_ucs4(d1, l1, &c1, &cl1)
+ : _dom_utf16_to_ucs4(d1, l1, &c1, &cl1);
+ if (err != CHARSET_OK) {
+ }
+
+ err = (s2->charset == DOM_STRING_UTF8)
+ ? _dom_utf8_to_ucs4(d2, l2, &c2, &cl2)
+ : _dom_utf16_to_ucs4(d2, l2, &c2, &cl2);
+ if (err != CHARSET_OK) {
+ }
+
+ /** \todo improved lower-casing algorithm */
+ if (tolower(c1) != tolower(c2)) {
+ return (int)(tolower(c1) - tolower(c2));
+ }
+
+ d1 += cl1;
+ d2 += cl2;
+
+ l1 -= cl1;
+ l2 -= cl2;
+ }
- return strncasecmp((const char *) d1, (const char *) d2, l1);
+ return (int)(l1 - l2);
}