From e0e38d906c8974bb22a0368a9709af9590362927 Mon Sep 17 00:00:00 2001 From: John Mark Bell Date: Sun, 30 Sep 2007 21:10:50 +0000 Subject: DOM Strings are now capable of containing either UTF-8 or UTF-16 encoded data. The charset used for strings within a document is specified at document creation time. Whilst it is possible to mix charsets within a document, it's not recommended. Things that need fixing: + dom_string_get_data() doesn't return the charset. Better would be to permit the client to request a charset for the data to be returned in. + Interned node name strings will break if the document is UTF-16 (dom_document_create()). In fact, these could quite happily be globals, rather than allocating a set for each document. + Other usage of dom string constructors need checking for sanity + DOM Strings need to gain more utility APIs (such as getting the character length of a string, string concatenation etc). svn path=/trunk/dom/; revision=3614 --- src/core/string.c | 93 +++++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 80 insertions(+), 13 deletions(-) (limited to 'src/core/string.c') diff --git a/src/core/string.c b/src/core/string.c index d43c571..faa3c85 100644 --- a/src/core/string.c +++ b/src/core/string.c @@ -5,6 +5,7 @@ * Copyright 2007 John-Mark Bell */ +#include #include #include @@ -12,6 +13,8 @@ #include "core/document.h" #include "utils/utils.h" +#include "utils/utf8.h" +#include "utils/utf16.h" /** * A DOM string @@ -28,6 +31,8 @@ struct dom_string { DOM_STRING_PTR_NODOC } type; /**< String type */ + dom_string_charset charset; /**< Charset of string */ + union { uint8_t *ptr; const uint8_t *cptr; @@ -49,7 +54,8 @@ struct dom_string { }; static struct dom_string empty_string = { - .type = DOM_STRING_CONST_PTR, + .type = DOM_STRING_CONST_PTR, + .charset = DOM_STRING_UTF8, .data.ptr = NULL, .len = 0, .ctx.doc = NULL, @@ -116,6 +122,8 @@ dom_exception dom_string_create_from_off(struct dom_document *doc, ret->type = DOM_STRING_OFFSET; + ret->charset = dom_document_get_charset(doc); + ret->data.offset = off; ret->len = len; @@ -161,6 +169,8 @@ dom_exception dom_string_create_from_ptr(struct dom_document *doc, ret->type = DOM_STRING_PTR; + ret->charset = dom_document_get_charset(doc); + memcpy(ret->data.ptr, ptr, len); ret->len = len; @@ -200,6 +210,8 @@ dom_exception dom_string_create_from_const_ptr(struct dom_document *doc, ret->type = DOM_STRING_CONST_PTR; + ret->charset = dom_document_get_charset(doc); + ret->data.cptr = ptr; ret->len = len; @@ -217,11 +229,12 @@ dom_exception dom_string_create_from_const_ptr(struct dom_document *doc, * Create a DOM string from a string of characters that does not belong * to a document * - * \param alloc Memory (de)allocation function - * \param pw Pointer to client-specific private data - * \param ptr Pointer to string of characters - * \param len Length, in bytes, of string of characters - * \param str Pointer to location to receive result + * \param alloc Memory (de)allocation function + * \param pw Pointer to client-specific private data + * \param charset The charset of the string + * \param ptr Pointer to string of characters + * \param len Length, in bytes, of string of characters + * \param str Pointer to location to receive result * \return DOM_NO_ERR on success, DOM_NO_MEM_ERR on memory exhaustion * * The returned string will already be referenced, so there is no need @@ -231,7 +244,8 @@ dom_exception dom_string_create_from_const_ptr(struct dom_document *doc, * returned DOM string. */ dom_exception dom_string_create_from_ptr_no_doc(dom_alloc alloc, void *pw, - const uint8_t *ptr, size_t len, struct dom_string **str) + dom_string_charset charset, const uint8_t *ptr, size_t len, + struct dom_string **str) { struct dom_string *ret; @@ -247,6 +261,8 @@ dom_exception dom_string_create_from_ptr_no_doc(dom_alloc alloc, void *pw, ret->type = DOM_STRING_PTR_NODOC; + ret->charset = charset; + memcpy(ret->data.ptr, ptr, len); ret->len = len; @@ -324,10 +340,35 @@ int dom_string_cmp(struct dom_string *s1, struct dom_string *s2) if (err != DOM_NO_ERR) return 1; /* arbitrary */ - if (l1 != l2) - return 1; /* arbitrary */ + while (l1 > 0 && l2 > 0) { + uint32_t c1, c2; + size_t cl1, cl2; + charset_error err; + + err = (s1->charset == DOM_STRING_UTF8) + ? _dom_utf8_to_ucs4(d1, l1, &c1, &cl1) + : _dom_utf16_to_ucs4(d1, l1, &c1, &cl1); + if (err != CHARSET_OK) { + } + + err = (s2->charset == DOM_STRING_UTF8) + ? _dom_utf8_to_ucs4(d2, l2, &c2, &cl2) + : _dom_utf16_to_ucs4(d2, l2, &c2, &cl2); + if (err != CHARSET_OK) { + } + + if (c1 != c2) { + return (int)(c1 - c2); + } - return strncmp((const char *) d1, (const char *) d2, l1); + d1 += cl1; + d2 += cl2; + + l1 -= cl1; + l2 -= cl2; + } + + return (int)(l1 - l2); } /** @@ -354,9 +395,35 @@ int dom_string_icmp(struct dom_string *s1, struct dom_string *s2) if (err != DOM_NO_ERR) return 1; /* arbitrary */ - if (l1 != l2) - return 1; /* arbitrary */ + while (l1 > 0 && l2 > 0) { + uint32_t c1, c2; + size_t cl1, cl2; + charset_error err; + + err = (s1->charset == DOM_STRING_UTF8) + ? _dom_utf8_to_ucs4(d1, l1, &c1, &cl1) + : _dom_utf16_to_ucs4(d1, l1, &c1, &cl1); + if (err != CHARSET_OK) { + } + + err = (s2->charset == DOM_STRING_UTF8) + ? _dom_utf8_to_ucs4(d2, l2, &c2, &cl2) + : _dom_utf16_to_ucs4(d2, l2, &c2, &cl2); + if (err != CHARSET_OK) { + } + + /** \todo improved lower-casing algorithm */ + if (tolower(c1) != tolower(c2)) { + return (int)(tolower(c1) - tolower(c2)); + } + + d1 += cl1; + d2 += cl2; + + l1 -= cl1; + l2 -= cl2; + } - return strncasecmp((const char *) d1, (const char *) d2, l1); + return (int)(l1 - l2); } -- cgit v1.2.3