From 613f88393960853513873756933bd23b93543a33 Mon Sep 17 00:00:00 2001 From: John Mark Bell Date: Sun, 30 Sep 2007 21:10:50 +0000 Subject: DOM Strings are now capable of containing either UTF-8 or UTF-16 encoded data. The charset used for strings within a document is specified at document creation time. Whilst it is possible to mix charsets within a document, it's not recommended. Things that need fixing: + dom_string_get_data() doesn't return the charset. Better would be to permit the client to request a charset for the data to be returned in. + Interned node name strings will break if the document is UTF-16 (dom_document_create()). In fact, these could quite happily be globals, rather than allocating a set for each document. + Other usage of dom string constructors need checking for sanity + DOM Strings need to gain more utility APIs (such as getting the character length of a string, string concatenation etc). svn path=/trunk/dom/; revision=3614 --- bindings/xml/xmlbinding.c | 5 +- bindings/xml/xmlparser.c | 3 + include/dom/bootstrap/implpriv.h | 11 +- include/dom/core/implementation.h | 3 +- include/dom/core/string.h | 8 +- src/core/document.c | 44 ++++- src/core/document.h | 5 +- src/core/implementation.c | 4 +- src/core/string.c | 93 ++++++++-- src/utils/Makefile | 2 +- src/utils/charset_errors.h | 19 ++ src/utils/namespace.c | 2 + src/utils/utf16.c | 239 +++++++++++++++++++++++++ src/utils/utf16.h | 38 ++++ src/utils/utf8.c | 368 ++++++++++++++++++++++++++++++++++++++ src/utils/utf8.h | 38 ++++ 16 files changed, 856 insertions(+), 26 deletions(-) create mode 100644 src/utils/charset_errors.h create mode 100644 src/utils/utf16.c create mode 100644 src/utils/utf16.h create mode 100644 src/utils/utf8.c create mode 100644 src/utils/utf8.h diff --git a/bindings/xml/xmlbinding.c b/bindings/xml/xmlbinding.c index fbdc4c9..7b2475e 100644 --- a/bindings/xml/xmlbinding.c +++ b/bindings/xml/xmlbinding.c @@ -39,6 +39,7 @@ static dom_exception xml_dom_implementation_create_document( struct dom_string *qname, struct dom_document_type *doctype, struct dom_document **doc, + dom_string_charset charset, dom_alloc alloc, void *pw); static dom_exception xml_dom_implementation_get_feature( struct dom_implementation *impl, @@ -237,6 +238,7 @@ dom_exception xml_dom_implementation_create_document_type( * \param qname The qualified name of the document element * \param doctype The type of document to create * \param doc Pointer to location to receive result + * \param charset The charset to use for strings in the document * \param alloc Memory (de)allocation function * \param pw Pointer to client-specific private data * \return DOM_NO_ERR on success, @@ -273,13 +275,14 @@ dom_exception xml_dom_implementation_create_document( struct dom_string *qname, struct dom_document_type *doctype, struct dom_document **doc, + dom_string_charset charset, dom_alloc alloc, void *pw) { struct dom_document *d; dom_exception err; /* Create document object */ - err = dom_document_create(impl, alloc, pw, &d); + err = dom_document_create(impl, charset, alloc, pw, &d); if (err != DOM_NO_ERR) return err; diff --git a/bindings/xml/xmlparser.c b/bindings/xml/xmlparser.c index 6f1516b..9541a7c 100644 --- a/bindings/xml/xmlparser.c +++ b/bindings/xml/xmlparser.c @@ -182,6 +182,7 @@ xml_parser *xml_parser_create(const char *enc, const char *int_enc, /* Create key for user data registration */ err = dom_string_create_from_ptr_no_doc((dom_alloc) alloc, pw, + DOM_STRING_UTF8, (const uint8_t *) "__xmlnode", SLEN("__xmlnode"), &parser->udkey); if (err != DOM_NO_ERR) { @@ -194,6 +195,7 @@ xml_parser *xml_parser_create(const char *enc, const char *int_enc, /* Get DOM implementation */ /* Create a string representation of the features we want */ err = dom_string_create_from_ptr_no_doc((dom_alloc) alloc, pw, + DOM_STRING_UTF8, (const uint8_t *) "XML", SLEN("XML"), &features); if (err != DOM_NO_ERR) { dom_string_unref(parser->udkey); @@ -327,6 +329,7 @@ void xml_parser_start_document(void *ctx) /* qname */ NULL, /* doctype */ NULL, &doc, + DOM_STRING_UTF8, (dom_alloc) parser->alloc, parser->pw); if (err != DOM_NO_ERR) { diff --git a/include/dom/bootstrap/implpriv.h b/include/dom/bootstrap/implpriv.h index 36359c5..97806a8 100644 --- a/include/dom/bootstrap/implpriv.h +++ b/include/dom/bootstrap/implpriv.h @@ -29,10 +29,10 @@ #include #include +#include struct dom_document; struct dom_document_type; -struct dom_string; /** * DOM Implementation @@ -94,6 +94,7 @@ struct dom_implementation { * \param qname The qualified name of the document element * \param doctype The type of document to create * \param doc Pointer to location to receive result + * \param charset The charset to use for strings in the document * \param alloc Memory (de)allocation function * \param pw Pointer to client-specific private data * \return DOM_NO_ERR on success, @@ -129,6 +130,7 @@ struct dom_implementation { struct dom_string *qname, struct dom_document_type *doctype, struct dom_document **doc, + dom_string_charset charset, dom_alloc alloc, void *pw); /** @@ -249,7 +251,12 @@ dom_exception dom_register_source(struct dom_implementation_source *source, /* Create a DOM document */ dom_exception dom_document_create(struct dom_implementation *impl, - dom_alloc alloc, void *pw, struct dom_document **doc); + dom_string_charset charset, dom_alloc alloc, void *pw, + struct dom_document **doc); + +/* Set a document's buffer */ +void dom_document_set_buffer(struct dom_document *doc, uint8_t *buffer, + size_t buffer_len); /* Create a DOM document type */ dom_exception dom_document_type_create(struct dom_string *qname, diff --git a/include/dom/core/implementation.h b/include/dom/core/implementation.h index 3f42ab7..a51493f 100644 --- a/include/dom/core/implementation.h +++ b/include/dom/core/implementation.h @@ -12,11 +12,11 @@ #include #include +#include struct dom_document; struct dom_document_type; struct dom_implementation; -struct dom_string; void dom_implementation_ref(struct dom_implementation *impl); void dom_implementation_unref(struct dom_implementation *impl); @@ -37,6 +37,7 @@ dom_exception dom_implementation_create_document( struct dom_string *namespace, struct dom_string *qname, struct dom_document_type *doctype, struct dom_document **doc, + dom_string_charset charset, dom_alloc alloc, void *pw); dom_exception dom_implementation_get_feature( diff --git a/include/dom/core/string.h b/include/dom/core/string.h index c9ffd97..935e2c2 100644 --- a/include/dom/core/string.h +++ b/include/dom/core/string.h @@ -17,6 +17,11 @@ struct dom_document; struct dom_string; +typedef enum { + DOM_STRING_UTF8, + DOM_STRING_UTF16 +} dom_string_charset; + /* Claim a reference on a DOM string */ void dom_string_ref(struct dom_string *str); /* Release a reference on a DOM string */ @@ -34,7 +39,8 @@ dom_exception dom_string_create_from_const_ptr(struct dom_document *doc, /* Create a DOM string from a string of characters that does not belong * to a document */ dom_exception dom_string_create_from_ptr_no_doc(dom_alloc alloc, void *pw, - const uint8_t *ptr, size_t len, struct dom_string **str); + dom_string_charset charset, const uint8_t *ptr, size_t len, + struct dom_string **str); /* Get a pointer to the string of characters within a DOM string */ dom_exception dom_string_get_data(struct dom_string *str, diff --git a/src/core/document.c b/src/core/document.c index 5148224..e188868 100644 --- a/src/core/document.c +++ b/src/core/document.c @@ -56,6 +56,8 @@ struct dom_doc_nnm { struct dom_document { struct dom_node base; /**< Base node */ + dom_string_charset charset; /**< Charset of strings in document */ + struct dom_implementation *impl; /**< Owning implementation */ struct dom_doc_nl *nodelists; /**< List of active nodelists */ @@ -73,10 +75,11 @@ struct dom_document { /** * Create a Document * - * \param impl The DOM implementation owning the document - * \param alloc Memory (de)allocation function - * \param pw Pointer to client-specific private data - * \param doc Pointer to location to receive created document + * \param impl The DOM implementation owning the document + * \param charset The charset used for strings in the document + * \param alloc Memory (de)allocation function + * \param pw Pointer to client-specific private data + * \param doc Pointer to location to receive created document * \return DOM_NO_ERR on success, DOM_NO_MEM_ERR on memory exhaustion. * * ::impl will have its reference count increased. @@ -84,7 +87,8 @@ struct dom_document { * The returned document will already be referenced. */ dom_exception dom_document_create(struct dom_implementation *impl, - dom_alloc alloc, void *pw, struct dom_document **doc) + dom_string_charset charset, dom_alloc alloc, void *pw, + struct dom_document **doc) { static const char *names[DOM_NODE_TYPE_COUNT + 1] = { NULL, /* Unused */ @@ -110,6 +114,7 @@ dom_exception dom_document_create(struct dom_implementation *impl, return DOM_NO_MEM_ERR; /* Set up document allocation context - must be first */ + d->charset = charset; d->alloc = alloc; d->pw = pw; @@ -993,6 +998,35 @@ const uint8_t *dom_document_get_base(struct dom_document *doc) return NULL; } +/** + * Set the document buffer pointer + * + * \param doc Document to set buffer pointer of + * \param buffer Pointer to buffer + * \param buffer_len Length of buffer, in bytes + * + * By calling this, ownership of the buffer is transferred to the document. + * It should be called once per document node. + */ +void dom_document_set_buffer(struct dom_document *doc, uint8_t *buffer, + size_t buffer_len) +{ + UNUSED(doc); + UNUSED(buffer); + UNUSED(buffer_len); +} + +/** + * Retrieve the character set used to encode strings in the document + * + * \param doc The document to get the charset of + * \return The charset in use + */ +dom_string_charset dom_document_get_charset(struct dom_document *doc) +{ + return doc->charset; +} + /** * (De)allocate memory with a document's context * diff --git a/src/core/document.h b/src/core/document.h index 367b1ec..5149f2e 100644 --- a/src/core/document.h +++ b/src/core/document.h @@ -12,12 +12,12 @@ #include #include +#include struct dom_document; struct dom_namednodemap; struct dom_node; struct dom_nodelist; -struct dom_string; /* Destroy a document */ void dom_document_destroy(struct dom_document *doc); @@ -25,6 +25,9 @@ void dom_document_destroy(struct dom_document *doc); /* Get base of document buffer */ const uint8_t *dom_document_get_base(struct dom_document *doc); +/* Get the document character set */ +dom_string_charset dom_document_get_charset(struct dom_document *doc); + /* (De)allocate memory */ void *dom_document_alloc(struct dom_document *doc, void *ptr, size_t size); diff --git a/src/core/implementation.c b/src/core/implementation.c index e37b27d..9738b7c 100644 --- a/src/core/implementation.c +++ b/src/core/implementation.c @@ -94,6 +94,7 @@ dom_exception dom_implementation_create_document_type( * \param qname The qualified name of the document element * \param doctype The type of document to create * \param doc Pointer to location to receive result + * \param charset The charset to use for strings in the document * \param alloc Memory (de)allocation function * \param pw Pointer to client-specific private data * \return DOM_NO_ERR on success, @@ -126,10 +127,11 @@ dom_exception dom_implementation_create_document( struct dom_string *namespace, struct dom_string *qname, struct dom_document_type *doctype, struct dom_document **doc, + dom_string_charset charset, dom_alloc alloc, void *pw) { return impl->create_document(impl, namespace, qname, doctype, doc, - alloc, pw); + charset, alloc, pw); } /** diff --git a/src/core/string.c b/src/core/string.c index d43c571..faa3c85 100644 --- a/src/core/string.c +++ b/src/core/string.c @@ -5,6 +5,7 @@ * Copyright 2007 John-Mark Bell */ +#include #include #include @@ -12,6 +13,8 @@ #include "core/document.h" #include "utils/utils.h" +#include "utils/utf8.h" +#include "utils/utf16.h" /** * A DOM string @@ -28,6 +31,8 @@ struct dom_string { DOM_STRING_PTR_NODOC } type; /**< String type */ + dom_string_charset charset; /**< Charset of string */ + union { uint8_t *ptr; const uint8_t *cptr; @@ -49,7 +54,8 @@ struct dom_string { }; static struct dom_string empty_string = { - .type = DOM_STRING_CONST_PTR, + .type = DOM_STRING_CONST_PTR, + .charset = DOM_STRING_UTF8, .data.ptr = NULL, .len = 0, .ctx.doc = NULL, @@ -116,6 +122,8 @@ dom_exception dom_string_create_from_off(struct dom_document *doc, ret->type = DOM_STRING_OFFSET; + ret->charset = dom_document_get_charset(doc); + ret->data.offset = off; ret->len = len; @@ -161,6 +169,8 @@ dom_exception dom_string_create_from_ptr(struct dom_document *doc, ret->type = DOM_STRING_PTR; + ret->charset = dom_document_get_charset(doc); + memcpy(ret->data.ptr, ptr, len); ret->len = len; @@ -200,6 +210,8 @@ dom_exception dom_string_create_from_const_ptr(struct dom_document *doc, ret->type = DOM_STRING_CONST_PTR; + ret->charset = dom_document_get_charset(doc); + ret->data.cptr = ptr; ret->len = len; @@ -217,11 +229,12 @@ dom_exception dom_string_create_from_const_ptr(struct dom_document *doc, * Create a DOM string from a string of characters that does not belong * to a document * - * \param alloc Memory (de)allocation function - * \param pw Pointer to client-specific private data - * \param ptr Pointer to string of characters - * \param len Length, in bytes, of string of characters - * \param str Pointer to location to receive result + * \param alloc Memory (de)allocation function + * \param pw Pointer to client-specific private data + * \param charset The charset of the string + * \param ptr Pointer to string of characters + * \param len Length, in bytes, of string of characters + * \param str Pointer to location to receive result * \return DOM_NO_ERR on success, DOM_NO_MEM_ERR on memory exhaustion * * The returned string will already be referenced, so there is no need @@ -231,7 +244,8 @@ dom_exception dom_string_create_from_const_ptr(struct dom_document *doc, * returned DOM string. */ dom_exception dom_string_create_from_ptr_no_doc(dom_alloc alloc, void *pw, - const uint8_t *ptr, size_t len, struct dom_string **str) + dom_string_charset charset, const uint8_t *ptr, size_t len, + struct dom_string **str) { struct dom_string *ret; @@ -247,6 +261,8 @@ dom_exception dom_string_create_from_ptr_no_doc(dom_alloc alloc, void *pw, ret->type = DOM_STRING_PTR_NODOC; + ret->charset = charset; + memcpy(ret->data.ptr, ptr, len); ret->len = len; @@ -324,10 +340,35 @@ int dom_string_cmp(struct dom_string *s1, struct dom_string *s2) if (err != DOM_NO_ERR) return 1; /* arbitrary */ - if (l1 != l2) - return 1; /* arbitrary */ + while (l1 > 0 && l2 > 0) { + uint32_t c1, c2; + size_t cl1, cl2; + charset_error err; + + err = (s1->charset == DOM_STRING_UTF8) + ? _dom_utf8_to_ucs4(d1, l1, &c1, &cl1) + : _dom_utf16_to_ucs4(d1, l1, &c1, &cl1); + if (err != CHARSET_OK) { + } + + err = (s2->charset == DOM_STRING_UTF8) + ? _dom_utf8_to_ucs4(d2, l2, &c2, &cl2) + : _dom_utf16_to_ucs4(d2, l2, &c2, &cl2); + if (err != CHARSET_OK) { + } + + if (c1 != c2) { + return (int)(c1 - c2); + } - return strncmp((const char *) d1, (const char *) d2, l1); + d1 += cl1; + d2 += cl2; + + l1 -= cl1; + l2 -= cl2; + } + + return (int)(l1 - l2); } /** @@ -354,9 +395,35 @@ int dom_string_icmp(struct dom_string *s1, struct dom_string *s2) if (err != DOM_NO_ERR) return 1; /* arbitrary */ - if (l1 != l2) - return 1; /* arbitrary */ + while (l1 > 0 && l2 > 0) { + uint32_t c1, c2; + size_t cl1, cl2; + charset_error err; + + err = (s1->charset == DOM_STRING_UTF8) + ? _dom_utf8_to_ucs4(d1, l1, &c1, &cl1) + : _dom_utf16_to_ucs4(d1, l1, &c1, &cl1); + if (err != CHARSET_OK) { + } + + err = (s2->charset == DOM_STRING_UTF8) + ? _dom_utf8_to_ucs4(d2, l2, &c2, &cl2) + : _dom_utf16_to_ucs4(d2, l2, &c2, &cl2); + if (err != CHARSET_OK) { + } + + /** \todo improved lower-casing algorithm */ + if (tolower(c1) != tolower(c2)) { + return (int)(tolower(c1) - tolower(c2)); + } + + d1 += cl1; + d2 += cl2; + + l1 -= cl1; + l2 -= cl2; + } - return strncasecmp((const char *) d1, (const char *) d2, l1); + return (int)(l1 - l2); } diff --git a/src/utils/Makefile b/src/utils/Makefile index 29369ae..ac87ded 100644 --- a/src/utils/Makefile +++ b/src/utils/Makefile @@ -22,7 +22,7 @@ CFLAGS += -I$(CURDIR) # Objects -OBJS = namespace +OBJS = namespace utf8 utf16 .PHONY: clean debug distclean export release setup test diff --git a/src/utils/charset_errors.h b/src/utils/charset_errors.h new file mode 100644 index 0000000..7571c06 --- /dev/null +++ b/src/utils/charset_errors.h @@ -0,0 +1,19 @@ +/* + * This file is part of libdom. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell + */ + +#ifndef dom_utils_charset_errors_h_ +#define dom_utils_charset_errors_h_ + +typedef enum { + CHARSET_OK, /**< No error */ + CHARSET_BADPARM, /**< Bad parameters to argument */ + CHARSET_NEEDDATA, /**< Insufficient data for operation */ + CHARSET_INVALID /**< Invalid input data */ +} charset_error; + +#endif + diff --git a/src/utils/namespace.c b/src/utils/namespace.c index 25b56ee..8a53e45 100644 --- a/src/utils/namespace.c +++ b/src/utils/namespace.c @@ -29,6 +29,7 @@ dom_exception _dom_namespace_initialise(dom_alloc alloc, void *pw) dom_exception err; err = dom_string_create_from_ptr_no_doc(alloc, pw, + DOM_STRING_UTF8, (const uint8_t *) "http://www.w3.org/XML/1998/namespace", SLEN("http://www.w3.org/XML/1998/namespace"), &xml); @@ -37,6 +38,7 @@ dom_exception _dom_namespace_initialise(dom_alloc alloc, void *pw) } err = dom_string_create_from_ptr_no_doc(alloc, pw, + DOM_STRING_UTF8, (const uint8_t *) "http://www.w3.org/2000/xmlns", SLEN("http://www.w3.org/2000/xmlns"), &xmlns); diff --git a/src/utils/utf16.c b/src/utils/utf16.c new file mode 100644 index 0000000..8917328 --- /dev/null +++ b/src/utils/utf16.c @@ -0,0 +1,239 @@ +/* + * This file is part of Hubbub. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell + */ + +/** \file + * UTF-16 manipulation functions (implementation). + */ + +#include +#include +#include + +#include "utils/utf16.h" + +/** + * Convert a UTF-16 sequence into a single UCS4 character + * + * \param s The sequence to process + * \param len Length of sequence + * \param ucs4 Pointer to location to receive UCS4 character (host endian) + * \param clen Pointer to location to receive byte length of UTF-16 sequence + * \return CHARSET_OK on success, appropriate error otherwise + */ +inline charset_error _dom_utf16_to_ucs4(const uint8_t *s, size_t len, + uint32_t *ucs4, size_t *clen) +{ + const uint16_t *ss = (const uint16_t *) (const void *) s; + + if (s == NULL || ucs4 == NULL || clen == NULL) + return CHARSET_BADPARM; + + if (len < 2) + return CHARSET_NEEDDATA; + + if (*ss < 0xD800 || *ss > 0xDFFF) { + *ucs4 = *ss; + *clen = 2; + } else if (0xD800 <= *ss && *ss <= 0xBFFF) { + if (len < 4) + return CHARSET_NEEDDATA; + + if (0xDC00 <= ss[1] && ss[1] <= 0xE000) { + *ucs4 = (((s[0] >> 6) & 0x1f) + 1) | + ((s[0] & 0x3f) | (s[1] & 0x3ff)); + *clen = 4; + } else { + return CHARSET_INVALID; + } + } + + return CHARSET_OK; +} + +/** + * Convert a single UCS4 character into a UTF-16 sequence + * + * \param ucs4 The character to process (0 <= c <= 0x7FFFFFFF) (host endian) + * \param s Pointer to 4 byte long output buffer + * \param len Pointer to location to receive length of multibyte sequence + * \return CHARSET_OK on success, appropriate error otherwise + */ +inline charset_error _dom_utf16_from_ucs4(uint32_t ucs4, uint8_t *s, + size_t *len) +{ + uint16_t *ss = (uint16_t *) (void *) s; + uint32_t l = 0; + + if (s == NULL || len == NULL) + return CHARSET_BADPARM; + else if (ucs4 < 0x10000) { + *ss = (uint16_t) ucs4; + l = 2; + } else if (ucs4 < 0x110000) { + ss[0] = 0xD800 | (((ucs4 >> 16) & 0x1f) - 1) | (ucs4 >> 10); + ss[1] = 0xDC00 | (ucs4 & 0x3ff); + l = 4; + } else { + return CHARSET_INVALID; + } + + *len = l; + + return CHARSET_OK; +} + +/** + * Calculate the length (in characters) of a bounded UTF-16 string + * + * \param s The string + * \param max Maximum length + * \param len Pointer to location to receive length of string + * \return CHARSET_OK on success, appropriate error otherwise + */ +inline charset_error _dom_utf16_length(const uint8_t *s, size_t max, + size_t *len) +{ + const uint16_t *ss = (const uint16_t *) (const void *) s; + const uint16_t *end = (const uint16_t *) (const void *) (s + max); + int l = 0; + + if (s == NULL || len == NULL) + return CHARSET_BADPARM; + + while (ss < end) { + if (*ss < 0xD800 || 0xDFFF < *ss) + ss++; + else + ss += 2; + + l++; + } + + *len = l; + + return CHARSET_OK; +} + +/** + * Calculate the length (in bytes) of a UTF-16 character + * + * \param s Pointer to start of character + * \param len Pointer to location to receive length + * \return CHARSET_OK on success, appropriate error otherwise + */ +inline charset_error _dom_utf16_char_byte_length(const uint8_t *s, + size_t *len) +{ + const uint16_t *ss = (const uint16_t *) (const void *) s; + + if (s == NULL || len == NULL) + return CHARSET_BADPARM; + + if (*ss < 0xD800 || 0xDFFF < *ss) + *len = 2; + else + *len = 4; + + return CHARSET_OK; +} + +/** + * Find previous legal UTF-16 char in string + * + * \param s The string + * \param off Offset in the string to start at + * \param prevoff Pointer to location to receive offset of first byte of + * previous legal character + * \return CHARSET_OK on success, appropriate error otherwise + */ +inline charset_error _dom_utf16_prev(const uint8_t *s, uint32_t off, + uint32_t *prevoff) +{ + const uint16_t *ss = (const uint16_t *) (const void *) s; + + if (s == NULL || prevoff == NULL) + return CHARSET_BADPARM; + + if (off < 2) + *prevoff = 0; + else if (ss[-1] < 0xDC00 || ss[-1] > 0xDFFF) + *prevoff = off - 2; + else + *prevoff = (off < 4) ? 0 : off - 4; + + return CHARSET_OK; +} + +/** + * Find next legal UTF-16 char in string + * + * \param s The string (assumed valid) + * \param len Maximum offset in string + * \param off Offset in the string to start at + * \param nextoff Pointer to location to receive offset of first byte of + * next legal character + * \return CHARSET_OK on success, appropriate error otherwise + */ +inline charset_error _dom_utf16_next(const uint8_t *s, uint32_t len, + uint32_t off, uint32_t *nextoff) +{ + const uint16_t *ss = (const uint16_t *) (const void *) s; + + if (s == NULL || off >= len || nextoff == NULL) + return CHARSET_BADPARM; + + if (len - off < 4) + *nextoff = len; + else if (ss[1] < 0xD800 || ss[1] > 0xDBFF) + *nextoff = off + 2; + else + *nextoff = (len - off < 6) ? len : off + 4; + + return CHARSET_OK; +} + +/** + * Find next legal UTF-16 char in string + * + * \param s The string (assumed to be of dubious validity) + * \param len Maximum offset in string + * \param off Offset in the string to start at + * \param nextoff Pointer to location to receive offset of first byte of + * next legal character + * \return CHARSET_OK on success, appropriate error otherwise + */ +inline charset_error _dom_utf16_next_paranoid(const uint8_t *s, + uint32_t len, uint32_t off, uint32_t *nextoff) +{ + const uint16_t *ss = (const uint16_t *) (const void *) s; + + if (s == NULL || off >= len || nextoff == NULL) + return CHARSET_BADPARM; + + while (1) { + if (len - off < 4) { + return CHARSET_NEEDDATA; + } else if (ss[1] < 0xD800 || ss[1] > 0xDFFF) { + *nextoff = off + 2; + break; + } else if (ss[1] >= 0xD800 && ss[1] <= 0xDBFF) { + if (len - off < 6) + return CHARSET_NEEDDATA; + + if (ss[2] >= 0xDC00 && ss[2] <= 0xDFFF) { + *nextoff = off + 4; + break; + } else { + ss++; + off += 2; + } + } + } + + return CHARSET_OK; +} + diff --git a/src/utils/utf16.h b/src/utils/utf16.h new file mode 100644 index 0000000..7b9e15f --- /dev/null +++ b/src/utils/utf16.h @@ -0,0 +1,38 @@ +/* + * This file is part of libdom. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell + */ + +/** \file + * UTF-16 manipulation functions (interface). + */ + +#ifndef dom_utils_utf16_h_ +#define dom_utils_utf16_h_ + +#include + +#include "utils/charset_errors.h" + +inline charset_error _dom_utf16_to_ucs4(const uint8_t *s, size_t len, + uint32_t *ucs4, size_t *clen); +inline charset_error _dom_utf16_from_ucs4(uint32_t ucs4, uint8_t *s, + size_t *len); + +inline charset_error _dom_utf16_length(const uint8_t *s, size_t max, + size_t *len); +inline charset_error _dom_utf16_char_byte_length(const uint8_t *s, + size_t *len); + +inline charset_error _dom_utf16_prev(const uint8_t *s, uint32_t off, + uint32_t *prevoff); +inline charset_error _dom_utf16_next(const uint8_t *s, uint32_t len, + uint32_t off, uint32_t *nextoff); + +inline charset_error _dom_utf16_next_paranoid(const uint8_t *s, + uint32_t len, uint32_t off, uint32_t *nextoff); + +#endif + diff --git a/src/utils/utf8.c b/src/utils/utf8.c new file mode 100644 index 0000000..b80f04e --- /dev/null +++ b/src/utils/utf8.c @@ -0,0 +1,368 @@ +/* + * This file is part of libdom. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell + */ + +/** \file + * UTF-8 manipulation functions (implementation). + */ + +#include +#include +#include + +#include "utils/utf8.h" + +/** Number of continuation bytes for a given start byte */ +static const uint8_t numContinuations[256] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, +}; + +/** + * Convert a UTF-8 multibyte sequence into a single UCS4 character + * + * Encoding of UCS values outside the UTF-16 plane has been removed from + * RFC3629. This function conforms to RFC2279, however. + * + * \param s The sequence to process + * \param len Length of sequence + * \param ucs4 Pointer to location to receive UCS4 character (host endian) + * \param clen Pointer to location to receive byte length of UTF-8 sequence + * \return CHARSET_OK on success, appropriate error otherwise + */ +inline charset_error _dom_utf8_to_ucs4(const uint8_t *s, size_t len, + uint32_t *ucs4, size_t *clen) +{ + if (s == NULL || ucs4 == NULL || clen == NULL) + return CHARSET_BADPARM; + + if (len == 0) + return CHARSET_NEEDDATA; + + if (*s < 0x80) { + *ucs4 = *s; + *clen = 1; + } else if ((*s & 0xE0) == 0xC0) { + if (len < 2) + return CHARSET_NEEDDATA; + else if ((*(s+1) & 0xC0) != 0x80) + return CHARSET_INVALID; + else { + *ucs4 = ((*s & 0x1F) << 6) | (*(s+1) & 0x3F); + *clen = 2; + } + } else if ((*s & 0xF0) == 0xE0) { + if (len < 3) + return CHARSET_NEEDDATA; + else if ((*(s+1) & 0xC0) != 0x80 || + (*(s+2) & 0xC0) != 0x80) + return CHARSET_INVALID; + else { + *ucs4 = ((*s & 0x0F) << 12) | + ((*(s+1) & 0x3F) << 6) | + (*(s+2) & 0x3F); + *clen = 3; + } + } else if ((*s & 0xF8) == 0xF0) { + if (len < 4) + return CHARSET_NEEDDATA; + else if ((*(s+1) & 0xC0) != 0x80 || + (*(s+2) & 0xC0) != 0x80 || + (*(s+3) & 0xC0) != 0x80) + return CHARSET_INVALID; + else { + *ucs4 = ((*s & 0x0F) << 18) | + ((*(s+1) & 0x3F) << 12) | + ((*(s+2) & 0x3F) << 6) | + (*(s+3) & 0x3F); + *clen = 4; + } + } else if ((*s & 0xFC) == 0xF8) { + if (len < 5) + return CHARSET_NEEDDATA; + else if ((*(s+1) & 0xC0) != 0x80 || + (*(s+2) & 0xC0) != 0x80 || + (*(s+3) & 0xC0) != 0x80 || + (*(s+4) & 0xC0) != 0x80) + return CHARSET_INVALID; + else { + *ucs4 = ((*s & 0x0F) << 24) | + ((*(s+1) & 0x3F) << 18) | + ((*(s+2) & 0x3F) << 12) | + ((*(s+3) & 0x3F) << 6) | + (*(s+4) & 0x3F); + *clen = 5; + } + } else if ((*s & 0xFE) == 0xFC) { + if (len < 6) + return CHARSET_NEEDDATA; + else if ((*(s+1) & 0xC0) != 0x80 || + (*(s+2) & 0xC0) != 0x80 || + (*(s+3) & 0xC0) != 0x80 || + (*(s+4) & 0xC0) != 0x80 || + (*(s+5) & 0xC0) != 0x80) + return CHARSET_INVALID; + else { + *ucs4 = ((*s & 0x0F) << 28) | + ((*(s+1) & 0x3F) << 24) | + ((*(s+2) & 0x3F) << 18) | + ((*(s+3) & 0x3F) << 12) | + ((*(s+4) & 0x3F) << 6) | + (*(s+5) & 0x3F); + *clen = 6; + } + } else { + return CHARSET_INVALID; + } + + return CHARSET_OK; +} + +/** + * Convert a single UCS4 character into a UTF-8 multibyte sequence + * + * Encoding of UCS values outside the UTF-16 plane has been removed from + * RFC3629. This function conforms to RFC2279, however. + * + * \param ucs4 The character to process (0 <= c <= 0x7FFFFFFF) (host endian) + * \param s Pointer to 6 byte long output buffer + * \param len Pointer to location to receive length of multibyte sequence + * \return CHARSET_OK on success, appropriate error otherwise + */ +inline charset_error _dom_utf8_from_ucs4(uint32_t ucs4, uint8_t *s, + size_t *len) +{ + uint32_t l = 0; + + if (s == NULL || len == NULL) + return CHARSET_BADPARM; + else if (ucs4 < 0x80) { + *s = (uint8_t) ucs4; + l = 1; + } else if (ucs4 < 0x800) { + *s = 0xC0 | ((ucs4 >> 6) & 0x1F); + *(s+1) = 0x80 | (ucs4 & 0x3F); + l = 2; + } else if (ucs4 < 0x10000) { + *s = 0xE0 | ((ucs4 >> 12) & 0xF); + *(s+1) = 0x80 | ((ucs4 >> 6) & 0x3F); + *(s+2) = 0x80 | (ucs4 & 0x3F); + l = 3; + } else if (ucs4 < 0x200000) { + *s = 0xF0 | ((ucs4 >> 18) & 0x7); + *(s+1) = 0x80 | ((ucs4 >> 12) & 0x3F); + *(s+2) = 0x80 | ((ucs4 >> 6) & 0x3F); + *(s+3) = 0x80 | (ucs4 & 0x3F); + l = 4; + } else if (ucs4 < 0x4000000) { + *s = 0xF8 | ((ucs4 >> 24) & 0x3); + *(s+1) = 0x80 | ((ucs4 >> 18) & 0x3F); + *(s+2) = 0x80 | ((ucs4 >> 12) & 0x3F); + *(s+3) = 0x80 | ((ucs4 >> 6) & 0x3F); + *(s+4) = 0x80 | (ucs4 & 0x3F); + l = 5; + } else if (ucs4 <= 0x7FFFFFFF) { + *s = 0xFC | ((ucs4 >> 30) & 0x1); + *(s+1) = 0x80 | ((ucs4 >> 24) & 0x3F); + *(s+2) = 0x80 | ((ucs4 >> 18) & 0x3F); + *(s+3) = 0x80 | ((ucs4 >> 12) & 0x3F); + *(s+4) = 0x80 | ((ucs4 >> 6) & 0x3F); + *(s+5) = 0x80 | (ucs4 & 0x3F); + l = 6; + } else { + return CHARSET_INVALID; + } + + *len = l; + + return CHARSET_OK; +} + +/** + * Calculate the length (in characters) of a bounded UTF-8 string + * + * \param s The string + * \param max Maximum length + * \param len Pointer to location to receive length of string + * \return CHARSET_OK on success, appropriate error otherwise + */ +inline charset_error _dom_utf8_length(const uint8_t *s, size_t max, + size_t *len) +{ + const uint8_t *end = s + max; + int l = 0; + + if (s == NULL || len == NULL) + return CHARSET_BADPARM; + + while (s < end) { + if ((*s & 0x80) == 0x00) + s += 1; + else if ((*s & 0xE0) == 0xC0) + s += 2; + else if ((*s & 0xF0) == 0xE0) + s += 3; + else if ((*s & 0xF8) == 0xF0) + s += 4; + else if ((*s & 0xFC) == 0xF8) + s += 5; + else if ((*s & 0xFE) == 0xFC) + s += 6; + else + return CHARSET_INVALID; + l++; + } + + *len = l; + + return CHARSET_OK; +} + +/** + * Calculate the length (in bytes) of a UTF-8 character + * + * \param s Pointer to start of character + * \param len Pointer to location to receive length + * \return CHARSET_OK on success, appropriate error otherwise + */ +inline charset_error _dom_utf8_char_byte_length(const uint8_t *s, + size_t *len) +{ + if (s == NULL || len == NULL) + return CHARSET_BADPARM; + + *len = numContinuations[s[0]] + 1 /* Start byte */; + + return CHARSET_OK; +} + +/** + * Find previous legal UTF-8 char in string + * + * \param s The string + * \param off Offset in the string to start at + * \param prevoff Pointer to location to receive offset of first byte of + * previous legal character + * \return CHARSET_OK on success, appropriate error otherwise + */ +inline charset_error _dom_utf8_prev(const uint8_t *s, uint32_t off, + uint32_t *prevoff) +{ + if (s == NULL || prevoff == NULL) + return CHARSET_BADPARM; + + while (off != 0 && (s[--off] & 0xC0) == 0x80) + /* do nothing */; + + *prevoff = off; + + return CHARSET_OK; +} + +/** + * Find next legal UTF-8 char in string + * + * \param s The string (assumed valid) + * \param len Maximum offset in string + * \param off Offset in the string to start at + * \param nextoff Pointer to location to receive offset of first byte of + * next legal character + * \return CHARSET_OK on success, appropriate error otherwise + */ +inline charset_error _dom_utf8_next(const uint8_t *s, uint32_t len, + uint32_t off, uint32_t *nextoff) +{ + if (s == NULL || off >= len || nextoff == NULL) + return CHARSET_BADPARM; + + /* Skip current start byte (if present - may be mid-sequence) */ + if (s[off] < 0x80 || (s[off] & 0xC0) == 0xC0) + off++; + + while (off < len && (s[off] & 0xC0) == 0x80) + off++; + + *nextoff = off; + + return CHARSET_OK; +} + +/** + * Find next legal UTF-8 char in string + * + * \param s The string (assumed to be of dubious validity) + * \param len Maximum offset in string + * \param off Offset in the string to start at + * \param nextoff Pointer to location to receive offset of first byte of + * next legal character + * \return CHARSET_OK on success, appropriate error otherwise + */ +inline charset_error _dom_utf8_next_paranoid(const uint8_t *s, uint32_t len, + uint32_t off, uint32_t *nextoff) +{ + bool valid; + + if (s == NULL || off >= len || nextoff == NULL) + return CHARSET_BADPARM; + + /* Skip current start byte (if present - may be mid-sequence) */ + if (s[off] < 0x80 || (s[off] & 0xC0) == 0xC0) + off++; + + while (1) { + /* Find next possible start byte */ + while (off < len && (s[off] & 0xC0) == 0x80) + off++; + + /* Ran off end of data */ + if (off == len || off + numContinuations[s[off]] >= len) + return CHARSET_NEEDDATA; + + /* Found if start byte is ascii, + * or next n bytes are valid continuations */ + valid = true; + + switch (numContinuations[s[off]]) { + case 5: + valid &= ((s[off + 5] & 0xC0) == 0x80); + case 4: + valid &= ((s[off + 4] & 0xC0) == 0x80); + case 3: + valid &= ((s[off + 3] & 0xC0) == 0x80); + case 2: + valid &= ((s[off + 2] & 0xC0) == 0x80); + case 1: + valid &= ((s[off + 1] & 0xC0) == 0x80); + case 0: + valid &= (s[off + 0] < 0x80); + } + + if (valid) + break; + + /* Otherwise, skip this (invalid) start byte and try again */ + off++; + } + + *nextoff = off; + + return CHARSET_OK; +} + diff --git a/src/utils/utf8.h b/src/utils/utf8.h new file mode 100644 index 0000000..154dbb8 --- /dev/null +++ b/src/utils/utf8.h @@ -0,0 +1,38 @@ +/* + * This file is part of libdom. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell + */ + +/** \file + * UTF-8 manipulation functions (interface). + */ + +#ifndef dom_utils_utf8_h_ +#define dom_utils_utf8_h_ + +#include + +#include "utils/charset_errors.h" + +inline charset_error _dom_utf8_to_ucs4(const uint8_t *s, size_t len, + uint32_t *ucs4, size_t *clen); +inline charset_error _dom_utf8_from_ucs4(uint32_t ucs4, uint8_t *s, + size_t *len); + +inline charset_error _dom_utf8_length(const uint8_t *s, size_t max, + size_t *len); +inline charset_error _dom_utf8_char_byte_length(const uint8_t *s, + size_t *len); + +inline charset_error _dom_utf8_prev(const uint8_t *s, uint32_t off, + uint32_t *prevoff); +inline charset_error _dom_utf8_next(const uint8_t *s, uint32_t len, + uint32_t off, uint32_t *nextoff); + +inline charset_error _dom_utf8_next_paranoid(const uint8_t *s, uint32_t len, + uint32_t off, uint32_t *nextoff); + +#endif + -- cgit v1.2.3