From 3479055b4a609032a1775871cc685fd7dd33ab32 Mon Sep 17 00:00:00 2001 From: John Mark Bell Date: Tue, 3 Mar 2009 18:08:01 +0000 Subject: Rationalise dom_string (some consideration is required as to what happens wrt interning -- lwc_strings should probably be used) Purge charset handling -- a) documents are always converted to utf-8 b) use parserutils for utf-8 handling Fix Hubbub binding to compile. svn path=/trunk/dom/; revision=6682 --- bindings/hubbub/parser.c | 64 ++-- bindings/xml/xmlbinding.c | 5 +- bindings/xml/xmlparser.c | 36 +-- include/dom/bootstrap/implpriv.h | 5 +- include/dom/core/document.h | 3 + include/dom/core/implementation.h | 1 - include/dom/core/string.h | 19 +- src/core/attr.c | 2 +- src/core/document.c | 113 ++----- src/core/document.h | 6 - src/core/implementation.c | 4 +- src/core/node.c | 21 +- src/core/string.c | 632 ++++++++------------------------------ src/utils/Makefile | 2 +- src/utils/namespace.c | 13 +- src/utils/utf16.c | 239 -------------- src/utils/utf16.h | 38 --- src/utils/utf8.c | 368 ---------------------- src/utils/utf8.h | 38 --- 19 files changed, 202 insertions(+), 1407 deletions(-) delete mode 100644 src/utils/utf16.c delete mode 100644 src/utils/utf16.h delete mode 100644 src/utils/utf8.c delete mode 100644 src/utils/utf8.h diff --git a/bindings/hubbub/parser.c b/bindings/hubbub/parser.c index 9473438..7b5e6ab 100644 --- a/bindings/hubbub/parser.c +++ b/bindings/hubbub/parser.c @@ -20,7 +20,6 @@ */ struct dom_hubbub_parser { hubbub_parser *parser; /**< Hubbub parser instance */ - const uint8_t *buffer; /**< Parser buffer pointer */ struct dom_document *doc; /**< DOM Document we're building */ @@ -35,9 +34,8 @@ struct dom_hubbub_parser { void *mctx; /**< Pointer to client data */ }; -static void __dom_hubbub_buffer_handler(const uint8_t *buffer, size_t len, +static hubbub_error __dom_hubbub_token_handler(const hubbub_token *token, void *pw); -static void __dom_hubbub_token_handler(const hubbub_token *token, void *pw); static bool __initialised; @@ -63,6 +61,8 @@ dom_hubbub_parser *dom_hubbub_parser_create(const char *aliases, dom_exception err; hubbub_error e; + UNUSED(int_enc); + if (__initialised == false) { e = hubbub_initialise(aliases, (hubbub_alloc) alloc, pw); if (e != HUBBUB_OK) { @@ -80,23 +80,11 @@ dom_hubbub_parser *dom_hubbub_parser_create(const char *aliases, return NULL; } - parser->parser = hubbub_parser_create(enc, int_enc, - (hubbub_alloc) alloc, pw); - if (parser->parser == NULL) { - alloc(parser, 0, pw); - msg(DOM_MSG_CRITICAL, mctx, "Failed to create hubbub parser"); - return NULL; - } - - params.buffer_handler.handler = __dom_hubbub_buffer_handler; - params.buffer_handler.pw = parser; - e = hubbub_parser_setopt(parser->parser, HUBBUB_PARSER_BUFFER_HANDLER, - ¶ms); + e = hubbub_parser_create(enc, true, (hubbub_alloc) alloc, pw, + &parser->parser); if (e != HUBBUB_OK) { - hubbub_parser_destroy(parser->parser); alloc(parser, 0, pw); - msg(DOM_MSG_CRITICAL, mctx, - "Failed registering hubbub buffer handler"); + msg(DOM_MSG_CRITICAL, mctx, "Failed to create hubbub parser"); return NULL; } @@ -118,8 +106,7 @@ dom_hubbub_parser *dom_hubbub_parser_create(const char *aliases, /* Get DOM implementation */ /* Create string representation of the features we want */ - err = dom_string_create_from_ptr_no_doc(alloc, pw, - DOM_STRING_UTF8, + err = dom_string_create(alloc, pw, (const uint8_t *) "HTML", SLEN("HTML"), &features); if (err != DOM_NO_ERR) { hubbub_parser_destroy(parser->parser); @@ -202,17 +189,7 @@ struct dom_document *dom_hubbub_parser_get_document(dom_hubbub_parser *parser) return (parser->complete ? parser->doc : NULL); } -void __dom_hubbub_buffer_handler(const uint8_t *buffer, size_t len, - void *pw) -{ - dom_hubbub_parser *parser = (dom_hubbub_parser *) pw; - - UNUSED(len); - - parser->buffer = buffer; -} - -void __dom_hubbub_token_handler(const hubbub_token *token, void *pw) +hubbub_error __dom_hubbub_token_handler(const hubbub_token *token, void *pw) { dom_hubbub_parser *parser = (dom_hubbub_parser *) pw; static const char *token_names[] = { @@ -221,55 +198,58 @@ void __dom_hubbub_token_handler(const hubbub_token *token, void *pw) }; size_t i; + UNUSED(parser); + printf("%s: ", token_names[token->type]); switch (token->type) { case HUBBUB_TOKEN_DOCTYPE: printf("'%.*s' (%svalid)\n", (int) token->data.doctype.name.len, - parser->buffer + - token->data.doctype.name.data_off, - token->data.doctype.correct ? "" : "in"); + token->data.doctype.name.ptr, + token->data.doctype.force_quirks ? "in" : ""); break; case HUBBUB_TOKEN_START_TAG: printf("'%.*s' %s\n", (int) token->data.tag.name.len, - parser->buffer + token->data.tag.name.data_off, + token->data.tag.name.ptr, (token->data.tag.n_attributes > 0) ? "attributes:" : ""); for (i = 0; i < token->data.tag.n_attributes; i++) { printf("\t'%.*s' = '%.*s'\n", (int) token->data.tag.attributes[i].name.len, - parser->buffer + token->data.tag.attributes[i].name.data_off, + token->data.tag.attributes[i].name.ptr, (int) token->data.tag.attributes[i].value.len, - parser->buffer + token->data.tag.attributes[i].value.data_off); + token->data.tag.attributes[i].value.ptr); } break; case HUBBUB_TOKEN_END_TAG: printf("'%.*s' %s\n", (int) token->data.tag.name.len, - parser->buffer + token->data.tag.name.data_off, + token->data.tag.name.ptr, (token->data.tag.n_attributes > 0) ? "attributes:" : ""); for (i = 0; i < token->data.tag.n_attributes; i++) { printf("\t'%.*s' = '%.*s'\n", (int) token->data.tag.attributes[i].name.len, - parser->buffer + token->data.tag.attributes[i].name.data_off, + token->data.tag.attributes[i].name.ptr, (int) token->data.tag.attributes[i].value.len, - parser->buffer + token->data.tag.attributes[i].value.data_off); + token->data.tag.attributes[i].value.ptr); } break; case HUBBUB_TOKEN_COMMENT: printf("'%.*s'\n", (int) token->data.comment.len, - parser->buffer + token->data.comment.data_off); + token->data.comment.ptr); break; case HUBBUB_TOKEN_CHARACTER: printf("'%.*s'\n", (int) token->data.character.len, - parser->buffer + token->data.character.data_off); + token->data.character.ptr); break; case HUBBUB_TOKEN_EOF: printf("\n"); break; } + + return HUBBUB_OK; } diff --git a/bindings/xml/xmlbinding.c b/bindings/xml/xmlbinding.c index 2bbfb7b..b03b7af 100644 --- a/bindings/xml/xmlbinding.c +++ b/bindings/xml/xmlbinding.c @@ -38,7 +38,6 @@ static dom_exception xml_dom_implementation_create_document( struct dom_string *qname, struct dom_document_type *doctype, struct dom_document **doc, - dom_string_charset charset, dom_alloc alloc, void *pw); static dom_exception xml_dom_implementation_get_feature( struct dom_implementation *impl, @@ -237,7 +236,6 @@ dom_exception xml_dom_implementation_create_document_type( * \param qname The qualified name of the document element * \param doctype The type of document to create * \param doc Pointer to location to receive result - * \param charset The charset to use for strings in the document * \param alloc Memory (de)allocation function * \param pw Pointer to client-specific private data * \return DOM_NO_ERR on success, @@ -274,14 +272,13 @@ dom_exception xml_dom_implementation_create_document( struct dom_string *qname, struct dom_document_type *doctype, struct dom_document **doc, - dom_string_charset charset, dom_alloc alloc, void *pw) { struct dom_document *d; dom_exception err; /* Create document object */ - err = dom_document_create(impl, charset, alloc, pw, &d); + err = dom_document_create(impl, alloc, pw, &d); if (err != DOM_NO_ERR) return err; diff --git a/bindings/xml/xmlparser.c b/bindings/xml/xmlparser.c index 743a826..9e3786f 100644 --- a/bindings/xml/xmlparser.c +++ b/bindings/xml/xmlparser.c @@ -181,8 +181,7 @@ dom_xml_parser *dom_xml_parser_create(const char *enc, const char *int_enc, parser->complete = false; /* Create key for user data registration */ - err = dom_string_create_from_ptr_no_doc((dom_alloc) alloc, pw, - DOM_STRING_UTF8, + err = dom_string_create((dom_alloc) alloc, pw, (const uint8_t *) "__xmlnode", SLEN("__xmlnode"), &parser->udkey); if (err != DOM_NO_ERR) { @@ -194,8 +193,7 @@ dom_xml_parser *dom_xml_parser_create(const char *enc, const char *int_enc, /* Get DOM implementation */ /* Create a string representation of the features we want */ - err = dom_string_create_from_ptr_no_doc((dom_alloc) alloc, pw, - DOM_STRING_UTF8, + err = dom_string_create((dom_alloc) alloc, pw, (const uint8_t *) "XML", SLEN("XML"), &features); if (err != DOM_NO_ERR) { dom_string_unref(parser->udkey); @@ -329,7 +327,6 @@ void xml_parser_start_document(void *ctx) /* qname */ NULL, /* doctype */ NULL, &doc, - DOM_STRING_UTF8, (dom_alloc) parser->alloc, parser->pw); if (err != DOM_NO_ERR) { @@ -650,9 +647,8 @@ void xml_parser_add_element_node(dom_xml_parser *parser, struct dom_string *tag_name; /* Create tag name DOM string */ - err = dom_string_create_from_const_ptr(parser->doc, - child->name, - strlen((const char *) child->name), + err = dom_document_create_string(parser->doc, + child->name, strlen((const char *) child->name), &tag_name); if (err != DOM_NO_ERR) { parser->msg(DOM_MSG_CRITICAL, parser->mctx, @@ -684,7 +680,7 @@ void xml_parser_add_element_node(dom_xml_parser *parser, uint8_t qnamebuf[qnamelen + 1 /* '\0' */]; /* Create namespace DOM string */ - err = dom_string_create_from_const_ptr(parser->doc, + err = dom_document_create_string(parser->doc, child->ns->href, strlen((const char *) child->ns->href), &namespace); @@ -703,7 +699,7 @@ void xml_parser_add_element_node(dom_xml_parser *parser, (const char *) child->name); /* Create qname DOM string */ - err = dom_string_create_from_ptr(parser->doc, + err = dom_document_create_string(parser->doc, qnamebuf, qnamelen, &qname); @@ -742,7 +738,7 @@ void xml_parser_add_element_node(dom_xml_parser *parser, struct dom_string *name; /* Create attribute name DOM string */ - err = dom_string_create_from_const_ptr(parser->doc, + err = dom_document_create_string(parser->doc, a->name, strlen((const char *) a->name), &name); @@ -776,7 +772,7 @@ void xml_parser_add_element_node(dom_xml_parser *parser, uint8_t qnamebuf[qnamelen + 1 /* '\0' */]; /* Create namespace DOM string */ - err = dom_string_create_from_const_ptr(parser->doc, + err = dom_document_create_string(parser->doc, a->ns->href, strlen((const char *) a->ns->href), &namespace); @@ -795,7 +791,7 @@ void xml_parser_add_element_node(dom_xml_parser *parser, (const char *) a->name); /* Create qname DOM string */ - err = dom_string_create_from_ptr(parser->doc, + err = dom_document_create_string(parser->doc, qnamebuf, qnamelen, &qname); @@ -904,7 +900,7 @@ void xml_parser_add_text_node(dom_xml_parser *parser, struct dom_node *parent, dom_exception err; /* Create DOM string data for text node */ - err = dom_string_create_from_const_ptr(parser->doc, child->content, + err = dom_document_create_string(parser->doc, child->content, strlen((const char *) child->content), &data); if (err != DOM_NO_ERR) { parser->msg(DOM_MSG_CRITICAL, parser->mctx, @@ -965,7 +961,7 @@ void xml_parser_add_cdata_section(dom_xml_parser *parser, dom_exception err; /* Create DOM string data for cdata section */ - err = dom_string_create_from_const_ptr(parser->doc, child->content, + err = dom_document_create_string(parser->doc, child->content, strlen((const char *) child->content), &data); if (err != DOM_NO_ERR) { parser->msg(DOM_MSG_CRITICAL, parser->mctx, @@ -1027,7 +1023,7 @@ void xml_parser_add_entity_reference(dom_xml_parser *parser, dom_exception err; /* Create name of entity reference */ - err = dom_string_create_from_const_ptr(parser->doc, child->name, + err = dom_document_create_string(parser->doc, child->name, strlen((const char *) child->name), &name); if (err != DOM_NO_ERR) { parser->msg(DOM_MSG_CRITICAL, parser->mctx, @@ -1094,7 +1090,7 @@ void xml_parser_add_comment(dom_xml_parser *parser, struct dom_node *parent, dom_exception err; /* Create DOM string data for comment */ - err = dom_string_create_from_const_ptr(parser->doc, child->content, + err = dom_document_create_string(parser->doc, child->content, strlen((const char *) child->content), &data); if (err != DOM_NO_ERR) { parser->msg(DOM_MSG_CRITICAL, parser->mctx, @@ -1156,7 +1152,7 @@ void xml_parser_add_document_type(dom_xml_parser *parser, dom_exception err; /* Create qname for doctype */ - err = dom_string_create_from_const_ptr(parser->doc, dtd->name, + err = dom_document_create_string(parser->doc, dtd->name, strlen((const char *) dtd->name), &qname); if (err != DOM_NO_ERR) { parser->msg(DOM_MSG_CRITICAL, parser->mctx, @@ -1165,7 +1161,7 @@ void xml_parser_add_document_type(dom_xml_parser *parser, } /* Create public ID for doctype */ - err = dom_string_create_from_const_ptr(parser->doc, + err = dom_document_create_string(parser->doc, dtd->ExternalID, (dtd->ExternalID == NULL) ? 0 : strlen((const char *) dtd->ExternalID), @@ -1178,7 +1174,7 @@ void xml_parser_add_document_type(dom_xml_parser *parser, } /* Create system ID for doctype */ - err = dom_string_create_from_const_ptr(parser->doc, + err = dom_document_create_string(parser->doc, dtd->SystemID, (dtd->SystemID == NULL) ? 0 : strlen((const char *) dtd->SystemID), diff --git a/include/dom/bootstrap/implpriv.h b/include/dom/bootstrap/implpriv.h index 97806a8..c99a9d2 100644 --- a/include/dom/bootstrap/implpriv.h +++ b/include/dom/bootstrap/implpriv.h @@ -94,7 +94,6 @@ struct dom_implementation { * \param qname The qualified name of the document element * \param doctype The type of document to create * \param doc Pointer to location to receive result - * \param charset The charset to use for strings in the document * \param alloc Memory (de)allocation function * \param pw Pointer to client-specific private data * \return DOM_NO_ERR on success, @@ -130,7 +129,6 @@ struct dom_implementation { struct dom_string *qname, struct dom_document_type *doctype, struct dom_document **doc, - dom_string_charset charset, dom_alloc alloc, void *pw); /** @@ -251,8 +249,7 @@ dom_exception dom_register_source(struct dom_implementation_source *source, /* Create a DOM document */ dom_exception dom_document_create(struct dom_implementation *impl, - dom_string_charset charset, dom_alloc alloc, void *pw, - struct dom_document **doc); + dom_alloc alloc, void *pw, struct dom_document **doc); /* Set a document's buffer */ void dom_document_set_buffer(struct dom_document *doc, uint8_t *buffer, diff --git a/include/dom/core/document.h b/include/dom/core/document.h index cce8e4b..6a5fd9f 100644 --- a/include/dom/core/document.h +++ b/include/dom/core/document.h @@ -9,6 +9,7 @@ #define dom_core_document_h_ #include +#include #include @@ -98,5 +99,7 @@ dom_exception dom_document_rename_node(struct dom_document *doc, struct dom_node *node, struct dom_string *namespace, struct dom_string *qname, struct dom_node **result); +dom_exception dom_document_create_string(struct dom_document *doc, + const uint8_t *data, size_t len, struct dom_string **result); #endif diff --git a/include/dom/core/implementation.h b/include/dom/core/implementation.h index a51493f..5e26432 100644 --- a/include/dom/core/implementation.h +++ b/include/dom/core/implementation.h @@ -37,7 +37,6 @@ dom_exception dom_implementation_create_document( struct dom_string *namespace, struct dom_string *qname, struct dom_document_type *doctype, struct dom_document **doc, - dom_string_charset charset, dom_alloc alloc, void *pw); dom_exception dom_implementation_get_feature( diff --git a/include/dom/core/string.h b/include/dom/core/string.h index e3dfa30..8da9dd7 100644 --- a/include/dom/core/string.h +++ b/include/dom/core/string.h @@ -14,33 +14,16 @@ #include #include -struct dom_document; struct dom_string; -typedef enum { - DOM_STRING_UTF8, - DOM_STRING_UTF16 -} dom_string_charset; - /* Claim a reference on a DOM string */ void dom_string_ref(struct dom_string *str); /* Release a reference on a DOM string */ void dom_string_unref(struct dom_string *str); -/* Create a DOM string from an offset into the document buffer */ -dom_exception dom_string_create_from_off(struct dom_document *doc, - uint32_t off, size_t len, struct dom_string **str); /* Create a DOM string from a string of characters */ -dom_exception dom_string_create_from_ptr(struct dom_document *doc, - const uint8_t *ptr, size_t len, struct dom_string **str); -/* Create a DOM string from a constant string of characters */ -dom_exception dom_string_create_from_const_ptr(struct dom_document *doc, +dom_exception dom_string_create(dom_alloc alloc, void *pw, const uint8_t *ptr, size_t len, struct dom_string **str); -/* Create a DOM string from a string of characters that does not belong - * to a document */ -dom_exception dom_string_create_from_ptr_no_doc(dom_alloc alloc, void *pw, - dom_string_charset charset, const uint8_t *ptr, size_t len, - struct dom_string **str); /* Case sensitively compare two DOM strings */ int dom_string_cmp(struct dom_string *s1, struct dom_string *s2); diff --git a/src/core/attr.c b/src/core/attr.c index a82f117..5a85ac0 100644 --- a/src/core/attr.c +++ b/src/core/attr.c @@ -180,7 +180,7 @@ dom_exception dom_attr_get_value(struct dom_attr *attr, struct dom_string *value, *temp; dom_exception err; - err = dom_string_create_from_const_ptr(a->owner, + err = dom_document_create_string(a->owner, (const uint8_t *) "", SLEN(""), &value); if (err != DOM_NO_ERR) { return err; diff --git a/src/core/document.c b/src/core/document.c index 3e06541..74283f9 100644 --- a/src/core/document.c +++ b/src/core/document.c @@ -56,8 +56,6 @@ struct dom_doc_nnm { struct dom_document { struct dom_node base; /**< Base node */ - dom_string_charset charset; /**< Charset of strings in document */ - struct dom_implementation *impl; /**< Owning implementation */ struct dom_doc_nl *nodelists; /**< List of active nodelists */ @@ -73,7 +71,6 @@ struct dom_document { /** Interned node name strings, indexed by node type */ /* Index 0 is unused */ static struct dom_string *__nodenames_utf8[DOM_NODE_TYPE_COUNT + 1]; -static struct dom_string *__nodenames_utf16[DOM_NODE_TYPE_COUNT + 1]; /** * Initialise the document module @@ -102,27 +99,6 @@ dom_exception _dom_document_initialise(dom_alloc alloc, void *pw) { "#document-fragment", 18 }, /* Document fragment */ { NULL, 0 } /* Notation */ }; - - /** \todo This assumes Little Endian */ - static struct { - const char *name; - size_t len; - } names_utf16[DOM_NODE_TYPE_COUNT + 1] = { - { NULL, 0 }, /* Unused */ - { NULL, 0 }, /* Element */ - { NULL, 0 }, /* Attr */ - { "#\0t\0e\0x\0t\0", 10 }, /* Text */ - { "#\0c\0d\0a\0t\0a\0-\0s\0e\0c\0t\0i\0o\0n\0", 28 }, /* CDATA section */ - { NULL, 0 }, /* Entity reference */ - { NULL, 0 }, /* Entity */ - { NULL, 0 }, /* Processing instruction */ - { "#\0c\0o\0m\0m\0e\0n\0t\0", 16 }, /* Comment */ - { "#\0d\0o\0c\0u\0m\0e\0n\0t\0", 18 }, /* Document */ - { NULL, 0 }, /* Document type */ - { "#\0d\0o\0c\0u\0m\0e\0n\0t\0-\0f\0r\0a\0g\0m\0e\0n\0t\0", 36 }, /* Document fragment */ - { NULL, 0 } /* Notation */ - }; - dom_exception err; /* Initialise interned node names */ @@ -130,13 +106,11 @@ dom_exception _dom_document_initialise(dom_alloc alloc, void *pw) if (names_utf8[i].name == NULL) { /* Nothing to intern; skip this entry */ __nodenames_utf8[i] = NULL; - __nodenames_utf16[i] = NULL; continue; } /* Make string */ - err = dom_string_create_from_ptr_no_doc(alloc, pw, - DOM_STRING_UTF8, + err = dom_string_create(alloc, pw, (const uint8_t *) names_utf8[i].name, names_utf8[i].len, &__nodenames_utf8[i]); if (err != DOM_NO_ERR) { @@ -144,29 +118,10 @@ dom_exception _dom_document_initialise(dom_alloc alloc, void *pw) for (int j = 0; j < i; j++) { if (__nodenames_utf8[j] != NULL) { dom_string_unref(__nodenames_utf8[j]); - dom_string_unref(__nodenames_utf16[j]); } } return err; } - - err = dom_string_create_from_ptr_no_doc(alloc, pw, - DOM_STRING_UTF16, - (const uint8_t *) names_utf16[i].name, - names_utf16[i].len, &__nodenames_utf16[i]); - if (err != DOM_NO_ERR) { - /* Failed, clean up strings we've created so far */ - for (int j = 0; j < i; j++) { - if (__nodenames_utf8[j] != NULL) { - dom_string_unref(__nodenames_utf8[j]); - dom_string_unref(__nodenames_utf16[j]); - } - } - - dom_string_unref(__nodenames_utf8[i]); - - return err; - } } return DOM_NO_ERR; @@ -182,7 +137,6 @@ dom_exception _dom_document_finalise(void) for (int i = 0; i <= DOM_NODE_TYPE_COUNT; i++) { if (__nodenames_utf8[i] != NULL) { dom_string_unref(__nodenames_utf8[i]); - dom_string_unref(__nodenames_utf16[i]); } } @@ -193,7 +147,6 @@ dom_exception _dom_document_finalise(void) * Create a Document * * \param impl The DOM implementation owning the document - * \param charset The charset used for strings in the document * \param alloc Memory (de)allocation function * \param pw Pointer to client-specific private data * \param doc Pointer to location to receive created document @@ -204,8 +157,7 @@ dom_exception _dom_document_finalise(void) * The returned document will already be referenced. */ dom_exception dom_document_create(struct dom_implementation *impl, - dom_string_charset charset, dom_alloc alloc, void *pw, - struct dom_document **doc) + dom_alloc alloc, void *pw, struct dom_document **doc) { struct dom_document *d; dom_exception err; @@ -233,7 +185,6 @@ dom_exception dom_document_create(struct dom_implementation *impl, } /* Initialise remaining type-specific data */ - d->charset = charset; if (impl != NULL) dom_implementation_ref(impl); d->impl = impl; @@ -241,8 +192,7 @@ dom_exception dom_document_create(struct dom_implementation *impl, d->nodelists = NULL; d->maps = NULL; - d->nodenames = (charset == DOM_STRING_UTF8) ? __nodenames_utf8 - : __nodenames_utf16; + d->nodenames = __nodenames_utf8; *doc = d; @@ -1047,55 +997,30 @@ dom_exception dom_document_rename_node(struct dom_document *doc, return DOM_NOT_SUPPORTED_ERR; } -/* */ -/* ----------------------------------------------------------------------- */ -/* */ - /** - * Acquire a pointer to the base of the document buffer - * - * \param doc Document to retrieve pointer from - * \return Pointer to document buffer + * Create a DOM string, using a document's allocation context * - * The document buffer is _not_ reference counted (as it is an implicit part - * of the document). It is destroyed with the document, and thus after all - * users have been destroyed. - */ -const uint8_t *dom_document_get_base(struct dom_document *doc) -{ - UNUSED(doc); - - return NULL; -} - -/** - * Set the document buffer pointer + * \param doc The document + * \param data Pointer to string data + * \param len Length, in bytes, of string + * \param result Pointer to location to receive result + * \return DOM_NO_ERR on success, DOM_NO_MEM_ERR on memory exhaustion * - * \param doc Document to set buffer pointer of - * \param buffer Pointer to buffer - * \param buffer_len Length of buffer, in bytes + * The returned string will already be referenced, so there is no need + * to explicitly reference it. * - * By calling this, ownership of the buffer is transferred to the document. - * It should be called once per document node. + * The string of characters passed in will be copied for use by the + * returned DOM string. */ -void dom_document_set_buffer(struct dom_document *doc, uint8_t *buffer, - size_t buffer_len) +dom_exception dom_document_create_string(struct dom_document *doc, + const uint8_t *data, size_t len, struct dom_string **result) { - UNUSED(doc); - UNUSED(buffer); - UNUSED(buffer_len); + return dom_string_create(doc->alloc, doc->pw, data, len, result); } -/** - * Retrieve the character set used to encode strings in the document - * - * \param doc The document to get the charset of - * \return The charset in use - */ -dom_string_charset dom_document_get_charset(struct dom_document *doc) -{ - return doc->charset; -} +/* */ +/* ----------------------------------------------------------------------- */ +/* */ /** * (De)allocate memory with a document's context diff --git a/src/core/document.h b/src/core/document.h index 6982b74..c5c13ac 100644 --- a/src/core/document.h +++ b/src/core/document.h @@ -27,12 +27,6 @@ dom_exception _dom_document_finalise(void); /* Destroy a document */ void dom_document_destroy(struct dom_document *doc); -/* Get base of document buffer */ -const uint8_t *dom_document_get_base(struct dom_document *doc); - -/* Get the document character set */ -dom_string_charset dom_document_get_charset(struct dom_document *doc); - /* (De)allocate memory */ void *dom_document_alloc(struct dom_document *doc, void *ptr, size_t size); diff --git a/src/core/implementation.c b/src/core/implementation.c index 9738b7c..e37b27d 100644 --- a/src/core/implementation.c +++ b/src/core/implementation.c @@ -94,7 +94,6 @@ dom_exception dom_implementation_create_document_type( * \param qname The qualified name of the document element * \param doctype The type of document to create * \param doc Pointer to location to receive result - * \param charset The charset to use for strings in the document * \param alloc Memory (de)allocation function * \param pw Pointer to client-specific private data * \return DOM_NO_ERR on success, @@ -127,11 +126,10 @@ dom_exception dom_implementation_create_document( struct dom_string *namespace, struct dom_string *qname, struct dom_document_type *doctype, struct dom_document **doc, - dom_string_charset charset, dom_alloc alloc, void *pw) { return impl->create_document(impl, namespace, qname, doctype, doc, - charset, alloc, pw); + alloc, pw); } /** diff --git a/src/core/node.c b/src/core/node.c index 2284e4f..0eebfb0 100644 --- a/src/core/node.c +++ b/src/core/node.c @@ -306,15 +306,8 @@ dom_exception dom_node_get_node_name(struct dom_node *node, struct dom_string *colon; dom_exception err; - /* ugh! */ - /** \todo Assumes little endian */ - err = dom_string_create_from_const_ptr(node->owner, - (const uint8_t *) ( - (dom_document_get_charset(node->owner) == - DOM_STRING_UTF8) ? ":" : ":\0"), - (dom_document_get_charset(node->owner) == - DOM_STRING_UTF8) ? 1 : 2, - &colon); + err = dom_document_create_string(node->owner, + (const uint8_t *) ":", SLEN(":"), &colon); if (err != DOM_NO_ERR) { return err; } @@ -1639,7 +1632,7 @@ bool _dom_node_readonly(const struct dom_node *node) * \param previous Previous node in sibling list, or NULL if none * \param next Next node in sibling list, or NULL if none */ -inline void _dom_node_attach(struct dom_node *node, struct dom_node *parent, +void _dom_node_attach(struct dom_node *node, struct dom_node *parent, struct dom_node *previous, struct dom_node *next) { _dom_node_attach_range(node, node, parent, previous, next); @@ -1650,7 +1643,7 @@ inline void _dom_node_attach(struct dom_node *node, struct dom_node *parent, * * \param node The node to detach */ -inline void _dom_node_detach(struct dom_node *node) +void _dom_node_detach(struct dom_node *node) { _dom_node_detach_range(node, node); } @@ -1666,7 +1659,7 @@ inline void _dom_node_detach(struct dom_node *node) * * The range is assumed to be a linked list of sibling nodes. */ -inline void _dom_node_attach_range(struct dom_node *first, +void _dom_node_attach_range(struct dom_node *first, struct dom_node *last, struct dom_node *parent, struct dom_node *previous, @@ -1697,7 +1690,7 @@ inline void _dom_node_attach_range(struct dom_node *first, * * The range is assumed to be a linked list of sibling nodes. */ -inline void _dom_node_detach_range(struct dom_node *first, +void _dom_node_detach_range(struct dom_node *first, struct dom_node *last) { if (first->previous != NULL) @@ -1727,7 +1720,7 @@ inline void _dom_node_detach_range(struct dom_node *first, * we want to perform any special replacement-related behaviour * at a later date. */ -inline void _dom_node_replace(struct dom_node *old, +void _dom_node_replace(struct dom_node *old, struct dom_node *replacement) { struct dom_node *first, *last; diff --git a/src/core/string.c b/src/core/string.c index 8ec44aa..2540e26 100644 --- a/src/core/string.c +++ b/src/core/string.c @@ -9,62 +9,37 @@ #include #include +#include + #include #include "core/document.h" #include "utils/utils.h" -#include "utils/utf8.h" -#include "utils/utf16.h" /** * A DOM string * - * DOM strings store either a pointer to allocated data, a pointer - * to constant data or an offset into a document buffer. - * - * They are reference counted so freeing is performed correctly. + * Strings are reference counted so destruction is performed correctly. */ struct dom_string { - enum { DOM_STRING_PTR, - DOM_STRING_CONST_PTR, - DOM_STRING_OFFSET, - DOM_STRING_PTR_NODOC - } type; /**< String type */ - - dom_string_charset charset; /**< Charset of string */ - - union { - uint8_t *ptr; - const uint8_t *cptr; - uint32_t offset; - } data; /**< Type-specific data */ + uint8_t *ptr; /**< Pointer to string data */ size_t len; /**< Byte length of string */ - union { - struct dom_document *doc; /**< Owning document */ - struct { - dom_alloc alloc; /**< Memory (de)allocation - * function */ - void *pw; /**< Client-specific data */ - } nodoc; - } ctx; /**< Allocation context */ + dom_alloc alloc; /**< Memory (de)allocation function */ + void *pw; /**< Client-specific data */ uint32_t refcnt; /**< Reference count */ }; static struct dom_string empty_string = { - .type = DOM_STRING_CONST_PTR, - .charset = DOM_STRING_UTF8, - .data.ptr = NULL, + .ptr = NULL, .len = 0, - .ctx.doc = NULL, + .alloc = NULL, + .pw = NULL, .refcnt = 1 }; -static dom_exception __dom_string_get_data(struct dom_string *str, - const uint8_t **data, size_t *len); - /** * Claim a reference on a DOM string * @@ -86,155 +61,18 @@ void dom_string_ref(struct dom_string *str) void dom_string_unref(struct dom_string *str) { if (--str->refcnt == 0) { - if (str->type == DOM_STRING_PTR_NODOC) { - str->ctx.nodoc.alloc(str->data.ptr, 0, - str->ctx.nodoc.pw); - - str->ctx.nodoc.alloc(str, 0, str->ctx.nodoc.pw); - } else { - if (str->type == DOM_STRING_PTR) { - dom_document_alloc(str->ctx.doc, - str->data.ptr, 0); - } - - dom_document_alloc(str->ctx.doc, str, 0); + if (str->alloc != NULL) { + str->alloc(str->ptr, 0, str->pw); + str->alloc(str, 0, str->pw); } } } -/** - * Create a DOM string from an offset into the document buffer - * - * \param doc The document in which the string resides - * \param off Offset from start of document buffer - * \param len Length, in bytes, of string - * \param str Pointer to location to receive pointer to new string - * \return DOM_NO_ERR on success, DOM_NO_MEM_ERR on memory exhaustion - * - * The returned string will already be referenced, so there is no need - * to explicitly reference it. - */ -dom_exception dom_string_create_from_off(struct dom_document *doc, - uint32_t off, size_t len, struct dom_string **str) -{ - struct dom_string *ret; - - ret = dom_document_alloc(doc, NULL, sizeof(struct dom_string)); - if (ret == NULL) - return DOM_NO_MEM_ERR; - - ret->type = DOM_STRING_OFFSET; - - ret->charset = dom_document_get_charset(doc); - - ret->data.offset = off; - - ret->len = len; - - ret->ctx.doc = doc; - - ret->refcnt = 1; - - *str = ret; - - return DOM_NO_ERR; -} - /** * Create a DOM string from a string of characters * - * \param doc The document in which the string resides - * \param ptr Pointer to string of characters - * \param len Length, in bytes, of string of characters - * \param str Pointer to location to receive pointer to new string - * \return DOM_NO_ERR on success, DOM_NO_MEM_ERR on memory exhaustion - * - * The returned string will already be referenced, so there is no need - * to explicitly reference it. - * - * The string of characters passed in will be copied for use by the - * returned DOM string. - */ -dom_exception dom_string_create_from_ptr(struct dom_document *doc, - const uint8_t *ptr, size_t len, struct dom_string **str) -{ - struct dom_string *ret; - - ret = dom_document_alloc(doc, NULL, sizeof(struct dom_string)); - if (ret == NULL) - return DOM_NO_MEM_ERR; - - ret->data.ptr = dom_document_alloc(doc, NULL, len); - if (ret->data.ptr == NULL) { - dom_document_alloc(doc, ret, 0); - return DOM_NO_MEM_ERR; - } - - ret->type = DOM_STRING_PTR; - - ret->charset = dom_document_get_charset(doc); - - memcpy(ret->data.ptr, ptr, len); - - ret->len = len; - - ret->ctx.doc = doc; - - ret->refcnt = 1; - - *str = ret; - - return DOM_NO_ERR; -} - -/** - * Create a DOM string from a constant string of characters - * - * \param doc The document in which the string resides - * \param ptr Pointer to string of characters - * \param len Length, in bytes, of string of characters - * \param str Pointer to location to receive pointer to new string - * \return DOM_NO_ERR on success, DOM_NO_MEM_ERR on memory exhaustion - * - * The returned string will already be referenced, so there is no need - * to explicitly reference it. - * - * The string of characters passed in will _not_ be copied for use by the - * returned DOM string. - */ -dom_exception dom_string_create_from_const_ptr(struct dom_document *doc, - const uint8_t *ptr, size_t len, struct dom_string **str) -{ - struct dom_string *ret; - - ret = dom_document_alloc(doc, NULL, sizeof(struct dom_string)); - if (ret == NULL) - return DOM_NO_MEM_ERR; - - ret->type = DOM_STRING_CONST_PTR; - - ret->charset = dom_document_get_charset(doc); - - ret->data.cptr = ptr; - - ret->len = len; - - ret->ctx.doc = doc; - - ret->refcnt = 1; - - *str = ret; - - return DOM_NO_ERR; -} - -/** - * Create a DOM string from a string of characters that does not belong - * to a document - * * \param alloc Memory (de)allocation function * \param pw Pointer to client-specific private data - * \param charset The charset of the string * \param ptr Pointer to string of characters * \param len Length, in bytes, of string of characters * \param str Pointer to location to receive result @@ -243,12 +81,11 @@ dom_exception dom_string_create_from_const_ptr(struct dom_document *doc, * The returned string will already be referenced, so there is no need * to explicitly reference it. * - * The string of characters passed in will be copied for use by the + * The string of characters passed in will be copied for use by the * returned DOM string. */ -dom_exception dom_string_create_from_ptr_no_doc(dom_alloc alloc, void *pw, - dom_string_charset charset, const uint8_t *ptr, size_t len, - struct dom_string **str) +dom_exception dom_string_create(dom_alloc alloc, void *pw, + const uint8_t *ptr, size_t len, struct dom_string **str) { struct dom_string *ret; @@ -256,22 +93,18 @@ dom_exception dom_string_create_from_ptr_no_doc(dom_alloc alloc, void *pw, if (ret == NULL) return DOM_NO_MEM_ERR; - ret->data.ptr = alloc(NULL, len, pw); - if (ret->data.ptr == NULL) { + ret->ptr = alloc(NULL, len, pw); + if (ret->ptr == NULL) { alloc(ret, 0, pw); return DOM_NO_MEM_ERR; } - ret->type = DOM_STRING_PTR_NODOC; - - ret->charset = charset; - - memcpy(ret->data.ptr, ptr, len); + memcpy(ret->ptr, ptr, len); ret->len = len; - ret->ctx.nodoc.alloc = alloc; - ret->ctx.nodoc.pw = pw; + ret->alloc = alloc; + ret->pw = pw; ret->refcnt = 1; @@ -291,48 +124,16 @@ dom_exception dom_string_create_from_ptr_no_doc(dom_alloc alloc, void *pw, */ int dom_string_cmp(struct dom_string *s1, struct dom_string *s2) { - const uint8_t *d1 = NULL; - const uint8_t *d2 = NULL; - size_t l1, l2; - dom_exception err; - - err = __dom_string_get_data(s1, &d1, &l1); - if (err != DOM_NO_ERR) - return 1; /* arbitrary */ + if (s1 == NULL) + s1 = &empty_string; - err = __dom_string_get_data(s2, &d2, &l2); - if (err != DOM_NO_ERR) - return 1; /* arbitrary */ + if (s2 == NULL) + s2 = &empty_string; - while (l1 > 0 && l2 > 0) { - uint32_t c1, c2; - size_t cl1, cl2; - charset_error err; - - err = (s1->charset == DOM_STRING_UTF8) - ? _dom_utf8_to_ucs4(d1, l1, &c1, &cl1) - : _dom_utf16_to_ucs4(d1, l1, &c1, &cl1); - if (err != CHARSET_OK) { - } + if (s1->len != s2->len) + return 1; - err = (s2->charset == DOM_STRING_UTF8) - ? _dom_utf8_to_ucs4(d2, l2, &c2, &cl2) - : _dom_utf16_to_ucs4(d2, l2, &c2, &cl2); - if (err != CHARSET_OK) { - } - - if (c1 != c2) { - return (int)(c1 - c2); - } - - d1 += cl1; - d2 += cl2; - - l1 -= cl1; - l2 -= cl2; - } - - return (int)(l1 - l2); + return memcmp(s1->ptr, s2->ptr, s1->len); } /** @@ -349,31 +150,28 @@ int dom_string_icmp(struct dom_string *s1, struct dom_string *s2) const uint8_t *d1 = NULL; const uint8_t *d2 = NULL; size_t l1, l2; - dom_exception err; - err = __dom_string_get_data(s1, &d1, &l1); - if (err != DOM_NO_ERR) - return 1; /* arbitrary */ + if (s1 == NULL) + s1 = &empty_string; + if (s2 == NULL) + s2 = &empty_string; - err = __dom_string_get_data(s2, &d2, &l2); - if (err != DOM_NO_ERR) - return 1; /* arbitrary */ + d1 = s1->ptr; + d2 = s2->ptr; + l1 = s1->len; + l2 = s2->len; while (l1 > 0 && l2 > 0) { uint32_t c1, c2; size_t cl1, cl2; - charset_error err; + parserutils_error err; - err = (s1->charset == DOM_STRING_UTF8) - ? _dom_utf8_to_ucs4(d1, l1, &c1, &cl1) - : _dom_utf16_to_ucs4(d1, l1, &c1, &cl1); - if (err != CHARSET_OK) { + err = parserutils_charset_utf8_to_ucs4(d1, l1, &c1, &cl1); + if (err != PARSERUTILS_OK) { } - err = (s2->charset == DOM_STRING_UTF8) - ? _dom_utf8_to_ucs4(d2, l2, &c2, &cl2) - : _dom_utf16_to_ucs4(d2, l2, &c2, &cl2); - if (err != CHARSET_OK) { + err = parserutils_charset_utf8_to_ucs4(d2, l2, &c2, &cl2); + if (err != PARSERUTILS_OK) { } /** \todo improved lower-casing algorithm */ @@ -403,20 +201,19 @@ uint32_t dom_string_index(struct dom_string *str, uint32_t chr) const uint8_t *s; size_t clen, slen; uint32_t c, index; - charset_error err; + parserutils_error err; - __dom_string_get_data(str, &s, &slen); + if (str == NULL) + str = &empty_string; + + s = str->ptr; + slen = str->len; index = 0; while (slen > 0) { - if (str->charset == DOM_STRING_UTF8) { - err = _dom_utf8_to_ucs4(s, slen, &c, &clen); - } else { - err = _dom_utf16_to_ucs4(s, slen, &c, &clen); - } - - if (err != CHARSET_OK) { + err = parserutils_charset_utf8_to_ucs4(s, slen, &c, &clen); + if (err != PARSERUTILS_OK) { return (uint32_t) -1; } @@ -444,28 +241,25 @@ uint32_t dom_string_rindex(struct dom_string *str, uint32_t chr) const uint8_t *s; size_t clen, slen; uint32_t c, index; - charset_error err; + parserutils_error err; + + if (str == NULL) + str = &empty_string; - __dom_string_get_data(str, &s, &slen); + s = str->ptr; + slen = str->len; index = dom_string_length(str); while (slen > 0) { - if (str->charset == DOM_STRING_UTF8) { - err = _dom_utf8_prev(s, slen, &clen); - if (err == CHARSET_OK) { - err = _dom_utf8_to_ucs4(s + clen, slen - clen, - &c, &clen); - } - } else { - err = _dom_utf16_prev(s, slen, &clen); - if (err == CHARSET_OK) { - err = _dom_utf16_to_ucs4(s + clen, slen - clen, - &c, &clen); - } + err = parserutils_charset_utf8_prev(s, slen, + (uint32_t *) &clen); + if (err == PARSERUTILS_OK) { + err = parserutils_charset_utf8_to_ucs4(s + clen, + slen - clen, &c, &clen); } - if (err != CHARSET_OK) { + if (err != PARSERUTILS_OK) { return (uint32_t) -1; } @@ -478,7 +272,6 @@ uint32_t dom_string_rindex(struct dom_string *str, uint32_t chr) } return (uint32_t) -1; - } /** @@ -489,20 +282,14 @@ uint32_t dom_string_rindex(struct dom_string *str, uint32_t chr) */ uint32_t dom_string_length(struct dom_string *str) { - const uint8_t *s; - size_t slen; - uint32_t clen; - charset_error err; - - __dom_string_get_data(str, &s, &slen); + size_t clen; + parserutils_error err; - if (str->charset == DOM_STRING_UTF8) { - err = _dom_utf8_length(s, slen, &clen); - } else { - err = _dom_utf16_length(s, slen, &clen); - } + if (str == NULL) + str = &empty_string; - if (err != CHARSET_OK) { + err = parserutils_charset_utf8_length(str->ptr, str->len, &clen); + if (err != PARSERUTILS_OK) { return 0; } @@ -527,60 +314,28 @@ dom_exception dom_string_concat(struct dom_string *s1, struct dom_string *s2, struct dom_string **result) { struct dom_string *concat; - const uint8_t *s; - size_t slen; - if (s1->type == DOM_STRING_PTR_NODOC) { - concat = s1->ctx.nodoc.alloc(NULL, - sizeof(struct dom_string), s1->ctx.nodoc.pw); - } else { - concat = dom_document_alloc(s1->ctx.doc, - NULL, sizeof(struct dom_string)); - } + concat = s1->alloc(NULL, sizeof(struct dom_string), s1->pw); if (concat == NULL) { return DOM_NO_MEM_ERR; } - /** \todo support attempted concatenation of mismatched charsets */ + concat->ptr = s1->alloc(NULL, s1->len + s2->len, s1->pw); + if (concat->ptr == NULL) { + s1->alloc(concat, 0, s1->pw); - if (s1->type == DOM_STRING_PTR_NODOC) { - concat->data.ptr = s1->ctx.nodoc.alloc(NULL, - s1->len + s2->len, s1->ctx.nodoc.pw); - } else { - concat->data.ptr = dom_document_alloc(s1->ctx.doc, - NULL, s1->len + s2->len); - } - if (concat->data.ptr == NULL) { - if (s1->type == DOM_STRING_PTR_NODOC) { - s1->ctx.nodoc.alloc(concat, 0, s1->ctx.nodoc.pw); - } else { - dom_document_alloc(s1->ctx.doc, concat, 0); - } return DOM_NO_MEM_ERR; } - concat->type = (s1->type == DOM_STRING_PTR_NODOC) - ? DOM_STRING_PTR_NODOC : DOM_STRING_PTR; - - concat->charset = s1->charset; - - __dom_string_get_data(s1, &s, &slen); + memcpy(concat->ptr, s1->ptr, s1->len); - memcpy(concat->data.ptr, s, slen); - - __dom_string_get_data(s2, &s, &slen); - - memcpy(concat->data.ptr + s1->len, s, slen); + memcpy(concat->ptr + s1->len, s2->ptr, s2->len); concat->len = s1->len + s2->len; - if (concat->type == DOM_STRING_PTR_NODOC) { - concat->ctx.nodoc.alloc = s1->ctx.nodoc.alloc; - concat->ctx.nodoc.pw = s1->ctx.nodoc.pw; - } else { - concat->ctx.doc = s1->ctx.doc; - } + concat->alloc = s1->alloc; + concat->pw = s1->pw; concat->refcnt = 1; @@ -607,12 +362,10 @@ dom_exception dom_string_concat(struct dom_string *s1, struct dom_string *s2, dom_exception dom_string_substr(struct dom_string *str, uint32_t i1, uint32_t i2, struct dom_string **result) { - const uint8_t *s; - size_t slen; + const uint8_t *s = str->ptr; + size_t slen = str->len; size_t b1, b2; - charset_error err; - - __dom_string_get_data(str, &s, &slen); + parserutils_error err; /* Initialise the byte index of the start to 0 */ b1 = 0; @@ -621,13 +374,9 @@ dom_exception dom_string_substr(struct dom_string *str, /* Calculate the byte index of the start */ while (i1 > 0) { - if (str->charset == DOM_STRING_UTF8) { - err = _dom_utf8_next(s, slen - b1, b1, &b1); - } else { - err = _dom_utf16_next(s, slen - b1, b1, &b1); - } - - if (err != CHARSET_OK) { + err = parserutils_charset_utf8_next(s, slen - b1, b1, + (uint32_t *) &b1); + if (err != PARSERUTILS_OK) { return DOM_NO_MEM_ERR; } @@ -639,13 +388,10 @@ dom_exception dom_string_substr(struct dom_string *str, /* Calculate the byte index of the end */ while (i2 > 0) { - if (str->charset == DOM_STRING_UTF8) { - err = _dom_utf8_next(s, slen - b2, b2, &b2); - } else { - err = _dom_utf16_next(s, slen - b2, b2, &b2); - } + err = parserutils_charset_utf8_next(s, slen - b2, b2, + (uint32_t *) &b2); - if (err != CHARSET_OK) { + if (err != PARSERUTILS_OK) { return DOM_NO_MEM_ERR; } @@ -653,14 +399,7 @@ dom_exception dom_string_substr(struct dom_string *str, } /* Create a string from the specified byte range */ - return (str->type == DOM_STRING_PTR_NODOC) - ? dom_string_create_from_ptr_no_doc( - str->ctx.nodoc.alloc, - str->ctx.nodoc.pw, - str->charset, - s + b1, b2 - b1, result) - : dom_string_create_from_ptr(str->ctx.doc, - s + b1, b2 - b1, result); + return dom_string_create(str->alloc, str->pw, s + b1, b2 - b1, result); } /** @@ -688,11 +427,12 @@ dom_exception dom_string_insert(struct dom_string *target, const uint8_t *t, *s; uint32_t tlen, slen, clen; uint32_t ins = 0; - charset_error err; - - __dom_string_get_data(target, &t, &tlen); + parserutils_error err; - __dom_string_get_data(source, &s, &slen); + t = target->ptr; + tlen = target->len; + s = source->ptr; + slen = source->len; clen = dom_string_length(target); @@ -706,13 +446,10 @@ dom_exception dom_string_insert(struct dom_string *target, ins = tlen; } else { while (offset > 0) { - if (target->charset == DOM_STRING_UTF8) { - err = _dom_utf8_next(t, tlen - ins, ins, &ins); - } else { - err = _dom_utf16_next(t, tlen - ins, ins, &ins); - } + err = parserutils_charset_utf8_next(t, tlen - ins, + ins, &ins); - if (err != CHARSET_OK) { + if (err != PARSERUTILS_OK) { return DOM_NO_MEM_ERR; } @@ -721,65 +458,36 @@ dom_exception dom_string_insert(struct dom_string *target, } /* Allocate result string */ - if (target->type == DOM_STRING_PTR_NODOC) { - res = target->ctx.nodoc.alloc(NULL, sizeof(struct dom_string), - target->ctx.nodoc.pw); - } else { - res = dom_document_alloc(target->ctx.doc, - NULL, sizeof(struct dom_string)); - } - + res = target->alloc(NULL, sizeof(struct dom_string), target->pw); if (res == NULL) { return DOM_NO_MEM_ERR; } - /** \todo support insertion of a string from a different charset */ - /* Allocate data buffer for result contents */ - if (target->type == DOM_STRING_PTR_NODOC) { - res->data.ptr = target->ctx.nodoc.alloc(NULL, - tlen + slen, target->ctx.nodoc.pw); - } else { - res->data.ptr = dom_document_alloc(target->ctx.doc, - NULL, tlen + slen); - } - if (res->data.ptr == NULL) { - if (target->type == DOM_STRING_PTR_NODOC) { - target->ctx.nodoc.alloc(res, 0, target->ctx.nodoc.pw); - } else { - dom_document_alloc(target->ctx.doc, res, 0); - } + res->ptr = target->alloc(NULL, tlen + slen, target->pw); + if (res->ptr == NULL) { + target->alloc(res, 0, target->pw); return DOM_NO_MEM_ERR; } - /* Populate result members */ - res->type = (target->type == DOM_STRING_PTR_NODOC) - ? DOM_STRING_PTR_NODOC : DOM_STRING_PTR; - - res->charset = target->charset; - /* Copy initial portion of target, if any, into result */ if (ins > 0) { - memcpy(res->data.ptr, t, ins); + memcpy(res->ptr, t, ins); } /* Copy inserted data into result */ - memcpy(res->data.ptr + ins, s, slen); + memcpy(res->ptr + ins, s, slen); /* Copy remainder of target, if any, into result */ if (tlen - ins > 0) { - memcpy(res->data.ptr + ins + slen, t + ins, tlen - ins); + memcpy(res->ptr + ins + slen, t + ins, tlen - ins); } res->len = tlen + slen; - if (res->type == DOM_STRING_PTR_NODOC) { - res->ctx.nodoc.alloc = target->ctx.nodoc.alloc; - res->ctx.nodoc.pw = target->ctx.nodoc.pw; - } else { - res->ctx.doc = target->ctx.doc; - } - + res->alloc = target->alloc; + res->pw = target->pw; + res->refcnt = 1; *result = res; @@ -811,11 +519,12 @@ dom_exception dom_string_replace(struct dom_string *target, const uint8_t *t, *s; uint32_t tlen, slen; uint32_t b1, b2; - charset_error err; - - __dom_string_get_data(target, &t, &tlen); + parserutils_error err; - __dom_string_get_data(source, &s, &slen); + t = target->ptr; + tlen = target->len; + s = source->ptr; + slen = source->len; /* Initialise the byte index of the start to 0 */ b1 = 0; @@ -824,13 +533,9 @@ dom_exception dom_string_replace(struct dom_string *target, /* Calculate the byte index of the start */ while (i1 > 0) { - if (target->charset == DOM_STRING_UTF8) { - err = _dom_utf8_next(s, slen - b1, b1, &b1); - } else { - err = _dom_utf16_next(s, slen - b1, b1, &b1); - } + err = parserutils_charset_utf8_next(s, slen - b1, b1, &b1); - if (err != CHARSET_OK) { + if (err != PARSERUTILS_OK) { return DOM_NO_MEM_ERR; } @@ -842,13 +547,9 @@ dom_exception dom_string_replace(struct dom_string *target, /* Calculate the byte index of the end */ while (i2 > 0) { - if (target->charset == DOM_STRING_UTF8) { - err = _dom_utf8_next(s, slen - b2, b2, &b2); - } else { - err = _dom_utf16_next(s, slen - b2, b2, &b2); - } + err = parserutils_charset_utf8_next(s, slen - b2, b2, &b2); - if (err != CHARSET_OK) { + if (err != PARSERUTILS_OK) { return DOM_NO_MEM_ERR; } @@ -856,66 +557,38 @@ dom_exception dom_string_replace(struct dom_string *target, } /* Allocate result string */ - if (target->type == DOM_STRING_PTR_NODOC) { - res = target->ctx.nodoc.alloc(NULL, sizeof(struct dom_string), - target->ctx.nodoc.pw); - } else { - res = dom_document_alloc(target->ctx.doc, - NULL, sizeof(struct dom_string)); - } + res = target->alloc(NULL, sizeof(struct dom_string), target->pw); if (res == NULL) { return DOM_NO_MEM_ERR; } - /** \todo support insertion of a string from a different charset */ - /* Allocate data buffer for result contents */ - if (target->type == DOM_STRING_PTR_NODOC) { - res->data.ptr = target->ctx.nodoc.alloc(NULL, - tlen + slen - (b2 - b1), target->ctx.nodoc.pw); - } else { - res->data.ptr = dom_document_alloc(target->ctx.doc, - NULL, tlen + slen - (b2 - b1)); - } - if (res->data.ptr == NULL) { - if (target->type == DOM_STRING_PTR_NODOC) { - target->ctx.nodoc.alloc(res, 0, target->ctx.nodoc.pw); - } else { - dom_document_alloc(target->ctx.doc, res, 0); - } + res->ptr = target->alloc(NULL, tlen + slen - (b2 - b1), target->pw); + if (res->ptr == NULL) { + target->alloc(res, 0, target->pw); return DOM_NO_MEM_ERR; } - /* Populate result members */ - res->type = (target->type == DOM_STRING_PTR_NODOC) - ? DOM_STRING_PTR_NODOC : DOM_STRING_PTR; - - res->charset = target->charset; - /* Copy initial portion of target, if any, into result */ if (b1 > 0) { - memcpy(res->data.ptr, t, b1); + memcpy(res->ptr, t, b1); } /* Copy replacement data into result */ if (slen > 0) { - memcpy(res->data.ptr + b1, s, slen); + memcpy(res->ptr + b1, s, slen); } /* Copy remainder of target, if any, into result */ if (tlen - b2 > 0) { - memcpy(res->data.ptr + b1 + slen, t + b2, tlen - b2); + memcpy(res->ptr + b1 + slen, t + b2, tlen - b2); } res->len = tlen + slen - (b2 - b1); - if (res->type == DOM_STRING_PTR_NODOC) { - res->ctx.nodoc.alloc = target->ctx.nodoc.alloc; - res->ctx.nodoc.pw = target->ctx.nodoc.pw; - } else { - res->ctx.doc = target->ctx.doc; - } + res->alloc = target->alloc; + res->pw = target->pw; res->refcnt = 1; @@ -940,19 +613,8 @@ dom_exception dom_string_replace(struct dom_string *target, dom_exception dom_string_dup(struct dom_string *str, struct dom_string **result) { - const uint8_t *s; - size_t slen; - - __dom_string_get_data(str, &s, &slen); - - return str->type == DOM_STRING_PTR_NODOC - ? dom_string_create_from_ptr_no_doc( - str->ctx.nodoc.alloc, - str->ctx.nodoc.pw, - str->charset, - s, slen, result) - : dom_string_create_from_ptr(str->ctx.doc, - s, slen, result); + return dom_string_create(str->alloc, str->pw, str->ptr, str->len, + result); } /** @@ -963,12 +625,10 @@ dom_exception dom_string_dup(struct dom_string *str, */ uint32_t dom_string_hash(struct dom_string *str) { - const uint8_t *s; - size_t slen; + const uint8_t *s = str->ptr; + size_t slen = str->len; uint32_t hash = 0x01000193; - __dom_string_get_data(str, &s, &slen); - while (slen > 0) { hash *= 0x01000193; hash ^= *s; @@ -980,47 +640,3 @@ uint32_t dom_string_hash(struct dom_string *str) return hash; } -/* */ -/*---------------------------------------------------------------------------*/ -/* */ - -/** - * Get a pointer to the string of characters within a DOM string - * - * \param str Pointer to DOM string to retrieve pointer from - * \param data Pointer to location to receive data - * \param len Pointer to location to receive byte length of data - * \return DOM_NO_ERR on success - * - * The caller must have previously claimed a reference on the DOM string. - * The returned pointer must not be freed. - */ -dom_exception __dom_string_get_data(struct dom_string *str, - const uint8_t **data, size_t *len) -{ - /* Assume that a NULL str pointer indicates the empty string */ - if (str == NULL) - str = &empty_string; - - switch (str->type) { - case DOM_STRING_PTR: - *data = str->data.ptr; - break; - case DOM_STRING_CONST_PTR: - *data = str->data.cptr; - break; - case DOM_STRING_OFFSET: - *data = dom_document_get_base(str->ctx.doc) + - str->data.offset; - break; - case DOM_STRING_PTR_NODOC: - *data = str->data.ptr; - break; - } - - *len = str->len; - - return DOM_NO_ERR; -} - - diff --git a/src/utils/Makefile b/src/utils/Makefile index ac87ded..29369ae 100644 --- a/src/utils/Makefile +++ b/src/utils/Makefile @@ -22,7 +22,7 @@ CFLAGS += -I$(CURDIR) # Objects -OBJS = namespace utf8 utf16 +OBJS = namespace .PHONY: clean debug distclean export release setup test diff --git a/src/utils/namespace.c b/src/utils/namespace.c index 9c0d214..8002b8e 100644 --- a/src/utils/namespace.c +++ b/src/utils/namespace.c @@ -32,14 +32,13 @@ dom_exception _dom_namespace_initialise(dom_alloc alloc, void *pw) { dom_exception err; - err = dom_string_create_from_ptr_no_doc(alloc, pw, - DOM_STRING_UTF8, (const uint8_t *) "xml", SLEN("xml"), &xml); + err = dom_string_create(alloc, pw, + (const uint8_t *) "xml", SLEN("xml"), &xml); if (err != DOM_NO_ERR) { return err; } - err = dom_string_create_from_ptr_no_doc(alloc, pw, - DOM_STRING_UTF8, + err = dom_string_create(alloc, pw, (const uint8_t *) "http://www.w3.org/XML/1998/namespace", SLEN("http://www.w3.org/XML/1998/namespace"), &xml_ns); @@ -48,8 +47,7 @@ dom_exception _dom_namespace_initialise(dom_alloc alloc, void *pw) return err; } - err = dom_string_create_from_ptr_no_doc(alloc, pw, - DOM_STRING_UTF8, + err = dom_string_create(alloc, pw, (const uint8_t *) "xmlns", SLEN("xmlns"), &xmlns); if (err != DOM_NO_ERR) { dom_string_unref(xml_ns); @@ -57,8 +55,7 @@ dom_exception _dom_namespace_initialise(dom_alloc alloc, void *pw) return err; } - err = dom_string_create_from_ptr_no_doc(alloc, pw, - DOM_STRING_UTF8, + err = dom_string_create(alloc, pw, (const uint8_t *) "http://www.w3.org/2000/xmlns", SLEN("http://www.w3.org/2000/xmlns"), &xmlns_ns); diff --git a/src/utils/utf16.c b/src/utils/utf16.c deleted file mode 100644 index 8917328..0000000 --- a/src/utils/utf16.c +++ /dev/null @@ -1,239 +0,0 @@ -/* - * This file is part of Hubbub. - * Licensed under the MIT License, - * http://www.opensource.org/licenses/mit-license.php - * Copyright 2007 John-Mark Bell - */ - -/** \file - * UTF-16 manipulation functions (implementation). - */ - -#include -#include -#include - -#include "utils/utf16.h" - -/** - * Convert a UTF-16 sequence into a single UCS4 character - * - * \param s The sequence to process - * \param len Length of sequence - * \param ucs4 Pointer to location to receive UCS4 character (host endian) - * \param clen Pointer to location to receive byte length of UTF-16 sequence - * \return CHARSET_OK on success, appropriate error otherwise - */ -inline charset_error _dom_utf16_to_ucs4(const uint8_t *s, size_t len, - uint32_t *ucs4, size_t *clen) -{ - const uint16_t *ss = (const uint16_t *) (const void *) s; - - if (s == NULL || ucs4 == NULL || clen == NULL) - return CHARSET_BADPARM; - - if (len < 2) - return CHARSET_NEEDDATA; - - if (*ss < 0xD800 || *ss > 0xDFFF) { - *ucs4 = *ss; - *clen = 2; - } else if (0xD800 <= *ss && *ss <= 0xBFFF) { - if (len < 4) - return CHARSET_NEEDDATA; - - if (0xDC00 <= ss[1] && ss[1] <= 0xE000) { - *ucs4 = (((s[0] >> 6) & 0x1f) + 1) | - ((s[0] & 0x3f) | (s[1] & 0x3ff)); - *clen = 4; - } else { - return CHARSET_INVALID; - } - } - - return CHARSET_OK; -} - -/** - * Convert a single UCS4 character into a UTF-16 sequence - * - * \param ucs4 The character to process (0 <= c <= 0x7FFFFFFF) (host endian) - * \param s Pointer to 4 byte long output buffer - * \param len Pointer to location to receive length of multibyte sequence - * \return CHARSET_OK on success, appropriate error otherwise - */ -inline charset_error _dom_utf16_from_ucs4(uint32_t ucs4, uint8_t *s, - size_t *len) -{ - uint16_t *ss = (uint16_t *) (void *) s; - uint32_t l = 0; - - if (s == NULL || len == NULL) - return CHARSET_BADPARM; - else if (ucs4 < 0x10000) { - *ss = (uint16_t) ucs4; - l = 2; - } else if (ucs4 < 0x110000) { - ss[0] = 0xD800 | (((ucs4 >> 16) & 0x1f) - 1) | (ucs4 >> 10); - ss[1] = 0xDC00 | (ucs4 & 0x3ff); - l = 4; - } else { - return CHARSET_INVALID; - } - - *len = l; - - return CHARSET_OK; -} - -/** - * Calculate the length (in characters) of a bounded UTF-16 string - * - * \param s The string - * \param max Maximum length - * \param len Pointer to location to receive length of string - * \return CHARSET_OK on success, appropriate error otherwise - */ -inline charset_error _dom_utf16_length(const uint8_t *s, size_t max, - size_t *len) -{ - const uint16_t *ss = (const uint16_t *) (const void *) s; - const uint16_t *end = (const uint16_t *) (const void *) (s + max); - int l = 0; - - if (s == NULL || len == NULL) - return CHARSET_BADPARM; - - while (ss < end) { - if (*ss < 0xD800 || 0xDFFF < *ss) - ss++; - else - ss += 2; - - l++; - } - - *len = l; - - return CHARSET_OK; -} - -/** - * Calculate the length (in bytes) of a UTF-16 character - * - * \param s Pointer to start of character - * \param len Pointer to location to receive length - * \return CHARSET_OK on success, appropriate error otherwise - */ -inline charset_error _dom_utf16_char_byte_length(const uint8_t *s, - size_t *len) -{ - const uint16_t *ss = (const uint16_t *) (const void *) s; - - if (s == NULL || len == NULL) - return CHARSET_BADPARM; - - if (*ss < 0xD800 || 0xDFFF < *ss) - *len = 2; - else - *len = 4; - - return CHARSET_OK; -} - -/** - * Find previous legal UTF-16 char in string - * - * \param s The string - * \param off Offset in the string to start at - * \param prevoff Pointer to location to receive offset of first byte of - * previous legal character - * \return CHARSET_OK on success, appropriate error otherwise - */ -inline charset_error _dom_utf16_prev(const uint8_t *s, uint32_t off, - uint32_t *prevoff) -{ - const uint16_t *ss = (const uint16_t *) (const void *) s; - - if (s == NULL || prevoff == NULL) - return CHARSET_BADPARM; - - if (off < 2) - *prevoff = 0; - else if (ss[-1] < 0xDC00 || ss[-1] > 0xDFFF) - *prevoff = off - 2; - else - *prevoff = (off < 4) ? 0 : off - 4; - - return CHARSET_OK; -} - -/** - * Find next legal UTF-16 char in string - * - * \param s The string (assumed valid) - * \param len Maximum offset in string - * \param off Offset in the string to start at - * \param nextoff Pointer to location to receive offset of first byte of - * next legal character - * \return CHARSET_OK on success, appropriate error otherwise - */ -inline charset_error _dom_utf16_next(const uint8_t *s, uint32_t len, - uint32_t off, uint32_t *nextoff) -{ - const uint16_t *ss = (const uint16_t *) (const void *) s; - - if (s == NULL || off >= len || nextoff == NULL) - return CHARSET_BADPARM; - - if (len - off < 4) - *nextoff = len; - else if (ss[1] < 0xD800 || ss[1] > 0xDBFF) - *nextoff = off + 2; - else - *nextoff = (len - off < 6) ? len : off + 4; - - return CHARSET_OK; -} - -/** - * Find next legal UTF-16 char in string - * - * \param s The string (assumed to be of dubious validity) - * \param len Maximum offset in string - * \param off Offset in the string to start at - * \param nextoff Pointer to location to receive offset of first byte of - * next legal character - * \return CHARSET_OK on success, appropriate error otherwise - */ -inline charset_error _dom_utf16_next_paranoid(const uint8_t *s, - uint32_t len, uint32_t off, uint32_t *nextoff) -{ - const uint16_t *ss = (const uint16_t *) (const void *) s; - - if (s == NULL || off >= len || nextoff == NULL) - return CHARSET_BADPARM; - - while (1) { - if (len - off < 4) { - return CHARSET_NEEDDATA; - } else if (ss[1] < 0xD800 || ss[1] > 0xDFFF) { - *nextoff = off + 2; - break; - } else if (ss[1] >= 0xD800 && ss[1] <= 0xDBFF) { - if (len - off < 6) - return CHARSET_NEEDDATA; - - if (ss[2] >= 0xDC00 && ss[2] <= 0xDFFF) { - *nextoff = off + 4; - break; - } else { - ss++; - off += 2; - } - } - } - - return CHARSET_OK; -} - diff --git a/src/utils/utf16.h b/src/utils/utf16.h deleted file mode 100644 index 7b9e15f..0000000 --- a/src/utils/utf16.h +++ /dev/null @@ -1,38 +0,0 @@ -/* - * This file is part of libdom. - * Licensed under the MIT License, - * http://www.opensource.org/licenses/mit-license.php - * Copyright 2007 John-Mark Bell - */ - -/** \file - * UTF-16 manipulation functions (interface). - */ - -#ifndef dom_utils_utf16_h_ -#define dom_utils_utf16_h_ - -#include - -#include "utils/charset_errors.h" - -inline charset_error _dom_utf16_to_ucs4(const uint8_t *s, size_t len, - uint32_t *ucs4, size_t *clen); -inline charset_error _dom_utf16_from_ucs4(uint32_t ucs4, uint8_t *s, - size_t *len); - -inline charset_error _dom_utf16_length(const uint8_t *s, size_t max, - size_t *len); -inline charset_error _dom_utf16_char_byte_length(const uint8_t *s, - size_t *len); - -inline charset_error _dom_utf16_prev(const uint8_t *s, uint32_t off, - uint32_t *prevoff); -inline charset_error _dom_utf16_next(const uint8_t *s, uint32_t len, - uint32_t off, uint32_t *nextoff); - -inline charset_error _dom_utf16_next_paranoid(const uint8_t *s, - uint32_t len, uint32_t off, uint32_t *nextoff); - -#endif - diff --git a/src/utils/utf8.c b/src/utils/utf8.c deleted file mode 100644 index b80f04e..0000000 --- a/src/utils/utf8.c +++ /dev/null @@ -1,368 +0,0 @@ -/* - * This file is part of libdom. - * Licensed under the MIT License, - * http://www.opensource.org/licenses/mit-license.php - * Copyright 2007 John-Mark Bell - */ - -/** \file - * UTF-8 manipulation functions (implementation). - */ - -#include -#include -#include - -#include "utils/utf8.h" - -/** Number of continuation bytes for a given start byte */ -static const uint8_t numContinuations[256] = { - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, -}; - -/** - * Convert a UTF-8 multibyte sequence into a single UCS4 character - * - * Encoding of UCS values outside the UTF-16 plane has been removed from - * RFC3629. This function conforms to RFC2279, however. - * - * \param s The sequence to process - * \param len Length of sequence - * \param ucs4 Pointer to location to receive UCS4 character (host endian) - * \param clen Pointer to location to receive byte length of UTF-8 sequence - * \return CHARSET_OK on success, appropriate error otherwise - */ -inline charset_error _dom_utf8_to_ucs4(const uint8_t *s, size_t len, - uint32_t *ucs4, size_t *clen) -{ - if (s == NULL || ucs4 == NULL || clen == NULL) - return CHARSET_BADPARM; - - if (len == 0) - return CHARSET_NEEDDATA; - - if (*s < 0x80) { - *ucs4 = *s; - *clen = 1; - } else if ((*s & 0xE0) == 0xC0) { - if (len < 2) - return CHARSET_NEEDDATA; - else if ((*(s+1) & 0xC0) != 0x80) - return CHARSET_INVALID; - else { - *ucs4 = ((*s & 0x1F) << 6) | (*(s+1) & 0x3F); - *clen = 2; - } - } else if ((*s & 0xF0) == 0xE0) { - if (len < 3) - return CHARSET_NEEDDATA; - else if ((*(s+1) & 0xC0) != 0x80 || - (*(s+2) & 0xC0) != 0x80) - return CHARSET_INVALID; - else { - *ucs4 = ((*s & 0x0F) << 12) | - ((*(s+1) & 0x3F) << 6) | - (*(s+2) & 0x3F); - *clen = 3; - } - } else if ((*s & 0xF8) == 0xF0) { - if (len < 4) - return CHARSET_NEEDDATA; - else if ((*(s+1) & 0xC0) != 0x80 || - (*(s+2) & 0xC0) != 0x80 || - (*(s+3) & 0xC0) != 0x80) - return CHARSET_INVALID; - else { - *ucs4 = ((*s & 0x0F) << 18) | - ((*(s+1) & 0x3F) << 12) | - ((*(s+2) & 0x3F) << 6) | - (*(s+3) & 0x3F); - *clen = 4; - } - } else if ((*s & 0xFC) == 0xF8) { - if (len < 5) - return CHARSET_NEEDDATA; - else if ((*(s+1) & 0xC0) != 0x80 || - (*(s+2) & 0xC0) != 0x80 || - (*(s+3) & 0xC0) != 0x80 || - (*(s+4) & 0xC0) != 0x80) - return CHARSET_INVALID; - else { - *ucs4 = ((*s & 0x0F) << 24) | - ((*(s+1) & 0x3F) << 18) | - ((*(s+2) & 0x3F) << 12) | - ((*(s+3) & 0x3F) << 6) | - (*(s+4) & 0x3F); - *clen = 5; - } - } else if ((*s & 0xFE) == 0xFC) { - if (len < 6) - return CHARSET_NEEDDATA; - else if ((*(s+1) & 0xC0) != 0x80 || - (*(s+2) & 0xC0) != 0x80 || - (*(s+3) & 0xC0) != 0x80 || - (*(s+4) & 0xC0) != 0x80 || - (*(s+5) & 0xC0) != 0x80) - return CHARSET_INVALID; - else { - *ucs4 = ((*s & 0x0F) << 28) | - ((*(s+1) & 0x3F) << 24) | - ((*(s+2) & 0x3F) << 18) | - ((*(s+3) & 0x3F) << 12) | - ((*(s+4) & 0x3F) << 6) | - (*(s+5) & 0x3F); - *clen = 6; - } - } else { - return CHARSET_INVALID; - } - - return CHARSET_OK; -} - -/** - * Convert a single UCS4 character into a UTF-8 multibyte sequence - * - * Encoding of UCS values outside the UTF-16 plane has been removed from - * RFC3629. This function conforms to RFC2279, however. - * - * \param ucs4 The character to process (0 <= c <= 0x7FFFFFFF) (host endian) - * \param s Pointer to 6 byte long output buffer - * \param len Pointer to location to receive length of multibyte sequence - * \return CHARSET_OK on success, appropriate error otherwise - */ -inline charset_error _dom_utf8_from_ucs4(uint32_t ucs4, uint8_t *s, - size_t *len) -{ - uint32_t l = 0; - - if (s == NULL || len == NULL) - return CHARSET_BADPARM; - else if (ucs4 < 0x80) { - *s = (uint8_t) ucs4; - l = 1; - } else if (ucs4 < 0x800) { - *s = 0xC0 | ((ucs4 >> 6) & 0x1F); - *(s+1) = 0x80 | (ucs4 & 0x3F); - l = 2; - } else if (ucs4 < 0x10000) { - *s = 0xE0 | ((ucs4 >> 12) & 0xF); - *(s+1) = 0x80 | ((ucs4 >> 6) & 0x3F); - *(s+2) = 0x80 | (ucs4 & 0x3F); - l = 3; - } else if (ucs4 < 0x200000) { - *s = 0xF0 | ((ucs4 >> 18) & 0x7); - *(s+1) = 0x80 | ((ucs4 >> 12) & 0x3F); - *(s+2) = 0x80 | ((ucs4 >> 6) & 0x3F); - *(s+3) = 0x80 | (ucs4 & 0x3F); - l = 4; - } else if (ucs4 < 0x4000000) { - *s = 0xF8 | ((ucs4 >> 24) & 0x3); - *(s+1) = 0x80 | ((ucs4 >> 18) & 0x3F); - *(s+2) = 0x80 | ((ucs4 >> 12) & 0x3F); - *(s+3) = 0x80 | ((ucs4 >> 6) & 0x3F); - *(s+4) = 0x80 | (ucs4 & 0x3F); - l = 5; - } else if (ucs4 <= 0x7FFFFFFF) { - *s = 0xFC | ((ucs4 >> 30) & 0x1); - *(s+1) = 0x80 | ((ucs4 >> 24) & 0x3F); - *(s+2) = 0x80 | ((ucs4 >> 18) & 0x3F); - *(s+3) = 0x80 | ((ucs4 >> 12) & 0x3F); - *(s+4) = 0x80 | ((ucs4 >> 6) & 0x3F); - *(s+5) = 0x80 | (ucs4 & 0x3F); - l = 6; - } else { - return CHARSET_INVALID; - } - - *len = l; - - return CHARSET_OK; -} - -/** - * Calculate the length (in characters) of a bounded UTF-8 string - * - * \param s The string - * \param max Maximum length - * \param len Pointer to location to receive length of string - * \return CHARSET_OK on success, appropriate error otherwise - */ -inline charset_error _dom_utf8_length(const uint8_t *s, size_t max, - size_t *len) -{ - const uint8_t *end = s + max; - int l = 0; - - if (s == NULL || len == NULL) - return CHARSET_BADPARM; - - while (s < end) { - if ((*s & 0x80) == 0x00) - s += 1; - else if ((*s & 0xE0) == 0xC0) - s += 2; - else if ((*s & 0xF0) == 0xE0) - s += 3; - else if ((*s & 0xF8) == 0xF0) - s += 4; - else if ((*s & 0xFC) == 0xF8) - s += 5; - else if ((*s & 0xFE) == 0xFC) - s += 6; - else - return CHARSET_INVALID; - l++; - } - - *len = l; - - return CHARSET_OK; -} - -/** - * Calculate the length (in bytes) of a UTF-8 character - * - * \param s Pointer to start of character - * \param len Pointer to location to receive length - * \return CHARSET_OK on success, appropriate error otherwise - */ -inline charset_error _dom_utf8_char_byte_length(const uint8_t *s, - size_t *len) -{ - if (s == NULL || len == NULL) - return CHARSET_BADPARM; - - *len = numContinuations[s[0]] + 1 /* Start byte */; - - return CHARSET_OK; -} - -/** - * Find previous legal UTF-8 char in string - * - * \param s The string - * \param off Offset in the string to start at - * \param prevoff Pointer to location to receive offset of first byte of - * previous legal character - * \return CHARSET_OK on success, appropriate error otherwise - */ -inline charset_error _dom_utf8_prev(const uint8_t *s, uint32_t off, - uint32_t *prevoff) -{ - if (s == NULL || prevoff == NULL) - return CHARSET_BADPARM; - - while (off != 0 && (s[--off] & 0xC0) == 0x80) - /* do nothing */; - - *prevoff = off; - - return CHARSET_OK; -} - -/** - * Find next legal UTF-8 char in string - * - * \param s The string (assumed valid) - * \param len Maximum offset in string - * \param off Offset in the string to start at - * \param nextoff Pointer to location to receive offset of first byte of - * next legal character - * \return CHARSET_OK on success, appropriate error otherwise - */ -inline charset_error _dom_utf8_next(const uint8_t *s, uint32_t len, - uint32_t off, uint32_t *nextoff) -{ - if (s == NULL || off >= len || nextoff == NULL) - return CHARSET_BADPARM; - - /* Skip current start byte (if present - may be mid-sequence) */ - if (s[off] < 0x80 || (s[off] & 0xC0) == 0xC0) - off++; - - while (off < len && (s[off] & 0xC0) == 0x80) - off++; - - *nextoff = off; - - return CHARSET_OK; -} - -/** - * Find next legal UTF-8 char in string - * - * \param s The string (assumed to be of dubious validity) - * \param len Maximum offset in string - * \param off Offset in the string to start at - * \param nextoff Pointer to location to receive offset of first byte of - * next legal character - * \return CHARSET_OK on success, appropriate error otherwise - */ -inline charset_error _dom_utf8_next_paranoid(const uint8_t *s, uint32_t len, - uint32_t off, uint32_t *nextoff) -{ - bool valid; - - if (s == NULL || off >= len || nextoff == NULL) - return CHARSET_BADPARM; - - /* Skip current start byte (if present - may be mid-sequence) */ - if (s[off] < 0x80 || (s[off] & 0xC0) == 0xC0) - off++; - - while (1) { - /* Find next possible start byte */ - while (off < len && (s[off] & 0xC0) == 0x80) - off++; - - /* Ran off end of data */ - if (off == len || off + numContinuations[s[off]] >= len) - return CHARSET_NEEDDATA; - - /* Found if start byte is ascii, - * or next n bytes are valid continuations */ - valid = true; - - switch (numContinuations[s[off]]) { - case 5: - valid &= ((s[off + 5] & 0xC0) == 0x80); - case 4: - valid &= ((s[off + 4] & 0xC0) == 0x80); - case 3: - valid &= ((s[off + 3] & 0xC0) == 0x80); - case 2: - valid &= ((s[off + 2] & 0xC0) == 0x80); - case 1: - valid &= ((s[off + 1] & 0xC0) == 0x80); - case 0: - valid &= (s[off + 0] < 0x80); - } - - if (valid) - break; - - /* Otherwise, skip this (invalid) start byte and try again */ - off++; - } - - *nextoff = off; - - return CHARSET_OK; -} - diff --git a/src/utils/utf8.h b/src/utils/utf8.h deleted file mode 100644 index 154dbb8..0000000 --- a/src/utils/utf8.h +++ /dev/null @@ -1,38 +0,0 @@ -/* - * This file is part of libdom. - * Licensed under the MIT License, - * http://www.opensource.org/licenses/mit-license.php - * Copyright 2007 John-Mark Bell - */ - -/** \file - * UTF-8 manipulation functions (interface). - */ - -#ifndef dom_utils_utf8_h_ -#define dom_utils_utf8_h_ - -#include - -#include "utils/charset_errors.h" - -inline charset_error _dom_utf8_to_ucs4(const uint8_t *s, size_t len, - uint32_t *ucs4, size_t *clen); -inline charset_error _dom_utf8_from_ucs4(uint32_t ucs4, uint8_t *s, - size_t *len); - -inline charset_error _dom_utf8_length(const uint8_t *s, size_t max, - size_t *len); -inline charset_error _dom_utf8_char_byte_length(const uint8_t *s, - size_t *len); - -inline charset_error _dom_utf8_prev(const uint8_t *s, uint32_t off, - uint32_t *prevoff); -inline charset_error _dom_utf8_next(const uint8_t *s, uint32_t len, - uint32_t off, uint32_t *nextoff); - -inline charset_error _dom_utf8_next_paranoid(const uint8_t *s, uint32_t len, - uint32_t off, uint32_t *nextoff); - -#endif - -- cgit v1.2.3