summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJohn Mark Bell <jmb@netsurf-browser.org>2009-03-03 18:08:01 +0000
committerJohn Mark Bell <jmb@netsurf-browser.org>2009-03-03 18:08:01 +0000
commit702d96e703473dbe4481a42c472b4aae423a51d1 (patch)
tree9dc767860ebea940f1d936d14d69073b4e289c92
parenteeb651eadb47228ad41c21b80d75afc17c2924f8 (diff)
downloadlibdom-702d96e703473dbe4481a42c472b4aae423a51d1.tar.gz
libdom-702d96e703473dbe4481a42c472b4aae423a51d1.tar.bz2
Rationalise dom_string (some consideration is required as to what happens wrt interning -- lwc_strings should probably be used)
Purge charset handling -- a) documents are always converted to utf-8 b) use parserutils for utf-8 handling Fix Hubbub binding to compile. svn path=/trunk/dom/; revision=6682
-rw-r--r--bindings/hubbub/parser.c64
-rw-r--r--bindings/xml/xmlbinding.c5
-rw-r--r--bindings/xml/xmlparser.c36
-rw-r--r--include/dom/bootstrap/implpriv.h5
-rw-r--r--include/dom/core/document.h3
-rw-r--r--include/dom/core/implementation.h1
-rw-r--r--include/dom/core/string.h19
-rw-r--r--src/core/attr.c2
-rw-r--r--src/core/document.c113
-rw-r--r--src/core/document.h6
-rw-r--r--src/core/implementation.c4
-rw-r--r--src/core/node.c21
-rw-r--r--src/core/string.c632
-rw-r--r--src/utils/Makefile2
-rw-r--r--src/utils/namespace.c13
-rw-r--r--src/utils/utf16.c239
-rw-r--r--src/utils/utf16.h38
-rw-r--r--src/utils/utf8.c368
-rw-r--r--src/utils/utf8.h38
19 files changed, 202 insertions, 1407 deletions
diff --git a/bindings/hubbub/parser.c b/bindings/hubbub/parser.c
index 9473438..7b5e6ab 100644
--- a/bindings/hubbub/parser.c
+++ b/bindings/hubbub/parser.c
@@ -20,7 +20,6 @@
*/
struct dom_hubbub_parser {
hubbub_parser *parser; /**< Hubbub parser instance */
- const uint8_t *buffer; /**< Parser buffer pointer */
struct dom_document *doc; /**< DOM Document we're building */
@@ -35,9 +34,8 @@ struct dom_hubbub_parser {
void *mctx; /**< Pointer to client data */
};
-static void __dom_hubbub_buffer_handler(const uint8_t *buffer, size_t len,
+static hubbub_error __dom_hubbub_token_handler(const hubbub_token *token,
void *pw);
-static void __dom_hubbub_token_handler(const hubbub_token *token, void *pw);
static bool __initialised;
@@ -63,6 +61,8 @@ dom_hubbub_parser *dom_hubbub_parser_create(const char *aliases,
dom_exception err;
hubbub_error e;
+ UNUSED(int_enc);
+
if (__initialised == false) {
e = hubbub_initialise(aliases, (hubbub_alloc) alloc, pw);
if (e != HUBBUB_OK) {
@@ -80,23 +80,11 @@ dom_hubbub_parser *dom_hubbub_parser_create(const char *aliases,
return NULL;
}
- parser->parser = hubbub_parser_create(enc, int_enc,
- (hubbub_alloc) alloc, pw);
- if (parser->parser == NULL) {
- alloc(parser, 0, pw);
- msg(DOM_MSG_CRITICAL, mctx, "Failed to create hubbub parser");
- return NULL;
- }
-
- params.buffer_handler.handler = __dom_hubbub_buffer_handler;
- params.buffer_handler.pw = parser;
- e = hubbub_parser_setopt(parser->parser, HUBBUB_PARSER_BUFFER_HANDLER,
- &params);
+ e = hubbub_parser_create(enc, true, (hubbub_alloc) alloc, pw,
+ &parser->parser);
if (e != HUBBUB_OK) {
- hubbub_parser_destroy(parser->parser);
alloc(parser, 0, pw);
- msg(DOM_MSG_CRITICAL, mctx,
- "Failed registering hubbub buffer handler");
+ msg(DOM_MSG_CRITICAL, mctx, "Failed to create hubbub parser");
return NULL;
}
@@ -118,8 +106,7 @@ dom_hubbub_parser *dom_hubbub_parser_create(const char *aliases,
/* Get DOM implementation */
/* Create string representation of the features we want */
- err = dom_string_create_from_ptr_no_doc(alloc, pw,
- DOM_STRING_UTF8,
+ err = dom_string_create(alloc, pw,
(const uint8_t *) "HTML", SLEN("HTML"), &features);
if (err != DOM_NO_ERR) {
hubbub_parser_destroy(parser->parser);
@@ -202,17 +189,7 @@ struct dom_document *dom_hubbub_parser_get_document(dom_hubbub_parser *parser)
return (parser->complete ? parser->doc : NULL);
}
-void __dom_hubbub_buffer_handler(const uint8_t *buffer, size_t len,
- void *pw)
-{
- dom_hubbub_parser *parser = (dom_hubbub_parser *) pw;
-
- UNUSED(len);
-
- parser->buffer = buffer;
-}
-
-void __dom_hubbub_token_handler(const hubbub_token *token, void *pw)
+hubbub_error __dom_hubbub_token_handler(const hubbub_token *token, void *pw)
{
dom_hubbub_parser *parser = (dom_hubbub_parser *) pw;
static const char *token_names[] = {
@@ -221,55 +198,58 @@ void __dom_hubbub_token_handler(const hubbub_token *token, void *pw)
};
size_t i;
+ UNUSED(parser);
+
printf("%s: ", token_names[token->type]);
switch (token->type) {
case HUBBUB_TOKEN_DOCTYPE:
printf("'%.*s' (%svalid)\n",
(int) token->data.doctype.name.len,
- parser->buffer +
- token->data.doctype.name.data_off,
- token->data.doctype.correct ? "" : "in");
+ token->data.doctype.name.ptr,
+ token->data.doctype.force_quirks ? "in" : "");
break;
case HUBBUB_TOKEN_START_TAG:
printf("'%.*s' %s\n",
(int) token->data.tag.name.len,
- parser->buffer + token->data.tag.name.data_off,
+ token->data.tag.name.ptr,
(token->data.tag.n_attributes > 0) ?
"attributes:" : "");
for (i = 0; i < token->data.tag.n_attributes; i++) {
printf("\t'%.*s' = '%.*s'\n",
(int) token->data.tag.attributes[i].name.len,
- parser->buffer + token->data.tag.attributes[i].name.data_off,
+ token->data.tag.attributes[i].name.ptr,
(int) token->data.tag.attributes[i].value.len,
- parser->buffer + token->data.tag.attributes[i].value.data_off);
+ token->data.tag.attributes[i].value.ptr);
}
break;
case HUBBUB_TOKEN_END_TAG:
printf("'%.*s' %s\n",
(int) token->data.tag.name.len,
- parser->buffer + token->data.tag.name.data_off,
+ token->data.tag.name.ptr,
(token->data.tag.n_attributes > 0) ?
"attributes:" : "");
for (i = 0; i < token->data.tag.n_attributes; i++) {
printf("\t'%.*s' = '%.*s'\n",
(int) token->data.tag.attributes[i].name.len,
- parser->buffer + token->data.tag.attributes[i].name.data_off,
+ token->data.tag.attributes[i].name.ptr,
(int) token->data.tag.attributes[i].value.len,
- parser->buffer + token->data.tag.attributes[i].value.data_off);
+ token->data.tag.attributes[i].value.ptr);
}
break;
case HUBBUB_TOKEN_COMMENT:
printf("'%.*s'\n", (int) token->data.comment.len,
- parser->buffer + token->data.comment.data_off);
+ token->data.comment.ptr);
break;
case HUBBUB_TOKEN_CHARACTER:
printf("'%.*s'\n", (int) token->data.character.len,
- parser->buffer + token->data.character.data_off);
+ token->data.character.ptr);
break;
case HUBBUB_TOKEN_EOF:
printf("\n");
break;
}
+
+ return HUBBUB_OK;
}
diff --git a/bindings/xml/xmlbinding.c b/bindings/xml/xmlbinding.c
index 2bbfb7b..b03b7af 100644
--- a/bindings/xml/xmlbinding.c
+++ b/bindings/xml/xmlbinding.c
@@ -38,7 +38,6 @@ static dom_exception xml_dom_implementation_create_document(
struct dom_string *qname,
struct dom_document_type *doctype,
struct dom_document **doc,
- dom_string_charset charset,
dom_alloc alloc, void *pw);
static dom_exception xml_dom_implementation_get_feature(
struct dom_implementation *impl,
@@ -237,7 +236,6 @@ dom_exception xml_dom_implementation_create_document_type(
* \param qname The qualified name of the document element
* \param doctype The type of document to create
* \param doc Pointer to location to receive result
- * \param charset The charset to use for strings in the document
* \param alloc Memory (de)allocation function
* \param pw Pointer to client-specific private data
* \return DOM_NO_ERR on success,
@@ -274,14 +272,13 @@ dom_exception xml_dom_implementation_create_document(
struct dom_string *qname,
struct dom_document_type *doctype,
struct dom_document **doc,
- dom_string_charset charset,
dom_alloc alloc, void *pw)
{
struct dom_document *d;
dom_exception err;
/* Create document object */
- err = dom_document_create(impl, charset, alloc, pw, &d);
+ err = dom_document_create(impl, alloc, pw, &d);
if (err != DOM_NO_ERR)
return err;
diff --git a/bindings/xml/xmlparser.c b/bindings/xml/xmlparser.c
index 743a826..9e3786f 100644
--- a/bindings/xml/xmlparser.c
+++ b/bindings/xml/xmlparser.c
@@ -181,8 +181,7 @@ dom_xml_parser *dom_xml_parser_create(const char *enc, const char *int_enc,
parser->complete = false;
/* Create key for user data registration */
- err = dom_string_create_from_ptr_no_doc((dom_alloc) alloc, pw,
- DOM_STRING_UTF8,
+ err = dom_string_create((dom_alloc) alloc, pw,
(const uint8_t *) "__xmlnode", SLEN("__xmlnode"),
&parser->udkey);
if (err != DOM_NO_ERR) {
@@ -194,8 +193,7 @@ dom_xml_parser *dom_xml_parser_create(const char *enc, const char *int_enc,
/* Get DOM implementation */
/* Create a string representation of the features we want */
- err = dom_string_create_from_ptr_no_doc((dom_alloc) alloc, pw,
- DOM_STRING_UTF8,
+ err = dom_string_create((dom_alloc) alloc, pw,
(const uint8_t *) "XML", SLEN("XML"), &features);
if (err != DOM_NO_ERR) {
dom_string_unref(parser->udkey);
@@ -329,7 +327,6 @@ void xml_parser_start_document(void *ctx)
/* qname */ NULL,
/* doctype */ NULL,
&doc,
- DOM_STRING_UTF8,
(dom_alloc) parser->alloc,
parser->pw);
if (err != DOM_NO_ERR) {
@@ -650,9 +647,8 @@ void xml_parser_add_element_node(dom_xml_parser *parser,
struct dom_string *tag_name;
/* Create tag name DOM string */
- err = dom_string_create_from_const_ptr(parser->doc,
- child->name,
- strlen((const char *) child->name),
+ err = dom_document_create_string(parser->doc,
+ child->name, strlen((const char *) child->name),
&tag_name);
if (err != DOM_NO_ERR) {
parser->msg(DOM_MSG_CRITICAL, parser->mctx,
@@ -684,7 +680,7 @@ void xml_parser_add_element_node(dom_xml_parser *parser,
uint8_t qnamebuf[qnamelen + 1 /* '\0' */];
/* Create namespace DOM string */
- err = dom_string_create_from_const_ptr(parser->doc,
+ err = dom_document_create_string(parser->doc,
child->ns->href,
strlen((const char *) child->ns->href),
&namespace);
@@ -703,7 +699,7 @@ void xml_parser_add_element_node(dom_xml_parser *parser,
(const char *) child->name);
/* Create qname DOM string */
- err = dom_string_create_from_ptr(parser->doc,
+ err = dom_document_create_string(parser->doc,
qnamebuf,
qnamelen,
&qname);
@@ -742,7 +738,7 @@ void xml_parser_add_element_node(dom_xml_parser *parser,
struct dom_string *name;
/* Create attribute name DOM string */
- err = dom_string_create_from_const_ptr(parser->doc,
+ err = dom_document_create_string(parser->doc,
a->name,
strlen((const char *) a->name),
&name);
@@ -776,7 +772,7 @@ void xml_parser_add_element_node(dom_xml_parser *parser,
uint8_t qnamebuf[qnamelen + 1 /* '\0' */];
/* Create namespace DOM string */
- err = dom_string_create_from_const_ptr(parser->doc,
+ err = dom_document_create_string(parser->doc,
a->ns->href,
strlen((const char *) a->ns->href),
&namespace);
@@ -795,7 +791,7 @@ void xml_parser_add_element_node(dom_xml_parser *parser,
(const char *) a->name);
/* Create qname DOM string */
- err = dom_string_create_from_ptr(parser->doc,
+ err = dom_document_create_string(parser->doc,
qnamebuf,
qnamelen,
&qname);
@@ -904,7 +900,7 @@ void xml_parser_add_text_node(dom_xml_parser *parser, struct dom_node *parent,
dom_exception err;
/* Create DOM string data for text node */
- err = dom_string_create_from_const_ptr(parser->doc, child->content,
+ err = dom_document_create_string(parser->doc, child->content,
strlen((const char *) child->content), &data);
if (err != DOM_NO_ERR) {
parser->msg(DOM_MSG_CRITICAL, parser->mctx,
@@ -965,7 +961,7 @@ void xml_parser_add_cdata_section(dom_xml_parser *parser,
dom_exception err;
/* Create DOM string data for cdata section */
- err = dom_string_create_from_const_ptr(parser->doc, child->content,
+ err = dom_document_create_string(parser->doc, child->content,
strlen((const char *) child->content), &data);
if (err != DOM_NO_ERR) {
parser->msg(DOM_MSG_CRITICAL, parser->mctx,
@@ -1027,7 +1023,7 @@ void xml_parser_add_entity_reference(dom_xml_parser *parser,
dom_exception err;
/* Create name of entity reference */
- err = dom_string_create_from_const_ptr(parser->doc, child->name,
+ err = dom_document_create_string(parser->doc, child->name,
strlen((const char *) child->name), &name);
if (err != DOM_NO_ERR) {
parser->msg(DOM_MSG_CRITICAL, parser->mctx,
@@ -1094,7 +1090,7 @@ void xml_parser_add_comment(dom_xml_parser *parser, struct dom_node *parent,
dom_exception err;
/* Create DOM string data for comment */
- err = dom_string_create_from_const_ptr(parser->doc, child->content,
+ err = dom_document_create_string(parser->doc, child->content,
strlen((const char *) child->content), &data);
if (err != DOM_NO_ERR) {
parser->msg(DOM_MSG_CRITICAL, parser->mctx,
@@ -1156,7 +1152,7 @@ void xml_parser_add_document_type(dom_xml_parser *parser,
dom_exception err;
/* Create qname for doctype */
- err = dom_string_create_from_const_ptr(parser->doc, dtd->name,
+ err = dom_document_create_string(parser->doc, dtd->name,
strlen((const char *) dtd->name), &qname);
if (err != DOM_NO_ERR) {
parser->msg(DOM_MSG_CRITICAL, parser->mctx,
@@ -1165,7 +1161,7 @@ void xml_parser_add_document_type(dom_xml_parser *parser,
}
/* Create public ID for doctype */
- err = dom_string_create_from_const_ptr(parser->doc,
+ err = dom_document_create_string(parser->doc,
dtd->ExternalID,
(dtd->ExternalID == NULL) ? 0
: strlen((const char *) dtd->ExternalID),
@@ -1178,7 +1174,7 @@ void xml_parser_add_document_type(dom_xml_parser *parser,
}
/* Create system ID for doctype */
- err = dom_string_create_from_const_ptr(parser->doc,
+ err = dom_document_create_string(parser->doc,
dtd->SystemID,
(dtd->SystemID == NULL) ? 0
: strlen((const char *) dtd->SystemID),
diff --git a/include/dom/bootstrap/implpriv.h b/include/dom/bootstrap/implpriv.h
index 97806a8..c99a9d2 100644
--- a/include/dom/bootstrap/implpriv.h
+++ b/include/dom/bootstrap/implpriv.h
@@ -94,7 +94,6 @@ struct dom_implementation {
* \param qname The qualified name of the document element
* \param doctype The type of document to create
* \param doc Pointer to location to receive result
- * \param charset The charset to use for strings in the document
* \param alloc Memory (de)allocation function
* \param pw Pointer to client-specific private data
* \return DOM_NO_ERR on success,
@@ -130,7 +129,6 @@ struct dom_implementation {
struct dom_string *qname,
struct dom_document_type *doctype,
struct dom_document **doc,
- dom_string_charset charset,
dom_alloc alloc, void *pw);
/**
@@ -251,8 +249,7 @@ dom_exception dom_register_source(struct dom_implementation_source *source,
/* Create a DOM document */
dom_exception dom_document_create(struct dom_implementation *impl,
- dom_string_charset charset, dom_alloc alloc, void *pw,
- struct dom_document **doc);
+ dom_alloc alloc, void *pw, struct dom_document **doc);
/* Set a document's buffer */
void dom_document_set_buffer(struct dom_document *doc, uint8_t *buffer,
diff --git a/include/dom/core/document.h b/include/dom/core/document.h
index cce8e4b..6a5fd9f 100644
--- a/include/dom/core/document.h
+++ b/include/dom/core/document.h
@@ -9,6 +9,7 @@
#define dom_core_document_h_
#include <stdbool.h>
+#include <stdint.h>
#include <dom/core/exceptions.h>
@@ -98,5 +99,7 @@ dom_exception dom_document_rename_node(struct dom_document *doc,
struct dom_node *node,
struct dom_string *namespace, struct dom_string *qname,
struct dom_node **result);
+dom_exception dom_document_create_string(struct dom_document *doc,
+ const uint8_t *data, size_t len, struct dom_string **result);
#endif
diff --git a/include/dom/core/implementation.h b/include/dom/core/implementation.h
index a51493f..5e26432 100644
--- a/include/dom/core/implementation.h
+++ b/include/dom/core/implementation.h
@@ -37,7 +37,6 @@ dom_exception dom_implementation_create_document(
struct dom_string *namespace, struct dom_string *qname,
struct dom_document_type *doctype,
struct dom_document **doc,
- dom_string_charset charset,
dom_alloc alloc, void *pw);
dom_exception dom_implementation_get_feature(
diff --git a/include/dom/core/string.h b/include/dom/core/string.h
index e3dfa30..8da9dd7 100644
--- a/include/dom/core/string.h
+++ b/include/dom/core/string.h
@@ -14,33 +14,16 @@
#include <dom/functypes.h>
#include <dom/core/exceptions.h>
-struct dom_document;
struct dom_string;
-typedef enum {
- DOM_STRING_UTF8,
- DOM_STRING_UTF16
-} dom_string_charset;
-
/* Claim a reference on a DOM string */
void dom_string_ref(struct dom_string *str);
/* Release a reference on a DOM string */
void dom_string_unref(struct dom_string *str);
-/* Create a DOM string from an offset into the document buffer */
-dom_exception dom_string_create_from_off(struct dom_document *doc,
- uint32_t off, size_t len, struct dom_string **str);
/* Create a DOM string from a string of characters */
-dom_exception dom_string_create_from_ptr(struct dom_document *doc,
- const uint8_t *ptr, size_t len, struct dom_string **str);
-/* Create a DOM string from a constant string of characters */
-dom_exception dom_string_create_from_const_ptr(struct dom_document *doc,
+dom_exception dom_string_create(dom_alloc alloc, void *pw,
const uint8_t *ptr, size_t len, struct dom_string **str);
-/* Create a DOM string from a string of characters that does not belong
- * to a document */
-dom_exception dom_string_create_from_ptr_no_doc(dom_alloc alloc, void *pw,
- dom_string_charset charset, const uint8_t *ptr, size_t len,
- struct dom_string **str);
/* Case sensitively compare two DOM strings */
int dom_string_cmp(struct dom_string *s1, struct dom_string *s2);
diff --git a/src/core/attr.c b/src/core/attr.c
index a82f117..5a85ac0 100644
--- a/src/core/attr.c
+++ b/src/core/attr.c
@@ -180,7 +180,7 @@ dom_exception dom_attr_get_value(struct dom_attr *attr,
struct dom_string *value, *temp;
dom_exception err;
- err = dom_string_create_from_const_ptr(a->owner,
+ err = dom_document_create_string(a->owner,
(const uint8_t *) "", SLEN(""), &value);
if (err != DOM_NO_ERR) {
return err;
diff --git a/src/core/document.c b/src/core/document.c
index 3e06541..74283f9 100644
--- a/src/core/document.c
+++ b/src/core/document.c
@@ -56,8 +56,6 @@ struct dom_doc_nnm {
struct dom_document {
struct dom_node base; /**< Base node */
- dom_string_charset charset; /**< Charset of strings in document */
-
struct dom_implementation *impl; /**< Owning implementation */
struct dom_doc_nl *nodelists; /**< List of active nodelists */
@@ -73,7 +71,6 @@ struct dom_document {
/** Interned node name strings, indexed by node type */
/* Index 0 is unused */
static struct dom_string *__nodenames_utf8[DOM_NODE_TYPE_COUNT + 1];
-static struct dom_string *__nodenames_utf16[DOM_NODE_TYPE_COUNT + 1];
/**
* Initialise the document module
@@ -102,27 +99,6 @@ dom_exception _dom_document_initialise(dom_alloc alloc, void *pw)
{ "#document-fragment", 18 }, /* Document fragment */
{ NULL, 0 } /* Notation */
};
-
- /** \todo This assumes Little Endian */
- static struct {
- const char *name;
- size_t len;
- } names_utf16[DOM_NODE_TYPE_COUNT + 1] = {
- { NULL, 0 }, /* Unused */
- { NULL, 0 }, /* Element */
- { NULL, 0 }, /* Attr */
- { "#\0t\0e\0x\0t\0", 10 }, /* Text */
- { "#\0c\0d\0a\0t\0a\0-\0s\0e\0c\0t\0i\0o\0n\0", 28 }, /* CDATA section */
- { NULL, 0 }, /* Entity reference */
- { NULL, 0 }, /* Entity */
- { NULL, 0 }, /* Processing instruction */
- { "#\0c\0o\0m\0m\0e\0n\0t\0", 16 }, /* Comment */
- { "#\0d\0o\0c\0u\0m\0e\0n\0t\0", 18 }, /* Document */
- { NULL, 0 }, /* Document type */
- { "#\0d\0o\0c\0u\0m\0e\0n\0t\0-\0f\0r\0a\0g\0m\0e\0n\0t\0", 36 }, /* Document fragment */
- { NULL, 0 } /* Notation */
- };
-
dom_exception err;
/* Initialise interned node names */
@@ -130,13 +106,11 @@ dom_exception _dom_document_initialise(dom_alloc alloc, void *pw)
if (names_utf8[i].name == NULL) {
/* Nothing to intern; skip this entry */
__nodenames_utf8[i] = NULL;
- __nodenames_utf16[i] = NULL;
continue;
}
/* Make string */
- err = dom_string_create_from_ptr_no_doc(alloc, pw,
- DOM_STRING_UTF8,
+ err = dom_string_create(alloc, pw,
(const uint8_t *) names_utf8[i].name,
names_utf8[i].len, &__nodenames_utf8[i]);
if (err != DOM_NO_ERR) {
@@ -144,29 +118,10 @@ dom_exception _dom_document_initialise(dom_alloc alloc, void *pw)
for (int j = 0; j < i; j++) {
if (__nodenames_utf8[j] != NULL) {
dom_string_unref(__nodenames_utf8[j]);
- dom_string_unref(__nodenames_utf16[j]);
}
}
return err;
}
-
- err = dom_string_create_from_ptr_no_doc(alloc, pw,
- DOM_STRING_UTF16,
- (const uint8_t *) names_utf16[i].name,
- names_utf16[i].len, &__nodenames_utf16[i]);
- if (err != DOM_NO_ERR) {
- /* Failed, clean up strings we've created so far */
- for (int j = 0; j < i; j++) {
- if (__nodenames_utf8[j] != NULL) {
- dom_string_unref(__nodenames_utf8[j]);
- dom_string_unref(__nodenames_utf16[j]);
- }
- }
-
- dom_string_unref(__nodenames_utf8[i]);
-
- return err;
- }
}
return DOM_NO_ERR;
@@ -182,7 +137,6 @@ dom_exception _dom_document_finalise(void)
for (int i = 0; i <= DOM_NODE_TYPE_COUNT; i++) {
if (__nodenames_utf8[i] != NULL) {
dom_string_unref(__nodenames_utf8[i]);
- dom_string_unref(__nodenames_utf16[i]);
}
}
@@ -193,7 +147,6 @@ dom_exception _dom_document_finalise(void)
* Create a Document
*
* \param impl The DOM implementation owning the document
- * \param charset The charset used for strings in the document
* \param alloc Memory (de)allocation function
* \param pw Pointer to client-specific private data
* \param doc Pointer to location to receive created document
@@ -204,8 +157,7 @@ dom_exception _dom_document_finalise(void)
* The returned document will already be referenced.
*/
dom_exception dom_document_create(struct dom_implementation *impl,
- dom_string_charset charset, dom_alloc alloc, void *pw,
- struct dom_document **doc)
+ dom_alloc alloc, void *pw, struct dom_document **doc)
{
struct dom_document *d;
dom_exception err;
@@ -233,7 +185,6 @@ dom_exception dom_document_create(struct dom_implementation *impl,
}
/* Initialise remaining type-specific data */
- d->charset = charset;
if (impl != NULL)
dom_implementation_ref(impl);
d->impl = impl;
@@ -241,8 +192,7 @@ dom_exception dom_document_create(struct dom_implementation *impl,
d->nodelists = NULL;
d->maps = NULL;
- d->nodenames = (charset == DOM_STRING_UTF8) ? __nodenames_utf8
- : __nodenames_utf16;
+ d->nodenames = __nodenames_utf8;
*doc = d;
@@ -1047,55 +997,30 @@ dom_exception dom_document_rename_node(struct dom_document *doc,
return DOM_NOT_SUPPORTED_ERR;
}
-/* */
-/* ----------------------------------------------------------------------- */
-/* */
-
/**
- * Acquire a pointer to the base of the document buffer
- *
- * \param doc Document to retrieve pointer from
- * \return Pointer to document buffer
+ * Create a DOM string, using a document's allocation context
*
- * The document buffer is _not_ reference counted (as it is an implicit part
- * of the document). It is destroyed with the document, and thus after all
- * users have been destroyed.
- */
-const uint8_t *dom_document_get_base(struct dom_document *doc)
-{
- UNUSED(doc);
-
- return NULL;
-}
-
-/**
- * Set the document buffer pointer
+ * \param doc The document
+ * \param data Pointer to string data
+ * \param len Length, in bytes, of string
+ * \param result Pointer to location to receive result
+ * \return DOM_NO_ERR on success, DOM_NO_MEM_ERR on memory exhaustion
*
- * \param doc Document to set buffer pointer of
- * \param buffer Pointer to buffer
- * \param buffer_len Length of buffer, in bytes
+ * The returned string will already be referenced, so there is no need
+ * to explicitly reference it.
*
- * By calling this, ownership of the buffer is transferred to the document.
- * It should be called once per document node.
+ * The string of characters passed in will be copied for use by the
+ * returned DOM string.
*/
-void dom_document_set_buffer(struct dom_document *doc, uint8_t *buffer,
- size_t buffer_len)
+dom_exception dom_document_create_string(struct dom_document *doc,
+ const uint8_t *data, size_t len, struct dom_string **result)
{
- UNUSED(doc);
- UNUSED(buffer);
- UNUSED(buffer_len);
+ return dom_string_create(doc->alloc, doc->pw, data, len, result);
}
-/**
- * Retrieve the character set used to encode strings in the document
- *
- * \param doc The document to get the charset of
- * \return The charset in use
- */
-dom_string_charset dom_document_get_charset(struct dom_document *doc)
-{
- return doc->charset;
-}
+/* */
+/* ----------------------------------------------------------------------- */
+/* */
/**
* (De)allocate memory with a document's context
diff --git a/src/core/document.h b/src/core/document.h
index 6982b74..c5c13ac 100644
--- a/src/core/document.h
+++ b/src/core/document.h
@@ -27,12 +27,6 @@ dom_exception _dom_document_finalise(void);
/* Destroy a document */
void dom_document_destroy(struct dom_document *doc);
-/* Get base of document buffer */
-const uint8_t *dom_document_get_base(struct dom_document *doc);
-
-/* Get the document character set */
-dom_string_charset dom_document_get_charset(struct dom_document *doc);
-
/* (De)allocate memory */
void *dom_document_alloc(struct dom_document *doc, void *ptr, size_t size);
diff --git a/src/core/implementation.c b/src/core/implementation.c
index 9738b7c..e37b27d 100644
--- a/src/core/implementation.c
+++ b/src/core/implementation.c
@@ -94,7 +94,6 @@ dom_exception dom_implementation_create_document_type(
* \param qname The qualified name of the document element
* \param doctype The type of document to create
* \param doc Pointer to location to receive result
- * \param charset The charset to use for strings in the document
* \param alloc Memory (de)allocation function
* \param pw Pointer to client-specific private data
* \return DOM_NO_ERR on success,
@@ -127,11 +126,10 @@ dom_exception dom_implementation_create_document(
struct dom_string *namespace, struct dom_string *qname,
struct dom_document_type *doctype,
struct dom_document **doc,
- dom_string_charset charset,
dom_alloc alloc, void *pw)
{
return impl->create_document(impl, namespace, qname, doctype, doc,
- charset, alloc, pw);
+ alloc, pw);
}
/**
diff --git a/src/core/node.c b/src/core/node.c
index 2284e4f..0eebfb0 100644
--- a/src/core/node.c
+++ b/src/core/node.c
@@ -306,15 +306,8 @@ dom_exception dom_node_get_node_name(struct dom_node *node,
struct dom_string *colon;
dom_exception err;
- /* ugh! */
- /** \todo Assumes little endian */
- err = dom_string_create_from_const_ptr(node->owner,
- (const uint8_t *) (
- (dom_document_get_charset(node->owner) ==
- DOM_STRING_UTF8) ? ":" : ":\0"),
- (dom_document_get_charset(node->owner) ==
- DOM_STRING_UTF8) ? 1 : 2,
- &colon);
+ err = dom_document_create_string(node->owner,
+ (const uint8_t *) ":", SLEN(":"), &colon);
if (err != DOM_NO_ERR) {
return err;
}
@@ -1639,7 +1632,7 @@ bool _dom_node_readonly(const struct dom_node *node)
* \param previous Previous node in sibling list, or NULL if none
* \param next Next node in sibling list, or NULL if none
*/
-inline void _dom_node_attach(struct dom_node *node, struct dom_node *parent,
+void _dom_node_attach(struct dom_node *node, struct dom_node *parent,
struct dom_node *previous, struct dom_node *next)
{
_dom_node_attach_range(node, node, parent, previous, next);
@@ -1650,7 +1643,7 @@ inline void _dom_node_attach(struct dom_node *node, struct dom_node *parent,
*
* \param node The node to detach
*/
-inline void _dom_node_detach(struct dom_node *node)
+void _dom_node_detach(struct dom_node *node)
{
_dom_node_detach_range(node, node);
}
@@ -1666,7 +1659,7 @@ inline void _dom_node_detach(struct dom_node *node)
*
* The range is assumed to be a linked list of sibling nodes.
*/
-inline void _dom_node_attach_range(struct dom_node *first,
+void _dom_node_attach_range(struct dom_node *first,
struct dom_node *last,
struct dom_node *parent,
struct dom_node *previous,
@@ -1697,7 +1690,7 @@ inline void _dom_node_attach_range(struct dom_node *first,
*
* The range is assumed to be a linked list of sibling nodes.
*/
-inline void _dom_node_detach_range(struct dom_node *first,
+void _dom_node_detach_range(struct dom_node *first,
struct dom_node *last)
{
if (first->previous != NULL)
@@ -1727,7 +1720,7 @@ inline void _dom_node_detach_range(struct dom_node *first,
* we want to perform any special replacement-related behaviour
* at a later date.
*/
-inline void _dom_node_replace(struct dom_node *old,
+void _dom_node_replace(struct dom_node *old,
struct dom_node *replacement)
{
struct dom_node *first, *last;
diff --git a/src/core/string.c b/src/core/string.c
index 8ec44aa..2540e26 100644
--- a/src/core/string.c
+++ b/src/core/string.c
@@ -9,62 +9,37 @@
#include <inttypes.h>
#include <string.h>
+#include <parserutils/charset/utf8.h>
+
#include <dom/core/string.h>
#include "core/document.h"
#include "utils/utils.h"
-#include "utils/utf8.h"
-#include "utils/utf16.h"
/**
* A DOM string
*
- * DOM strings store either a pointer to allocated data, a pointer
- * to constant data or an offset into a document buffer.
- *
- * They are reference counted so freeing is performed correctly.
+ * Strings are reference counted so destruction is performed correctly.
*/
struct dom_string {
- enum { DOM_STRING_PTR,
- DOM_STRING_CONST_PTR,
- DOM_STRING_OFFSET,
- DOM_STRING_PTR_NODOC
- } type; /**< String type */
-
- dom_string_charset charset; /**< Charset of string */
-
- union {
- uint8_t *ptr;
- const uint8_t *cptr;
- uint32_t offset;
- } data; /**< Type-specific data */
+ uint8_t *ptr; /**< Pointer to string data */
size_t len; /**< Byte length of string */
- union {
- struct dom_document *doc; /**< Owning document */
- struct {
- dom_alloc alloc; /**< Memory (de)allocation
- * function */
- void *pw; /**< Client-specific data */
- } nodoc;
- } ctx; /**< Allocation context */
+ dom_alloc alloc; /**< Memory (de)allocation function */
+ void *pw; /**< Client-specific data */
uint32_t refcnt; /**< Reference count */
};
static struct dom_string empty_string = {
- .type = DOM_STRING_CONST_PTR,
- .charset = DOM_STRING_UTF8,
- .data.ptr = NULL,
+ .ptr = NULL,
.len = 0,
- .ctx.doc = NULL,
+ .alloc = NULL,
+ .pw = NULL,
.refcnt = 1
};
-static dom_exception __dom_string_get_data(struct dom_string *str,
- const uint8_t **data, size_t *len);
-
/**
* Claim a reference on a DOM string
*
@@ -86,155 +61,18 @@ void dom_string_ref(struct dom_string *str)
void dom_string_unref(struct dom_string *str)
{
if (--str->refcnt == 0) {
- if (str->type == DOM_STRING_PTR_NODOC) {
- str->ctx.nodoc.alloc(str->data.ptr, 0,
- str->ctx.nodoc.pw);
-
- str->ctx.nodoc.alloc(str, 0, str->ctx.nodoc.pw);
- } else {
- if (str->type == DOM_STRING_PTR) {
- dom_document_alloc(str->ctx.doc,
- str->data.ptr, 0);
- }
-
- dom_document_alloc(str->ctx.doc, str, 0);
+ if (str->alloc != NULL) {
+ str->alloc(str->ptr, 0, str->pw);
+ str->alloc(str, 0, str->pw);
}
}
}
/**
- * Create a DOM string from an offset into the document buffer
- *
- * \param doc The document in which the string resides
- * \param off Offset from start of document buffer
- * \param len Length, in bytes, of string
- * \param str Pointer to location to receive pointer to new string
- * \return DOM_NO_ERR on success, DOM_NO_MEM_ERR on memory exhaustion
- *
- * The returned string will already be referenced, so there is no need
- * to explicitly reference it.
- */
-dom_exception dom_string_create_from_off(struct dom_document *doc,
- uint32_t off, size_t len, struct dom_string **str)
-{
- struct dom_string *ret;
-
- ret = dom_document_alloc(doc, NULL, sizeof(struct dom_string));
- if (ret == NULL)
- return DOM_NO_MEM_ERR;
-
- ret->type = DOM_STRING_OFFSET;
-
- ret->charset = dom_document_get_charset(doc);
-
- ret->data.offset = off;
-
- ret->len = len;
-
- ret->ctx.doc = doc;
-
- ret->refcnt = 1;
-
- *str = ret;
-
- return DOM_NO_ERR;
-}
-
-/**
* Create a DOM string from a string of characters
*
- * \param doc The document in which the string resides
- * \param ptr Pointer to string of characters
- * \param len Length, in bytes, of string of characters
- * \param str Pointer to location to receive pointer to new string
- * \return DOM_NO_ERR on success, DOM_NO_MEM_ERR on memory exhaustion
- *
- * The returned string will already be referenced, so there is no need
- * to explicitly reference it.
- *
- * The string of characters passed in will be copied for use by the
- * returned DOM string.
- */
-dom_exception dom_string_create_from_ptr(struct dom_document *doc,
- const uint8_t *ptr, size_t len, struct dom_string **str)
-{
- struct dom_string *ret;
-
- ret = dom_document_alloc(doc, NULL, sizeof(struct dom_string));
- if (ret == NULL)
- return DOM_NO_MEM_ERR;
-
- ret->data.ptr = dom_document_alloc(doc, NULL, len);
- if (ret->data.ptr == NULL) {
- dom_document_alloc(doc, ret, 0);
- return DOM_NO_MEM_ERR;
- }
-
- ret->type = DOM_STRING_PTR;
-
- ret->charset = dom_document_get_charset(doc);
-
- memcpy(ret->data.ptr, ptr, len);
-
- ret->len = len;
-
- ret->ctx.doc = doc;
-
- ret->refcnt = 1;
-
- *str = ret;
-
- return DOM_NO_ERR;
-}
-
-/**
- * Create a DOM string from a constant string of characters
- *
- * \param doc The document in which the string resides
- * \param ptr Pointer to string of characters
- * \param len Length, in bytes, of string of characters
- * \param str Pointer to location to receive pointer to new string
- * \return DOM_NO_ERR on success, DOM_NO_MEM_ERR on memory exhaustion
- *
- * The returned string will already be referenced, so there is no need
- * to explicitly reference it.
- *
- * The string of characters passed in will _not_ be copied for use by the
- * returned DOM string.
- */
-dom_exception dom_string_create_from_const_ptr(struct dom_document *doc,
- const uint8_t *ptr, size_t len, struct dom_string **str)
-{
- struct dom_string *ret;
-
- ret = dom_document_alloc(doc, NULL, sizeof(struct dom_string));
- if (ret == NULL)
- return DOM_NO_MEM_ERR;
-
- ret->type = DOM_STRING_CONST_PTR;
-
- ret->charset = dom_document_get_charset(doc);
-
- ret->data.cptr = ptr;
-
- ret->len = len;
-
- ret->ctx.doc = doc;
-
- ret->refcnt = 1;
-
- *str = ret;
-
- return DOM_NO_ERR;
-}
-
-/**
- * Create a DOM string from a string of characters that does not belong
- * to a document
- *
* \param alloc Memory (de)allocation function
* \param pw Pointer to client-specific private data
- * \param charset The charset of the string
* \param ptr Pointer to string of characters
* \param len Length, in bytes, of string of characters
* \param str Pointer to location to receive result
@@ -243,12 +81,11 @@ dom_exception dom_string_create_from_const_ptr(struct dom_document *doc,
* The returned string will already be referenced, so there is no need
* to explicitly reference it.
*
- * The string of characters passed in will be copied for use by the
+ * The string of characters passed in will be copied for use by the
* returned DOM string.
*/
-dom_exception dom_string_create_from_ptr_no_doc(dom_alloc alloc, void *pw,
- dom_string_charset charset, const uint8_t *ptr, size_t len,
- struct dom_string **str)
+dom_exception dom_string_create(dom_alloc alloc, void *pw,
+ const uint8_t *ptr, size_t len, struct dom_string **str)
{
struct dom_string *ret;
@@ -256,22 +93,18 @@ dom_exception dom_string_create_from_ptr_no_doc(dom_alloc alloc, void *pw,
if (ret == NULL)
return DOM_NO_MEM_ERR;
- ret->data.ptr = alloc(NULL, len, pw);
- if (ret->data.ptr == NULL) {
+ ret->ptr = alloc(NULL, len, pw);
+ if (ret->ptr == NULL) {
alloc(ret, 0, pw);
return DOM_NO_MEM_ERR;
}
- ret->type = DOM_STRING_PTR_NODOC;
-
- ret->charset = charset;
-
- memcpy(ret->data.ptr, ptr, len);
+ memcpy(ret->ptr, ptr, len);
ret->len = len;
- ret->ctx.nodoc.alloc = alloc;
- ret->ctx.nodoc.pw = pw;
+ ret->alloc = alloc;
+ ret->pw = pw;
ret->refcnt = 1;
@@ -291,48 +124,16 @@ dom_exception dom_string_create_from_ptr_no_doc(dom_alloc alloc, void *pw,
*/
int dom_string_cmp(struct dom_string *s1, struct dom_string *s2)
{
- const uint8_t *d1 = NULL;
- const uint8_t *d2 = NULL;
- size_t l1, l2;
- dom_exception err;
-
- err = __dom_string_get_data(s1, &d1, &l1);
- if (err != DOM_NO_ERR)
- return 1; /* arbitrary */
+ if (s1 == NULL)
+ s1 = &empty_string;
- err = __dom_string_get_data(s2, &d2, &l2);
- if (err != DOM_NO_ERR)
- return 1; /* arbitrary */
+ if (s2 == NULL)
+ s2 = &empty_string;
- while (l1 > 0 && l2 > 0) {
- uint32_t c1, c2;
- size_t cl1, cl2;
- charset_error err;
-
- err = (s1->charset == DOM_STRING_UTF8)
- ? _dom_utf8_to_ucs4(d1, l1, &c1, &cl1)
- : _dom_utf16_to_ucs4(d1, l1, &c1, &cl1);
- if (err != CHARSET_OK) {
- }
+ if (s1->len != s2->len)
+ return 1;
- err = (s2->charset == DOM_STRING_UTF8)
- ? _dom_utf8_to_ucs4(d2, l2, &c2, &cl2)
- : _dom_utf16_to_ucs4(d2, l2, &c2, &cl2);
- if (err != CHARSET_OK) {
- }
-
- if (c1 != c2) {
- return (int)(c1 - c2);
- }
-
- d1 += cl1;
- d2 += cl2;
-
- l1 -= cl1;
- l2 -= cl2;
- }
-
- return (int)(l1 - l2);
+ return memcmp(s1->ptr, s2->ptr, s1->len);
}
/**
@@ -349,31 +150,28 @@ int dom_string_icmp(struct dom_string *s1, struct dom_string *s2)
const uint8_t *d1 = NULL;
const uint8_t *d2 = NULL;
size_t l1, l2;
- dom_exception err;
- err = __dom_string_get_data(s1, &d1, &l1);
- if (err != DOM_NO_ERR)
- return 1; /* arbitrary */
+ if (s1 == NULL)
+ s1 = &empty_string;
+ if (s2 == NULL)
+ s2 = &empty_string;
- err = __dom_string_get_data(s2, &d2, &l2);
- if (err != DOM_NO_ERR)
- return 1; /* arbitrary */
+ d1 = s1->ptr;
+ d2 = s2->ptr;
+ l1 = s1->len;
+ l2 = s2->len;
while (l1 > 0 && l2 > 0) {
uint32_t c1, c2;
size_t cl1, cl2;
- charset_error err;
+ parserutils_error err;
- err = (s1->charset == DOM_STRING_UTF8)
- ? _dom_utf8_to_ucs4(d1, l1, &c1, &cl1)
- : _dom_utf16_to_ucs4(d1, l1, &c1, &cl1);
- if (err != CHARSET_OK) {
+ err = parserutils_charset_utf8_to_ucs4(d1, l1, &c1, &cl1);
+ if (err != PARSERUTILS_OK) {
}
- err = (s2->charset == DOM_STRING_UTF8)
- ? _dom_utf8_to_ucs4(d2, l2, &c2, &cl2)
- : _dom_utf16_to_ucs4(d2, l2, &c2, &cl2);
- if (err != CHARSET_OK) {
+ err = parserutils_charset_utf8_to_ucs4(d2, l2, &c2, &cl2);
+ if (err != PARSERUTILS_OK) {
}
/** \todo improved lower-casing algorithm */
@@ -403,20 +201,19 @@ uint32_t dom_string_index(struct dom_string *str, uint32_t chr)
const uint8_t *s;
size_t clen, slen;
uint32_t c, index;
- charset_error err;
+ parserutils_error err;
- __dom_string_get_data(str, &s, &slen);
+ if (str == NULL)
+ str = &empty_string;
+
+ s = str->ptr;
+ slen = str->len;
index = 0;
while (slen > 0) {
- if (str->charset == DOM_STRING_UTF8) {
- err = _dom_utf8_to_ucs4(s, slen, &c, &clen);
- } else {
- err = _dom_utf16_to_ucs4(s, slen, &c, &clen);
- }
-
- if (err != CHARSET_OK) {
+ err = parserutils_charset_utf8_to_ucs4(s, slen, &c, &clen);
+ if (err != PARSERUTILS_OK) {
return (uint32_t) -1;
}
@@ -444,28 +241,25 @@ uint32_t dom_string_rindex(struct dom_string *str, uint32_t chr)
const uint8_t *s;
size_t clen, slen;
uint32_t c, index;
- charset_error err;
+ parserutils_error err;
+
+ if (str == NULL)
+ str = &empty_string;
- __dom_string_get_data(str, &s, &slen);
+ s = str->ptr;
+ slen = str->len;
index = dom_string_length(str);
while (slen > 0) {
- if (str->charset == DOM_STRING_UTF8) {
- err = _dom_utf8_prev(s, slen, &clen);
- if (err == CHARSET_OK) {
- err = _dom_utf8_to_ucs4(s + clen, slen - clen,
- &c, &clen);
- }
- } else {
- err = _dom_utf16_prev(s, slen, &clen);
- if (err == CHARSET_OK) {
- err = _dom_utf16_to_ucs4(s + clen, slen - clen,
- &c, &clen);
- }
+ err = parserutils_charset_utf8_prev(s, slen,
+ (uint32_t *) &clen);
+ if (err == PARSERUTILS_OK) {
+ err = parserutils_charset_utf8_to_ucs4(s + clen,
+ slen - clen, &c, &clen);
}
- if (err != CHARSET_OK) {
+ if (err != PARSERUTILS_OK) {
return (uint32_t) -1;
}
@@ -478,7 +272,6 @@ uint32_t dom_string_rindex(struct dom_string *str, uint32_t chr)
}
return (uint32_t) -1;
-
}
/**
@@ -489,20 +282,14 @@ uint32_t dom_string_rindex(struct dom_string *str, uint32_t chr)
*/
uint32_t dom_string_length(struct dom_string *str)
{
- const uint8_t *s;
- size_t slen;
- uint32_t clen;
- charset_error err;
-
- __dom_string_get_data(str, &s, &slen);
+ size_t clen;
+ parserutils_error err;
- if (str->charset == DOM_STRING_UTF8) {
- err = _dom_utf8_length(s, slen, &clen);
- } else {
- err = _dom_utf16_length(s, slen, &clen);
- }
+ if (str == NULL)
+ str = &empty_string;
- if (err != CHARSET_OK) {
+ err = parserutils_charset_utf8_length(str->ptr, str->len, &clen);
+ if (err != PARSERUTILS_OK) {
return 0;
}
@@ -527,60 +314,28 @@ dom_exception dom_string_concat(struct dom_string *s1, struct dom_string *s2,
struct dom_string **result)
{
struct dom_string *concat;
- const uint8_t *s;
- size_t slen;
- if (s1->type == DOM_STRING_PTR_NODOC) {
- concat = s1->ctx.nodoc.alloc(NULL,
- sizeof(struct dom_string), s1->ctx.nodoc.pw);
- } else {
- concat = dom_document_alloc(s1->ctx.doc,
- NULL, sizeof(struct dom_string));
- }
+ concat = s1->alloc(NULL, sizeof(struct dom_string), s1->pw);
if (concat == NULL) {
return DOM_NO_MEM_ERR;
}
- /** \todo support attempted concatenation of mismatched charsets */
+ concat->ptr = s1->alloc(NULL, s1->len + s2->len, s1->pw);
+ if (concat->ptr == NULL) {
+ s1->alloc(concat, 0, s1->pw);
- if (s1->type == DOM_STRING_PTR_NODOC) {
- concat->data.ptr = s1->ctx.nodoc.alloc(NULL,
- s1->len + s2->len, s1->ctx.nodoc.pw);
- } else {
- concat->data.ptr = dom_document_alloc(s1->ctx.doc,
- NULL, s1->len + s2->len);
- }
- if (concat->data.ptr == NULL) {
- if (s1->type == DOM_STRING_PTR_NODOC) {
- s1->ctx.nodoc.alloc(concat, 0, s1->ctx.nodoc.pw);
- } else {
- dom_document_alloc(s1->ctx.doc, concat, 0);
- }
return DOM_NO_MEM_ERR;
}
- concat->type = (s1->type == DOM_STRING_PTR_NODOC)
- ? DOM_STRING_PTR_NODOC : DOM_STRING_PTR;
-
- concat->charset = s1->charset;
-
- __dom_string_get_data(s1, &s, &slen);
+ memcpy(concat->ptr, s1->ptr, s1->len);
- memcpy(concat->data.ptr, s, slen);
-
- __dom_string_get_data(s2, &s, &slen);
-
- memcpy(concat->data.ptr + s1->len, s, slen);
+ memcpy(concat->ptr + s1->len, s2->ptr, s2->len);
concat->len = s1->len + s2->len;
- if (concat->type == DOM_STRING_PTR_NODOC) {
- concat->ctx.nodoc.alloc = s1->ctx.nodoc.alloc;
- concat->ctx.nodoc.pw = s1->ctx.nodoc.pw;
- } else {
- concat->ctx.doc = s1->ctx.doc;
- }
+ concat->alloc = s1->alloc;
+ concat->pw = s1->pw;
concat->refcnt = 1;
@@ -607,12 +362,10 @@ dom_exception dom_string_concat(struct dom_string *s1, struct dom_string *s2,
dom_exception dom_string_substr(struct dom_string *str,
uint32_t i1, uint32_t i2, struct dom_string **result)
{
- const uint8_t *s;
- size_t slen;
+ const uint8_t *s = str->ptr;
+ size_t slen = str->len;
size_t b1, b2;
- charset_error err;
-
- __dom_string_get_data(str, &s, &slen);
+ parserutils_error err;
/* Initialise the byte index of the start to 0 */
b1 = 0;
@@ -621,13 +374,9 @@ dom_exception dom_string_substr(struct dom_string *str,
/* Calculate the byte index of the start */
while (i1 > 0) {
- if (str->charset == DOM_STRING_UTF8) {
- err = _dom_utf8_next(s, slen - b1, b1, &b1);
- } else {
- err = _dom_utf16_next(s, slen - b1, b1, &b1);
- }
-
- if (err != CHARSET_OK) {
+ err = parserutils_charset_utf8_next(s, slen - b1, b1,
+ (uint32_t *) &b1);
+ if (err != PARSERUTILS_OK) {
return DOM_NO_MEM_ERR;
}
@@ -639,13 +388,10 @@ dom_exception dom_string_substr(struct dom_string *str,
/* Calculate the byte index of the end */
while (i2 > 0) {
- if (str->charset == DOM_STRING_UTF8) {
- err = _dom_utf8_next(s, slen - b2, b2, &b2);
- } else {
- err = _dom_utf16_next(s, slen - b2, b2, &b2);
- }
+ err = parserutils_charset_utf8_next(s, slen - b2, b2,
+ (uint32_t *) &b2);
- if (err != CHARSET_OK) {
+ if (err != PARSERUTILS_OK) {
return DOM_NO_MEM_ERR;
}
@@ -653,14 +399,7 @@ dom_exception dom_string_substr(struct dom_string *str,
}
/* Create a string from the specified byte range */
- return (str->type == DOM_STRING_PTR_NODOC)
- ? dom_string_create_from_ptr_no_doc(
- str->ctx.nodoc.alloc,
- str->ctx.nodoc.pw,
- str->charset,
- s + b1, b2 - b1, result)
- : dom_string_create_from_ptr(str->ctx.doc,
- s + b1, b2 - b1, result);
+ return dom_string_create(str->alloc, str->pw, s + b1, b2 - b1, result);
}
/**
@@ -688,11 +427,12 @@ dom_exception dom_string_insert(struct dom_string *target,
const uint8_t *t, *s;
uint32_t tlen, slen, clen;
uint32_t ins = 0;
- charset_error err;
-
- __dom_string_get_data(target, &t, &tlen);
+ parserutils_error err;
- __dom_string_get_data(source, &s, &slen);
+ t = target->ptr;
+ tlen = target->len;
+ s = source->ptr;
+ slen = source->len;
clen = dom_string_length(target);
@@ -706,13 +446,10 @@ dom_exception dom_string_insert(struct dom_string *target,
ins = tlen;
} else {
while (offset > 0) {
- if (target->charset == DOM_STRING_UTF8) {
- err = _dom_utf8_next(t, tlen - ins, ins, &ins);
- } else {
- err = _dom_utf16_next(t, tlen - ins, ins, &ins);
- }
+ err = parserutils_charset_utf8_next(t, tlen - ins,
+ ins, &ins);
- if (err != CHARSET_OK) {
+ if (err != PARSERUTILS_OK) {
return DOM_NO_MEM_ERR;
}
@@ -721,65 +458,36 @@ dom_exception dom_string_insert(struct dom_string *target,
}
/* Allocate result string */
- if (target->type == DOM_STRING_PTR_NODOC) {
- res = target->ctx.nodoc.alloc(NULL, sizeof(struct dom_string),
- target->ctx.nodoc.pw);
- } else {
- res = dom_document_alloc(target->ctx.doc,
- NULL, sizeof(struct dom_string));
- }
-
+ res = target->alloc(NULL, sizeof(struct dom_string), target->pw);
if (res == NULL) {
return DOM_NO_MEM_ERR;
}
- /** \todo support insertion of a string from a different charset */
-
/* Allocate data buffer for result contents */
- if (target->type == DOM_STRING_PTR_NODOC) {
- res->data.ptr = target->ctx.nodoc.alloc(NULL,
- tlen + slen, target->ctx.nodoc.pw);
- } else {
- res->data.ptr = dom_document_alloc(target->ctx.doc,
- NULL, tlen + slen);
- }
- if (res->data.ptr == NULL) {
- if (target->type == DOM_STRING_PTR_NODOC) {
- target->ctx.nodoc.alloc(res, 0, target->ctx.nodoc.pw);
- } else {
- dom_document_alloc(target->ctx.doc, res, 0);
- }
+ res->ptr = target->alloc(NULL, tlen + slen, target->pw);
+ if (res->ptr == NULL) {
+ target->alloc(res, 0, target->pw);
return DOM_NO_MEM_ERR;
}
- /* Populate result members */
- res->type = (target->type == DOM_STRING_PTR_NODOC)
- ? DOM_STRING_PTR_NODOC : DOM_STRING_PTR;
-
- res->charset = target->charset;
-
/* Copy initial portion of target, if any, into result */
if (ins > 0) {
- memcpy(res->data.ptr, t, ins);
+ memcpy(res->ptr, t, ins);
}
/* Copy inserted data into result */
- memcpy(res->data.ptr + ins, s, slen);
+ memcpy(res->ptr + ins, s, slen);
/* Copy remainder of target, if any, into result */
if (tlen - ins > 0) {
- memcpy(res->data.ptr + ins + slen, t + ins, tlen - ins);
+ memcpy(res->ptr + ins + slen, t + ins, tlen - ins);
}
res->len = tlen + slen;
- if (res->type == DOM_STRING_PTR_NODOC) {
- res->ctx.nodoc.alloc = target->ctx.nodoc.alloc;
- res->ctx.nodoc.pw = target->ctx.nodoc.pw;
- } else {
- res->ctx.doc = target->ctx.doc;
- }
-
+ res->alloc = target->alloc;
+ res->pw = target->pw;
+
res->refcnt = 1;
*result = res;
@@ -811,11 +519,12 @@ dom_exception dom_string_replace(struct dom_string *target,
const uint8_t *t, *s;
uint32_t tlen, slen;
uint32_t b1, b2;
- charset_error err;
-
- __dom_string_get_data(target, &t, &tlen);
+ parserutils_error err;
- __dom_string_get_data(source, &s, &slen);
+ t = target->ptr;
+ tlen = target->len;
+ s = source->ptr;
+ slen = source->len;
/* Initialise the byte index of the start to 0 */
b1 = 0;
@@ -824,13 +533,9 @@ dom_exception dom_string_replace(struct dom_string *target,
/* Calculate the byte index of the start */
while (i1 > 0) {
- if (target->charset == DOM_STRING_UTF8) {
- err = _dom_utf8_next(s, slen - b1, b1, &b1);
- } else {
- err = _dom_utf16_next(s, slen - b1, b1, &b1);
- }
+ err = parserutils_charset_utf8_next(s, slen - b1, b1, &b1);
- if (err != CHARSET_OK) {
+ if (err != PARSERUTILS_OK) {
return DOM_NO_MEM_ERR;
}
@@ -842,13 +547,9 @@ dom_exception dom_string_replace(struct dom_string *target,
/* Calculate the byte index of the end */
while (i2 > 0) {
- if (target->charset == DOM_STRING_UTF8) {
- err = _dom_utf8_next(s, slen - b2, b2, &b2);
- } else {
- err = _dom_utf16_next(s, slen - b2, b2, &b2);
- }
+ err = parserutils_charset_utf8_next(s, slen - b2, b2, &b2);
- if (err != CHARSET_OK) {
+ if (err != PARSERUTILS_OK) {
return DOM_NO_MEM_ERR;
}
@@ -856,66 +557,38 @@ dom_exception dom_string_replace(struct dom_string *target,
}
/* Allocate result string */
- if (target->type == DOM_STRING_PTR_NODOC) {
- res = target->ctx.nodoc.alloc(NULL, sizeof(struct dom_string),
- target->ctx.nodoc.pw);
- } else {
- res = dom_document_alloc(target->ctx.doc,
- NULL, sizeof(struct dom_string));
- }
+ res = target->alloc(NULL, sizeof(struct dom_string), target->pw);
if (res == NULL) {
return DOM_NO_MEM_ERR;
}
- /** \todo support insertion of a string from a different charset */
-
/* Allocate data buffer for result contents */
- if (target->type == DOM_STRING_PTR_NODOC) {
- res->data.ptr = target->ctx.nodoc.alloc(NULL,
- tlen + slen - (b2 - b1), target->ctx.nodoc.pw);
- } else {
- res->data.ptr = dom_document_alloc(target->ctx.doc,
- NULL, tlen + slen - (b2 - b1));
- }
- if (res->data.ptr == NULL) {
- if (target->type == DOM_STRING_PTR_NODOC) {
- target->ctx.nodoc.alloc(res, 0, target->ctx.nodoc.pw);
- } else {
- dom_document_alloc(target->ctx.doc, res, 0);
- }
+ res->ptr = target->alloc(NULL, tlen + slen - (b2 - b1), target->pw);
+ if (res->ptr == NULL) {
+ target->alloc(res, 0, target->pw);
return DOM_NO_MEM_ERR;
}
- /* Populate result members */
- res->type = (target->type == DOM_STRING_PTR_NODOC)
- ? DOM_STRING_PTR_NODOC : DOM_STRING_PTR;
-
- res->charset = target->charset;
-
/* Copy initial portion of target, if any, into result */
if (b1 > 0) {
- memcpy(res->data.ptr, t, b1);
+ memcpy(res->ptr, t, b1);
}
/* Copy replacement data into result */
if (slen > 0) {
- memcpy(res->data.ptr + b1, s, slen);
+ memcpy(res->ptr + b1, s, slen);
}
/* Copy remainder of target, if any, into result */
if (tlen - b2 > 0) {
- memcpy(res->data.ptr + b1 + slen, t + b2, tlen - b2);
+ memcpy(res->ptr + b1 + slen, t + b2, tlen - b2);
}
res->len = tlen + slen - (b2 - b1);
- if (res->type == DOM_STRING_PTR_NODOC) {
- res->ctx.nodoc.alloc = target->ctx.nodoc.alloc;
- res->ctx.nodoc.pw = target->ctx.nodoc.pw;
- } else {
- res->ctx.doc = target->ctx.doc;
- }
+ res->alloc = target->alloc;
+ res->pw = target->pw;
res->refcnt = 1;
@@ -940,19 +613,8 @@ dom_exception dom_string_replace(struct dom_string *target,
dom_exception dom_string_dup(struct dom_string *str,
struct dom_string **result)
{
- const uint8_t *s;
- size_t slen;
-
- __dom_string_get_data(str, &s, &slen);
-
- return str->type == DOM_STRING_PTR_NODOC
- ? dom_string_create_from_ptr_no_doc(
- str->ctx.nodoc.alloc,
- str->ctx.nodoc.pw,
- str->charset,
- s, slen, result)
- : dom_string_create_from_ptr(str->ctx.doc,
- s, slen, result);
+ return dom_string_create(str->alloc, str->pw, str->ptr, str->len,
+ result);
}
/**
@@ -963,12 +625,10 @@ dom_exception dom_string_dup(struct dom_string *str,
*/
uint32_t dom_string_hash(struct dom_string *str)
{
- const uint8_t *s;
- size_t slen;
+ const uint8_t *s = str->ptr;
+ size_t slen = str->len;
uint32_t hash = 0x01000193;
- __dom_string_get_data(str, &s, &slen);
-
while (slen > 0) {
hash *= 0x01000193;
hash ^= *s;
@@ -980,47 +640,3 @@ uint32_t dom_string_hash(struct dom_string *str)
return hash;
}
-/* */
-/*---------------------------------------------------------------------------*/
-/* */
-
-/**
- * Get a pointer to the string of characters within a DOM string
- *
- * \param str Pointer to DOM string to retrieve pointer from
- * \param data Pointer to location to receive data
- * \param len Pointer to location to receive byte length of data
- * \return DOM_NO_ERR on success
- *
- * The caller must have previously claimed a reference on the DOM string.
- * The returned pointer must not be freed.
- */
-dom_exception __dom_string_get_data(struct dom_string *str,
- const uint8_t **data, size_t *len)
-{
- /* Assume that a NULL str pointer indicates the empty string */
- if (str == NULL)
- str = &empty_string;
-
- switch (str->type) {
- case DOM_STRING_PTR:
- *data = str->data.ptr;
- break;
- case DOM_STRING_CONST_PTR:
- *data = str->data.cptr;
- break;
- case DOM_STRING_OFFSET:
- *data = dom_document_get_base(str->ctx.doc) +
- str->data.offset;
- break;
- case DOM_STRING_PTR_NODOC:
- *data = str->data.ptr;
- break;
- }
-
- *len = str->len;
-
- return DOM_NO_ERR;
-}
-
-
diff --git a/src/utils/Makefile b/src/utils/Makefile
index ac87ded..29369ae 100644
--- a/src/utils/Makefile
+++ b/src/utils/Makefile
@@ -22,7 +22,7 @@
CFLAGS += -I$(CURDIR)
# Objects
-OBJS = namespace utf8 utf16
+OBJS = namespace
.PHONY: clean debug distclean export release setup test
diff --git a/src/utils/namespace.c b/src/utils/namespace.c
index 9c0d214..8002b8e 100644
--- a/src/utils/namespace.c
+++ b/src/utils/namespace.c
@@ -32,14 +32,13 @@ dom_exception _dom_namespace_initialise(dom_alloc alloc, void *pw)
{
dom_exception err;
- err = dom_string_create_from_ptr_no_doc(alloc, pw,
- DOM_STRING_UTF8, (const uint8_t *) "xml", SLEN("xml"), &xml);
+ err = dom_string_create(alloc, pw,
+ (const uint8_t *) "xml", SLEN("xml"), &xml);
if (err != DOM_NO_ERR) {
return err;
}
- err = dom_string_create_from_ptr_no_doc(alloc, pw,
- DOM_STRING_UTF8,
+ err = dom_string_create(alloc, pw,
(const uint8_t *) "http://www.w3.org/XML/1998/namespace",
SLEN("http://www.w3.org/XML/1998/namespace"),
&xml_ns);
@@ -48,8 +47,7 @@ dom_exception _dom_namespace_initialise(dom_alloc alloc, void *pw)
return err;
}
- err = dom_string_create_from_ptr_no_doc(alloc, pw,
- DOM_STRING_UTF8,
+ err = dom_string_create(alloc, pw,
(const uint8_t *) "xmlns", SLEN("xmlns"), &xmlns);
if (err != DOM_NO_ERR) {
dom_string_unref(xml_ns);
@@ -57,8 +55,7 @@ dom_exception _dom_namespace_initialise(dom_alloc alloc, void *pw)
return err;
}
- err = dom_string_create_from_ptr_no_doc(alloc, pw,
- DOM_STRING_UTF8,
+ err = dom_string_create(alloc, pw,
(const uint8_t *) "http://www.w3.org/2000/xmlns",
SLEN("http://www.w3.org/2000/xmlns"),
&xmlns_ns);
diff --git a/src/utils/utf16.c b/src/utils/utf16.c
deleted file mode 100644
index 8917328..0000000
--- a/src/utils/utf16.c
+++ /dev/null
@@ -1,239 +0,0 @@
-/*
- * This file is part of Hubbub.
- * Licensed under the MIT License,
- * http://www.opensource.org/licenses/mit-license.php
- * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
- */
-
-/** \file
- * UTF-16 manipulation functions (implementation).
- */
-
-#include <stdbool.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "utils/utf16.h"
-
-/**
- * Convert a UTF-16 sequence into a single UCS4 character
- *
- * \param s The sequence to process
- * \param len Length of sequence
- * \param ucs4 Pointer to location to receive UCS4 character (host endian)
- * \param clen Pointer to location to receive byte length of UTF-16 sequence
- * \return CHARSET_OK on success, appropriate error otherwise
- */
-inline charset_error _dom_utf16_to_ucs4(const uint8_t *s, size_t len,
- uint32_t *ucs4, size_t *clen)
-{
- const uint16_t *ss = (const uint16_t *) (const void *) s;
-
- if (s == NULL || ucs4 == NULL || clen == NULL)
- return CHARSET_BADPARM;
-
- if (len < 2)
- return CHARSET_NEEDDATA;
-
- if (*ss < 0xD800 || *ss > 0xDFFF) {
- *ucs4 = *ss;
- *clen = 2;
- } else if (0xD800 <= *ss && *ss <= 0xBFFF) {
- if (len < 4)
- return CHARSET_NEEDDATA;
-
- if (0xDC00 <= ss[1] && ss[1] <= 0xE000) {
- *ucs4 = (((s[0] >> 6) & 0x1f) + 1) |
- ((s[0] & 0x3f) | (s[1] & 0x3ff));
- *clen = 4;
- } else {
- return CHARSET_INVALID;
- }
- }
-
- return CHARSET_OK;
-}
-
-/**
- * Convert a single UCS4 character into a UTF-16 sequence
- *
- * \param ucs4 The character to process (0 <= c <= 0x7FFFFFFF) (host endian)
- * \param s Pointer to 4 byte long output buffer
- * \param len Pointer to location to receive length of multibyte sequence
- * \return CHARSET_OK on success, appropriate error otherwise
- */
-inline charset_error _dom_utf16_from_ucs4(uint32_t ucs4, uint8_t *s,
- size_t *len)
-{
- uint16_t *ss = (uint16_t *) (void *) s;
- uint32_t l = 0;
-
- if (s == NULL || len == NULL)
- return CHARSET_BADPARM;
- else if (ucs4 < 0x10000) {
- *ss = (uint16_t) ucs4;
- l = 2;
- } else if (ucs4 < 0x110000) {
- ss[0] = 0xD800 | (((ucs4 >> 16) & 0x1f) - 1) | (ucs4 >> 10);
- ss[1] = 0xDC00 | (ucs4 & 0x3ff);
- l = 4;
- } else {
- return CHARSET_INVALID;
- }
-
- *len = l;
-
- return CHARSET_OK;
-}
-
-/**
- * Calculate the length (in characters) of a bounded UTF-16 string
- *
- * \param s The string
- * \param max Maximum length
- * \param len Pointer to location to receive length of string
- * \return CHARSET_OK on success, appropriate error otherwise
- */
-inline charset_error _dom_utf16_length(const uint8_t *s, size_t max,
- size_t *len)
-{
- const uint16_t *ss = (const uint16_t *) (const void *) s;
- const uint16_t *end = (const uint16_t *) (const void *) (s + max);
- int l = 0;
-
- if (s == NULL || len == NULL)
- return CHARSET_BADPARM;
-
- while (ss < end) {
- if (*ss < 0xD800 || 0xDFFF < *ss)
- ss++;
- else
- ss += 2;
-
- l++;
- }
-
- *len = l;
-
- return CHARSET_OK;
-}
-
-/**
- * Calculate the length (in bytes) of a UTF-16 character
- *
- * \param s Pointer to start of character
- * \param len Pointer to location to receive length
- * \return CHARSET_OK on success, appropriate error otherwise
- */
-inline charset_error _dom_utf16_char_byte_length(const uint8_t *s,
- size_t *len)
-{
- const uint16_t *ss = (const uint16_t *) (const void *) s;
-
- if (s == NULL || len == NULL)
- return CHARSET_BADPARM;
-
- if (*ss < 0xD800 || 0xDFFF < *ss)
- *len = 2;
- else
- *len = 4;
-
- return CHARSET_OK;
-}
-
-/**
- * Find previous legal UTF-16 char in string
- *
- * \param s The string
- * \param off Offset in the string to start at
- * \param prevoff Pointer to location to receive offset of first byte of
- * previous legal character
- * \return CHARSET_OK on success, appropriate error otherwise
- */
-inline charset_error _dom_utf16_prev(const uint8_t *s, uint32_t off,
- uint32_t *prevoff)
-{
- const uint16_t *ss = (const uint16_t *) (const void *) s;
-
- if (s == NULL || prevoff == NULL)
- return CHARSET_BADPARM;
-
- if (off < 2)
- *prevoff = 0;
- else if (ss[-1] < 0xDC00 || ss[-1] > 0xDFFF)
- *prevoff = off - 2;
- else
- *prevoff = (off < 4) ? 0 : off - 4;
-
- return CHARSET_OK;
-}
-
-/**
- * Find next legal UTF-16 char in string
- *
- * \param s The string (assumed valid)
- * \param len Maximum offset in string
- * \param off Offset in the string to start at
- * \param nextoff Pointer to location to receive offset of first byte of
- * next legal character
- * \return CHARSET_OK on success, appropriate error otherwise
- */
-inline charset_error _dom_utf16_next(const uint8_t *s, uint32_t len,
- uint32_t off, uint32_t *nextoff)
-{
- const uint16_t *ss = (const uint16_t *) (const void *) s;
-
- if (s == NULL || off >= len || nextoff == NULL)
- return CHARSET_BADPARM;
-
- if (len - off < 4)
- *nextoff = len;
- else if (ss[1] < 0xD800 || ss[1] > 0xDBFF)
- *nextoff = off + 2;
- else
- *nextoff = (len - off < 6) ? len : off + 4;
-
- return CHARSET_OK;
-}
-
-/**
- * Find next legal UTF-16 char in string
- *
- * \param s The string (assumed to be of dubious validity)
- * \param len Maximum offset in string
- * \param off Offset in the string to start at
- * \param nextoff Pointer to location to receive offset of first byte of
- * next legal character
- * \return CHARSET_OK on success, appropriate error otherwise
- */
-inline charset_error _dom_utf16_next_paranoid(const uint8_t *s,
- uint32_t len, uint32_t off, uint32_t *nextoff)
-{
- const uint16_t *ss = (const uint16_t *) (const void *) s;
-
- if (s == NULL || off >= len || nextoff == NULL)
- return CHARSET_BADPARM;
-
- while (1) {
- if (len - off < 4) {
- return CHARSET_NEEDDATA;
- } else if (ss[1] < 0xD800 || ss[1] > 0xDFFF) {
- *nextoff = off + 2;
- break;
- } else if (ss[1] >= 0xD800 && ss[1] <= 0xDBFF) {
- if (len - off < 6)
- return CHARSET_NEEDDATA;
-
- if (ss[2] >= 0xDC00 && ss[2] <= 0xDFFF) {
- *nextoff = off + 4;
- break;
- } else {
- ss++;
- off += 2;
- }
- }
- }
-
- return CHARSET_OK;
-}
-
diff --git a/src/utils/utf16.h b/src/utils/utf16.h
deleted file mode 100644
index 7b9e15f..0000000
--- a/src/utils/utf16.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * This file is part of libdom.
- * Licensed under the MIT License,
- * http://www.opensource.org/licenses/mit-license.php
- * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
- */
-
-/** \file
- * UTF-16 manipulation functions (interface).
- */
-
-#ifndef dom_utils_utf16_h_
-#define dom_utils_utf16_h_
-
-#include <inttypes.h>
-
-#include "utils/charset_errors.h"
-
-inline charset_error _dom_utf16_to_ucs4(const uint8_t *s, size_t len,
- uint32_t *ucs4, size_t *clen);
-inline charset_error _dom_utf16_from_ucs4(uint32_t ucs4, uint8_t *s,
- size_t *len);
-
-inline charset_error _dom_utf16_length(const uint8_t *s, size_t max,
- size_t *len);
-inline charset_error _dom_utf16_char_byte_length(const uint8_t *s,
- size_t *len);
-
-inline charset_error _dom_utf16_prev(const uint8_t *s, uint32_t off,
- uint32_t *prevoff);
-inline charset_error _dom_utf16_next(const uint8_t *s, uint32_t len,
- uint32_t off, uint32_t *nextoff);
-
-inline charset_error _dom_utf16_next_paranoid(const uint8_t *s,
- uint32_t len, uint32_t off, uint32_t *nextoff);
-
-#endif
-
diff --git a/src/utils/utf8.c b/src/utils/utf8.c
deleted file mode 100644
index b80f04e..0000000
--- a/src/utils/utf8.c
+++ /dev/null
@@ -1,368 +0,0 @@
-/*
- * This file is part of libdom.
- * Licensed under the MIT License,
- * http://www.opensource.org/licenses/mit-license.php
- * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
- */
-
-/** \file
- * UTF-8 manipulation functions (implementation).
- */
-
-#include <stdbool.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "utils/utf8.h"
-
-/** Number of continuation bytes for a given start byte */
-static const uint8_t numContinuations[256] = {
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5,
-};
-
-/**
- * Convert a UTF-8 multibyte sequence into a single UCS4 character
- *
- * Encoding of UCS values outside the UTF-16 plane has been removed from
- * RFC3629. This function conforms to RFC2279, however.
- *
- * \param s The sequence to process
- * \param len Length of sequence
- * \param ucs4 Pointer to location to receive UCS4 character (host endian)
- * \param clen Pointer to location to receive byte length of UTF-8 sequence
- * \return CHARSET_OK on success, appropriate error otherwise
- */
-inline charset_error _dom_utf8_to_ucs4(const uint8_t *s, size_t len,
- uint32_t *ucs4, size_t *clen)
-{
- if (s == NULL || ucs4 == NULL || clen == NULL)
- return CHARSET_BADPARM;
-
- if (len == 0)
- return CHARSET_NEEDDATA;
-
- if (*s < 0x80) {
- *ucs4 = *s;
- *clen = 1;
- } else if ((*s & 0xE0) == 0xC0) {
- if (len < 2)
- return CHARSET_NEEDDATA;
- else if ((*(s+1) & 0xC0) != 0x80)
- return CHARSET_INVALID;
- else {
- *ucs4 = ((*s & 0x1F) << 6) | (*(s+1) & 0x3F);
- *clen = 2;
- }
- } else if ((*s & 0xF0) == 0xE0) {
- if (len < 3)
- return CHARSET_NEEDDATA;
- else if ((*(s+1) & 0xC0) != 0x80 ||
- (*(s+2) & 0xC0) != 0x80)
- return CHARSET_INVALID;
- else {
- *ucs4 = ((*s & 0x0F) << 12) |
- ((*(s+1) & 0x3F) << 6) |
- (*(s+2) & 0x3F);
- *clen = 3;
- }
- } else if ((*s & 0xF8) == 0xF0) {
- if (len < 4)
- return CHARSET_NEEDDATA;
- else if ((*(s+1) & 0xC0) != 0x80 ||
- (*(s+2) & 0xC0) != 0x80 ||
- (*(s+3) & 0xC0) != 0x80)
- return CHARSET_INVALID;
- else {
- *ucs4 = ((*s & 0x0F) << 18) |
- ((*(s+1) & 0x3F) << 12) |
- ((*(s+2) & 0x3F) << 6) |
- (*(s+3) & 0x3F);
- *clen = 4;
- }
- } else if ((*s & 0xFC) == 0xF8) {
- if (len < 5)
- return CHARSET_NEEDDATA;
- else if ((*(s+1) & 0xC0) != 0x80 ||
- (*(s+2) & 0xC0) != 0x80 ||
- (*(s+3) & 0xC0) != 0x80 ||
- (*(s+4) & 0xC0) != 0x80)
- return CHARSET_INVALID;
- else {
- *ucs4 = ((*s & 0x0F) << 24) |
- ((*(s+1) & 0x3F) << 18) |
- ((*(s+2) & 0x3F) << 12) |
- ((*(s+3) & 0x3F) << 6) |
- (*(s+4) & 0x3F);
- *clen = 5;
- }
- } else if ((*s & 0xFE) == 0xFC) {
- if (len < 6)
- return CHARSET_NEEDDATA;
- else if ((*(s+1) & 0xC0) != 0x80 ||
- (*(s+2) & 0xC0) != 0x80 ||
- (*(s+3) & 0xC0) != 0x80 ||
- (*(s+4) & 0xC0) != 0x80 ||
- (*(s+5) & 0xC0) != 0x80)
- return CHARSET_INVALID;
- else {
- *ucs4 = ((*s & 0x0F) << 28) |
- ((*(s+1) & 0x3F) << 24) |
- ((*(s+2) & 0x3F) << 18) |
- ((*(s+3) & 0x3F) << 12) |
- ((*(s+4) & 0x3F) << 6) |
- (*(s+5) & 0x3F);
- *clen = 6;
- }
- } else {
- return CHARSET_INVALID;
- }
-
- return CHARSET_OK;
-}
-
-/**
- * Convert a single UCS4 character into a UTF-8 multibyte sequence
- *
- * Encoding of UCS values outside the UTF-16 plane has been removed from
- * RFC3629. This function conforms to RFC2279, however.
- *
- * \param ucs4 The character to process (0 <= c <= 0x7FFFFFFF) (host endian)
- * \param s Pointer to 6 byte long output buffer
- * \param len Pointer to location to receive length of multibyte sequence
- * \return CHARSET_OK on success, appropriate error otherwise
- */
-inline charset_error _dom_utf8_from_ucs4(uint32_t ucs4, uint8_t *s,
- size_t *len)
-{
- uint32_t l = 0;
-
- if (s == NULL || len == NULL)
- return CHARSET_BADPARM;
- else if (ucs4 < 0x80) {
- *s = (uint8_t) ucs4;
- l = 1;
- } else if (ucs4 < 0x800) {
- *s = 0xC0 | ((ucs4 >> 6) & 0x1F);
- *(s+1) = 0x80 | (ucs4 & 0x3F);
- l = 2;
- } else if (ucs4 < 0x10000) {
- *s = 0xE0 | ((ucs4 >> 12) & 0xF);
- *(s+1) = 0x80 | ((ucs4 >> 6) & 0x3F);
- *(s+2) = 0x80 | (ucs4 & 0x3F);
- l = 3;
- } else if (ucs4 < 0x200000) {
- *s = 0xF0 | ((ucs4 >> 18) & 0x7);
- *(s+1) = 0x80 | ((ucs4 >> 12) & 0x3F);
- *(s+2) = 0x80 | ((ucs4 >> 6) & 0x3F);
- *(s+3) = 0x80 | (ucs4 & 0x3F);
- l = 4;
- } else if (ucs4 < 0x4000000) {
- *s = 0xF8 | ((ucs4 >> 24) & 0x3);
- *(s+1) = 0x80 | ((ucs4 >> 18) & 0x3F);
- *(s+2) = 0x80 | ((ucs4 >> 12) & 0x3F);
- *(s+3) = 0x80 | ((ucs4 >> 6) & 0x3F);
- *(s+4) = 0x80 | (ucs4 & 0x3F);
- l = 5;
- } else if (ucs4 <= 0x7FFFFFFF) {
- *s = 0xFC | ((ucs4 >> 30) & 0x1);
- *(s+1) = 0x80 | ((ucs4 >> 24) & 0x3F);
- *(s+2) = 0x80 | ((ucs4 >> 18) & 0x3F);
- *(s+3) = 0x80 | ((ucs4 >> 12) & 0x3F);
- *(s+4) = 0x80 | ((ucs4 >> 6) & 0x3F);
- *(s+5) = 0x80 | (ucs4 & 0x3F);
- l = 6;
- } else {
- return CHARSET_INVALID;
- }
-
- *len = l;
-
- return CHARSET_OK;
-}
-
-/**
- * Calculate the length (in characters) of a bounded UTF-8 string
- *
- * \param s The string
- * \param max Maximum length
- * \param len Pointer to location to receive length of string
- * \return CHARSET_OK on success, appropriate error otherwise
- */
-inline charset_error _dom_utf8_length(const uint8_t *s, size_t max,
- size_t *len)
-{
- const uint8_t *end = s + max;
- int l = 0;
-
- if (s == NULL || len == NULL)
- return CHARSET_BADPARM;
-
- while (s < end) {
- if ((*s & 0x80) == 0x00)
- s += 1;
- else if ((*s & 0xE0) == 0xC0)
- s += 2;
- else if ((*s & 0xF0) == 0xE0)
- s += 3;
- else if ((*s & 0xF8) == 0xF0)
- s += 4;
- else if ((*s & 0xFC) == 0xF8)
- s += 5;
- else if ((*s & 0xFE) == 0xFC)
- s += 6;
- else
- return CHARSET_INVALID;
- l++;
- }
-
- *len = l;
-
- return CHARSET_OK;
-}
-
-/**
- * Calculate the length (in bytes) of a UTF-8 character
- *
- * \param s Pointer to start of character
- * \param len Pointer to location to receive length
- * \return CHARSET_OK on success, appropriate error otherwise
- */
-inline charset_error _dom_utf8_char_byte_length(const uint8_t *s,
- size_t *len)
-{
- if (s == NULL || len == NULL)
- return CHARSET_BADPARM;
-
- *len = numContinuations[s[0]] + 1 /* Start byte */;
-
- return CHARSET_OK;
-}
-
-/**
- * Find previous legal UTF-8 char in string
- *
- * \param s The string
- * \param off Offset in the string to start at
- * \param prevoff Pointer to location to receive offset of first byte of
- * previous legal character
- * \return CHARSET_OK on success, appropriate error otherwise
- */
-inline charset_error _dom_utf8_prev(const uint8_t *s, uint32_t off,
- uint32_t *prevoff)
-{
- if (s == NULL || prevoff == NULL)
- return CHARSET_BADPARM;
-
- while (off != 0 && (s[--off] & 0xC0) == 0x80)
- /* do nothing */;
-
- *prevoff = off;
-
- return CHARSET_OK;
-}
-
-/**
- * Find next legal UTF-8 char in string
- *
- * \param s The string (assumed valid)
- * \param len Maximum offset in string
- * \param off Offset in the string to start at
- * \param nextoff Pointer to location to receive offset of first byte of
- * next legal character
- * \return CHARSET_OK on success, appropriate error otherwise
- */
-inline charset_error _dom_utf8_next(const uint8_t *s, uint32_t len,
- uint32_t off, uint32_t *nextoff)
-{
- if (s == NULL || off >= len || nextoff == NULL)
- return CHARSET_BADPARM;
-
- /* Skip current start byte (if present - may be mid-sequence) */
- if (s[off] < 0x80 || (s[off] & 0xC0) == 0xC0)
- off++;
-
- while (off < len && (s[off] & 0xC0) == 0x80)
- off++;
-
- *nextoff = off;
-
- return CHARSET_OK;
-}
-
-/**
- * Find next legal UTF-8 char in string
- *
- * \param s The string (assumed to be of dubious validity)
- * \param len Maximum offset in string
- * \param off Offset in the string to start at
- * \param nextoff Pointer to location to receive offset of first byte of
- * next legal character
- * \return CHARSET_OK on success, appropriate error otherwise
- */
-inline charset_error _dom_utf8_next_paranoid(const uint8_t *s, uint32_t len,
- uint32_t off, uint32_t *nextoff)
-{
- bool valid;
-
- if (s == NULL || off >= len || nextoff == NULL)
- return CHARSET_BADPARM;
-
- /* Skip current start byte (if present - may be mid-sequence) */
- if (s[off] < 0x80 || (s[off] & 0xC0) == 0xC0)
- off++;
-
- while (1) {
- /* Find next possible start byte */
- while (off < len && (s[off] & 0xC0) == 0x80)
- off++;
-
- /* Ran off end of data */
- if (off == len || off + numContinuations[s[off]] >= len)
- return CHARSET_NEEDDATA;
-
- /* Found if start byte is ascii,
- * or next n bytes are valid continuations */
- valid = true;
-
- switch (numContinuations[s[off]]) {
- case 5:
- valid &= ((s[off + 5] & 0xC0) == 0x80);
- case 4:
- valid &= ((s[off + 4] & 0xC0) == 0x80);
- case 3:
- valid &= ((s[off + 3] & 0xC0) == 0x80);
- case 2:
- valid &= ((s[off + 2] & 0xC0) == 0x80);
- case 1:
- valid &= ((s[off + 1] & 0xC0) == 0x80);
- case 0:
- valid &= (s[off + 0] < 0x80);
- }
-
- if (valid)
- break;
-
- /* Otherwise, skip this (invalid) start byte and try again */
- off++;
- }
-
- *nextoff = off;
-
- return CHARSET_OK;
-}
-
diff --git a/src/utils/utf8.h b/src/utils/utf8.h
deleted file mode 100644
index 154dbb8..0000000
--- a/src/utils/utf8.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * This file is part of libdom.
- * Licensed under the MIT License,
- * http://www.opensource.org/licenses/mit-license.php
- * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
- */
-
-/** \file
- * UTF-8 manipulation functions (interface).
- */
-
-#ifndef dom_utils_utf8_h_
-#define dom_utils_utf8_h_
-
-#include <inttypes.h>
-
-#include "utils/charset_errors.h"
-
-inline charset_error _dom_utf8_to_ucs4(const uint8_t *s, size_t len,
- uint32_t *ucs4, size_t *clen);
-inline charset_error _dom_utf8_from_ucs4(uint32_t ucs4, uint8_t *s,
- size_t *len);
-
-inline charset_error _dom_utf8_length(const uint8_t *s, size_t max,
- size_t *len);
-inline charset_error _dom_utf8_char_byte_length(const uint8_t *s,
- size_t *len);
-
-inline charset_error _dom_utf8_prev(const uint8_t *s, uint32_t off,
- uint32_t *prevoff);
-inline charset_error _dom_utf8_next(const uint8_t *s, uint32_t len,
- uint32_t off, uint32_t *nextoff);
-
-inline charset_error _dom_utf8_next_paranoid(const uint8_t *s, uint32_t len,
- uint32_t off, uint32_t *nextoff);
-
-#endif
-