/* * This file is part of Hubbub. * Licensed under the MIT License, * http://www.opensource.org/licenses/mit-license.php * * Copyright 2008 Andrew Sidwell * Copyright 2008 John-Mark Bell */ #define _GNU_SOURCE /* for strndup */ #include #include #include #include #include #include #include #define UNUSED(x) ((x)=(x)) /** * Source of encoding information */ typedef enum encoding_source { ENCODING_SOURCE_HEADER, ENCODING_SOURCE_DETECTED, ENCODING_SOURCE_META } encoding_source; /** * Our context */ typedef struct context { hubbub_parser *parser; /**< Underlying parser */ htmlDocPtr document; /**< Document we're building */ const char *encoding; /**< The charset of the input */ encoding_source enc_source; /**< The encoding source */ #define NUM_NAMESPACES (6) xmlNsPtr namespaces[NUM_NAMESPACES]; /**< XML namespaces */ #undef NUM_NAMESPACES hubbub_tree_handler tree_handler; /**< Hubbub tree callbacks */ } context; /** * Mapping of namespace prefixes to URIs, indexed by hubbub_ns. */ static struct { const char *prefix; const char *url; } namespaces[] = { { NULL, NULL }, { NULL, "http://www.w3.org/1999/xhtml" }, { "math", "http://www.w3.org/1998/Math/MathML" }, { "svg", "http://www.w3.org/2000/svg" }, { "xlink", "http://www.w3.org/1999/xlink" }, /** \todo Oh dear. LibXML2 refuses to create any namespace with a * prefix of "xml". That sucks, royally. */ { "xml", "http://www.w3.org/XML/1998/namespace" }, { "xmlns", "http://www.w3.org/2000/xmlns/" } }; static inline char *c_string_from_hubbub_string(context *ctx, const hubbub_string *str); static void create_namespaces(context *ctx, xmlNode *root); static int create_comment(void *ctx, const hubbub_string *data, void **result); static int create_doctype(void *ctx, const hubbub_doctype *doctype, void **result); static int create_element(void *ctx, const hubbub_tag *tag, void **result); static int create_text(void *ctx, const hubbub_string *data, void **result); static int ref_node(void *ctx, void *node); static int unref_node(void *ctx, void *node); static int append_child(void *ctx, void *parent, void *child, void **result); static int insert_before(void *ctx, void *parent, void *child, void *ref_child, void **result); static int remove_child(void *ctx, void *parent, void *child, void **result); static int clone_node(void *ctx, void *node, bool deep, void **result); static int reparent_children(void *ctx, void *node, void *new_parent); static int get_parent(void *ctx, void *node, bool element_only, void **result); static int has_children(void *ctx, void *node, bool *result); static int form_associate(void *ctx, void *form, void *node); static int add_attributes(void *ctx, void *node, const hubbub_attribute *attributes, uint32_t n_attributes); static int set_quirks_mode(void *ctx, hubbub_quirks_mode mode); static int change_encoding(void *ctx, const char *charset); /* Prototype tree handler struct */ static hubbub_tree_handler tree_handler = { create_comment, create_doctype, create_element, create_text, ref_node, unref_node, append_child, insert_before, remove_child, clone_node, reparent_children, get_parent, has_children, form_associate, add_attributes, set_quirks_mode, change_encoding, NULL }; /** * Memory allocation callback. * * \param ptr Pointer to block to reallocate, or NULL for a new allocation * \param len Required length, in bytes. If zero, then free the block * \param pw Pointer to our private data * \return Pointer to resized block */ static void *myrealloc(void *ptr, size_t len, void *pw) { /* In this implementation, we just call realloc. * If we have more complex allocation requirements (e.g. multiple * allocation arenas, then we could use pw to point to the arena to use) */ UNUSED(pw); return realloc(ptr, len); } /**************** TODO: Sort this out already *********************************/ int main(int argc, char **argv) { } binding_error binding_create_tree(void *arena, const char *charset, void **ctx) { context *c; hubbub_parser_optparams params; uint32_t i; hubbub_error error; c = malloc(sizeof(context)); if (c == NULL) return BINDING_NOMEM; c->parser = NULL; c->encoding = charset; c->encoding_source = ENCODING_SOURCE_HEADER; c->document = NULL; c->owns_doc = true; error = hubbub_parser_create(charset, true, myrealloc, arena, &c->parser); if (error != HUBBUB_OK) { free(c); if (error == HUBBUB_BADENCODING) return BINDING_BADENCODING; else return BINDING_NOMEM; /* Assume OOM */ } c->document = htmlNewDocNoDtD(NULL, NULL); if (c->document == NULL) { hubbub_parser_destroy(c->parser); free(c); return BINDING_NOMEM; } c->document->_private = (void *) 0; for (i = 0; i < sizeof(c->namespaces) / sizeof(c->namespaces[0]); i++) { c->namespaces[i] = NULL; } c->tree_handler = tree_handler; c->tree_handler.ctx = (void *) c; params.tree_handler = &c->tree_handler; hubbub_parser_setopt(c->parser, HUBBUB_PARSER_TREE_HANDLER, ¶ms); ref_node(c, c->document); params.document_node = c->document; hubbub_parser_setopt(c->parser, HUBBUB_PARSER_DOCUMENT_NODE, ¶ms); *ctx = (void *) c; return BINDING_OK; } binding_error binding_destroy_tree(void *ctx) { context *c = (context *) ctx; if (ctx == NULL) return BINDING_OK; if (c->parser != NULL) hubbub_parser_destroy(c->parser); if (c->owns_doc) xmlFreeDoc(c->document); c->parser = NULL; c->encoding = NULL; c->document = NULL; free(c); return BINDING_OK; } binding_error binding_parse_chunk(void *ctx, const uint8_t *data, size_t len) { context *c = (context *) ctx; hubbub_error err; err = hubbub_parser_parse_chunk(c->parser, (uint8_t *) data, len); if (err == HUBBUB_ENCODINGCHANGE) return BINDING_ENCODINGCHANGE; return BINDING_OK; } binding_error binding_parse_completed(void *ctx) { context *c = (context *) ctx; hubbub_error error; error = hubbub_parser_completed(c->parser); /** \todo error handling */ return BINDING_OK; } /****************************************************************************** * Helper functions for tree building * ******************************************************************************/ /** * Convert a hubbub string to a C string * * \param ctx Our context * \param str The string to convert * \return Pointer to C string, must be freed * * This is a simple utility routine, as libXML expects data to be C strings. * If we were implementing our own tree, we might store hubbub-style strings * instead (with the associated memory saving) */ char *c_string_from_hubbub_string(context *ctx, const hubbub_string *str) { return strndup((const char *) str->ptr, (int) str->len); } /** * Initialise a context's XML namespaces * * \param ctx Our context * \param root The root node of the XML tree * * Again, this is specific to the needs of libXML. */ void create_namespaces(context *ctx, xmlNode *root) { uint32_t i; /* Index 0 is the NULL namespace, so skip over it */ for (i = 1; i < sizeof(namespaces) / sizeof(namespaces[0]); i++) { ctx->namespaces[i - 1] = xmlNewNs(root, BAD_CAST namespaces[i].url, BAD_CAST namespaces[i].prefix); /* Expect "xml" to fail here */ if (ctx->namespaces[i - 1] == NULL) { LOG(("Failed creating namespace %s\n", namespaces[i].prefix)); } } } /****************************************************************************** * Tree callbacks for hubbub * ******************************************************************************/ /** * Create a comment node * * \param ctx Our context * \param data The comment body * \param result Location to receive manufactured node * \return 0 on success, 1 on memory exhaustion * * Postcondition: if successful, result's reference count must be 1. */ int create_comment(void *ctx, const hubbub_string *data, void **result) { context *c = (context *) ctx; char *content; xmlNodePtr n; content = c_string_from_hubbub_string(c, data); if (content == NULL) return 1; n = xmlNewDocComment(c->document, BAD_CAST content); if (n == NULL) { free(content); return 1; } /* We use the _private field of libXML's xmlNode struct for the * reference count. */ n->_private = (void *) (uintptr_t) 1; free(content); *result = (void *) n; return 0; } /** * Create a doctype node * * \param ctx Our context * \param doctype Data for doctype node (public ID and system ID) * \param result Location to receive manufactured node * \return 0 on success, 1 on memory exhaustion * * Postcondition: if successful, result's reference count must be 1. */ int create_doctype(void *ctx, const hubbub_doctype *doctype, void **result) { context *c = (context *) ctx; char *name, *public = NULL, *system = NULL; xmlDtdPtr n; name = c_string_from_hubbub_string(c, &doctype->name); if (name == NULL) return 1; /* May not have public ID */ if (!doctype->public_missing) { public = c_string_from_hubbub_string(c, &doctype->public_id); if (public == NULL) { free(name); return 1; } } /* May not have system ID */ if (!doctype->system_missing) { system = c_string_from_hubbub_string(c, &doctype->system_id); if (system == NULL) { free(public); free(name); return 1; } } n = xmlNewDtd(c->document, BAD_CAST name, BAD_CAST (public ? public : ""), BAD_CAST (system ? system : "")); if (n == NULL) { free(system); free(public); free(name); return 1; } /* Again, reference count must be 1 */ n->_private = (void *) (uintptr_t) 1; *result = (void *) n; free(system); free(public); free(name); return 0; } /** * Create an element node * * \param ctx Our context * \param tag Data for node * \param result Location to receive manufactured node * \return 0 on success, 1 on memory exhaustion. * * Postcondition: if successful, result's reference count must be 1. */ int create_element(void *ctx, const hubbub_tag *tag, void **result) { context *c = (context *) ctx; char *name; xmlNodePtr n; name = c_string_from_hubbub_string(c, &tag->name); if (name == NULL) return 1; if (c->namespaces[0] != NULL) { n = xmlNewDocNode(c->document, c->namespaces[tag->ns - 1], BAD_CAST name, NULL); } else { n = xmlNewDocNode(c->document, NULL, BAD_CAST name, NULL); /* We're creating the root node of the document. Therefore, * create the namespaces and set this node's namespace */ if (n != NULL && c->namespaces[0] == NULL) { create_namespaces(c, (void *) n); xmlSetNs(n, c->namespaces[tag->ns - 1]); } } if (n == NULL) { free(name); return 1; } /* Reference count must be 1 */ n->_private = (void *) (uintptr_t) 1; /* Attempt to add attributes to node */ if (tag->n_attributes > 0 && add_attributes(ctx, (void *) n, tag->attributes, tag->n_attributes) != 0) { xmlFreeNode(n); free(name); return 1; } *result = (void *) n; free(name); return 0; } /** * Create a text node * * \param ctx Our context * \param data Node data * \param result Location to receive manufactured node * \return 0 on success, 1 on memory exhaustion. * * Postcondition: if successfult, result's reference count must be 1. */ int create_text(void *ctx, const hubbub_string *data, void **result) { context *c = (context *) ctx; xmlNodePtr n; n = xmlNewDocTextLen(c->document, BAD_CAST data->ptr, (int) data->len); if (n == NULL) { return 1; } /* Reference count must be 1 */ n->_private = (void *) (uintptr_t) 1; *result = (void *) n; return 0; } /** * Increase a node's reference count * * \param ctx Our context * \param node The node to reference * \return 0 on success, 1 on failure */ int ref_node(void *ctx, void *node) { context *c = (context *) ctx; if (node == c->document) { xmlDoc *n = (xmlDoc *) node; uintptr_t count = (uintptr_t) n->_private; n->_private = (void *) ++count; } else { xmlNode *n = (xmlNode *) node; uintptr_t count = (uintptr_t) n->_private; n->_private = (void *) ++count; } return 0; } /** * Decrease a node's reference count * * \param ctx Our context * \param node The node to unreference * \return 0 on success, 1 on failure * * Postcondition: If the node's reference count becomes zero, and it has no * parent, and it is not the document node, then it is destroyed. */ int unref_node(void *ctx, void *node) { context *c = (context *) ctx; if (node == c->document) { xmlDoc *n = (xmlDoc *) node; uintptr_t count = (uintptr_t) n->_private; /* Trap any attempt to unref a non-referenced node */ assert(count != 0 && "Node has refcount of zero"); /* Never destroy document node */ n->_private = (void *) --count; } else { xmlNode *n = (xmlNode *) node; uintptr_t count = (uintptr_t) n->_private; /* Trap any attempt to unref a non-referenced node */ assert(count != 0 && "Node has refcount of zero"); n->_private = (void *) --count; /* Destroy node, if it has no parent */ if (count == 0 && n->parent == NULL) { xmlFreeNode(n); } } return 0; } /** * Append a node to the end of another's child list * * \param ctx Our context * \param parent The node to append to * \param child The node to append * \param result Location to receive appended node * \return 0 on success, 1 on memory exhaustion * * Postcondition: if successful, result's reference count is increased by 1 * * Important: *result may not == child (e.g. if text nodes got coalesced) */ int append_child(void *ctx, void *parent, void *child, void **result) { xmlNode *chld = (xmlNode *) child; xmlNode *p = (xmlNode *) parent; if (chld->type == XML_TEXT_NODE && p->last != NULL && p->last->type == XML_TEXT_NODE) { /* Need to clone the child, as libxml will free it if it * merges the content with a pre-existing text node. */ chld = xmlCopyNode(chld, 0); if (chld == NULL) return 1; *result = xmlAddChild(p, chld); assert(*result != (void *) chld); } else { *result = xmlAddChild(p, chld); } if (*result == NULL) return 1; ref_node(ctx, *result); return 0; } /** * Insert a node into another's child list * * \param ctx Our context * \param parent The node to insert into * \param child The node to insert * \param ref_child The node to insert before * \param result Location to receive inserted node * \return 0 on success, 1 on memory exhaustion * * Postcondition: if successful, result's reference count is increased by 1 * * Important: *result may not == child (e.g. if text nodes got coalesced) */ int insert_before(void *ctx, void *parent, void *child, void *ref_child, void **result) { xmlNode *chld = (xmlNode *) child; xmlNode *ref = (xmlNode *) ref_child; if (chld->type == XML_TEXT_NODE && ref->prev != NULL && ref->prev->type == XML_TEXT_NODE) { /* Clone text node, as it'll be freed by libxml */ chld = xmlCopyNode(chld, 0); if (chld == NULL) return 1; *result = xmlAddNextSibling(ref->prev, chld); assert(*result != (void *) chld); } else { *result = xmlAddPrevSibling(ref, chld); } if (*result == NULL) return 1; ref_node(ctx, *result); return 0; } /** * Remove a node from another's child list * * \param ctx Our context * \param parent The node to remove from * \param child The node to remove * \param result Location to receive removed node * \return 0 on success, 1 on memory exhaustion * * Postcondition: if successful, result's reference count is increased by 1 */ int remove_child(void *ctx, void *parent, void *child, void **result) { xmlNode *chld = (xmlNode *) child; xmlUnlinkNode(chld); *result = child; ref_node(ctx, *result); return 0; } /** * Clone a node * * \param ctx Our context * \param node The node to clone * \param deep True to clone entire subtree, false to clone only the node * \param result Location to receive clone * \return 0 on success, 1 on memory exhaustion * * Postcondition: if successful, result's reference count must be 1. */ int clone_node(void *ctx, void *node, bool deep, void **result) { xmlNode *n = (xmlNode *) node; *result = xmlCopyNode(n, deep ? 1 : 2); if (*result == NULL) return 1; ((xmlNode *)(*result))->_private = (void *) (uintptr_t) 1; return 0; } /** * Move all the children of one node to another * * \param ctx Our context * \param node The initial parent node * \param new_parent The new parent node * \return 0 on success, 1 on memory exhaustion */ int reparent_children(void *ctx, void *node, void *new_parent) { xmlNode *n = (xmlNode *) node; xmlNode *p = (xmlNode *) new_parent; xmlNode *child; for (child = n->children; child != NULL; ) { xmlNode *next = child->next; xmlUnlinkNode(child); if (xmlAddChild(p, child) == NULL) return 1; child = next; } return 0; } /** * Retrieve the parent of a node * * \param ctx Our context * \param node Node to retrieve the parent of * \param element_only True if the parent must be an element, false otherwise * \param result Location to receive parent node * \return 0 on success, 1 on failure * * Postcondition: if there is a parent, then result's reference count must be * increased. */ int get_parent(void *ctx, void *node, bool element_only, void **result) { xmlNode *n = (xmlNode *) node; *result = (void *) n->parent; if (*result != NULL && element_only && ((xmlNode *) *result)->type != XML_ELEMENT_NODE) { *result = NULL; } if (*result != NULL) ref_node(ctx, *result); return 0; } /** * Determine if a node has children * * \param ctx Our context * \param node The node to inspect * \param result Location to receive result * \return 0 on success, 1 on failure */ int has_children(void *ctx, void *node, bool *result) { xmlNode *n = (xmlNode *) node; *result = n->children != NULL; return 0; } /** * Associate a node with a form * * \param ctx Our context * \param form The form to associate with * \param node The node to associate * \return 0 on success, 1 on failure */ int form_associate(void *ctx, void *form, void *node) { /* In this implementation, we do nothing here. * * If we wish to process forms afterwards, then we would want to use * this entry point to associate inputs with form elements. This is * useful because forms may be misnested in the source data and thus * it is not necessarily sufficient to search the resultant DOM to * perform the association. */ return 0; } /** * Add attributes to a node * * \param ctx Our context * \param node The node to add to * \param attributes Array of attributes to add * \param n_attributes Number of entries in array * \return 0 on success, 1 on memory exhaustion */ int add_attributes(void *ctx, void *node, const hubbub_attribute *attributes, uint32_t n_attributes) { context *c = (context *) ctx; xmlNode *n = (xmlNode *) node; uint32_t attr; for (attr = 0; attr < n_attributes; attr++) { xmlAttr *prop; char *name, *value; name = c_string_from_hubbub_string(c, &attributes[attr].name); if (name == NULL) return 1; value = c_string_from_hubbub_string(c, &attributes[attr].value); if (value == NULL) { free(name); return 1; } if (attributes[attr].ns != HUBBUB_NS_NULL && c->namespaces[0] != NULL) { prop = xmlNewNsProp(n, c->namespaces[attributes[attr].ns - 1], BAD_CAST name, BAD_CAST value); } else { prop = xmlNewProp(n, BAD_CAST name, BAD_CAST value); } if (prop == NULL) { free(value); free(name); return 1; } free(value); free(name); } return 0; } /** * Notification of the quirks mode of a document * * \param ctx Our context * \param mode The quirks mode * \return 0 on success, 1 on failure */ int set_quirks_mode(void *ctx, hubbub_quirks_mode mode) { /* In this implementation, we do nothing. * * The quirks mode is really only of any use when applying CSS * to the resulting DOM tree. */ return 0; } /** * Notification that a potential encoding change is required * * \param ctx Our context * \param charset The new charset for the source data * \return 0 to ignore the change and continue using the current input handler, * 1 to stop processing immediately and return control to the client. */ int change_encoding(void *ctx, const char *charset) { context *c = (context *) ctx; uint32_t source; const char *name; /* If we have an encoding here, it means we are *certain* */ if (c->encoding != NULL) { return 0; } /* Find the confidence otherwise (can only be from a BOM) */ name = hubbub_parser_read_charset(c->parser, &source); if (source == HUBBUB_CHARSET_CONFIDENT) { c->enc_source = ENCODING_SOURCE_DETECTED; c->encoding = (char *) charset; return 0; } /* So here we have something of confidence tentative... */ /* http://www.whatwg.org/specs/web-apps/current-work/#change */ /* 2. "If the new encoding is identical or equivalent to the encoding * that is already being used to interpret the input stream, then set * the confidence to confident and abort these steps." */ /* Whatever happens, the encoding should be set here; either for * reprocessing with a different charset, or for confirming that the * charset is in fact correct */ c->encoding = charset; c->enc_source = ENCODING_SOURCE_META; /* Equal encodings will have the same string pointers */ return (charset == name) ? 0 : 1; }