/* * This file is part of libdom. * Licensed under the MIT License, * http://www.opensource.org/licenses/mit-license.php * Copyright 2007 John-Mark Bell * Copyright 2009 Bo Yang * Copyright 2012 Daniel Silverstone */ #include #include #include #include #include #include #include "parser.h" #include "utils.h" #include "core/document.h" #include "core/string.h" #include "core/node.h" #include "html/html_document.h" #include "html/html_button_element.h" #include "html/html_input_element.h" #include "html/html_text_area_element.h" #include /** * libdom Hubbub parser context */ struct dom_hubbub_parser { hubbub_parser *parser; /**< Hubbub parser instance */ hubbub_tree_handler tree_handler; /**< Hubbub parser tree handler */ struct dom_document *doc; /**< DOM Document we're building */ dom_hubbub_encoding_source encoding_source; /**< The document's encoding source */ const char *encoding; /**< The document's encoding */ bool complete; /**< Indicate stream completion */ dom_msg msg; /**< Informational messaging function */ dom_script script; /**< Script callback function */ void *mctx; /**< Pointer to client data */ }; /* Forward declaration to break reference loop */ static hubbub_error add_attributes(void *parser, void *node, const hubbub_attribute *attributes, uint32_t n_attributes); /*--------------------- The callbacks definitions --------------------*/ static hubbub_error create_comment(void *parser, const hubbub_string *data, void **result) { dom_hubbub_parser *dom_parser = (dom_hubbub_parser *) parser; dom_exception err; dom_string *str; struct dom_comment *comment; *result = NULL; err = dom_string_create(data->ptr, data->len, &str); if (err != DOM_NO_ERR) { dom_parser->msg(DOM_MSG_CRITICAL, dom_parser->mctx, "Can't create comment node text"); return HUBBUB_UNKNOWN; } err = dom_document_create_comment(dom_parser->doc, str, &comment); if (err != DOM_NO_ERR) { dom_string_unref(str); dom_parser->msg(DOM_MSG_CRITICAL, dom_parser->mctx, "Can't create comment node with text '%.*s'", data->len, data->ptr); return HUBBUB_UNKNOWN; } *result = comment; dom_string_unref(str); return HUBBUB_OK; } static char *parser_strndup(const char *s, size_t n) { size_t len; char *s2; for (len = 0; len != n && s[len] != '\0'; len++) continue; s2 = malloc(len + 1); if (s2 == NULL) return NULL; memcpy(s2, s, len); s2[len] = '\0'; return s2; } static hubbub_error create_doctype(void *parser, const hubbub_doctype *doctype, void **result) { dom_hubbub_parser *dom_parser = (dom_hubbub_parser *) parser; dom_exception err; char *qname, *public_id = NULL, *system_id = NULL; struct dom_document_type *dtype; *result = NULL; qname = parser_strndup((const char *) doctype->name.ptr, (size_t) doctype->name.len); if (qname == NULL) { dom_parser->msg(DOM_MSG_CRITICAL, dom_parser->mctx, "Can't create doctype name"); goto fail; } if (doctype->public_missing == false) { public_id = parser_strndup( (const char *) doctype->public_id.ptr, (size_t) doctype->public_id.len); } else { public_id = strdup(""); } if (public_id == NULL) { dom_parser->msg(DOM_MSG_CRITICAL, dom_parser->mctx, "Can't create doctype public id"); goto clean1; } if (doctype->system_missing == false) { system_id = parser_strndup( (const char *) doctype->system_id.ptr, (size_t) doctype->system_id.len); } else { system_id = strdup(""); } if (system_id == NULL) { dom_parser->msg(DOM_MSG_CRITICAL, dom_parser->mctx, "Can't create doctype system id"); goto clean2; } err = dom_implementation_create_document_type(qname, public_id, system_id, &dtype); if (err != DOM_NO_ERR) { dom_parser->msg(DOM_MSG_CRITICAL, dom_parser->mctx, "Can't create the document type"); goto clean3; } *result = dtype; clean3: free(system_id); clean2: free(public_id); clean1: free(qname); fail: if (*result == NULL) return HUBBUB_UNKNOWN; else return HUBBUB_OK; } static hubbub_error create_element(void *parser, const hubbub_tag *tag, void **result) { dom_hubbub_parser *dom_parser = (dom_hubbub_parser *) parser; dom_exception err; dom_string *name; struct dom_element *element = NULL; hubbub_error herr; *result = NULL; err = dom_string_create_interned(tag->name.ptr, tag->name.len, &name); if (err != DOM_NO_ERR) { dom_parser->msg(DOM_MSG_CRITICAL, dom_parser->mctx, "Can't create element name"); goto fail; } if (tag->ns == HUBBUB_NS_NULL) { err = dom_document_create_element(dom_parser->doc, name, &element); if (err != DOM_NO_ERR) { dom_parser->msg(DOM_MSG_CRITICAL, dom_parser->mctx, "Can't create the DOM element"); goto clean1; } } else { err = dom_document_create_element_ns(dom_parser->doc, dom_namespaces[tag->ns], name, &element); if (err != DOM_NO_ERR) { dom_parser->msg(DOM_MSG_CRITICAL, dom_parser->mctx, "Can't create the DOM element"); goto clean1; } } if (element != NULL && tag->n_attributes > 0) { herr = add_attributes(parser, element, tag->attributes, tag->n_attributes); if (herr != HUBBUB_OK) goto clean1; } *result = element; clean1: dom_string_unref(name); fail: if (*result == NULL) return HUBBUB_UNKNOWN; else return HUBBUB_OK; } static hubbub_error create_text(void *parser, const hubbub_string *data, void **result) { dom_hubbub_parser *dom_parser = (dom_hubbub_parser *) parser; dom_exception err; dom_string *str; struct dom_text *text = NULL; *result = NULL; err = dom_string_create(data->ptr, data->len, &str); if (err != DOM_NO_ERR) { dom_parser->msg(DOM_MSG_CRITICAL, dom_parser->mctx, "Can't create text '%.*s'", data->len, data->ptr); goto fail; } err = dom_document_create_text_node(dom_parser->doc, str, &text); if (err != DOM_NO_ERR) { dom_parser->msg(DOM_MSG_CRITICAL, dom_parser->mctx, "Can't create the DOM text node"); goto clean1; } *result = text; clean1: dom_string_unref(str); fail: if (*result == NULL) return HUBBUB_UNKNOWN; else return HUBBUB_OK; } static hubbub_error ref_node(void *parser, void *node) { struct dom_node *dnode = (struct dom_node *) node; UNUSED(parser); dom_node_ref(dnode); return HUBBUB_OK; } static hubbub_error unref_node(void *parser, void *node) { struct dom_node *dnode = (struct dom_node *) node; UNUSED(parser); dom_node_unref(dnode); return HUBBUB_OK; } static hubbub_error append_child(void *parser, void *parent, void *child, void **result) { dom_hubbub_parser *dom_parser = (dom_hubbub_parser *) parser; dom_exception err; err = dom_node_append_child((struct dom_node *) parent, (struct dom_node *) child, (struct dom_node **) result); if (err != DOM_NO_ERR) { dom_parser->msg(DOM_MSG_CRITICAL, dom_parser->mctx, "Can't append child '%p' for parent '%p'", child, parent); return HUBBUB_UNKNOWN; } return HUBBUB_OK; } static hubbub_error insert_before(void *parser, void *parent, void *child, void *ref_child, void **result) { dom_hubbub_parser *dom_parser = (dom_hubbub_parser *) parser; dom_exception err; err = dom_node_insert_before((struct dom_node *) parent, (struct dom_node *) child, (struct dom_node *) ref_child, (struct dom_node **) result); if (err != DOM_NO_ERR) { dom_parser->msg(DOM_MSG_CRITICAL, dom_parser->mctx, "Can't insert node '%p' before node '%p'", child, ref_child); return HUBBUB_UNKNOWN; } return HUBBUB_OK; } static hubbub_error remove_child(void *parser, void *parent, void *child, void **result) { dom_hubbub_parser *dom_parser = (dom_hubbub_parser *) parser; dom_exception err; err = dom_node_remove_child((struct dom_node *) parent, (struct dom_node *) child, (struct dom_node **) result); if (err != DOM_NO_ERR) { dom_parser->msg(DOM_MSG_CRITICAL, dom_parser->mctx, "Can't remove child '%p'", child); return HUBBUB_UNKNOWN; } return HUBBUB_OK; } static hubbub_error clone_node(void *parser, void *node, bool deep, void **result) { dom_hubbub_parser *dom_parser = (dom_hubbub_parser *) parser; dom_exception err; err = dom_node_clone_node((struct dom_node *) node, deep, (struct dom_node **) result); if (err != DOM_NO_ERR) { dom_parser->msg(DOM_MSG_CRITICAL, dom_parser->mctx, "Can't clone node '%p'", node); return HUBBUB_UNKNOWN; } return HUBBUB_OK; } static hubbub_error reparent_children(void *parser, void *node, void *new_parent) { dom_hubbub_parser *dom_parser = (dom_hubbub_parser *) parser; dom_exception err; struct dom_node *child, *result; while(true) { err = dom_node_get_first_child((struct dom_node *) node, &child); if (err != DOM_NO_ERR) { dom_parser->msg(DOM_MSG_CRITICAL, dom_parser->mctx, "Error in dom_note_get_first_child"); return HUBBUB_UNKNOWN; } if (child == NULL) break; err = dom_node_remove_child(node, (struct dom_node *) child, &result); if (err != DOM_NO_ERR) { dom_parser->msg(DOM_MSG_CRITICAL, dom_parser->mctx, "Error in dom_node_remove_child"); goto fail; } dom_node_unref(result); err = dom_node_append_child((struct dom_node *) new_parent, (struct dom_node *) child, &result); if (err != DOM_NO_ERR) { dom_parser->msg(DOM_MSG_CRITICAL, dom_parser->mctx, "Error in dom_node_append_child"); goto fail; } dom_node_unref(result); dom_node_unref(child); } return HUBBUB_OK; fail: dom_node_unref(child); return HUBBUB_UNKNOWN; } static hubbub_error get_parent(void *parser, void *node, bool element_only, void **result) { dom_hubbub_parser *dom_parser = (dom_hubbub_parser *) parser; dom_exception err; struct dom_node *parent; dom_node_type type = DOM_NODE_TYPE_COUNT; err = dom_node_get_parent_node((struct dom_node *) node, &parent); if (err != DOM_NO_ERR) { dom_parser->msg(DOM_MSG_CRITICAL, dom_parser->mctx, "Error in dom_node_get_parent"); return HUBBUB_UNKNOWN; } if (element_only == false) { *result = parent; return HUBBUB_OK; } err = dom_node_get_node_type(parent, &type); if (err != DOM_NO_ERR) { dom_parser->msg(DOM_MSG_CRITICAL, dom_parser->mctx, "Error in dom_node_get_type"); goto fail; } if (type == DOM_ELEMENT_NODE) { *result = parent; return HUBBUB_OK; } else { *result = NULL; dom_node_unref(parent); return HUBBUB_OK; } return HUBBUB_OK; fail: dom_node_unref(parent); return HUBBUB_UNKNOWN; } static hubbub_error has_children(void *parser, void *node, bool *result) { dom_hubbub_parser *dom_parser = (dom_hubbub_parser *) parser; dom_exception err; err = dom_node_has_child_nodes((struct dom_node *) node, result); if (err != DOM_NO_ERR) { dom_parser->msg(DOM_MSG_CRITICAL, dom_parser->mctx, "Error in dom_node_has_child_nodes"); return HUBBUB_UNKNOWN; } return HUBBUB_OK; } static hubbub_error form_associate(void *parser, void *form, void *node) { dom_hubbub_parser *dom_parser = (dom_hubbub_parser *) parser; dom_html_form_element *form_ele = form; dom_node_internal *ele = node; dom_html_document *doc = (dom_html_document *)ele->owner; dom_exception err = DOM_NO_ERR; /* Determine the kind of the node we have here. */ if (dom_string_caseless_isequal(ele->name, doc->memoised[hds_BUTTON])) { err = _dom_html_button_element_set_form( (dom_html_button_element *)node, form_ele); if (err != DOM_NO_ERR) { dom_parser->msg(DOM_MSG_CRITICAL, dom_parser->mctx, "Error in form_associate"); return HUBBUB_UNKNOWN; } } else if (dom_string_caseless_isequal(ele->name, doc->memoised[hds_INPUT])) { err = _dom_html_input_element_set_form( (dom_html_input_element *)node, form_ele); if (err != DOM_NO_ERR) { dom_parser->msg(DOM_MSG_CRITICAL, dom_parser->mctx, "Error in form_associate"); return HUBBUB_UNKNOWN; } } else if (dom_string_caseless_isequal(ele->name, doc->memoised[hds_TEXTAREA])) { err = _dom_html_text_area_element_set_form( (dom_html_text_area_element *)node, form_ele); if (err != DOM_NO_ERR) { dom_parser->msg(DOM_MSG_CRITICAL, dom_parser->mctx, "Error in form_associate"); return HUBBUB_UNKNOWN; } } return HUBBUB_OK; } static hubbub_error add_attributes(void *parser, void *node, const hubbub_attribute *attributes, uint32_t n_attributes) { dom_hubbub_parser *dom_parser = (dom_hubbub_parser *) parser; dom_exception err; uint32_t i; for (i = 0; i < n_attributes; i++) { dom_string *name, *value; err = dom_string_create_interned(attributes[i].name.ptr, attributes[i].name.len, &name); if (err != DOM_NO_ERR) { dom_parser->msg(DOM_MSG_CRITICAL, dom_parser->mctx, "Can't create attribute name"); goto fail; } err = dom_string_create(attributes[i].value.ptr, attributes[i].value.len, &value); if (err != DOM_NO_ERR) { dom_parser->msg(DOM_MSG_CRITICAL, dom_parser->mctx, "Can't create attribute value"); dom_string_unref(name); goto fail; } if (attributes[i].ns == HUBBUB_NS_NULL) { err = dom_element_set_attribute( (struct dom_element *) node, name, value); dom_string_unref(name); dom_string_unref(value); if (err != DOM_NO_ERR) { dom_parser->msg(DOM_MSG_CRITICAL, dom_parser->mctx, "Can't add attribute"); } } else { err = dom_element_set_attribute_ns( (struct dom_element *) node, dom_namespaces[attributes[i].ns], name, value); dom_string_unref(name); dom_string_unref(value); if (err != DOM_NO_ERR) { dom_parser->msg(DOM_MSG_CRITICAL, dom_parser->mctx, "Can't add attribute ns"); } } } return HUBBUB_OK; fail: return HUBBUB_UNKNOWN; } static hubbub_error set_quirks_mode(void *parser, hubbub_quirks_mode mode) { dom_hubbub_parser *dom_parser = (dom_hubbub_parser *) parser; switch (mode) { case HUBBUB_QUIRKS_MODE_NONE: dom_document_set_quirks_mode(dom_parser->doc, DOM_DOCUMENT_QUIRKS_MODE_NONE); break; case HUBBUB_QUIRKS_MODE_LIMITED: dom_document_set_quirks_mode(dom_parser->doc, DOM_DOCUMENT_QUIRKS_MODE_LIMITED); break; case HUBBUB_QUIRKS_MODE_FULL: dom_document_set_quirks_mode(dom_parser->doc, DOM_DOCUMENT_QUIRKS_MODE_FULL); break; } return HUBBUB_OK; } static hubbub_error change_encoding(void *parser, const char *charset) { dom_hubbub_parser *dom_parser = (dom_hubbub_parser *) parser; uint32_t source; const char *name; /* If we have an encoding here, it means we are *certain* */ if (dom_parser->encoding != NULL) { return HUBBUB_OK; } /* Find the confidence otherwise (can only be from a BOM) */ name = hubbub_parser_read_charset(dom_parser->parser, &source); if (source == HUBBUB_CHARSET_CONFIDENT) { dom_parser->encoding_source = DOM_HUBBUB_ENCODING_SOURCE_DETECTED; dom_parser->encoding = charset; return HUBBUB_OK; } /* So here we have something of confidence tentative... */ /* http://www.whatwg.org/specs/web-apps/current-work/#change */ /* 2. "If the new encoding is identical or equivalent to the encoding * that is already being used to interpret the input stream, then set * the confidence to confident and abort these steps." */ /* Whatever happens, the encoding should be set here; either for * reprocessing with a different charset, or for confirming that the * charset is in fact correct */ dom_parser->encoding = charset; dom_parser->encoding_source = DOM_HUBBUB_ENCODING_SOURCE_META; /* Equal encodings will have the same string pointers */ return (charset == name) ? HUBBUB_OK : HUBBUB_ENCODINGCHANGE; } static hubbub_error complete_script(void *parser, void *script) { dom_hubbub_parser *dom_parser = (dom_hubbub_parser *) parser; dom_hubbub_error err; err = dom_parser->script(dom_parser->mctx, (struct dom_node *)script); if (err != DOM_HUBBUB_OK) { return HUBBUB_UNKNOWN; } return HUBBUB_OK; } static hubbub_tree_handler tree_handler = { create_comment, create_doctype, create_element, create_text, ref_node, unref_node, append_child, insert_before, remove_child, clone_node, reparent_children, get_parent, has_children, form_associate, add_attributes, set_quirks_mode, change_encoding, NULL, complete_script }; /** * Memory allocator */ static void *dom_hubbub_alloc(void *ptr, size_t len, void *pw) { UNUSED(pw); if (ptr == NULL) return len > 0 ? malloc(len) : NULL; if (len == 0) { free(ptr); return NULL; } return realloc(ptr, len); } /** * Default message callback */ static void dom_hubbub_parser_default_msg(uint32_t severity, void *ctx, const char *msg, ...) { UNUSED(severity); UNUSED(ctx); UNUSED(msg); } /** * Default script callback. */ static dom_hubbub_error dom_hubbub_parser_default_script(void *ctx, struct dom_node *node) { UNUSED(ctx); UNUSED(node); return DOM_HUBBUB_OK; } /** * Create a Hubbub parser instance * * \param enc Source charset, or NULL * \param fix_enc Whether fix the encoding * \param enable_script Whether scripting should be enabled. * \param msg Informational message function * \param script Script callback function * \param mctx Pointer to client-specific private data * \return Pointer to instance, or NULL on memory exhaustion */ dom_hubbub_parser * dom_hubbub_parser_create(const char *enc, bool fix_enc, bool enable_script, dom_msg msg, dom_script script, void *mctx) { dom_hubbub_parser *parser; hubbub_parser_optparams params; hubbub_error error; dom_exception err; parser = malloc(sizeof(dom_hubbub_parser)); if (parser == NULL) { msg(DOM_MSG_CRITICAL, mctx, "No memory for parsing context"); return NULL; } parser->parser = NULL; parser->doc = NULL; parser->encoding = enc; parser->encoding_source = enc != NULL ? DOM_HUBBUB_ENCODING_SOURCE_HEADER : DOM_HUBBUB_ENCODING_SOURCE_DETECTED; parser->complete = false; if (msg == NULL) { msg = dom_hubbub_parser_default_msg; } parser->msg = msg; parser->mctx = mctx; /* ensure script function is valid or use the default */ if (script == NULL) { script = dom_hubbub_parser_default_script; } parser->script = script; error = hubbub_parser_create(enc, fix_enc, dom_hubbub_alloc, NULL, &parser->parser); if (error != HUBBUB_OK) { free(parser); msg(DOM_MSG_CRITICAL, mctx, "Can't create parser"); return NULL; } /* TODO: Just pass the dom_events_default_action_fetcher a NULL, * we should pass the real function when we integrate libDOM with * Netsurf */ err = dom_implementation_create_document(DOM_IMPLEMENTATION_HTML, NULL, NULL, NULL, NULL, &parser->doc); if (err != DOM_NO_ERR) { hubbub_parser_destroy(parser->parser); free(parser); msg(DOM_MSG_ERROR, mctx, "Can't create DOM document"); return NULL; } parser->tree_handler = tree_handler; parser->tree_handler.ctx = (void *) parser; params.tree_handler = &parser->tree_handler; hubbub_parser_setopt(parser->parser, HUBBUB_PARSER_TREE_HANDLER, ¶ms); dom_node_ref((struct dom_node *) parser->doc); params.document_node = parser->doc; hubbub_parser_setopt(parser->parser, HUBBUB_PARSER_DOCUMENT_NODE, ¶ms); params.enable_scripting = enable_script; hubbub_parser_setopt(parser->parser, HUBBUB_PARSER_ENABLE_SCRIPTING, ¶ms); return parser; } dom_hubbub_error dom_hubbub_parser_insert_chunk(dom_hubbub_parser *parser, const uint8_t *data, size_t length) { hubbub_parser_insert_chunk(parser->parser, data, length); return DOM_HUBBUB_OK; } /** * Destroy a Hubbub parser instance * * \param parser The Hubbub parser object */ void dom_hubbub_parser_destroy(dom_hubbub_parser *parser) { hubbub_parser_destroy(parser->parser); parser->parser = NULL; if (parser->doc != NULL) { dom_node_unref((struct dom_node *) parser->doc); parser->doc = NULL; } free(parser); } /** * Parse data with Hubbub parser * * \param parser The parser object * \param data The data to be parsed * \param len The length of the data to be parsed * \return DOM_HUBBUB_OK on success, * DOM_HUBBUB_HUBBUB_ERR | on failure */ dom_hubbub_error dom_hubbub_parser_parse_chunk(dom_hubbub_parser *parser, const uint8_t *data, size_t len) { hubbub_error err; err = hubbub_parser_parse_chunk(parser->parser, data, len); if (err != HUBBUB_OK) return DOM_HUBBUB_HUBBUB_ERR | err; return DOM_HUBBUB_OK; } /** * Notify the parser to complete parsing * * \param parser The parser object * \return DOM_HUBBUB_OK on success, * DOM_HUBBUB_HUBBUB_ERR | on underlaying parser failure * DOMHUBBUB_UNKNOWN | on libwapcaplet failure */ dom_hubbub_error dom_hubbub_parser_completed(dom_hubbub_parser *parser) { dom_exception derr; hubbub_error err; dom_string *name = NULL; err = hubbub_parser_completed(parser->parser); if (err != HUBBUB_OK) { parser->msg(DOM_MSG_ERROR, parser->mctx, "hubbub_parser_completed failed: %d", err); return DOM_HUBBUB_HUBBUB_ERR | err; } parser->complete = true; derr = dom_string_create_interned((const uint8_t *) "id", SLEN("id"), &name); if (derr != DOM_NO_ERR) return DOM_HUBBUB_HUBBUB_ERR | HUBBUB_UNKNOWN; _dom_document_set_id_name(parser->doc, name); dom_string_unref(name); return DOM_HUBBUB_OK; } /** * Fetch the Document object from the parser * * \param parser The parser object * \return the created document on success, NULL on failure */ dom_document *dom_hubbub_parser_get_document(dom_hubbub_parser *parser) { dom_document *doc = NULL; if (parser->complete) { doc = parser->doc; parser->doc = NULL; } return doc; } /** * Retrieve the encoding * * \param parser The parser object * \param source The encoding_source * \return the encoding name */ const char *dom_hubbub_parser_get_encoding(dom_hubbub_parser *parser, dom_hubbub_encoding_source *source) { *source = parser->encoding_source; return parser->encoding != NULL ? parser->encoding : "Windows-1252"; } /** * Set the Parse pause state. * * \param parser The parser object * \param pause The pause state to set. * \return DOM_HUBBUB_OK on success, * DOM_HUBBUB_HUBBUB_ERR | on failure */ dom_hubbub_error dom_hubbub_parser_pause(dom_hubbub_parser *parser, bool pause) { hubbub_error err; hubbub_parser_optparams params; params.pause_parse = pause; err = hubbub_parser_setopt(parser->parser, HUBBUB_PARSER_PAUSE, ¶ms); if (err != HUBBUB_OK) return DOM_HUBBUB_HUBBUB_ERR | err; return DOM_HUBBUB_OK; }